mirror of https://github.com/FFmpeg/FFmpeg.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1868 lines
70 KiB
1868 lines
70 KiB
/* |
|
* Loongson LSX optimized swscale |
|
* |
|
* Copyright (c) 2023 Loongson Technology Corporation Limited |
|
* Contributed by Lu Wang <wanglu@loongson.cn> |
|
* |
|
* This file is part of FFmpeg. |
|
* |
|
* FFmpeg is free software; you can redistribute it and/or |
|
* modify it under the terms of the GNU Lesser General Public |
|
* License as published by the Free Software Foundation; either |
|
* version 2.1 of the License, or (at your option) any later version. |
|
* |
|
* FFmpeg is distributed in the hope that it will be useful, |
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
* Lesser General Public License for more details. |
|
* |
|
* You should have received a copy of the GNU Lesser General Public |
|
* License along with FFmpeg; if not, write to the Free Software |
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
*/ |
|
|
|
#include "libavcodec/loongarch/loongson_asm.S" |
|
|
|
/* void ff_hscale_8_to_15_lsx(SwsContext *c, int16_t *dst, int dstW, |
|
* const uint8_t *src, const int16_t *filter, |
|
* const int32_t *filterPos, int filterSize) |
|
*/ |
|
function ff_hscale_8_to_15_lsx |
|
addi.d sp, sp, -72 |
|
st.d s0, sp, 0 |
|
st.d s1, sp, 8 |
|
st.d s2, sp, 16 |
|
st.d s3, sp, 24 |
|
st.d s4, sp, 32 |
|
st.d s5, sp, 40 |
|
st.d s6, sp, 48 |
|
st.d s7, sp, 56 |
|
st.d s8, sp, 64 |
|
li.w t0, 32767 |
|
li.w t8, 8 |
|
li.w t7, 4 |
|
vldi vr0, 0 |
|
vreplgr2vr.w vr20, t0 |
|
beq a6, t7, .LOOP_DSTW4 |
|
beq a6, t8, .LOOP_DSTW8 |
|
blt t8, a6, .LOOP_START |
|
b .END_DSTW4 |
|
|
|
.LOOP_START: |
|
li.w t1, 0 |
|
li.w s1, 0 |
|
li.w s2, 0 |
|
li.w s3, 0 |
|
li.w s4, 0 |
|
li.w s5, 0 |
|
vldi vr22, 0 |
|
addi.w s0, a6, -7 |
|
slli.w s7, a6, 1 |
|
slli.w s8, a6, 2 |
|
add.w t6, s7, s8 |
|
.LOOP_DSTW: |
|
ld.w t2, a5, 0 |
|
ld.w t3, a5, 4 |
|
ld.w t4, a5, 8 |
|
ld.w t5, a5, 12 |
|
fldx.d f1, a3, t2 |
|
fldx.d f2, a3, t3 |
|
fldx.d f3, a3, t4 |
|
fldx.d f4, a3, t5 |
|
vld vr9, a4, 0 |
|
vldx vr10, a4, s7 |
|
vldx vr11, a4, s8 |
|
vldx vr12, a4, t6 |
|
vilvl.b vr1, vr0, vr1 |
|
vilvl.b vr2, vr0, vr2 |
|
vilvl.b vr3, vr0, vr3 |
|
vilvl.b vr4, vr0, vr4 |
|
vdp2.w.h vr17, vr1, vr9 |
|
vdp2.w.h vr18, vr2, vr10 |
|
vdp2.w.h vr19, vr3, vr11 |
|
vdp2.w.h vr21, vr4, vr12 |
|
vhaddw.d.w vr1, vr17, vr17 |
|
vhaddw.d.w vr2, vr18, vr18 |
|
vhaddw.d.w vr3, vr19, vr19 |
|
vhaddw.d.w vr4, vr21, vr21 |
|
vhaddw.q.d vr1, vr1, vr1 |
|
vhaddw.q.d vr2, vr2, vr2 |
|
vhaddw.q.d vr3, vr3, vr3 |
|
vhaddw.q.d vr4, vr4, vr4 |
|
vilvl.w vr1, vr2, vr1 |
|
vilvl.w vr3, vr4, vr3 |
|
vilvl.d vr1, vr3, vr1 |
|
vadd.w vr22, vr22, vr1 |
|
addi.w s1, s1, 8 |
|
addi.d a3, a3, 8 |
|
addi.d a4, a4, 16 |
|
blt s1, s0, .LOOP_DSTW |
|
blt s1, a6, .DSTWA |
|
b .END_FILTER |
|
.DSTWA: |
|
ld.w t2, a5, 0 |
|
li.w t3, 0 |
|
move s6, s1 |
|
.FILTERSIZEA: |
|
add.w t4, t2, t3 |
|
ldx.bu t5, a3, t4 |
|
mul.w t6, a6, t1 |
|
add.w t6, t6, t3 |
|
slli.w t6, t6, 1 |
|
ldx.h t6, a4, t6 |
|
mul.w t6, t5, t6 |
|
add.w s2, s2, t6 |
|
addi.w t3, t3, 1 |
|
addi.w s6, s6, 1 |
|
blt s6, a6, .FILTERSIZEA |
|
|
|
ld.w t2, a5, 4 |
|
li.w t3, 0 |
|
move s6, s1 |
|
addi.w t1, t1, 1 |
|
.FILTERSIZEB: |
|
add.w t4, t2, t3 |
|
ldx.bu t5, a3, t4 |
|
mul.w t6, a6, t1 |
|
add.w t6, t6, t3 |
|
slli.w t6, t6, 1 |
|
ldx.h t6, a4, t6 |
|
mul.w t6, t5, t6 |
|
add.w s3, s3, t6 |
|
addi.w t3, t3, 1 |
|
addi.w s6, s6, 1 |
|
blt s6, a6, .FILTERSIZEB |
|
ld.w t2, a5, 8 |
|
addi.w t1, t1, 1 |
|
li.w t3, 0 |
|
move s6, s1 |
|
.FILTERSIZEC: |
|
add.w t4, t2, t3 |
|
ldx.bu t5, a3, t4 |
|
mul.w t6, a6, t1 |
|
add.w t6, t6, t3 |
|
slli.w t6, t6, 1 |
|
ldx.h t6, a4, t6 |
|
mul.w t6, t5, t6 |
|
add.w s4, s4, t6 |
|
addi.w t3, t3, 1 |
|
addi.w s6, s6, 1 |
|
blt s6, a6, .FILTERSIZEC |
|
ld.w t2, a5, 12 |
|
addi.w t1, t1, 1 |
|
move s6, s1 |
|
li.w t3, 0 |
|
.FILTERSIZED: |
|
add.w t4, t2, t3 |
|
ldx.bu t5, a3, t4 |
|
mul.w t6, a6, t1 |
|
add.w t6, t6, t3 |
|
slli.w t6, t6, 1 |
|
ldx.h t6, a4, t6 |
|
mul.w t6, t5, t6 |
|
add.w s5, s5, t6 |
|
addi.w t3, t3, 1 |
|
addi.w s6, s6, 1 |
|
blt s6, a6, .FILTERSIZED |
|
.END_FILTER: |
|
vpickve2gr.w t1, vr22, 0 |
|
vpickve2gr.w t2, vr22, 1 |
|
vpickve2gr.w t3, vr22, 2 |
|
vpickve2gr.w t4, vr22, 3 |
|
add.w s2, s2, t1 |
|
add.w s3, s3, t2 |
|
add.w s4, s4, t3 |
|
add.w s5, s5, t4 |
|
srai.w s2, s2, 7 |
|
srai.w s3, s3, 7 |
|
srai.w s4, s4, 7 |
|
srai.w s5, s5, 7 |
|
slt t1, s2, t0 |
|
slt t2, s3, t0 |
|
slt t3, s4, t0 |
|
slt t4, s5, t0 |
|
maskeqz s2, s2, t1 |
|
maskeqz s3, s3, t2 |
|
maskeqz s4, s4, t3 |
|
maskeqz s5, s5, t4 |
|
masknez t1, t0, t1 |
|
masknez t2, t0, t2 |
|
masknez t3, t0, t3 |
|
masknez t4, t0, t4 |
|
or s2, s2, t1 |
|
or s3, s3, t2 |
|
or s4, s4, t3 |
|
or s5, s5, t4 |
|
st.h s2, a1, 0 |
|
st.h s3, a1, 2 |
|
st.h s4, a1, 4 |
|
st.h s5, a1, 6 |
|
|
|
addi.d a1, a1, 8 |
|
sub.d a3, a3, s1 |
|
addi.d a5, a5, 16 |
|
slli.d t3, a6, 3 |
|
add.d a4, a4, t3 |
|
sub.d a4, a4, s1 |
|
sub.d a4, a4, s1 |
|
addi.d a2, a2, -4 |
|
bge a2, t7, .LOOP_START |
|
blt zero, a2, .RES |
|
b .END_LOOP |
|
.RES: |
|
li.w t1, 0 |
|
.DSTW: |
|
slli.w t2, t1, 2 |
|
ldx.w t2, a5, t2 |
|
li.w t3, 0 |
|
li.w t8, 0 |
|
.FILTERSIZE: |
|
add.w t4, t2, t3 |
|
ldx.bu t5, a3, t4 |
|
mul.w t6, a6, t1 |
|
add.w t6, t6, t3 |
|
slli.w t7, t6, 1 |
|
ldx.h t7, a4, t7 |
|
mul.w t7, t5, t7 |
|
add.w t8, t8, t7 |
|
addi.w t3, t3, 1 |
|
blt t3, a6, .FILTERSIZE |
|
srai.w t8, t8, 7 |
|
slt t5, t8, t0 |
|
maskeqz t8, t8, t5 |
|
masknez t5, t0, t5 |
|
or t8, t8, t5 |
|
slli.w t4, t1, 1 |
|
stx.h t8, a1, t4 |
|
addi.w t1, t1, 1 |
|
blt t1, a2, .DSTW |
|
b .END_LOOP |
|
|
|
.LOOP_DSTW8: |
|
ld.w t1, a5, 0 |
|
ld.w t2, a5, 4 |
|
ld.w t3, a5, 8 |
|
ld.w t4, a5, 12 |
|
fldx.d f1, a3, t1 |
|
fldx.d f2, a3, t2 |
|
fldx.d f3, a3, t3 |
|
fldx.d f4, a3, t4 |
|
ld.w t1, a5, 16 |
|
ld.w t2, a5, 20 |
|
ld.w t3, a5, 24 |
|
ld.w t4, a5, 28 |
|
fldx.d f5, a3, t1 |
|
fldx.d f6, a3, t2 |
|
fldx.d f7, a3, t3 |
|
fldx.d f8, a3, t4 |
|
vld vr9, a4, 0 |
|
vld vr10, a4, 16 |
|
vld vr11, a4, 32 |
|
vld vr12, a4, 48 |
|
vld vr13, a4, 64 |
|
vld vr14, a4, 80 |
|
vld vr15, a4, 96 |
|
vld vr16, a4, 112 |
|
vilvl.b vr1, vr0, vr1 |
|
vilvl.b vr2, vr0, vr2 |
|
vilvl.b vr3, vr0, vr3 |
|
vilvl.b vr4, vr0, vr4 |
|
vilvl.b vr5, vr0, vr5 |
|
vilvl.b vr6, vr0, vr6 |
|
vilvl.b vr7, vr0, vr7 |
|
vilvl.b vr8, vr0, vr8 |
|
|
|
vdp2.w.h vr17, vr1, vr9 |
|
vdp2.w.h vr18, vr2, vr10 |
|
vdp2.w.h vr19, vr3, vr11 |
|
vdp2.w.h vr21, vr4, vr12 |
|
vdp2.w.h vr1, vr5, vr13 |
|
vdp2.w.h vr2, vr6, vr14 |
|
vdp2.w.h vr3, vr7, vr15 |
|
vdp2.w.h vr4, vr8, vr16 |
|
vhaddw.d.w vr5, vr1, vr1 |
|
vhaddw.d.w vr6, vr2, vr2 |
|
vhaddw.d.w vr7, vr3, vr3 |
|
vhaddw.d.w vr8, vr4, vr4 |
|
vhaddw.d.w vr1, vr17, vr17 |
|
vhaddw.d.w vr2, vr18, vr18 |
|
vhaddw.d.w vr3, vr19, vr19 |
|
vhaddw.d.w vr4, vr21, vr21 |
|
vhaddw.q.d vr1, vr1, vr1 |
|
vhaddw.q.d vr2, vr2, vr2 |
|
vhaddw.q.d vr3, vr3, vr3 |
|
vhaddw.q.d vr4, vr4, vr4 |
|
vhaddw.q.d vr5, vr5, vr5 |
|
vhaddw.q.d vr6, vr6, vr6 |
|
vhaddw.q.d vr7, vr7, vr7 |
|
vhaddw.q.d vr8, vr8, vr8 |
|
vilvl.w vr1, vr2, vr1 |
|
vilvl.w vr3, vr4, vr3 |
|
vilvl.w vr5, vr6, vr5 |
|
vilvl.w vr7, vr8, vr7 |
|
vilvl.d vr1, vr3, vr1 |
|
vilvl.d vr5, vr7, vr5 |
|
vsrai.w vr1, vr1, 7 |
|
vsrai.w vr5, vr5, 7 |
|
vmin.w vr1, vr1, vr20 |
|
vmin.w vr5, vr5, vr20 |
|
|
|
vpickev.h vr1, vr5, vr1 |
|
vst vr1, a1, 0 |
|
addi.d a1, a1, 16 |
|
addi.d a5, a5, 32 |
|
addi.d a4, a4, 128 |
|
addi.d a2, a2, -8 |
|
bge a2, t8, .LOOP_DSTW8 |
|
blt zero, a2, .RES8 |
|
b .END_LOOP |
|
.RES8: |
|
li.w t1, 0 |
|
.DSTW8: |
|
slli.w t2, t1, 2 |
|
ldx.w t2, a5, t2 |
|
li.w t3, 0 |
|
li.w t8, 0 |
|
.FILTERSIZE8: |
|
add.w t4, t2, t3 |
|
ldx.bu t5, a3, t4 |
|
mul.w t6, a6, t1 |
|
add.w t6, t6, t3 |
|
slli.w t7, t6, 1 |
|
ldx.h t7, a4, t7 |
|
mul.w t7, t5, t7 |
|
add.w t8, t8, t7 |
|
addi.w t3, t3, 1 |
|
blt t3, a6, .FILTERSIZE8 |
|
srai.w t8, t8, 7 |
|
slt t5, t8, t0 |
|
maskeqz t8, t8, t5 |
|
masknez t5, t0, t5 |
|
or t8, t8, t5 |
|
slli.w t4, t1, 1 |
|
stx.h t8, a1, t4 |
|
addi.w t1, t1, 1 |
|
blt t1, a2, .DSTW8 |
|
b .END_LOOP |
|
|
|
.LOOP_DSTW4: |
|
ld.w t1, a5, 0 |
|
ld.w t2, a5, 4 |
|
ld.w t3, a5, 8 |
|
ld.w t4, a5, 12 |
|
fldx.s f1, a3, t1 |
|
fldx.s f2, a3, t2 |
|
fldx.s f3, a3, t3 |
|
fldx.s f4, a3, t4 |
|
ld.w t1, a5, 16 |
|
ld.w t2, a5, 20 |
|
ld.w t3, a5, 24 |
|
ld.w t4, a5, 28 |
|
fldx.s f5, a3, t1 |
|
fldx.s f6, a3, t2 |
|
fldx.s f7, a3, t3 |
|
fldx.s f8, a3, t4 |
|
vld vr9, a4, 0 |
|
vld vr10, a4, 16 |
|
vld vr11, a4, 32 |
|
vld vr12, a4, 48 |
|
vilvl.w vr1, vr2, vr1 |
|
vilvl.w vr3, vr4, vr3 |
|
vilvl.w vr5, vr6, vr5 |
|
vilvl.w vr7, vr8, vr7 |
|
vilvl.b vr1, vr0, vr1 |
|
vilvl.b vr3, vr0, vr3 |
|
vilvl.b vr5, vr0, vr5 |
|
vilvl.b vr7, vr0, vr7 |
|
|
|
vdp2.w.h vr13, vr1, vr9 |
|
vdp2.w.h vr14, vr3, vr10 |
|
vdp2.w.h vr15, vr5, vr11 |
|
vdp2.w.h vr16, vr7, vr12 |
|
vhaddw.d.w vr13, vr13, vr13 |
|
vhaddw.d.w vr14, vr14, vr14 |
|
vhaddw.d.w vr15, vr15, vr15 |
|
vhaddw.d.w vr16, vr16, vr16 |
|
vpickev.w vr13, vr14, vr13 |
|
vpickev.w vr15, vr16, vr15 |
|
vsrai.w vr13, vr13, 7 |
|
vsrai.w vr15, vr15, 7 |
|
vmin.w vr13, vr13, vr20 |
|
vmin.w vr15, vr15, vr20 |
|
|
|
vpickev.h vr13, vr15, vr13 |
|
vst vr13, a1, 0 |
|
addi.d a1, a1, 16 |
|
addi.d a5, a5, 32 |
|
addi.d a4, a4, 64 |
|
addi.d a2, a2, -8 |
|
bge a2, t8, .LOOP_DSTW4 |
|
blt zero, a2, .RES4 |
|
b .END_LOOP |
|
.RES4: |
|
li.w t1, 0 |
|
.DSTW4: |
|
slli.w t2, t1, 2 |
|
ldx.w t2, a5, t2 |
|
li.w t3, 0 |
|
li.w t8, 0 |
|
.FILTERSIZE4: |
|
add.w t4, t2, t3 |
|
ldx.bu t5, a3, t4 |
|
mul.w t6, a6, t1 |
|
add.w t6, t6, t3 |
|
slli.w t7, t6, 1 |
|
ldx.h t7, a4, t7 |
|
mul.w t7, t5, t7 |
|
add.w t8, t8, t7 |
|
addi.w t3, t3, 1 |
|
blt t3, a6, .FILTERSIZE4 |
|
srai.w t8, t8, 7 |
|
slt t5, t8, t0 |
|
maskeqz t8, t8, t5 |
|
masknez t5, t0, t5 |
|
or t8, t8, t5 |
|
slli.w t4, t1, 1 |
|
stx.h t8, a1, t4 |
|
addi.w t1, t1, 1 |
|
blt t1, a2, .DSTW4 |
|
b .END_LOOP |
|
.END_DSTW4: |
|
|
|
li.w t1, 0 |
|
.LOOP_DSTW1: |
|
slli.w t2, t1, 2 |
|
ldx.w t2, a5, t2 |
|
li.w t3, 0 |
|
li.w t8, 0 |
|
.FILTERSIZE1: |
|
add.w t4, t2, t3 |
|
ldx.bu t5, a3, t4 |
|
mul.w t6, a6, t1 |
|
add.w t6, t6, t3 |
|
slli.w t7, t6, 1 |
|
ldx.h t7, a4, t7 |
|
mul.w t7, t5, t7 |
|
add.w t8, t8, t7 |
|
addi.w t3, t3, 1 |
|
blt t3, a6, .FILTERSIZE1 |
|
srai.w t8, t8, 7 |
|
slt t5, t8, t0 |
|
maskeqz t8, t8, t5 |
|
masknez t5, t0, t5 |
|
or t8, t8, t5 |
|
slli.w t4, t1, 1 |
|
stx.h t8, a1, t4 |
|
addi.w t1, t1, 1 |
|
blt t1, a2, .LOOP_DSTW1 |
|
b .END_LOOP |
|
.END_LOOP: |
|
|
|
ld.d s0, sp, 0 |
|
ld.d s1, sp, 8 |
|
ld.d s2, sp, 16 |
|
ld.d s3, sp, 24 |
|
ld.d s4, sp, 32 |
|
ld.d s5, sp, 40 |
|
ld.d s6, sp, 48 |
|
ld.d s7, sp, 56 |
|
ld.d s8, sp, 64 |
|
addi.d sp, sp, 72 |
|
endfunc |
|
|
|
/* void ff_hscale_8_to_19_lsx(SwsContext *c, int16_t *dst, int dstW, |
|
* const uint8_t *src, const int16_t *filter, |
|
* const int32_t *filterPos, int filterSize) |
|
*/ |
|
function ff_hscale_8_to_19_lsx |
|
addi.d sp, sp, -72 |
|
st.d s0, sp, 0 |
|
st.d s1, sp, 8 |
|
st.d s2, sp, 16 |
|
st.d s3, sp, 24 |
|
st.d s4, sp, 32 |
|
st.d s5, sp, 40 |
|
st.d s6, sp, 48 |
|
st.d s7, sp, 56 |
|
st.d s8, sp, 64 |
|
li.w t0, 524287 |
|
li.w t8, 8 |
|
li.w t7, 4 |
|
vldi vr0, 0 |
|
vreplgr2vr.w vr20, t0 |
|
beq a6, t7, .LOOP_DST4 |
|
beq a6, t8, .LOOP_DST8 |
|
blt t8, a6, .LOOP |
|
b .END_DST4 |
|
|
|
.LOOP: |
|
li.w t1, 0 |
|
li.w s1, 0 |
|
li.w s2, 0 |
|
li.w s3, 0 |
|
li.w s4, 0 |
|
li.w s5, 0 |
|
vldi vr22, 0 |
|
addi.w s0, a6, -7 |
|
slli.w s7, a6, 1 |
|
slli.w s8, a6, 2 |
|
add.w t6, s7, s8 |
|
.LOOP_DST: |
|
ld.w t2, a5, 0 |
|
ld.w t3, a5, 4 |
|
ld.w t4, a5, 8 |
|
ld.w t5, a5, 12 |
|
fldx.d f1, a3, t2 |
|
fldx.d f2, a3, t3 |
|
fldx.d f3, a3, t4 |
|
fldx.d f4, a3, t5 |
|
vld vr9, a4, 0 |
|
vldx vr10, a4, s7 |
|
vldx vr11, a4, s8 |
|
vldx vr12, a4, t6 |
|
vilvl.b vr1, vr0, vr1 |
|
vilvl.b vr2, vr0, vr2 |
|
vilvl.b vr3, vr0, vr3 |
|
vilvl.b vr4, vr0, vr4 |
|
vdp2.w.h vr17, vr1, vr9 |
|
vdp2.w.h vr18, vr2, vr10 |
|
vdp2.w.h vr19, vr3, vr11 |
|
vdp2.w.h vr21, vr4, vr12 |
|
vhaddw.d.w vr1, vr17, vr17 |
|
vhaddw.d.w vr2, vr18, vr18 |
|
vhaddw.d.w vr3, vr19, vr19 |
|
vhaddw.d.w vr4, vr21, vr21 |
|
vhaddw.q.d vr1, vr1, vr1 |
|
vhaddw.q.d vr2, vr2, vr2 |
|
vhaddw.q.d vr3, vr3, vr3 |
|
vhaddw.q.d vr4, vr4, vr4 |
|
vilvl.w vr1, vr2, vr1 |
|
vilvl.w vr3, vr4, vr3 |
|
vilvl.d vr1, vr3, vr1 |
|
vadd.w vr22, vr22, vr1 |
|
addi.w s1, s1, 8 |
|
addi.d a3, a3, 8 |
|
addi.d a4, a4, 16 |
|
blt s1, s0, .LOOP_DST |
|
blt s1, a6, .DSTA |
|
b .END_FILTERA |
|
.DSTA: |
|
ld.w t2, a5, 0 |
|
li.w t3, 0 |
|
move s6, s1 |
|
.FILTERA: |
|
add.w t4, t2, t3 |
|
ldx.bu t5, a3, t4 |
|
mul.w t6, a6, t1 |
|
add.w t6, t6, t3 |
|
slli.w t6, t6, 1 |
|
ldx.h t6, a4, t6 |
|
mul.w t6, t5, t6 |
|
add.w s2, s2, t6 |
|
addi.w t3, t3, 1 |
|
addi.w s6, s6, 1 |
|
blt s6, a6, .FILTERA |
|
|
|
ld.w t2, a5, 4 |
|
li.w t3, 0 |
|
move s6, s1 |
|
addi.w t1, t1, 1 |
|
.FILTERB: |
|
add.w t4, t2, t3 |
|
ldx.bu t5, a3, t4 |
|
mul.w t6, a6, t1 |
|
add.w t6, t6, t3 |
|
slli.w t6, t6, 1 |
|
ldx.h t6, a4, t6 |
|
mul.w t6, t5, t6 |
|
add.w s3, s3, t6 |
|
addi.w t3, t3, 1 |
|
addi.w s6, s6, 1 |
|
blt s6, a6, .FILTERB |
|
ld.w t2, a5, 8 |
|
addi.w t1, t1, 1 |
|
li.w t3, 0 |
|
move s6, s1 |
|
.FILTERC: |
|
add.w t4, t2, t3 |
|
ldx.bu t5, a3, t4 |
|
mul.w t6, a6, t1 |
|
add.w t6, t6, t3 |
|
slli.w t6, t6, 1 |
|
ldx.h t6, a4, t6 |
|
mul.w t6, t5, t6 |
|
add.w s4, s4, t6 |
|
addi.w t3, t3, 1 |
|
addi.w s6, s6, 1 |
|
blt s6, a6, .FILTERC |
|
ld.w t2, a5, 12 |
|
addi.w t1, t1, 1 |
|
move s6, s1 |
|
li.w t3, 0 |
|
.FILTERD: |
|
add.w t4, t2, t3 |
|
ldx.bu t5, a3, t4 |
|
mul.w t6, a6, t1 |
|
add.w t6, t6, t3 |
|
slli.w t6, t6, 1 |
|
ldx.h t6, a4, t6 |
|
mul.w t6, t5, t6 |
|
add.w s5, s5, t6 |
|
addi.w t3, t3, 1 |
|
addi.w s6, s6, 1 |
|
blt s6, a6, .FILTERD |
|
.END_FILTERA: |
|
vpickve2gr.w t1, vr22, 0 |
|
vpickve2gr.w t2, vr22, 1 |
|
vpickve2gr.w t3, vr22, 2 |
|
vpickve2gr.w t4, vr22, 3 |
|
add.w s2, s2, t1 |
|
add.w s3, s3, t2 |
|
add.w s4, s4, t3 |
|
add.w s5, s5, t4 |
|
srai.w s2, s2, 3 |
|
srai.w s3, s3, 3 |
|
srai.w s4, s4, 3 |
|
srai.w s5, s5, 3 |
|
slt t1, s2, t0 |
|
slt t2, s3, t0 |
|
slt t3, s4, t0 |
|
slt t4, s5, t0 |
|
maskeqz s2, s2, t1 |
|
maskeqz s3, s3, t2 |
|
maskeqz s4, s4, t3 |
|
maskeqz s5, s5, t4 |
|
masknez t1, t0, t1 |
|
masknez t2, t0, t2 |
|
masknez t3, t0, t3 |
|
masknez t4, t0, t4 |
|
or s2, s2, t1 |
|
or s3, s3, t2 |
|
or s4, s4, t3 |
|
or s5, s5, t4 |
|
st.w s2, a1, 0 |
|
st.w s3, a1, 4 |
|
st.w s4, a1, 8 |
|
st.w s5, a1, 12 |
|
|
|
addi.d a1, a1, 16 |
|
sub.d a3, a3, s1 |
|
addi.d a5, a5, 16 |
|
slli.d t3, a6, 3 |
|
add.d a4, a4, t3 |
|
sub.d a4, a4, s1 |
|
sub.d a4, a4, s1 |
|
addi.d a2, a2, -4 |
|
bge a2, t7, .LOOP |
|
blt zero, a2, .RESA |
|
b .END |
|
.RESA: |
|
li.w t1, 0 |
|
.DST: |
|
slli.w t2, t1, 2 |
|
ldx.w t2, a5, t2 |
|
li.w t3, 0 |
|
li.w t8, 0 |
|
.FILTER: |
|
add.w t4, t2, t3 |
|
ldx.bu t5, a3, t4 |
|
mul.w t6, a6, t1 |
|
add.w t6, t6, t3 |
|
slli.w t7, t6, 1 |
|
ldx.h t7, a4, t7 |
|
mul.w t7, t5, t7 |
|
add.w t8, t8, t7 |
|
addi.w t3, t3, 1 |
|
blt t3, a6, .FILTER |
|
srai.w t8, t8, 3 |
|
slt t5, t8, t0 |
|
maskeqz t8, t8, t5 |
|
masknez t5, t0, t5 |
|
or t8, t8, t5 |
|
slli.w t4, t1, 2 |
|
stx.w t8, a1, t4 |
|
addi.w t1, t1, 1 |
|
blt t1, a2, .DST |
|
b .END |
|
|
|
.LOOP_DST8: |
|
ld.w t1, a5, 0 |
|
ld.w t2, a5, 4 |
|
ld.w t3, a5, 8 |
|
ld.w t4, a5, 12 |
|
fldx.d f1, a3, t1 |
|
fldx.d f2, a3, t2 |
|
fldx.d f3, a3, t3 |
|
fldx.d f4, a3, t4 |
|
ld.w t1, a5, 16 |
|
ld.w t2, a5, 20 |
|
ld.w t3, a5, 24 |
|
ld.w t4, a5, 28 |
|
fldx.d f5, a3, t1 |
|
fldx.d f6, a3, t2 |
|
fldx.d f7, a3, t3 |
|
fldx.d f8, a3, t4 |
|
vld vr9, a4, 0 |
|
vld vr10, a4, 16 |
|
vld vr11, a4, 32 |
|
vld vr12, a4, 48 |
|
vld vr13, a4, 64 |
|
vld vr14, a4, 80 |
|
vld vr15, a4, 96 |
|
vld vr16, a4, 112 |
|
vilvl.b vr1, vr0, vr1 |
|
vilvl.b vr2, vr0, vr2 |
|
vilvl.b vr3, vr0, vr3 |
|
vilvl.b vr4, vr0, vr4 |
|
vilvl.b vr5, vr0, vr5 |
|
vilvl.b vr6, vr0, vr6 |
|
vilvl.b vr7, vr0, vr7 |
|
vilvl.b vr8, vr0, vr8 |
|
|
|
vdp2.w.h vr17, vr1, vr9 |
|
vdp2.w.h vr18, vr2, vr10 |
|
vdp2.w.h vr19, vr3, vr11 |
|
vdp2.w.h vr21, vr4, vr12 |
|
vdp2.w.h vr1, vr5, vr13 |
|
vdp2.w.h vr2, vr6, vr14 |
|
vdp2.w.h vr3, vr7, vr15 |
|
vdp2.w.h vr4, vr8, vr16 |
|
vhaddw.d.w vr5, vr1, vr1 |
|
vhaddw.d.w vr6, vr2, vr2 |
|
vhaddw.d.w vr7, vr3, vr3 |
|
vhaddw.d.w vr8, vr4, vr4 |
|
vhaddw.d.w vr1, vr17, vr17 |
|
vhaddw.d.w vr2, vr18, vr18 |
|
vhaddw.d.w vr3, vr19, vr19 |
|
vhaddw.d.w vr4, vr21, vr21 |
|
vhaddw.q.d vr1, vr1, vr1 |
|
vhaddw.q.d vr2, vr2, vr2 |
|
vhaddw.q.d vr3, vr3, vr3 |
|
vhaddw.q.d vr4, vr4, vr4 |
|
vhaddw.q.d vr5, vr5, vr5 |
|
vhaddw.q.d vr6, vr6, vr6 |
|
vhaddw.q.d vr7, vr7, vr7 |
|
vhaddw.q.d vr8, vr8, vr8 |
|
vilvl.w vr1, vr2, vr1 |
|
vilvl.w vr3, vr4, vr3 |
|
vilvl.w vr5, vr6, vr5 |
|
vilvl.w vr7, vr8, vr7 |
|
vilvl.d vr1, vr3, vr1 |
|
vilvl.d vr5, vr7, vr5 |
|
vsrai.w vr1, vr1, 3 |
|
vsrai.w vr5, vr5, 3 |
|
vmin.w vr1, vr1, vr20 |
|
vmin.w vr5, vr5, vr20 |
|
|
|
vst vr1, a1, 0 |
|
vst vr5, a1, 16 |
|
addi.d a1, a1, 32 |
|
addi.d a5, a5, 32 |
|
addi.d a4, a4, 128 |
|
addi.d a2, a2, -8 |
|
bge a2, t8, .LOOP_DST8 |
|
blt zero, a2, .REST8 |
|
b .END |
|
.REST8: |
|
li.w t1, 0 |
|
.DST8: |
|
slli.w t2, t1, 2 |
|
ldx.w t2, a5, t2 |
|
li.w t3, 0 |
|
li.w t8, 0 |
|
.FILTER8: |
|
add.w t4, t2, t3 |
|
ldx.bu t5, a3, t4 |
|
mul.w t6, a6, t1 |
|
add.w t6, t6, t3 |
|
slli.w t7, t6, 1 |
|
ldx.h t7, a4, t7 |
|
mul.w t7, t5, t7 |
|
add.w t8, t8, t7 |
|
addi.w t3, t3, 1 |
|
blt t3, a6, .FILTER8 |
|
srai.w t8, t8, 3 |
|
slt t5, t8, t0 |
|
maskeqz t8, t8, t5 |
|
masknez t5, t0, t5 |
|
or t8, t8, t5 |
|
slli.w t4, t1, 2 |
|
stx.w t8, a1, t4 |
|
addi.w t1, t1, 1 |
|
blt t1, a2, .DST8 |
|
b .END |
|
|
|
.LOOP_DST4: |
|
ld.w t1, a5, 0 |
|
ld.w t2, a5, 4 |
|
ld.w t3, a5, 8 |
|
ld.w t4, a5, 12 |
|
fldx.s f1, a3, t1 |
|
fldx.s f2, a3, t2 |
|
fldx.s f3, a3, t3 |
|
fldx.s f4, a3, t4 |
|
ld.w t1, a5, 16 |
|
ld.w t2, a5, 20 |
|
ld.w t3, a5, 24 |
|
ld.w t4, a5, 28 |
|
fldx.s f5, a3, t1 |
|
fldx.s f6, a3, t2 |
|
fldx.s f7, a3, t3 |
|
fldx.s f8, a3, t4 |
|
vld vr9, a4, 0 |
|
vld vr10, a4, 16 |
|
vld vr11, a4, 32 |
|
vld vr12, a4, 48 |
|
vilvl.w vr1, vr2, vr1 |
|
vilvl.w vr3, vr4, vr3 |
|
vilvl.w vr5, vr6, vr5 |
|
vilvl.w vr7, vr8, vr7 |
|
vilvl.b vr1, vr0, vr1 |
|
vilvl.b vr3, vr0, vr3 |
|
vilvl.b vr5, vr0, vr5 |
|
vilvl.b vr7, vr0, vr7 |
|
|
|
vdp2.w.h vr13, vr1, vr9 |
|
vdp2.w.h vr14, vr3, vr10 |
|
vdp2.w.h vr15, vr5, vr11 |
|
vdp2.w.h vr16, vr7, vr12 |
|
vhaddw.d.w vr13, vr13, vr13 |
|
vhaddw.d.w vr14, vr14, vr14 |
|
vhaddw.d.w vr15, vr15, vr15 |
|
vhaddw.d.w vr16, vr16, vr16 |
|
vpickev.w vr13, vr14, vr13 |
|
vpickev.w vr15, vr16, vr15 |
|
vsrai.w vr13, vr13, 3 |
|
vsrai.w vr15, vr15, 3 |
|
vmin.w vr13, vr13, vr20 |
|
vmin.w vr15, vr15, vr20 |
|
|
|
vst vr13, a1, 0 |
|
vst vr15, a1, 16 |
|
addi.d a1, a1, 32 |
|
addi.d a5, a5, 32 |
|
addi.d a4, a4, 64 |
|
addi.d a2, a2, -8 |
|
bge a2, t8, .LOOP_DST4 |
|
blt zero, a2, .REST4 |
|
b .END |
|
.REST4: |
|
li.w t1, 0 |
|
.DST4: |
|
slli.w t2, t1, 2 |
|
ldx.w t2, a5, t2 |
|
li.w t3, 0 |
|
li.w t8, 0 |
|
.FILTER4: |
|
add.w t4, t2, t3 |
|
ldx.bu t5, a3, t4 |
|
mul.w t6, a6, t1 |
|
add.w t6, t6, t3 |
|
slli.w t7, t6, 1 |
|
ldx.h t7, a4, t7 |
|
mul.w t7, t5, t7 |
|
add.w t8, t8, t7 |
|
addi.w t3, t3, 1 |
|
blt t3, a6, .FILTER4 |
|
srai.w t8, t8, 3 |
|
slt t5, t8, t0 |
|
maskeqz t8, t8, t5 |
|
masknez t5, t0, t5 |
|
or t8, t8, t5 |
|
slli.w t4, t1, 2 |
|
stx.w t8, a1, t4 |
|
addi.w t1, t1, 1 |
|
blt t1, a2, .DST4 |
|
b .END |
|
.END_DST4: |
|
|
|
li.w t1, 0 |
|
.LOOP_DST1: |
|
slli.w t2, t1, 2 |
|
ldx.w t2, a5, t2 |
|
li.w t3, 0 |
|
li.w t8, 0 |
|
.FILTER1: |
|
add.w t4, t2, t3 |
|
ldx.bu t5, a3, t4 |
|
mul.w t6, a6, t1 |
|
add.w t6, t6, t3 |
|
slli.w t7, t6, 1 |
|
ldx.h t7, a4, t7 |
|
mul.w t7, t5, t7 |
|
add.w t8, t8, t7 |
|
addi.w t3, t3, 1 |
|
blt t3, a6, .FILTER1 |
|
srai.w t8, t8, 3 |
|
slt t5, t8, t0 |
|
maskeqz t8, t8, t5 |
|
masknez t5, t0, t5 |
|
or t8, t8, t5 |
|
slli.w t4, t1, 2 |
|
stx.w t8, a1, t4 |
|
addi.w t1, t1, 1 |
|
blt t1, a2, .LOOP_DST1 |
|
b .END |
|
.END: |
|
|
|
ld.d s0, sp, 0 |
|
ld.d s1, sp, 8 |
|
ld.d s2, sp, 16 |
|
ld.d s3, sp, 24 |
|
ld.d s4, sp, 32 |
|
ld.d s5, sp, 40 |
|
ld.d s6, sp, 48 |
|
ld.d s7, sp, 56 |
|
ld.d s8, sp, 64 |
|
addi.d sp, sp, 72 |
|
endfunc |
|
|
|
/* void ff_hscale_16_to_15_sub_lsx(SwsContext *c, int16_t *dst, int dstW, |
|
* const uint8_t *src, const int16_t *filter, |
|
* const int32_t *filterPos, int filterSize, int sh) |
|
*/ |
|
function ff_hscale_16_to_15_sub_lsx |
|
addi.d sp, sp, -72 |
|
st.d s0, sp, 0 |
|
st.d s1, sp, 8 |
|
st.d s2, sp, 16 |
|
st.d s3, sp, 24 |
|
st.d s4, sp, 32 |
|
st.d s5, sp, 40 |
|
st.d s6, sp, 48 |
|
st.d s7, sp, 56 |
|
st.d s8, sp, 64 |
|
li.w t0, 32767 |
|
li.w t8, 8 |
|
li.w t7, 4 |
|
vreplgr2vr.w vr20, t0 |
|
vreplgr2vr.w vr0, a7 |
|
beq a6, t7, .LOOP_HS15_DST4 |
|
beq a6, t8, .LOOP_HS15_DST8 |
|
blt t8, a6, .LOOP_HS15 |
|
b .END_HS15_DST4 |
|
|
|
.LOOP_HS15: |
|
li.w t1, 0 |
|
li.w s1, 0 |
|
li.w s2, 0 |
|
li.w s3, 0 |
|
li.w s4, 0 |
|
li.w s5, 0 |
|
vldi vr22, 0 |
|
addi.w s0, a6, -7 |
|
slli.w s7, a6, 1 |
|
slli.w s8, a6, 2 |
|
add.w t6, s7, s8 |
|
.LOOP_HS15_DST: |
|
ld.w t2, a5, 0 |
|
ld.w t3, a5, 4 |
|
ld.w t4, a5, 8 |
|
ld.w t5, a5, 12 |
|
slli.w t2, t2, 1 |
|
slli.w t3, t3, 1 |
|
slli.w t4, t4, 1 |
|
slli.w t5, t5, 1 |
|
vldx vr1, a3, t2 |
|
vldx vr2, a3, t3 |
|
vldx vr3, a3, t4 |
|
vldx vr4, a3, t5 |
|
vld vr9, a4, 0 |
|
vldx vr10, a4, s7 |
|
vldx vr11, a4, s8 |
|
vldx vr12, a4, t6 |
|
vmulwev.w.hu.h vr17, vr1, vr9 |
|
vmulwev.w.hu.h vr18, vr2, vr10 |
|
vmulwev.w.hu.h vr19, vr3, vr11 |
|
vmulwev.w.hu.h vr21, vr4, vr12 |
|
vmaddwod.w.hu.h vr17, vr1, vr9 |
|
vmaddwod.w.hu.h vr18, vr2, vr10 |
|
vmaddwod.w.hu.h vr19, vr3, vr11 |
|
vmaddwod.w.hu.h vr21, vr4, vr12 |
|
vhaddw.d.w vr1, vr17, vr17 |
|
vhaddw.d.w vr2, vr18, vr18 |
|
vhaddw.d.w vr3, vr19, vr19 |
|
vhaddw.d.w vr4, vr21, vr21 |
|
vhaddw.q.d vr1, vr1, vr1 |
|
vhaddw.q.d vr2, vr2, vr2 |
|
vhaddw.q.d vr3, vr3, vr3 |
|
vhaddw.q.d vr4, vr4, vr4 |
|
vilvl.w vr1, vr2, vr1 |
|
vilvl.w vr3, vr4, vr3 |
|
vilvl.d vr1, vr3, vr1 |
|
vadd.w vr22, vr22, vr1 |
|
addi.w s1, s1, 8 |
|
addi.d a3, a3, 16 |
|
addi.d a4, a4, 16 |
|
blt s1, s0, .LOOP_HS15_DST |
|
blt s1, a6, .HS15_DSTA |
|
b .END_HS15_FILTERA |
|
.HS15_DSTA: |
|
ld.w t2, a5, 0 |
|
li.w t3, 0 |
|
move s6, s1 |
|
.HS15_FILTERA: |
|
add.w t4, t2, t3 |
|
slli.w t4, t4, 1 |
|
ldx.hu t5, a3, t4 |
|
mul.w t6, a6, t1 |
|
add.w t6, t6, t3 |
|
slli.w t6, t6, 1 |
|
ldx.h t6, a4, t6 |
|
mul.w t6, t5, t6 |
|
add.w s2, s2, t6 |
|
addi.w t3, t3, 1 |
|
addi.w s6, s6, 1 |
|
blt s6, a6, .HS15_FILTERA |
|
|
|
ld.w t2, a5, 4 |
|
li.w t3, 0 |
|
move s6, s1 |
|
addi.w t1, t1, 1 |
|
.HS15_FILTERB: |
|
add.w t4, t2, t3 |
|
slli.w t4, t4, 1 |
|
ldx.hu t5, a3, t4 |
|
mul.w t6, a6, t1 |
|
add.w t6, t6, t3 |
|
slli.w t6, t6, 1 |
|
ldx.h t6, a4, t6 |
|
mul.w t6, t5, t6 |
|
add.w s3, s3, t6 |
|
addi.w t3, t3, 1 |
|
addi.w s6, s6, 1 |
|
blt s6, a6, .HS15_FILTERB |
|
ld.w t2, a5, 8 |
|
addi.w t1, t1, 1 |
|
li.w t3, 0 |
|
move s6, s1 |
|
.HS15_FILTERC: |
|
add.w t4, t2, t3 |
|
slli.w t4, t4, 1 |
|
ldx.hu t5, a3, t4 |
|
mul.w t6, a6, t1 |
|
add.w t6, t6, t3 |
|
slli.w t6, t6, 1 |
|
ldx.h t6, a4, t6 |
|
mul.w t6, t5, t6 |
|
add.w s4, s4, t6 |
|
addi.w t3, t3, 1 |
|
addi.w s6, s6, 1 |
|
blt s6, a6, .HS15_FILTERC |
|
ld.w t2, a5, 12 |
|
addi.w t1, t1, 1 |
|
move s6, s1 |
|
li.w t3, 0 |
|
.HS15_FILTERD: |
|
add.w t4, t2, t3 |
|
slli.w t4, t4, 1 |
|
ldx.hu t5, a3, t4 |
|
mul.w t6, a6, t1 |
|
add.w t6, t6, t3 |
|
slli.w t6, t6, 1 |
|
ldx.h t6, a4, t6 |
|
mul.w t6, t5, t6 |
|
add.w s5, s5, t6 |
|
addi.w t3, t3, 1 |
|
addi.w s6, s6, 1 |
|
blt s6, a6, .HS15_FILTERD |
|
.END_HS15_FILTERA: |
|
vpickve2gr.w t1, vr22, 0 |
|
vpickve2gr.w t2, vr22, 1 |
|
vpickve2gr.w t3, vr22, 2 |
|
vpickve2gr.w t4, vr22, 3 |
|
add.w s2, s2, t1 |
|
add.w s3, s3, t2 |
|
add.w s4, s4, t3 |
|
add.w s5, s5, t4 |
|
sra.w s2, s2, a7 |
|
sra.w s3, s3, a7 |
|
sra.w s4, s4, a7 |
|
sra.w s5, s5, a7 |
|
slt t1, s2, t0 |
|
slt t2, s3, t0 |
|
slt t3, s4, t0 |
|
slt t4, s5, t0 |
|
maskeqz s2, s2, t1 |
|
maskeqz s3, s3, t2 |
|
maskeqz s4, s4, t3 |
|
maskeqz s5, s5, t4 |
|
masknez t1, t0, t1 |
|
masknez t2, t0, t2 |
|
masknez t3, t0, t3 |
|
masknez t4, t0, t4 |
|
or s2, s2, t1 |
|
or s3, s3, t2 |
|
or s4, s4, t3 |
|
or s5, s5, t4 |
|
st.h s2, a1, 0 |
|
st.h s3, a1, 2 |
|
st.h s4, a1, 4 |
|
st.h s5, a1, 6 |
|
|
|
addi.d a1, a1, 8 |
|
sub.d a3, a3, s1 |
|
sub.d a3, a3, s1 |
|
addi.d a5, a5, 16 |
|
slli.d t3, a6, 3 |
|
add.d a4, a4, t3 |
|
sub.d a4, a4, s1 |
|
sub.d a4, a4, s1 |
|
addi.d a2, a2, -4 |
|
bge a2, t7, .LOOP_HS15 |
|
blt zero, a2, .HS15_RESA |
|
b .HS15_END |
|
.HS15_RESA: |
|
li.w t1, 0 |
|
.HS15_DST: |
|
slli.w t2, t1, 2 |
|
ldx.w t2, a5, t2 |
|
li.w t3, 0 |
|
li.w t8, 0 |
|
.HS15_FILTER: |
|
add.w t4, t2, t3 |
|
slli.w t4, t4, 1 |
|
ldx.hu t5, a3, t4 |
|
mul.w t6, a6, t1 |
|
add.w t6, t6, t3 |
|
slli.w t7, t6, 1 |
|
ldx.h t7, a4, t7 |
|
mul.w t7, t5, t7 |
|
add.w t8, t8, t7 |
|
addi.w t3, t3, 1 |
|
blt t3, a6, .HS15_FILTER |
|
sra.w t8, t8, a7 |
|
slt t5, t8, t0 |
|
maskeqz t8, t8, t5 |
|
masknez t5, t0, t5 |
|
or t8, t8, t5 |
|
slli.w t4, t1, 1 |
|
stx.h t8, a1, t4 |
|
addi.w t1, t1, 1 |
|
blt t1, a2, .HS15_DST |
|
b .HS15_END |
|
|
|
.LOOP_HS15_DST8: |
|
ld.w t1, a5, 0 |
|
ld.w t2, a5, 4 |
|
ld.w t3, a5, 8 |
|
ld.w t4, a5, 12 |
|
slli.w t1, t1, 1 |
|
slli.w t2, t2, 1 |
|
slli.w t3, t3, 1 |
|
slli.w t4, t4, 1 |
|
vldx vr1, a3, t1 |
|
vldx vr2, a3, t2 |
|
vldx vr3, a3, t3 |
|
vldx vr4, a3, t4 |
|
ld.w t1, a5, 16 |
|
ld.w t2, a5, 20 |
|
ld.w t3, a5, 24 |
|
ld.w t4, a5, 28 |
|
slli.w t1, t1, 1 |
|
slli.w t2, t2, 1 |
|
slli.w t3, t3, 1 |
|
slli.w t4, t4, 1 |
|
vldx vr5, a3, t1 |
|
vldx vr6, a3, t2 |
|
vldx vr7, a3, t3 |
|
vldx vr8, a3, t4 |
|
vld vr9, a4, 0 |
|
vld vr10, a4, 16 |
|
vld vr11, a4, 32 |
|
vld vr12, a4, 48 |
|
vld vr13, a4, 64 |
|
vld vr14, a4, 80 |
|
vld vr15, a4, 96 |
|
vld vr16, a4, 112 |
|
|
|
vmulwev.w.hu.h vr17, vr1, vr9 |
|
vmulwev.w.hu.h vr18, vr2, vr10 |
|
vmulwev.w.hu.h vr19, vr3, vr11 |
|
vmulwev.w.hu.h vr21, vr4, vr12 |
|
vmaddwod.w.hu.h vr17, vr1, vr9 |
|
vmaddwod.w.hu.h vr18, vr2, vr10 |
|
vmaddwod.w.hu.h vr19, vr3, vr11 |
|
vmaddwod.w.hu.h vr21, vr4, vr12 |
|
vmulwev.w.hu.h vr1, vr5, vr13 |
|
vmulwev.w.hu.h vr2, vr6, vr14 |
|
vmulwev.w.hu.h vr3, vr7, vr15 |
|
vmulwev.w.hu.h vr4, vr8, vr16 |
|
vmaddwod.w.hu.h vr1, vr5, vr13 |
|
vmaddwod.w.hu.h vr2, vr6, vr14 |
|
vmaddwod.w.hu.h vr3, vr7, vr15 |
|
vmaddwod.w.hu.h vr4, vr8, vr16 |
|
vhaddw.d.w vr5, vr1, vr1 |
|
vhaddw.d.w vr6, vr2, vr2 |
|
vhaddw.d.w vr7, vr3, vr3 |
|
vhaddw.d.w vr8, vr4, vr4 |
|
vhaddw.d.w vr1, vr17, vr17 |
|
vhaddw.d.w vr2, vr18, vr18 |
|
vhaddw.d.w vr3, vr19, vr19 |
|
vhaddw.d.w vr4, vr21, vr21 |
|
vhaddw.q.d vr1, vr1, vr1 |
|
vhaddw.q.d vr2, vr2, vr2 |
|
vhaddw.q.d vr3, vr3, vr3 |
|
vhaddw.q.d vr4, vr4, vr4 |
|
vhaddw.q.d vr5, vr5, vr5 |
|
vhaddw.q.d vr6, vr6, vr6 |
|
vhaddw.q.d vr7, vr7, vr7 |
|
vhaddw.q.d vr8, vr8, vr8 |
|
vilvl.w vr1, vr2, vr1 |
|
vilvl.w vr3, vr4, vr3 |
|
vilvl.w vr5, vr6, vr5 |
|
vilvl.w vr7, vr8, vr7 |
|
vilvl.d vr1, vr3, vr1 |
|
vilvl.d vr5, vr7, vr5 |
|
vsra.w vr1, vr1, vr0 |
|
vsra.w vr5, vr5, vr0 |
|
vmin.w vr1, vr1, vr20 |
|
vmin.w vr5, vr5, vr20 |
|
|
|
vpickev.h vr1, vr5, vr1 |
|
vst vr1, a1, 0 |
|
addi.d a1, a1, 16 |
|
addi.d a5, a5, 32 |
|
addi.d a4, a4, 128 |
|
addi.d a2, a2, -8 |
|
bge a2, t8, .LOOP_HS15_DST8 |
|
blt zero, a2, .HS15_REST8 |
|
b .HS15_END |
|
.HS15_REST8: |
|
li.w t1, 0 |
|
.HS15_DST8: |
|
slli.w t2, t1, 2 |
|
ldx.w t2, a5, t2 |
|
li.w t3, 0 |
|
li.w t8, 0 |
|
.HS15_FILTER8: |
|
add.w t4, t2, t3 |
|
slli.w t4, t4, 1 |
|
ldx.hu t5, a3, t4 |
|
mul.w t6, a6, t1 |
|
add.w t6, t6, t3 |
|
slli.w t7, t6, 1 |
|
ldx.h t7, a4, t7 |
|
mul.w t7, t5, t7 |
|
add.w t8, t8, t7 |
|
addi.w t3, t3, 1 |
|
blt t3, a6, .HS15_FILTER8 |
|
sra.w t8, t8, a7 |
|
slt t5, t8, t0 |
|
maskeqz t8, t8, t5 |
|
masknez t5, t0, t5 |
|
or t8, t8, t5 |
|
slli.w t4, t1, 1 |
|
stx.h t8, a1, t4 |
|
addi.w t1, t1, 1 |
|
blt t1, a2, .HS15_DST8 |
|
b .HS15_END |
|
|
|
.LOOP_HS15_DST4: |
|
ld.w t1, a5, 0 |
|
ld.w t2, a5, 4 |
|
ld.w t3, a5, 8 |
|
ld.w t4, a5, 12 |
|
slli.w t1, t1, 1 |
|
slli.w t2, t2, 1 |
|
slli.w t3, t3, 1 |
|
slli.w t4, t4, 1 |
|
fldx.d f1, a3, t1 |
|
fldx.d f2, a3, t2 |
|
fldx.d f3, a3, t3 |
|
fldx.d f4, a3, t4 |
|
ld.w t1, a5, 16 |
|
ld.w t2, a5, 20 |
|
ld.w t3, a5, 24 |
|
ld.w t4, a5, 28 |
|
slli.w t1, t1, 1 |
|
slli.w t2, t2, 1 |
|
slli.w t3, t3, 1 |
|
slli.w t4, t4, 1 |
|
fldx.d f5, a3, t1 |
|
fldx.d f6, a3, t2 |
|
fldx.d f7, a3, t3 |
|
fldx.d f8, a3, t4 |
|
vld vr9, a4, 0 |
|
vld vr10, a4, 16 |
|
vld vr11, a4, 32 |
|
vld vr12, a4, 48 |
|
vilvl.d vr1, vr2, vr1 |
|
vilvl.d vr3, vr4, vr3 |
|
vilvl.d vr5, vr6, vr5 |
|
vilvl.d vr7, vr8, vr7 |
|
vmulwev.w.hu.h vr13, vr1, vr9 |
|
vmulwev.w.hu.h vr14, vr3, vr10 |
|
vmulwev.w.hu.h vr15, vr5, vr11 |
|
vmulwev.w.hu.h vr16, vr7, vr12 |
|
vmaddwod.w.hu.h vr13, vr1, vr9 |
|
vmaddwod.w.hu.h vr14, vr3, vr10 |
|
vmaddwod.w.hu.h vr15, vr5, vr11 |
|
vmaddwod.w.hu.h vr16, vr7, vr12 |
|
vhaddw.d.w vr13, vr13, vr13 |
|
vhaddw.d.w vr14, vr14, vr14 |
|
vhaddw.d.w vr15, vr15, vr15 |
|
vhaddw.d.w vr16, vr16, vr16 |
|
vpickev.w vr13, vr14, vr13 |
|
vpickev.w vr15, vr16, vr15 |
|
vsra.w vr13, vr13, vr0 |
|
vsra.w vr15, vr15, vr0 |
|
vmin.w vr13, vr13, vr20 |
|
vmin.w vr15, vr15, vr20 |
|
|
|
vpickev.h vr13, vr15, vr13 |
|
vst vr13, a1, 0 |
|
addi.d a1, a1, 16 |
|
addi.d a5, a5, 32 |
|
addi.d a4, a4, 64 |
|
addi.d a2, a2, -8 |
|
bge a2, t8, .LOOP_HS15_DST4 |
|
blt zero, a2, .HS15_REST4 |
|
b .HS15_END |
|
.HS15_REST4: |
|
li.w t1, 0 |
|
.HS15_DST4: |
|
slli.w t2, t1, 2 |
|
ldx.w t2, a5, t2 |
|
li.w t3, 0 |
|
li.w t8, 0 |
|
.HS15_FILTER4: |
|
add.w t4, t2, t3 |
|
slli.w t4, t4, 1 |
|
ldx.hu t5, a3, t4 |
|
mul.w t6, a6, t1 |
|
add.w t6, t6, t3 |
|
slli.w t7, t6, 1 |
|
ldx.h t7, a4, t7 |
|
mul.w t7, t5, t7 |
|
add.w t8, t8, t7 |
|
addi.w t3, t3, 1 |
|
blt t3, a6, .HS15_FILTER4 |
|
sra.w t8, t8, a7 |
|
slt t5, t8, t0 |
|
maskeqz t8, t8, t5 |
|
masknez t5, t0, t5 |
|
or t8, t8, t5 |
|
slli.w t4, t1, 1 |
|
stx.h t8, a1, t4 |
|
addi.w t1, t1, 1 |
|
blt t1, a2, .HS15_DST4 |
|
b .HS15_END |
|
.END_HS15_DST4: |
|
|
|
li.w t1, 0 |
|
.LOOP_HS15_DST1: |
|
slli.w t2, t1, 2 |
|
ldx.w t2, a5, t2 |
|
li.w t3, 0 |
|
li.w t8, 0 |
|
.HS15_FILTER1: |
|
add.w t4, t2, t3 |
|
slli.w t4, t4, 1 |
|
ldx.hu t5, a3, t4 |
|
mul.w t6, a6, t1 |
|
add.w t6, t6, t3 |
|
slli.w t7, t6, 1 |
|
ldx.h t7, a4, t7 |
|
mul.w t7, t5, t7 |
|
add.w t8, t8, t7 |
|
addi.w t3, t3, 1 |
|
blt t3, a6, .HS15_FILTER1 |
|
sra.w t8, t8, a7 |
|
slt t5, t8, t0 |
|
maskeqz t8, t8, t5 |
|
masknez t5, t0, t5 |
|
or t8, t8, t5 |
|
slli.w t4, t1, 1 |
|
stx.h t8, a1, t4 |
|
addi.w t1, t1, 1 |
|
blt t1, a2, .LOOP_HS15_DST1 |
|
b .HS15_END |
|
.HS15_END: |
|
|
|
ld.d s0, sp, 0 |
|
ld.d s1, sp, 8 |
|
ld.d s2, sp, 16 |
|
ld.d s3, sp, 24 |
|
ld.d s4, sp, 32 |
|
ld.d s5, sp, 40 |
|
ld.d s6, sp, 48 |
|
ld.d s7, sp, 56 |
|
ld.d s8, sp, 64 |
|
addi.d sp, sp, 72 |
|
endfunc |
|
|
|
/* void ff_hscale_16_to_19_sub_lsx(SwsContext *c, int16_t *dst, int dstW, |
|
* const uint8_t *src, const int16_t *filter, |
|
* const int32_t *filterPos, int filterSize, int sh) |
|
*/ |
|
function ff_hscale_16_to_19_sub_lsx |
|
addi.d sp, sp, -72 |
|
st.d s0, sp, 0 |
|
st.d s1, sp, 8 |
|
st.d s2, sp, 16 |
|
st.d s3, sp, 24 |
|
st.d s4, sp, 32 |
|
st.d s5, sp, 40 |
|
st.d s6, sp, 48 |
|
st.d s7, sp, 56 |
|
st.d s8, sp, 64 |
|
|
|
li.w t0, 524287 |
|
li.w t8, 8 |
|
li.w t7, 4 |
|
vreplgr2vr.w vr20, t0 |
|
vreplgr2vr.w vr0, a7 |
|
beq a6, t7, .LOOP_HS19_DST4 |
|
beq a6, t8, .LOOP_HS19_DST8 |
|
blt t8, a6, .LOOP_HS19 |
|
b .END_HS19_DST4 |
|
|
|
.LOOP_HS19: |
|
li.w t1, 0 |
|
li.w s1, 0 |
|
li.w s2, 0 |
|
li.w s3, 0 |
|
li.w s4, 0 |
|
li.w s5, 0 |
|
vldi vr22, 0 |
|
addi.w s0, a6, -7 |
|
slli.w s7, a6, 1 |
|
slli.w s8, a6, 2 |
|
add.w t6, s7, s8 |
|
.LOOP_HS19_DST: |
|
ld.w t2, a5, 0 |
|
ld.w t3, a5, 4 |
|
ld.w t4, a5, 8 |
|
ld.w t5, a5, 12 |
|
slli.w t2, t2, 1 |
|
slli.w t3, t3, 1 |
|
slli.w t4, t4, 1 |
|
slli.w t5, t5, 1 |
|
vldx vr1, a3, t2 |
|
vldx vr2, a3, t3 |
|
vldx vr3, a3, t4 |
|
vldx vr4, a3, t5 |
|
vld vr9, a4, 0 |
|
vldx vr10, a4, s7 |
|
vldx vr11, a4, s8 |
|
vldx vr12, a4, t6 |
|
vmulwev.w.hu.h vr17, vr1, vr9 |
|
vmulwev.w.hu.h vr18, vr2, vr10 |
|
vmulwev.w.hu.h vr19, vr3, vr11 |
|
vmulwev.w.hu.h vr21, vr4, vr12 |
|
vmaddwod.w.hu.h vr17, vr1, vr9 |
|
vmaddwod.w.hu.h vr18, vr2, vr10 |
|
vmaddwod.w.hu.h vr19, vr3, vr11 |
|
vmaddwod.w.hu.h vr21, vr4, vr12 |
|
vhaddw.d.w vr1, vr17, vr17 |
|
vhaddw.d.w vr2, vr18, vr18 |
|
vhaddw.d.w vr3, vr19, vr19 |
|
vhaddw.d.w vr4, vr21, vr21 |
|
vhaddw.q.d vr1, vr1, vr1 |
|
vhaddw.q.d vr2, vr2, vr2 |
|
vhaddw.q.d vr3, vr3, vr3 |
|
vhaddw.q.d vr4, vr4, vr4 |
|
vilvl.w vr1, vr2, vr1 |
|
vilvl.w vr3, vr4, vr3 |
|
vilvl.d vr1, vr3, vr1 |
|
vadd.w vr22, vr22, vr1 |
|
addi.w s1, s1, 8 |
|
addi.d a3, a3, 16 |
|
addi.d a4, a4, 16 |
|
blt s1, s0, .LOOP_HS19_DST |
|
blt s1, a6, .HS19_DSTA |
|
b .END_HS19_FILTERA |
|
.HS19_DSTA: |
|
ld.w t2, a5, 0 |
|
li.w t3, 0 |
|
move s6, s1 |
|
.HS19_FILTERA: |
|
add.w t4, t2, t3 |
|
slli.w t4, t4, 1 |
|
ldx.hu t5, a3, t4 |
|
mul.w t6, a6, t1 |
|
add.w t6, t6, t3 |
|
slli.w t6, t6, 1 |
|
ldx.h t6, a4, t6 |
|
mul.w t6, t5, t6 |
|
add.w s2, s2, t6 |
|
addi.w t3, t3, 1 |
|
addi.w s6, s6, 1 |
|
blt s6, a6, .HS19_FILTERA |
|
|
|
ld.w t2, a5, 4 |
|
li.w t3, 0 |
|
move s6, s1 |
|
addi.w t1, t1, 1 |
|
.HS19_FILTERB: |
|
add.w t4, t2, t3 |
|
slli.w t4, t4, 1 |
|
ldx.hu t5, a3, t4 |
|
mul.w t6, a6, t1 |
|
add.w t6, t6, t3 |
|
slli.w t6, t6, 1 |
|
ldx.h t6, a4, t6 |
|
mul.w t6, t5, t6 |
|
add.w s3, s3, t6 |
|
addi.w t3, t3, 1 |
|
addi.w s6, s6, 1 |
|
blt s6, a6, .HS19_FILTERB |
|
ld.w t2, a5, 8 |
|
addi.w t1, t1, 1 |
|
li.w t3, 0 |
|
move s6, s1 |
|
.HS19_FILTERC: |
|
add.w t4, t2, t3 |
|
slli.w t4, t4, 1 |
|
ldx.hu t5, a3, t4 |
|
mul.w t6, a6, t1 |
|
add.w t6, t6, t3 |
|
slli.w t6, t6, 1 |
|
ldx.h t6, a4, t6 |
|
mul.w t6, t5, t6 |
|
add.w s4, s4, t6 |
|
addi.w t3, t3, 1 |
|
addi.w s6, s6, 1 |
|
blt s6, a6, .HS19_FILTERC |
|
ld.w t2, a5, 12 |
|
addi.w t1, t1, 1 |
|
move s6, s1 |
|
li.w t3, 0 |
|
.HS19_FILTERD: |
|
add.w t4, t2, t3 |
|
slli.w t4, t4, 1 |
|
ldx.hu t5, a3, t4 |
|
mul.w t6, a6, t1 |
|
add.w t6, t6, t3 |
|
slli.w t6, t6, 1 |
|
ldx.h t6, a4, t6 |
|
mul.w t6, t5, t6 |
|
add.w s5, s5, t6 |
|
addi.w t3, t3, 1 |
|
addi.w s6, s6, 1 |
|
blt s6, a6, .HS19_FILTERD |
|
.END_HS19_FILTERA: |
|
vpickve2gr.w t1, vr22, 0 |
|
vpickve2gr.w t2, vr22, 1 |
|
vpickve2gr.w t3, vr22, 2 |
|
vpickve2gr.w t4, vr22, 3 |
|
add.w s2, s2, t1 |
|
add.w s3, s3, t2 |
|
add.w s4, s4, t3 |
|
add.w s5, s5, t4 |
|
sra.w s2, s2, a7 |
|
sra.w s3, s3, a7 |
|
sra.w s4, s4, a7 |
|
sra.w s5, s5, a7 |
|
slt t1, s2, t0 |
|
slt t2, s3, t0 |
|
slt t3, s4, t0 |
|
slt t4, s5, t0 |
|
maskeqz s2, s2, t1 |
|
maskeqz s3, s3, t2 |
|
maskeqz s4, s4, t3 |
|
maskeqz s5, s5, t4 |
|
masknez t1, t0, t1 |
|
masknez t2, t0, t2 |
|
masknez t3, t0, t3 |
|
masknez t4, t0, t4 |
|
or s2, s2, t1 |
|
or s3, s3, t2 |
|
or s4, s4, t3 |
|
or s5, s5, t4 |
|
st.w s2, a1, 0 |
|
st.w s3, a1, 4 |
|
st.w s4, a1, 8 |
|
st.w s5, a1, 12 |
|
|
|
addi.d a1, a1, 16 |
|
sub.d a3, a3, s1 |
|
sub.d a3, a3, s1 |
|
addi.d a5, a5, 16 |
|
slli.d t3, a6, 3 |
|
add.d a4, a4, t3 |
|
sub.d a4, a4, s1 |
|
sub.d a4, a4, s1 |
|
addi.d a2, a2, -4 |
|
bge a2, t7, .LOOP_HS19 |
|
blt zero, a2, .HS19_RESA |
|
b .HS19_END |
|
.HS19_RESA: |
|
li.w t1, 0 |
|
.HS19_DST: |
|
slli.w t2, t1, 2 |
|
ldx.w t2, a5, t2 |
|
li.w t3, 0 |
|
li.w t8, 0 |
|
.HS19_FILTER: |
|
add.w t4, t2, t3 |
|
slli.w t4, t4, 1 |
|
ldx.hu t5, a3, t4 |
|
mul.w t6, a6, t1 |
|
add.w t6, t6, t3 |
|
slli.w t7, t6, 1 |
|
ldx.h t7, a4, t7 |
|
mul.w t7, t5, t7 |
|
add.w t8, t8, t7 |
|
addi.w t3, t3, 1 |
|
blt t3, a6, .HS19_FILTER |
|
sra.w t8, t8, a7 |
|
slt t5, t8, t0 |
|
maskeqz t8, t8, t5 |
|
masknez t5, t0, t5 |
|
or t8, t8, t5 |
|
slli.w t4, t1, 2 |
|
stx.w t8, a1, t4 |
|
addi.w t1, t1, 1 |
|
blt t1, a2, .HS19_DST |
|
b .HS19_END |
|
|
|
.LOOP_HS19_DST8: |
|
ld.w t1, a5, 0 |
|
ld.w t2, a5, 4 |
|
ld.w t3, a5, 8 |
|
ld.w t4, a5, 12 |
|
slli.w t1, t1, 1 |
|
slli.w t2, t2, 1 |
|
slli.w t3, t3, 1 |
|
slli.w t4, t4, 1 |
|
vldx vr1, a3, t1 |
|
vldx vr2, a3, t2 |
|
vldx vr3, a3, t3 |
|
vldx vr4, a3, t4 |
|
ld.w t1, a5, 16 |
|
ld.w t2, a5, 20 |
|
ld.w t3, a5, 24 |
|
ld.w t4, a5, 28 |
|
slli.w t1, t1, 1 |
|
slli.w t2, t2, 1 |
|
slli.w t3, t3, 1 |
|
slli.w t4, t4, 1 |
|
vldx vr5, a3, t1 |
|
vldx vr6, a3, t2 |
|
vldx vr7, a3, t3 |
|
vldx vr8, a3, t4 |
|
vld vr9, a4, 0 |
|
vld vr10, a4, 16 |
|
vld vr11, a4, 32 |
|
vld vr12, a4, 48 |
|
vld vr13, a4, 64 |
|
vld vr14, a4, 80 |
|
vld vr15, a4, 96 |
|
vld vr16, a4, 112 |
|
vmulwev.w.hu.h vr17, vr1, vr9 |
|
vmulwev.w.hu.h vr18, vr2, vr10 |
|
vmulwev.w.hu.h vr19, vr3, vr11 |
|
vmulwev.w.hu.h vr21, vr4, vr12 |
|
vmaddwod.w.hu.h vr17, vr1, vr9 |
|
vmaddwod.w.hu.h vr18, vr2, vr10 |
|
vmaddwod.w.hu.h vr19, vr3, vr11 |
|
vmaddwod.w.hu.h vr21, vr4, vr12 |
|
vmulwev.w.hu.h vr1, vr5, vr13 |
|
vmulwev.w.hu.h vr2, vr6, vr14 |
|
vmulwev.w.hu.h vr3, vr7, vr15 |
|
vmulwev.w.hu.h vr4, vr8, vr16 |
|
vmaddwod.w.hu.h vr1, vr5, vr13 |
|
vmaddwod.w.hu.h vr2, vr6, vr14 |
|
vmaddwod.w.hu.h vr3, vr7, vr15 |
|
vmaddwod.w.hu.h vr4, vr8, vr16 |
|
vhaddw.d.w vr5, vr1, vr1 |
|
vhaddw.d.w vr6, vr2, vr2 |
|
vhaddw.d.w vr7, vr3, vr3 |
|
vhaddw.d.w vr8, vr4, vr4 |
|
vhaddw.d.w vr1, vr17, vr17 |
|
vhaddw.d.w vr2, vr18, vr18 |
|
vhaddw.d.w vr3, vr19, vr19 |
|
vhaddw.d.w vr4, vr21, vr21 |
|
vhaddw.q.d vr1, vr1, vr1 |
|
vhaddw.q.d vr2, vr2, vr2 |
|
vhaddw.q.d vr3, vr3, vr3 |
|
vhaddw.q.d vr4, vr4, vr4 |
|
vhaddw.q.d vr5, vr5, vr5 |
|
vhaddw.q.d vr6, vr6, vr6 |
|
vhaddw.q.d vr7, vr7, vr7 |
|
vhaddw.q.d vr8, vr8, vr8 |
|
vilvl.w vr1, vr2, vr1 |
|
vilvl.w vr3, vr4, vr3 |
|
vilvl.w vr5, vr6, vr5 |
|
vilvl.w vr7, vr8, vr7 |
|
vilvl.d vr1, vr3, vr1 |
|
vilvl.d vr5, vr7, vr5 |
|
vsra.w vr1, vr1, vr0 |
|
vsra.w vr5, vr5, vr0 |
|
vmin.w vr1, vr1, vr20 |
|
vmin.w vr5, vr5, vr20 |
|
|
|
vst vr1, a1, 0 |
|
vst vr5, a1, 16 |
|
addi.d a1, a1, 32 |
|
addi.d a5, a5, 32 |
|
addi.d a4, a4, 128 |
|
addi.d a2, a2, -8 |
|
bge a2, t8, .LOOP_HS19_DST8 |
|
blt zero, a2, .HS19_REST8 |
|
b .HS19_END |
|
.HS19_REST8: |
|
li.w t1, 0 |
|
.HS19_DST8: |
|
slli.w t2, t1, 2 |
|
ldx.w t2, a5, t2 |
|
li.w t3, 0 |
|
li.w t8, 0 |
|
.HS19_FILTER8: |
|
add.w t4, t2, t3 |
|
slli.w t4, t4, 1 |
|
ldx.hu t5, a3, t4 |
|
mul.w t6, a6, t1 |
|
add.w t6, t6, t3 |
|
slli.w t7, t6, 1 |
|
ldx.h t7, a4, t7 |
|
mul.w t7, t5, t7 |
|
add.w t8, t8, t7 |
|
addi.w t3, t3, 1 |
|
blt t3, a6, .HS19_FILTER8 |
|
sra.w t8, t8, a7 |
|
slt t5, t8, t0 |
|
maskeqz t8, t8, t5 |
|
masknez t5, t0, t5 |
|
or t8, t8, t5 |
|
slli.w t4, t1, 2 |
|
stx.w t8, a1, t4 |
|
addi.w t1, t1, 1 |
|
blt t1, a2, .HS19_DST8 |
|
b .HS19_END |
|
|
|
.LOOP_HS19_DST4: |
|
ld.w t1, a5, 0 |
|
ld.w t2, a5, 4 |
|
ld.w t3, a5, 8 |
|
ld.w t4, a5, 12 |
|
slli.w t1, t1, 1 |
|
slli.w t2, t2, 1 |
|
slli.w t3, t3, 1 |
|
slli.w t4, t4, 1 |
|
fldx.d f1, a3, t1 |
|
fldx.d f2, a3, t2 |
|
fldx.d f3, a3, t3 |
|
fldx.d f4, a3, t4 |
|
ld.w t1, a5, 16 |
|
ld.w t2, a5, 20 |
|
ld.w t3, a5, 24 |
|
ld.w t4, a5, 28 |
|
slli.w t1, t1, 1 |
|
slli.w t2, t2, 1 |
|
slli.w t3, t3, 1 |
|
slli.w t4, t4, 1 |
|
fldx.d f5, a3, t1 |
|
fldx.d f6, a3, t2 |
|
fldx.d f7, a3, t3 |
|
fldx.d f8, a3, t4 |
|
vld vr9, a4, 0 |
|
vld vr10, a4, 16 |
|
vld vr11, a4, 32 |
|
vld vr12, a4, 48 |
|
vilvl.d vr1, vr2, vr1 |
|
vilvl.d vr3, vr4, vr3 |
|
vilvl.d vr5, vr6, vr5 |
|
vilvl.d vr7, vr8, vr7 |
|
vmulwev.w.hu.h vr13, vr1, vr9 |
|
vmulwev.w.hu.h vr14, vr3, vr10 |
|
vmulwev.w.hu.h vr15, vr5, vr11 |
|
vmulwev.w.hu.h vr16, vr7, vr12 |
|
vmaddwod.w.hu.h vr13, vr1, vr9 |
|
vmaddwod.w.hu.h vr14, vr3, vr10 |
|
vmaddwod.w.hu.h vr15, vr5, vr11 |
|
vmaddwod.w.hu.h vr16, vr7, vr12 |
|
vhaddw.d.w vr13, vr13, vr13 |
|
vhaddw.d.w vr14, vr14, vr14 |
|
vhaddw.d.w vr15, vr15, vr15 |
|
vhaddw.d.w vr16, vr16, vr16 |
|
vpickev.w vr13, vr14, vr13 |
|
vpickev.w vr15, vr16, vr15 |
|
vsra.w vr13, vr13, vr0 |
|
vsra.w vr15, vr15, vr0 |
|
vmin.w vr13, vr13, vr20 |
|
vmin.w vr15, vr15, vr20 |
|
|
|
vst vr13, a1, 0 |
|
vst vr15, a1, 16 |
|
addi.d a1, a1, 32 |
|
addi.d a5, a5, 32 |
|
addi.d a4, a4, 64 |
|
addi.d a2, a2, -8 |
|
bge a2, t8, .LOOP_HS19_DST4 |
|
blt zero, a2, .HS19_REST4 |
|
b .HS19_END |
|
.HS19_REST4: |
|
li.w t1, 0 |
|
.HS19_DST4: |
|
slli.w t2, t1, 2 |
|
ldx.w t2, a5, t2 |
|
li.w t3, 0 |
|
li.w t8, 0 |
|
.HS19_FILTER4: |
|
add.w t4, t2, t3 |
|
slli.w t4, t4, 1 |
|
ldx.hu t5, a3, t4 |
|
mul.w t6, a6, t1 |
|
add.w t6, t6, t3 |
|
slli.w t7, t6, 1 |
|
ldx.h t7, a4, t7 |
|
mul.w t7, t5, t7 |
|
add.w t8, t8, t7 |
|
addi.w t3, t3, 1 |
|
blt t3, a6, .HS19_FILTER4 |
|
sra.w t8, t8, a7 |
|
slt t5, t8, t0 |
|
maskeqz t8, t8, t5 |
|
masknez t5, t0, t5 |
|
or t8, t8, t5 |
|
slli.w t4, t1, 2 |
|
stx.w t8, a1, t4 |
|
addi.w t1, t1, 1 |
|
blt t1, a2, .HS19_DST4 |
|
b .HS19_END |
|
.END_HS19_DST4: |
|
|
|
li.w t1, 0 |
|
.LOOP_HS19_DST1: |
|
slli.w t2, t1, 2 |
|
ldx.w t2, a5, t2 |
|
li.w t3, 0 |
|
li.w t8, 0 |
|
.HS19_FILTER1: |
|
add.w t4, t2, t3 |
|
slli.w t4, t4, 1 |
|
ldx.hu t5, a3, t4 |
|
mul.w t6, a6, t1 |
|
add.w t6, t6, t3 |
|
slli.w t7, t6, 1 |
|
ldx.h t7, a4, t7 |
|
mul.w t7, t5, t7 |
|
add.w t8, t8, t7 |
|
addi.w t3, t3, 1 |
|
blt t3, a6, .HS19_FILTER1 |
|
sra.w t8, t8, a7 |
|
slt t5, t8, t0 |
|
maskeqz t8, t8, t5 |
|
masknez t5, t0, t5 |
|
or t8, t8, t5 |
|
slli.w t4, t1, 2 |
|
stx.w t8, a1, t4 |
|
addi.w t1, t1, 1 |
|
blt t1, a2, .LOOP_HS19_DST1 |
|
b .HS19_END |
|
.HS19_END: |
|
|
|
ld.d s0, sp, 0 |
|
ld.d s1, sp, 8 |
|
ld.d s2, sp, 16 |
|
ld.d s3, sp, 24 |
|
ld.d s4, sp, 32 |
|
ld.d s5, sp, 40 |
|
ld.d s6, sp, 48 |
|
ld.d s7, sp, 56 |
|
ld.d s8, sp, 64 |
|
addi.d sp, sp, 72 |
|
endfunc
|
|
|