avcodec/la: Add LSX optimization for h264 chroma and intrapred.

./configure --disable-lasx
ffmpeg -i 1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -y /dev/null -an
before: 199fps
after:  214fps

Reviewed-by: Shiyou Yin <yinshiyou-hf@loongson.cn>
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
pull/389/head
Lu Wang 2 years ago committed by Michael Niedermayer
parent 7845b5ecd6
commit 8815a7719e
No known key found for this signature in database
GPG Key ID: B18E8928B3948D64
  1. 4
      libavcodec/loongarch/Makefile
  2. 18
      libavcodec/loongarch/h264_intrapred_init_loongarch.c
  3. 121
      libavcodec/loongarch/h264_intrapred_lasx.c
  4. 12
      libavcodec/loongarch/h264_intrapred_loongarch.h
  5. 966
      libavcodec/loongarch/h264chroma.S
  6. 10
      libavcodec/loongarch/h264chroma_init_loongarch.c
  7. 1280
      libavcodec/loongarch/h264chroma_lasx.c
  8. 36
      libavcodec/loongarch/h264chroma_lasx.h
  9. 41
      libavcodec/loongarch/h264chroma_loongarch.h
  10. 299
      libavcodec/loongarch/h264intrapred.S

@ -9,11 +9,9 @@ OBJS-$(CONFIG_HPELDSP) += loongarch/hpeldsp_init_loongarch.o
OBJS-$(CONFIG_IDCTDSP) += loongarch/idctdsp_init_loongarch.o
OBJS-$(CONFIG_VIDEODSP) += loongarch/videodsp_init.o
OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_init_loongarch.o
LASX-OBJS-$(CONFIG_H264CHROMA) += loongarch/h264chroma_lasx.o
LASX-OBJS-$(CONFIG_H264QPEL) += loongarch/h264qpel_lasx.o
LASX-OBJS-$(CONFIG_H264DSP) += loongarch/h264dsp_lasx.o \
loongarch/h264_deblock_lasx.o
LASX-OBJS-$(CONFIG_H264PRED) += loongarch/h264_intrapred_lasx.o
LASX-OBJS-$(CONFIG_VC1_DECODER) += loongarch/vc1dsp_lasx.o
LASX-OBJS-$(CONFIG_HPELDSP) += loongarch/hpeldsp_lasx.o
LASX-OBJS-$(CONFIG_IDCTDSP) += loongarch/simple_idct_lasx.o \
@ -33,3 +31,5 @@ LSX-OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_lsx.o \
LSX-OBJS-$(CONFIG_H264DSP) += loongarch/h264idct.o \
loongarch/h264idct_loongarch.o \
loongarch/h264dsp.o
LSX-OBJS-$(CONFIG_H264CHROMA) += loongarch/h264chroma.o
LSX-OBJS-$(CONFIG_H264PRED) += loongarch/h264intrapred.o

@ -21,7 +21,7 @@
#include "libavutil/loongarch/cpu.h"
#include "libavcodec/h264pred.h"
#include "h264_intrapred_lasx.h"
#include "h264_intrapred_loongarch.h"
av_cold void ff_h264_pred_init_loongarch(H264PredContext *h, int codec_id,
const int bit_depth,
@ -30,6 +30,22 @@ av_cold void ff_h264_pred_init_loongarch(H264PredContext *h, int codec_id,
int cpu_flags = av_get_cpu_flags();
if (bit_depth == 8) {
if (have_lsx(cpu_flags)) {
if (chroma_format_idc <= 1) {
}
if (codec_id == AV_CODEC_ID_VP7 || codec_id == AV_CODEC_ID_VP8) {
} else {
if (chroma_format_idc <= 1) {
}
if (codec_id == AV_CODEC_ID_SVQ3) {
h->pred16x16[PLANE_PRED8x8] = ff_h264_pred16x16_plane_svq3_8_lsx;
} else if (codec_id == AV_CODEC_ID_RV40) {
h->pred16x16[PLANE_PRED8x8] = ff_h264_pred16x16_plane_rv40_8_lsx;
} else {
h->pred16x16[PLANE_PRED8x8] = ff_h264_pred16x16_plane_h264_8_lsx;
}
}
}
if (have_lasx(cpu_flags)) {
if (chroma_format_idc <= 1) {
}

@ -1,121 +0,0 @@
/*
* Copyright (c) 2021 Loongson Technology Corporation Limited
* Contributed by Hao Chen <chenhao@loongson.cn>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/loongarch/loongson_intrinsics.h"
#include "h264_intrapred_lasx.h"
#define PRED16X16_PLANE \
ptrdiff_t stride_1, stride_2, stride_3, stride_4, stride_5, stride_6; \
ptrdiff_t stride_8, stride_15; \
int32_t res0, res1, res2, res3, cnt; \
uint8_t *src0, *src1; \
__m256i reg0, reg1, reg2, reg3, reg4; \
__m256i tmp0, tmp1, tmp2, tmp3; \
__m256i shuff = {0x0B040A0509060807, 0x0F000E010D020C03, 0, 0}; \
__m256i mult = {0x0004000300020001, 0x0008000700060005, 0, 0}; \
__m256i int_mult1 = {0x0000000100000000, 0x0000000300000002, \
0x0000000500000004, 0x0000000700000006}; \
\
stride_1 = -stride; \
stride_2 = stride << 1; \
stride_3 = stride_2 + stride; \
stride_4 = stride_2 << 1; \
stride_5 = stride_4 + stride; \
stride_6 = stride_3 << 1; \
stride_8 = stride_4 << 1; \
stride_15 = (stride_8 << 1) - stride; \
src0 = src - 1; \
src1 = src0 + stride_8; \
\
reg0 = __lasx_xvldx(src0, -stride); \
reg1 = __lasx_xvldx(src, (8 - stride)); \
reg0 = __lasx_xvilvl_d(reg1, reg0); \
reg0 = __lasx_xvshuf_b(reg0, reg0, shuff); \
reg0 = __lasx_xvhsubw_hu_bu(reg0, reg0); \
reg0 = __lasx_xvmul_h(reg0, mult); \
res1 = (src1[0] - src0[stride_6]) + \
2 * (src1[stride] - src0[stride_5]) + \
3 * (src1[stride_2] - src0[stride_4]) + \
4 * (src1[stride_3] - src0[stride_3]) + \
5 * (src1[stride_4] - src0[stride_2]) + \
6 * (src1[stride_5] - src0[stride]) + \
7 * (src1[stride_6] - src0[0]) + \
8 * (src0[stride_15] - src0[stride_1]); \
reg0 = __lasx_xvhaddw_w_h(reg0, reg0); \
reg0 = __lasx_xvhaddw_d_w(reg0, reg0); \
reg0 = __lasx_xvhaddw_q_d(reg0, reg0); \
res0 = __lasx_xvpickve2gr_w(reg0, 0); \
#define PRED16X16_PLANE_END \
res2 = (src0[stride_15] + src[15 - stride] + 1) << 4; \
res3 = 7 * (res0 + res1); \
res2 -= res3; \
reg0 = __lasx_xvreplgr2vr_w(res0); \
reg1 = __lasx_xvreplgr2vr_w(res1); \
reg2 = __lasx_xvreplgr2vr_w(res2); \
reg3 = __lasx_xvmul_w(reg0, int_mult1); \
reg4 = __lasx_xvslli_w(reg0, 3); \
reg4 = __lasx_xvadd_w(reg4, reg3); \
for (cnt = 8; cnt--;) { \
tmp0 = __lasx_xvadd_w(reg2, reg3); \
tmp1 = __lasx_xvadd_w(reg2, reg4); \
tmp0 = __lasx_xvssrani_hu_w(tmp1, tmp0, 5); \
tmp0 = __lasx_xvpermi_d(tmp0, 0xD8); \
reg2 = __lasx_xvadd_w(reg2, reg1); \
tmp2 = __lasx_xvadd_w(reg2, reg3); \
tmp3 = __lasx_xvadd_w(reg2, reg4); \
tmp1 = __lasx_xvssrani_hu_w(tmp3, tmp2, 5); \
tmp1 = __lasx_xvpermi_d(tmp1, 0xD8); \
tmp0 = __lasx_xvssrani_bu_h(tmp1, tmp0, 0); \
reg2 = __lasx_xvadd_w(reg2, reg1); \
__lasx_xvstelm_d(tmp0, src, 0, 0); \
__lasx_xvstelm_d(tmp0, src, 8, 2); \
src += stride; \
__lasx_xvstelm_d(tmp0, src, 0, 1); \
__lasx_xvstelm_d(tmp0, src, 8, 3); \
src += stride; \
}
void ff_h264_pred16x16_plane_h264_8_lasx(uint8_t *src, ptrdiff_t stride)
{
PRED16X16_PLANE
res0 = (5 * res0 + 32) >> 6;
res1 = (5 * res1 + 32) >> 6;
PRED16X16_PLANE_END
}
void ff_h264_pred16x16_plane_rv40_8_lasx(uint8_t *src, ptrdiff_t stride)
{
PRED16X16_PLANE
res0 = (res0 + (res0 >> 2)) >> 4;
res1 = (res1 + (res1 >> 2)) >> 4;
PRED16X16_PLANE_END
}
void ff_h264_pred16x16_plane_svq3_8_lasx(uint8_t *src, ptrdiff_t stride)
{
PRED16X16_PLANE
cnt = (5 * (res0/4)) / 16;
res0 = (5 * (res1/4)) / 16;
res1 = cnt;
PRED16X16_PLANE_END
}

@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Loongson Technology Corporation Limited
* Copyright (c) 2023 Loongson Technology Corporation Limited
* Contributed by Hao Chen <chenhao@loongson.cn>
*
* This file is part of FFmpeg.
@ -19,13 +19,17 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_LOONGARCH_H264_INTRAPRED_LASX_H
#define AVCODEC_LOONGARCH_H264_INTRAPRED_LASX_H
#ifndef AVCODEC_LOONGARCH_H264_INTRAPRED_LOONGARCH_H
#define AVCODEC_LOONGARCH_H264_INTRAPRED_LOONGARCH_H
#include "libavcodec/avcodec.h"
void ff_h264_pred16x16_plane_h264_8_lsx(uint8_t *src, ptrdiff_t stride);
void ff_h264_pred16x16_plane_rv40_8_lsx(uint8_t *src, ptrdiff_t stride);
void ff_h264_pred16x16_plane_svq3_8_lsx(uint8_t *src, ptrdiff_t stride);
void ff_h264_pred16x16_plane_h264_8_lasx(uint8_t *src, ptrdiff_t stride);
void ff_h264_pred16x16_plane_rv40_8_lasx(uint8_t *src, ptrdiff_t stride);
void ff_h264_pred16x16_plane_svq3_8_lasx(uint8_t *src, ptrdiff_t stride);
#endif // #ifndef AVCODEC_LOONGARCH_H264_INTRAPRED_LASX_H
#endif // #ifndef AVCODEC_LOONGARCH_H264_INTRAPRED_LOONGARCH_H

@ -0,0 +1,966 @@
/*
* Loongson LSX/LASX optimized h264chroma
*
* Copyright (c) 2023 Loongson Technology Corporation Limited
* Contributed by Lu Wang <wanglu@loongson.cn>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "loongson_asm.S"
/* void ff_put_h264_chroma_mc8_lsx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int h, int x, int y) */
function ff_put_h264_chroma_mc8_lsx
li.d t8, 8
sub.d t1, t8, a4 // 8-x
sub.d t2, t8, a5 // 8-y
mul.d t3, t1, t2 // A
mul.d t4, a4, t2 // B
mul.d t5, t1, a5 // C
mul.d t6, a4, a5 // D
add.d t0, t4, t5 // E
vreplgr2vr.b vr0, t3
vreplgr2vr.b vr1, t4
vreplgr2vr.b vr2, t5
vreplgr2vr.b vr3, t6
vreplgr2vr.b vr4, t0
slli.d t2, a2, 1
add.d t3, t2, a2
slli.d t4, a2, 2
bge zero, t6, .ENDLOOP_D
move t1, a3
vilvl.b vr9, vr1, vr0
vilvl.b vr10, vr3, vr2
.LOOP_D:
vld vr5, a1, 0
vld vr6, a1, 1
add.d a1, a1, a2
vld vr7, a1, 0
vld vr8, a1, 1
vilvl.b vr11, vr6, vr5
vilvl.b vr12, vr8, vr7
vmulwev.h.bu vr13, vr9, vr11
vmaddwod.h.bu vr13, vr9, vr11
vmulwev.h.bu vr14, vr10, vr12
vmaddwod.h.bu vr14, vr10, vr12
vadd.h vr13, vr13, vr14
vsrarni.b.h vr13, vr13, 6
vstelm.d vr13, a0, 0, 0
add.d a0, a0, a2
add.d a1, a1, a2
vld vr5, a1, 0
vld vr6, a1, 1
vilvl.b vr11, vr8, vr7
vilvl.b vr12, vr6, vr5
vmulwev.h.bu vr13, vr9, vr11
vmaddwod.h.bu vr13, vr9, vr11
vmulwev.h.bu vr14, vr10, vr12
vmaddwod.h.bu vr14, vr10, vr12
vadd.h vr13, vr13, vr14
vsrarni.b.h vr13, vr13, 6
vstelm.d vr13, a0, 0, 0
add.d a0, a0, a2
add.d a1, a1, a2
vld vr7, a1, 0
vld vr8, a1, 1
vilvl.b vr11, vr6, vr5
vilvl.b vr12, vr8, vr7
vmulwev.h.bu vr13, vr9, vr11
vmaddwod.h.bu vr13, vr9, vr11
vmulwev.h.bu vr14, vr10, vr12
vmaddwod.h.bu vr14, vr10, vr12
vadd.h vr13, vr13, vr14
vsrarni.b.h vr13, vr13, 6
vstelm.d vr13, a0, 0, 0
add.d a0, a0, a2
add.d a1, a1, a2
vld vr5, a1, 0
vld vr6, a1, 1
vilvl.b vr11, vr8, vr7
vilvl.b vr12, vr6, vr5
vmulwev.h.bu vr13, vr9, vr11
vmaddwod.h.bu vr13, vr9, vr11
vmulwev.h.bu vr14, vr10, vr12
vmaddwod.h.bu vr14, vr10, vr12
vadd.h vr13, vr13, vr14
vsrarni.b.h vr13, vr13, 6
vstelm.d vr13, a0, 0, 0
add.d a0, a0, a2
addi.d t1, t1, -4
blt zero, t1, .LOOP_D
b .ENDLOOP
.ENDLOOP_D:
bge zero, t0, .ENDLOOP_E
move t1, a3
li.d t7, 1
slt t8, zero, t5
maskeqz t5, a2, t8
masknez t7, t7, t8
or t7, t7, t5
vilvl.b vr7, vr4, vr0
.LOOP_E:
vld vr5, a1, 0
vldx vr6, a1, t7
vilvl.b vr5, vr6, vr5
vmulwev.h.bu vr6, vr7, vr5
vmaddwod.h.bu vr6, vr7, vr5
vsrarni.b.h vr6, vr6, 6
vstelm.d vr6, a0, 0, 0
add.d a0, a0, a2
add.d a1, a1, a2
vld vr5, a1, 0
vldx vr6, a1, t7
vilvl.b vr5, vr6, vr5
vmulwev.h.bu vr6, vr7, vr5
vmaddwod.h.bu vr6, vr7, vr5
vsrarni.b.h vr6, vr6, 6
vstelm.d vr6, a0, 0, 0
add.d a0, a0, a2
add.d a1, a1, a2
vld vr5, a1, 0
vldx vr6, a1, t7
vilvl.b vr5, vr6, vr5
vmulwev.h.bu vr6, vr7, vr5
vmaddwod.h.bu vr6, vr7, vr5
vsrarni.b.h vr6, vr6, 6
vstelm.d vr6, a0, 0, 0
add.d a0, a0, a2
add.d a1, a1, a2
vld vr5, a1, 0
vldx vr6, a1, t7
vilvl.b vr5, vr6, vr5
vmulwev.h.bu vr6, vr7, vr5
vmaddwod.h.bu vr6, vr7, vr5
vsrarni.b.h vr6, vr6, 6
vstelm.d vr6, a0, 0, 0
add.d a0, a0, a2
add.d a1, a1, a2
addi.d t1, t1, -4
blt zero, t1, .LOOP_E
b .ENDLOOP
.ENDLOOP_E:
move t1, a3
.LOOP:
vld vr5, a1, 0
vmulwev.h.bu vr6, vr0, vr5
vmulwod.h.bu vr7, vr0, vr5
vsrarni.b.h vr6, vr6, 6
vsrarni.b.h vr7, vr7, 6
vilvl.b vr6, vr7, vr6
vstelm.d vr6, a0, 0, 0
add.d a0, a0, a2
vldx vr5, a1, a2
vmulwev.h.bu vr6, vr0, vr5
vmulwod.h.bu vr7, vr0, vr5
vsrarni.b.h vr6, vr6, 6
vsrarni.b.h vr7, vr7, 6
vilvl.b vr6, vr7, vr6
vstelm.d vr6, a0, 0, 0
add.d a0, a0, a2
vldx vr5, a1, t2
vmulwev.h.bu vr6, vr0, vr5
vmulwod.h.bu vr7, vr0, vr5
vsrarni.b.h vr6, vr6, 6
vsrarni.b.h vr7, vr7, 6
vilvl.b vr6, vr7, vr6
vstelm.d vr6, a0, 0, 0
add.d a0, a0, a2
vldx vr5, a1, t3
vmulwev.h.bu vr6, vr0, vr5
vmulwod.h.bu vr7, vr0, vr5
vsrarni.b.h vr6, vr6, 6
vsrarni.b.h vr7, vr7, 6
vilvl.b vr6, vr7, vr6
vstelm.d vr6, a0, 0, 0
add.d a0, a0, a2
add.d a1, a1, t4
addi.d t1, t1, -4
blt zero, t1, .LOOP
.ENDLOOP:
endfunc
/* void ff_avg_h264_chroma_mc8_lsx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int h, int x, int y) */
function ff_avg_h264_chroma_mc8_lsx
li.d t8, 8
sub.d t1, t8, a4 // 8-x
sub.d t2, t8, a5 // 8-y
mul.d t3, t1, t2 // A
mul.d t4, a4, t2 // B
mul.d t5, t1, a5 // C
mul.d t6, a4, a5 // D
add.d t0, t4, t5 // E
vreplgr2vr.b vr0, t3
vreplgr2vr.b vr1, t4
vreplgr2vr.b vr2, t5
vreplgr2vr.b vr3, t6
vreplgr2vr.b vr4, t0
slli.d t2, a2, 1
add.d t3, t2, a2
slli.d t4, a2, 2
bge zero, t6, .ENDLOOPD
move t1, a3
vilvl.b vr9, vr1, vr0
vilvl.b vr10, vr3, vr2
.LOOPD:
vld vr5, a1, 0
vld vr6, a1, 1
add.d a1, a1, a2
vld vr7, a1, 0
vld vr8, a1, 1
vld vr11, a0, 0
vilvl.b vr12, vr6, vr5
vilvl.b vr13, vr8, vr7
vmulwev.h.bu vr14, vr9, vr12
vmaddwod.h.bu vr14, vr9, vr12
vmulwev.h.bu vr15, vr10, vr13
vmaddwod.h.bu vr15, vr10, vr13
vadd.h vr14, vr14, vr15
vsrari.h vr14, vr14, 6
vsllwil.hu.bu vr11, vr11, 0
vadd.h vr11, vr14, vr11
vsrarni.b.h vr11, vr11, 1
vstelm.d vr11, a0, 0, 0
add.d a0, a0, a2
add.d a1, a1, a2
vld vr5, a1, 0
vld vr6, a1, 1
vld vr11, a0, 0
vilvl.b vr12, vr8, vr7
vilvl.b vr13, vr6, vr5
vmulwev.h.bu vr14, vr9, vr12
vmaddwod.h.bu vr14, vr9, vr12
vmulwev.h.bu vr15, vr10, vr13
vmaddwod.h.bu vr15, vr10, vr13
vadd.h vr14, vr14, vr15
vsrari.h vr14, vr14, 6
vsllwil.hu.bu vr11, vr11, 0
vadd.h vr11, vr14, vr11
vsrarni.b.h vr11, vr11, 1
vstelm.d vr11, a0, 0, 0
add.d a0, a0, a2
add.d a1, a1, a2
vld vr7, a1, 0
vld vr8, a1, 1
vld vr11, a0, 0
vilvl.b vr12, vr6, vr5
vilvl.b vr13, vr8, vr7
vmulwev.h.bu vr14, vr9, vr12
vmaddwod.h.bu vr14, vr9, vr12
vmulwev.h.bu vr15, vr10, vr13
vmaddwod.h.bu vr15, vr10, vr13
vadd.h vr14, vr14, vr15
vsrari.h vr14, vr14, 6
vsllwil.hu.bu vr11, vr11, 0
vadd.h vr11, vr14, vr11
vsrarni.b.h vr11, vr11, 1
vstelm.d vr11, a0, 0, 0
add.d a0, a0, a2
add.d a1, a1, a2
vld vr5, a1, 0
vld vr6, a1, 1
vld vr11, a0, 0
vilvl.b vr12, vr8, vr7
vilvl.b vr13, vr6, vr5
vmulwev.h.bu vr14, vr9, vr12
vmaddwod.h.bu vr14, vr9, vr12
vmulwev.h.bu vr15, vr10, vr13
vmaddwod.h.bu vr15, vr10, vr13
vadd.h vr14, vr14, vr15
vsrari.h vr14, vr14, 6
vsllwil.hu.bu vr11, vr11, 0
vadd.h vr11, vr14, vr11
vsrarni.b.h vr11, vr11, 1
vstelm.d vr11, a0, 0, 0
add.d a0, a0, a2
addi.d t1, t1, -4
blt zero, t1, .LOOPD
b .ENDLOOPELSE
.ENDLOOPD:
bge zero, t0, .ENDLOOPE
move t1, a3
li.d t7, 1
slt t8, zero, t5
maskeqz t5, a2, t8
masknez t7, t7, t8
or t7, t7, t5
vilvl.b vr7, vr4, vr0
.LOOPE:
vld vr5, a1, 0
vldx vr6, a1, t7
vld vr8, a0, 0
vilvl.b vr5, vr6, vr5
vmulwev.h.bu vr6, vr7, vr5
vmaddwod.h.bu vr6, vr7, vr5
vsrari.h vr6, vr6, 6
vsllwil.hu.bu vr8, vr8, 0
vadd.h vr8, vr6, vr8
vsrarni.b.h vr8, vr8, 1
vstelm.d vr8, a0, 0, 0
add.d a0, a0, a2
add.d a1, a1, a2
vld vr5, a1, 0
vldx vr6, a1, t7
vld vr8, a0, 0
vilvl.b vr5, vr6, vr5
vmulwev.h.bu vr6, vr7, vr5
vmaddwod.h.bu vr6, vr7, vr5
vsrari.h vr6, vr6, 6
vsllwil.hu.bu vr8, vr8, 0
vadd.h vr8, vr6, vr8
vsrarni.b.h vr8, vr8, 1
vstelm.d vr8, a0, 0, 0
add.d a0, a0, a2
add.d a1, a1, a2
vld vr5, a1, 0
vldx vr6, a1, t7
vld vr8, a0, 0
vilvl.b vr5, vr6, vr5
vmulwev.h.bu vr6, vr7, vr5
vmaddwod.h.bu vr6, vr7, vr5
vsrari.h vr6, vr6, 6
vsllwil.hu.bu vr8, vr8, 0
vadd.h vr8, vr6, vr8
vsrarni.b.h vr8, vr8, 1
vstelm.d vr8, a0, 0, 0
add.d a0, a0, a2
add.d a1, a1, a2
vld vr5, a1, 0
vldx vr6, a1, t7
vld vr8, a0, 0
vilvl.b vr5, vr6, vr5
vmulwev.h.bu vr6, vr7, vr5
vmaddwod.h.bu vr6, vr7, vr5
vsrari.h vr6, vr6, 6
vsllwil.hu.bu vr8, vr8, 0
vadd.h vr8, vr6, vr8
vsrarni.b.h vr8, vr8, 1
vstelm.d vr8, a0, 0, 0
add.d a0, a0, a2
add.d a1, a1, a2
addi.d t1, t1, -4
blt zero, t1, .LOOPE
b .ENDLOOPELSE
.ENDLOOPE:
move t1, a3
.LOOPELSE:
vld vr5, a1, 0
vld vr8, a0, 0
vmulwev.h.bu vr6, vr0, vr5
vmulwod.h.bu vr7, vr0, vr5
vilvl.h vr6, vr7, vr6
vsrari.h vr6, vr6, 6
vsllwil.hu.bu vr8, vr8, 0
vadd.h vr8, vr6, vr8
vsrarni.b.h vr8, vr8, 1
vstelm.d vr8, a0, 0, 0
add.d a0, a0, a2
vldx vr5, a1, a2
vld vr8, a0, 0
vmulwev.h.bu vr6, vr0, vr5
vmulwod.h.bu vr7, vr0, vr5
vilvl.h vr6, vr7, vr6
vsrari.h vr6, vr6, 6
vsllwil.hu.bu vr8, vr8, 0
vadd.h vr8, vr6, vr8
vsrarni.b.h vr8, vr8, 1
vstelm.d vr8, a0, 0, 0
add.d a0, a0, a2
vldx vr5, a1, t2
vld vr8, a0, 0
vmulwev.h.bu vr6, vr0, vr5
vmulwod.h.bu vr7, vr0, vr5
vilvl.h vr6, vr7, vr6
vsrari.h vr6, vr6, 6
vsllwil.hu.bu vr8, vr8, 0
vadd.h vr8, vr6, vr8
vsrarni.b.h vr8, vr8, 1
vstelm.d vr8, a0, 0, 0
add.d a0, a0, a2
vldx vr5, a1, t3
vld vr8, a0, 0
vmulwev.h.bu vr6, vr0, vr5
vmulwod.h.bu vr7, vr0, vr5
vilvl.h vr6, vr7, vr6
vsrari.h vr6, vr6, 6
vsllwil.hu.bu vr8, vr8, 0
vadd.h vr8, vr6, vr8
vsrarni.b.h vr8, vr8, 1
vstelm.d vr8, a0, 0, 0
add.d a0, a0, a2
add.d a1, a1, t4
addi.d t1, t1, -4
blt zero, t1, .LOOPELSE
.ENDLOOPELSE:
endfunc
/* void ff_put_h264_chroma_mc4_lsx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int h, int x, int y) */
function ff_put_h264_chroma_mc4_lsx
li.d t8, 8
sub.d t1, t8, a4 // 8-x
sub.d t2, t8, a5 // 8-y
mul.d t3, t1, t2 // A
mul.d t4, a4, t2 // B
mul.d t5, t1, a5 // C
mul.d t6, a4, a5 // D
add.d t0, t4, t5 // E
slli.d t8, a2, 1
vreplgr2vr.b vr0, t3
vreplgr2vr.b vr1, t4
vreplgr2vr.b vr2, t5
vreplgr2vr.b vr3, t6
vreplgr2vr.b vr4, t0
bge zero, t6, .ENDPUT_D
move t1, a3
vilvl.b vr9, vr1, vr0
vilvl.b vr10, vr3, vr2
.PUT_D:
vld vr5, a1, 0
vld vr6, a1, 1
add.d a1, a1, a2
vld vr7, a1, 0
vld vr8, a1, 1
add.d a1, a1, a2
vld vr11, a1, 0
vld vr12, a1, 1
vilvl.b vr5, vr6, vr5
vilvl.b vr7, vr8, vr7
vilvl.b vr13, vr12, vr11
vilvl.d vr5, vr7, vr5
vilvl.d vr13, vr13, vr7
vmulwev.h.bu vr14, vr9, vr5
vmaddwod.h.bu vr14, vr9, vr5
vmulwev.h.bu vr15, vr10, vr13
vmaddwod.h.bu vr15, vr10, vr13
vadd.h vr14, vr14, vr15
vsrarni.b.h vr14, vr14, 6
vstelm.w vr14, a0, 0, 0
add.d a0, a0, a2
vstelm.w vr14, a0, 0, 1
add.d a0, a0, a2
addi.d t1, t1, -2
blt zero, t1, .PUT_D
b .ENDPUT
.ENDPUT_D:
bge zero, t0, .ENDPUT_E
move t1, a3
li.d t7, 1
slt t8, zero, t5
maskeqz t5, a2, t8
masknez t7, t7, t8
or t7, t7, t5
vilvl.b vr7, vr4, vr0
.PUT_E:
vld vr5, a1, 0
vldx vr6, a1, t7
vilvl.b vr5, vr6, vr5
add.d a1, a1, a2
vld vr8, a1, 0
vldx vr9, a1, t7
vilvl.b vr8, vr9, vr8
vilvl.d vr5, vr8, vr5
vmulwev.h.bu vr6, vr7, vr5
vmaddwod.h.bu vr6, vr7, vr5
vsrarni.b.h vr6, vr6, 6
vstelm.w vr6, a0, 0, 0
add.d a0, a0, a2
vstelm.w vr6, a0, 0, 1
add.d a0, a0, a2
add.d a1, a1, a2
addi.d t1, t1, -2
blt zero, t1, .PUT_E
b .ENDPUT
.ENDPUT_E:
move t1, a3
.PUT:
vld vr5, a1, 0
vldx vr8, a1, a2
vilvl.w vr5, vr8, vr5
vmulwev.h.bu vr6, vr0, vr5
vmulwod.h.bu vr7, vr0, vr5
vsrarni.b.h vr6, vr6, 6
vsrarni.b.h vr7, vr7, 6
vilvl.b vr6, vr7, vr6
vstelm.w vr6, a0, 0, 0
add.d a0, a0, a2
vstelm.w vr6, a0, 0, 1
add.d a0, a0, a2
add.d a1, a1, t8
addi.d t1, t1, -2
blt zero, t1, .PUT
.ENDPUT:
endfunc
/* void ff_put_h264_chroma_mc8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int h, int x, int y) */
function ff_put_h264_chroma_mc8_lasx
li.d t8, 8
sub.d t1, t8, a4 // 8-x
sub.d t2, t8, a5 // 8-y
mul.d t3, t1, t2 // A
mul.d t4, a4, t2 // B
mul.d t5, t1, a5 // C
mul.d t6, a4, a5 // D
add.d t0, t4, t5 // E
xvreplgr2vr.b xr0, t3
xvreplgr2vr.b xr1, t4
xvreplgr2vr.b xr2, t5
xvreplgr2vr.b xr3, t6
xvreplgr2vr.b xr4, t0
slli.d t2, a2, 1
add.d t3, t2, a2
slli.d t4, a2, 2
bge zero, t6, .ENDLOOP_DA
move t1, a3
xvilvl.b xr9, xr1, xr0
xvilvl.b xr10, xr3, xr2
.LOOP_DA:
fld.d f5, a1, 0
fld.d f6, a1, 1
add.d a1, a1, a2
fld.d f7, a1, 0
fld.d f8, a1, 1
add.d a1, a1, a2
fld.d f13, a1, 0
fld.d f14, a1, 1
add.d a1, a1, a2
fld.d f15, a1, 0
fld.d f16, a1, 1
add.d a1, a1, a2
fld.d f17, a1, 0
fld.d f18, a1, 1
vilvl.b vr11, vr6, vr5
vilvl.b vr12, vr8, vr7
vilvl.b vr14, vr14, vr13
vilvl.b vr15, vr16, vr15
vilvl.b vr16, vr18, vr17
xvpermi.q xr11, xr12, 0x02
xvpermi.q xr12, xr14, 0x02
xvpermi.q xr14, xr15, 0x02
xvpermi.q xr15, xr16, 0x02
xvmulwev.h.bu xr19, xr9, xr11
xvmaddwod.h.bu xr19, xr9, xr11
xvmulwev.h.bu xr20, xr10, xr12
xvmaddwod.h.bu xr20, xr10, xr12
xvadd.h xr21, xr19, xr20
xvsrarni.b.h xr21, xr21, 6
vstelm.d vr21, a0, 0, 0
add.d a0, a0, a2
xvstelm.d xr21, a0, 0, 2
add.d a0, a0, a2
xvmulwev.h.bu xr13, xr9, xr14
xvmaddwod.h.bu xr13, xr9, xr14
xvmulwev.h.bu xr14, xr10, xr15
xvmaddwod.h.bu xr14, xr10, xr15
xvadd.h xr13, xr13, xr14
xvsrarni.b.h xr13, xr13, 6
vstelm.d vr13, a0, 0, 0
add.d a0, a0, a2
xvstelm.d xr13, a0, 0, 2
add.d a0, a0, a2
addi.d t1, t1, -4
blt zero, t1, .LOOP_DA
b .ENDLOOPA
.ENDLOOP_DA:
bge zero, t0, .ENDLOOP_EA
move t1, a3
li.d t7, 1
slt t8, zero, t5
maskeqz t5, a2, t8
masknez t7, t7, t8
or t7, t7, t5
xvilvl.b xr7, xr4, xr0
.LOOP_EA:
fld.d f5, a1, 0
fldx.d f6, a1, t7
add.d a1, a1, a2
fld.d f9, a1, 0
fldx.d f10, a1, t7
add.d a1, a1, a2
fld.d f11, a1, 0
fldx.d f12, a1, t7
add.d a1, a1, a2
fld.d f13, a1, 0
fldx.d f14, a1, t7
vilvl.b vr5, vr6, vr5
vilvl.b vr9, vr10, vr9
vilvl.b vr11, vr12, vr11
vilvl.b vr13, vr14, vr13
xvpermi.q xr5, xr9, 0x02
xvpermi.q xr11, xr13, 0x02
xvmulwev.h.bu xr8, xr7, xr5
xvmaddwod.h.bu xr8, xr7, xr5
xvmulwev.h.bu xr6, xr7, xr11
xvmaddwod.h.bu xr6, xr7, xr11
xvsrarni.b.h xr8, xr8, 6
vstelm.d vr8, a0, 0, 0
add.d a0, a0, a2
xvstelm.d xr8, a0, 0, 2
add.d a0, a0, a2
xvsrarni.b.h xr6, xr6, 6
vstelm.d vr6, a0, 0, 0
add.d a0, a0, a2
xvstelm.d xr6, a0, 0, 2
add.d a0, a0, a2
add.d a1, a1, a2
addi.d t1, t1, -4
blt zero, t1, .LOOP_EA
b .ENDLOOPA
.ENDLOOP_EA:
move t1, a3
.LOOPA:
fld.d f5, a1, 0
fldx.d f6, a1, a2
fldx.d f7, a1, t2
fldx.d f8, a1, t3
vilvl.d vr5, vr6, vr5
vilvl.d vr7, vr8, vr7
xvpermi.q xr5, xr7, 0x02
xvmulwev.h.bu xr6, xr0, xr5
xvmulwod.h.bu xr7, xr0, xr5
xvilvl.h xr8, xr7, xr6
xvilvh.h xr9, xr7, xr6
xvsrarni.b.h xr9, xr8, 6
vstelm.d vr9, a0, 0, 0
add.d a0, a0, a2
vstelm.d vr9, a0, 0, 1
add.d a0, a0, a2
xvstelm.d xr9, a0, 0, 2
add.d a0, a0, a2
xvstelm.d xr9, a0, 0, 3
add.d a0, a0, a2
add.d a1, a1, t4
addi.d t1, t1, -4
blt zero, t1, .LOOPA
.ENDLOOPA:
endfunc
/* void ff_avg_h264_chroma_mc8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int h, int x, int y) */
function ff_avg_h264_chroma_mc8_lasx
li.d t8, 8
sub.d t1, t8, a4 // 8-x
sub.d t2, t8, a5 // 8-y
mul.d t3, t1, t2 // A
mul.d t4, a4, t2 // B
mul.d t5, t1, a5 // C
mul.d t6, a4, a5 // D
add.d t0, t4, t5 // E
xvreplgr2vr.b xr0, t3
xvreplgr2vr.b xr1, t4
xvreplgr2vr.b xr2, t5
xvreplgr2vr.b xr3, t6
xvreplgr2vr.b xr4, t0
slli.d t2, a2, 1
add.d t3, t2, a2
slli.d t4, a2, 2
bge zero, t6, .ENDLOOPDA
move t1, a3
xvilvl.b xr9, xr1, xr0
xvilvl.b xr10, xr3, xr2
.LOOPDA:
fld.d f5, a1, 0
fld.d f6, a1, 1
add.d a1, a1, a2
fld.d f7, a1, 0
fld.d f8, a1, 1
add.d a1, a1, a2
fld.d f11, a1, 0
fld.d f12, a1, 1
add.d a1, a1, a2
fld.d f13, a1, 0
fld.d f14, a1, 1
add.d a1, a1, a2
fld.d f15, a1, 0
fld.d f16, a1, 1
fld.d f17, a0, 0
fldx.d f18, a0, a2
fldx.d f19, a0, t2
fldx.d f20, a0, t3
vilvl.b vr5, vr6, vr5
vilvl.b vr7, vr8, vr7
vilvl.b vr11, vr12, vr11
vilvl.b vr13, vr14, vr13
vilvl.b vr16, vr16, vr15
xvpermi.q xr5, xr7, 0x02
xvpermi.q xr7, xr11, 0x02
xvpermi.q xr11, xr13, 0x02
xvpermi.q xr13, xr16, 0x02
xvpermi.q xr17, xr18, 0x02
xvpermi.q xr19, xr20, 0x02
xvmulwev.h.bu xr14, xr9, xr5
xvmaddwod.h.bu xr14, xr9, xr5
xvmulwev.h.bu xr15, xr10, xr7
xvmaddwod.h.bu xr15, xr10, xr7
xvadd.h xr14, xr14, xr15
xvsrari.h xr14, xr14, 6
xvsllwil.hu.bu xr17, xr17, 0
xvadd.h xr20, xr14, xr17
xvsrarni.b.h xr20, xr20, 1
xvstelm.d xr20, a0, 0, 0
add.d a0, a0, a2
xvstelm.d xr20, a0, 0, 2
add.d a0, a0, a2
xvmulwev.h.bu xr14, xr9, xr11
xvmaddwod.h.bu xr14, xr9, xr11
xvmulwev.h.bu xr15, xr10, xr13
xvmaddwod.h.bu xr15, xr10, xr13
xvadd.h xr14, xr14, xr15
xvsrari.h xr14, xr14, 6
xvsllwil.hu.bu xr19, xr19, 0
xvadd.h xr21, xr14, xr19
xvsrarni.b.h xr21, xr21, 1
xvstelm.d xr21, a0, 0, 0
add.d a0, a0, a2
xvstelm.d xr21, a0, 0, 2
add.d a0, a0, a2
addi.d t1, t1, -4
blt zero, t1, .LOOPDA
b .ENDLOOPELSEA
.ENDLOOPDA:
bge zero, t0, .ENDLOOPEA
move t1, a3
li.d t7, 1
slt t8, zero, t5
maskeqz t5, a2, t8
masknez t7, t7, t8
or t7, t7, t5
xvilvl.b xr7, xr4, xr0
.LOOPEA:
fld.d f5, a1, 0
fldx.d f6, a1, t7
add.d a1, a1, a2
fld.d f8, a1, 0
fldx.d f9, a1, t7
add.d a1, a1, a2
fld.d f10, a1, 0
fldx.d f11, a1, t7
add.d a1, a1, a2
fld.d f12, a1, 0
fldx.d f13, a1, t7
add.d a1, a1, a2
fld.d f14, a0, 0
fldx.d f15, a0, a2
fldx.d f16, a0, t2
fldx.d f17, a0, t3
vilvl.b vr5, vr6, vr5
vilvl.b vr8, vr9, vr8
vilvl.b vr10, vr11, vr10
vilvl.b vr12, vr13, vr12
xvpermi.q xr5, xr8, 0x02
xvpermi.q xr10, xr12, 0x02
xvpermi.q xr14, xr15, 0x02
xvpermi.q xr16, xr17, 0x02
xvmulwev.h.bu xr6, xr7, xr5
xvmaddwod.h.bu xr6, xr7, xr5
xvsrari.h xr6, xr6, 6
xvsllwil.hu.bu xr14, xr14, 0
xvadd.h xr8, xr6, xr14
xvsrarni.b.h xr8, xr8, 1
xvstelm.d xr8, a0, 0, 0
add.d a0, a0, a2
xvstelm.d xr8, a0, 0, 2
add.d a0, a0, a2
xvmulwev.h.bu xr6, xr7, xr10
xvmaddwod.h.bu xr6, xr7, xr10
xvsrari.h xr6, xr6, 6
xvsllwil.hu.bu xr16, xr16, 0
xvadd.h xr8, xr6, xr16
xvsrarni.b.h xr8, xr8, 1
xvstelm.d xr8, a0, 0, 0
add.d a0, a0, a2
xvstelm.d xr8, a0, 0, 2
add.d a0, a0, a2
addi.d t1, t1, -4
blt zero, t1, .LOOPEA
b .ENDLOOPELSEA
.ENDLOOPEA:
move t1, a3
.LOOPELSEA:
fld.d f5, a1, 0
fldx.d f6, a1, a2
fldx.d f7, a1, t2
fldx.d f8, a1, t3
fld.d f9, a0, 0
fldx.d f10, a0, a2
fldx.d f11, a0, t2
fldx.d f12, a0, t3
xvpermi.q xr5, xr6, 0x02
xvpermi.q xr7, xr8, 0x02
xvpermi.q xr9, xr10, 0x02
xvpermi.q xr11, xr12, 0x02
xvmulwev.h.bu xr12, xr0, xr5
xvmulwod.h.bu xr13, xr0, xr5
xvilvl.h xr12, xr13, xr12
xvsrari.h xr12, xr12, 6
xvsllwil.hu.bu xr9, xr9, 0
xvadd.h xr9, xr12, xr9
xvsrarni.b.h xr9, xr9, 1
xvstelm.d xr9, a0, 0, 0
add.d a0, a0, a2
xvstelm.d xr9, a0, 0, 2
add.d a0, a0, a2
xvmulwev.h.bu xr12, xr0, xr7
xvmulwod.h.bu xr13, xr0, xr7
xvilvl.h xr12, xr13, xr12
xvsrari.h xr12, xr12, 6
xvsllwil.hu.bu xr11, xr11, 0
xvadd.h xr13, xr12, xr11
xvsrarni.b.h xr13, xr13, 1
xvstelm.d xr13, a0, 0, 0
add.d a0, a0, a2
xvstelm.d xr13, a0, 0, 2
add.d a0, a0, a2
add.d a1, a1, t4
addi.d t1, t1, -4
blt zero, t1, .LOOPELSEA
.ENDLOOPELSEA:
endfunc
/* void ff_put_h264_chroma_mc4_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int h, int x, int y) */
function ff_put_h264_chroma_mc4_lasx
li.d t8, 8
sub.d t1, t8, a4 // 8-x
sub.d t2, t8, a5 // 8-y
mul.d t3, t1, t2 // A
mul.d t4, a4, t2 // B
mul.d t5, t1, a5 // C
mul.d t6, a4, a5 // D
add.d t0, t4, t5 // E
slli.d t8, a2, 1
vreplgr2vr.b vr0, t3
vreplgr2vr.b vr1, t4
vreplgr2vr.b vr2, t5
vreplgr2vr.b vr3, t6
vreplgr2vr.b vr4, t0
bge zero, t6, .ENDPUT_DA
move t1, a3
vilvl.b vr9, vr1, vr0
vilvl.b vr10, vr3, vr2
.PUT_DA:
fld.d f5, a1, 0
fld.d f6, a1, 1
add.d a1, a1, a2
fld.d f7, a1, 0
fld.d f8, a1, 1
add.d a1, a1, a2
fld.d f11, a1, 0
fld.d f12, a1, 1
vilvl.b vr5, vr6, vr5
vilvl.b vr7, vr8, vr7
vilvl.b vr13, vr12, vr11
vilvl.d vr5, vr7, vr5
vilvl.d vr13, vr13, vr7
vmulwev.h.bu vr14, vr9, vr5
vmaddwod.h.bu vr14, vr9, vr5
vmulwev.h.bu vr15, vr10, vr13
vmaddwod.h.bu vr15, vr10, vr13
xvadd.h xr14, xr14, xr15
vsrarni.b.h vr16, vr14, 6
vstelm.w vr16, a0, 0, 0
add.d a0, a0, a2
vstelm.w vr16, a0, 0, 1
add.d a0, a0, a2
addi.d t1, t1, -2
blt zero, t1, .PUT_DA
b .ENDPUTA
.ENDPUT_DA:
bge zero, t0, .ENDPUT_EA
move t1, a3
li.d t7, 1
slt t8, zero, t5
maskeqz t5, a2, t8
masknez t7, t7, t8
or t7, t7, t5
vilvl.b vr7, vr4, vr0
.PUT_EA:
fld.d f5, a1, 0
fldx.d f6, a1, t7
vilvl.b vr5, vr6, vr5
add.d a1, a1, a2
fld.d f8, a1, 0
fldx.d f9, a1, t7
vilvl.b vr8, vr9, vr8
vilvl.d vr5, vr8, vr5
vmulwev.h.bu vr6, vr7, vr5
vmaddwod.h.bu vr6, vr7, vr5
vsrarni.b.h vr6, vr6, 6
vstelm.w vr6, a0, 0, 0
add.d a0, a0, a2
vstelm.w vr6, a0, 0, 1
add.d a0, a0, a2
add.d a1, a1, a2
addi.d t1, t1, -2
blt zero, t1, .PUT_EA
b .ENDPUTA
.ENDPUT_EA:
move t1, a3
.PUTA:
fld.d f5, a1, 0
fldx.d f8, a1, a2
vilvl.w vr5, vr8, vr5
vmulwev.h.bu vr6, vr0, vr5
vmulwod.h.bu vr7, vr0, vr5
vilvl.h vr6, vr7, vr6
vsrarni.b.h vr6, vr6, 6
vstelm.w vr6, a0, 0, 0
add.d a0, a0, a2
vstelm.w vr6, a0, 0, 1
add.d a0, a0, a2
add.d a1, a1, t8
addi.d t1, t1, -2
blt zero, t1, .PUTA
.ENDPUTA:
endfunc

@ -19,7 +19,7 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "h264chroma_lasx.h"
#include "h264chroma_loongarch.h"
#include "libavutil/attributes.h"
#include "libavutil/loongarch/cpu.h"
#include "libavcodec/h264chroma.h"
@ -27,6 +27,14 @@
av_cold void ff_h264chroma_init_loongarch(H264ChromaContext *c, int bit_depth)
{
int cpu_flags = av_get_cpu_flags();
if (have_lsx(cpu_flags)) {
if (bit_depth <= 8) {
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_lsx;
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_lsx;
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_lsx;
}
}
if (have_lasx(cpu_flags)) {
if (bit_depth <= 8) {
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_lasx;

File diff suppressed because it is too large Load Diff

@ -1,36 +0,0 @@
/*
* Copyright (c) 2020 Loongson Technology Corporation Limited
* Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_LOONGARCH_H264CHROMA_LASX_H
#define AVCODEC_LOONGARCH_H264CHROMA_LASX_H
#include <stdint.h>
#include <stddef.h>
#include "libavcodec/h264.h"
void ff_put_h264_chroma_mc4_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride,
int h, int x, int y);
void ff_put_h264_chroma_mc8_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride,
int h, int x, int y);
void ff_avg_h264_chroma_mc8_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride,
int h, int x, int y);
#endif /* AVCODEC_LOONGARCH_H264CHROMA_LASX_H */

@ -0,0 +1,41 @@
/*
* Copyright (c) 2023 Loongson Technology Corporation Limited
* Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_LOONGARCH_H264CHROMA_LOONGARCH_H
#define AVCODEC_LOONGARCH_H264CHROMA_LOONGARCH_H
#include "libavcodec/h264.h"
void ff_put_h264_chroma_mc8_lsx(unsigned char *dst, const unsigned char *src,
long int stride, int h, int x, int y);
void ff_avg_h264_chroma_mc8_lsx(unsigned char *dst, const unsigned char *src,
long int stride, int h, int x, int y);
void ff_put_h264_chroma_mc4_lsx(unsigned char *dst, const unsigned char *src,
long int stride, int h, int x, int y);
void ff_put_h264_chroma_mc4_lasx(unsigned char *dst, const unsigned char *src,
long int stride, int h, int x, int y);
void ff_put_h264_chroma_mc8_lasx(unsigned char *dst, const unsigned char *src,
long int stride, int h, int x, int y);
void ff_avg_h264_chroma_mc8_lasx(unsigned char *dst, const unsigned char *src,
long int stride, int h, int x, int y);
#endif /* AVCODEC_LOONGARCH_H264CHROMA_LOONGARCH_H */

@ -0,0 +1,299 @@
/*
* Loongson LSX optimized h264intrapred
*
* Copyright (c) 2023 Loongson Technology Corporation Limited
* Contributed by Lu Wang <wanglu@loongson.cn>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "loongson_asm.S"
const shufa
.byte 6, 5, 4, 3, 2, 1, 0
endconst
const mulk
.byte 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0
endconst
const mulh
.byte 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0
.byte 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 0
endconst
.macro PRED16X16_PLANE
slli.d t6, a1, 1
slli.d t4, a1, 3
addi.d t0, a0, 7
sub.d t0, t0, a1
add.d t1, a0, t4
addi.d t1, t1, -1
sub.d t2, t1, t6
ld.bu t3, t0, 1
ld.bu t4, t0, -1
ld.bu t5, t1, 0
ld.bu t7, t2, 0
sub.d t3, t3, t4
sub.d t4, t5, t7
la.local t5, mulk
vld vr0, t5, 0
fld.d f1, t0, 2
fld.d f2, t0, -8
la.local t5, shufa
fld.d f3, t5, 0
vshuf.b vr2, vr2, vr2, vr3
vilvl.b vr1, vr1, vr2
vhsubw.hu.bu vr1, vr1, vr1
vmul.h vr0, vr0, vr1
vhaddw.w.h vr1, vr0, vr0
vhaddw.d.w vr0, vr1, vr1
vhaddw.q.d vr1, vr0, vr0
vpickve2gr.w t5, vr1, 0
add.d t3, t3, t5
//2
sub.d t2, t2, a1
ld.bu t8, t2, 0
ldx.bu t7, t1, a1
sub.d t5, t7, t8
slli.d t5, t5, 1
//3&4
add.d t1, t1, t6
sub.d t2, t2, a1
ld.bu t8, t2, 0
ld.bu t7, t1, 0
sub.d t7, t7, t8
slli.d t8, t7, 1
add.d t7, t7, t8
add.d t5, t5, t7
sub.d t2, t2, a1
ld.bu t8, t2, 0
ldx.bu t7, t1, a1
sub.d t7, t7, t8
slli.d t7, t7, 2
add.d t5, t5, t7
//5&6
add.d t1, t1, t6
sub.d t2, t2, a1
ld.bu t8, t2, 0
ld.bu t7, t1, 0
sub.d t7, t7, t8
slli.d t8, t7, 2
add.d t7, t7, t8
add.d t5, t5, t7
sub.d t2, t2, a1
ld.bu t8, t2, 0
ldx.bu t7, t1, a1
sub.d t7, t7, t8
slli.d t8, t7, 1
slli.d t7, t7, 2
add.d t7, t7, t8
add.d t5, t5, t7
//7&8
add.d t1, t1, t6
sub.d t2, t2, a1
ld.bu t8, t2, 0
ld.bu t7, t1, 0
sub.d t7, t7, t8
slli.d t8, t7, 3
sub.d t7, t8, t7
add.d t5, t5, t7
sub.d t2, t2, a1
ld.bu t8, t2, 0
ldx.bu t7, t1, a1
sub.d t7, t7, t8
slli.d t7, t7, 3
add.d t5, t5, t7
add.d t4, t4, t5
add.d t1, t1, a1
.endm
.macro PRED16X16_PLANE_END
ld.bu t7, t1, 0
ld.bu t8, t2, 16
add.d t5, t7, t8
addi.d t5, t5, 1
slli.d t5, t5, 4
add.d t7, t3, t4
slli.d t8, t7, 3
sub.d t7, t8, t7
sub.d t5, t5, t7
la.local t8, mulh
vld vr3, t8, 0
slli.d t8, t3, 3
vreplgr2vr.h vr4, t3
vreplgr2vr.h vr9, t8
vmul.h vr5, vr3, vr4
.rept 16
move t7, t5
add.d t5, t5, t4
vreplgr2vr.h vr6, t7
vadd.h vr7, vr6, vr5
vadd.h vr8, vr9, vr7
vssrani.bu.h vr8, vr7, 5
vst vr8, a0, 0
add.d a0, a0, a1
.endr
.endm
.macro PRED16X16_PLANE_END_LASX
ld.bu t7, t1, 0
ld.bu t8, t2, 16
add.d t5, t7, t8
addi.d t5, t5, 1
slli.d t5, t5, 4
add.d t7, t3, t4
slli.d t8, t7, 3
sub.d t7, t8, t7
sub.d t5, t5, t7
la.local t8, mulh
xvld xr3, t8, 0
xvreplgr2vr.h xr4, t3
xvmul.h xr5, xr3, xr4
.rept 8
move t7, t5
add.d t5, t5, t4
xvreplgr2vr.h xr6, t7
xvreplgr2vr.h xr8, t5
add.d t5, t5, t4
xvadd.h xr7, xr6, xr5
xvadd.h xr9, xr8, xr5
xvssrani.bu.h xr9, xr7, 5
vstelm.d vr9, a0, 0, 0
xvstelm.d xr9, a0, 8, 2
add.d a0, a0, a1
vstelm.d vr9, a0, 0, 1
xvstelm.d xr9, a0, 8, 3
add.d a0, a0, a1
.endr
.endm
/* void ff_h264_pred16x16_plane_h264_8_lsx(uint8_t *src, ptrdiff_t stride)
*/
function ff_h264_pred16x16_plane_h264_8_lsx
PRED16X16_PLANE
slli.d t7, t3, 2
add.d t3, t3, t7
addi.d t3, t3, 32
srai.d t3, t3, 6
slli.d t7, t4, 2
add.d t4, t4, t7
addi.d t4, t4, 32
srai.d t4, t4, 6
PRED16X16_PLANE_END
endfunc
/* void ff_h264_pred16x16_plane_rv40_8_lsx(uint8_t *src, ptrdiff_t stride)
*/
function ff_h264_pred16x16_plane_rv40_8_lsx
PRED16X16_PLANE
srai.d t7, t3, 2
add.d t3, t3, t7
srai.d t3, t3, 4
srai.d t7, t4, 2
add.d t4, t4, t7
srai.d t4, t4, 4
PRED16X16_PLANE_END
endfunc
/* void ff_h264_pred16x16_plane_svq3_8_lsx(uint8_t *src, ptrdiff_t stride)
*/
function ff_h264_pred16x16_plane_svq3_8_lsx
PRED16X16_PLANE
li.d t6, 4
li.d t7, 5
li.d t8, 16
div.d t3, t3, t6
mul.d t3, t3, t7
div.d t3, t3, t8
div.d t4, t4, t6
mul.d t4, t4, t7
div.d t4, t4, t8
move t7, t3
move t3, t4
move t4, t7
PRED16X16_PLANE_END
endfunc
/* void ff_h264_pred16x16_plane_h264_8_lasx(uint8_t *src, ptrdiff_t stride)
*/
function ff_h264_pred16x16_plane_h264_8_lasx
PRED16X16_PLANE
slli.d t7, t3, 2
add.d t3, t3, t7
addi.d t3, t3, 32
srai.d t3, t3, 6
slli.d t7, t4, 2
add.d t4, t4, t7
addi.d t4, t4, 32
srai.d t4, t4, 6
PRED16X16_PLANE_END_LASX
endfunc
/* void ff_h264_pred16x16_plane_rv40_8_lasx(uint8_t *src, ptrdiff_t stride)
*/
function ff_h264_pred16x16_plane_rv40_8_lasx
PRED16X16_PLANE
srai.d t7, t3, 2
add.d t3, t3, t7
srai.d t3, t3, 4
srai.d t7, t4, 2
add.d t4, t4, t7
srai.d t4, t4, 4
PRED16X16_PLANE_END_LASX
endfunc
/* void ff_h264_pred16x16_plane_svq3_8_lasx(uint8_t *src, ptrdiff_t stride)
*/
function ff_h264_pred16x16_plane_svq3_8_lasx
PRED16X16_PLANE
li.d t5, 4
li.d t7, 5
li.d t8, 16
div.d t3, t3, t5
mul.d t3, t3, t7
div.d t3, t3, t8
div.d t4, t4, t5
mul.d t4, t4, t7
div.d t4, t4, t8
move t7, t3
move t3, t4
move t4, t7
PRED16X16_PLANE_END_LASX
endfunc
Loading…
Cancel
Save