swscale/la: Optimize the functions of the swscale series with lsx.

./configure --disable-lasx
ffmpeg -i ~/media/1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -s 640x480
-pix_fmt bgra -y /dev/null -an
before: 91fps
after:  160fps

Reviewed-by: Shiyou Yin <yinshiyou-hf@loongson.cn>
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
pull/389/head
Lu Wang 2 years ago committed by Michael Niedermayer
parent f6077cc666
commit 4501b1dfd7
No known key found for this signature in database
GPG Key ID: B18E8928B3948D64
  1. 5
      libswscale/loongarch/Makefile
  2. 285
      libswscale/loongarch/input.S
  3. 138
      libswscale/loongarch/output.S
  4. 4
      libswscale/loongarch/output_lasx.c
  5. 1828
      libswscale/loongarch/output_lsx.c
  6. 1868
      libswscale/loongarch/swscale.S
  7. 32
      libswscale/loongarch/swscale_init_loongarch.c
  8. 43
      libswscale/loongarch/swscale_loongarch.h
  9. 57
      libswscale/loongarch/swscale_lsx.c
  10. 3
      libswscale/utils.c

@ -4,3 +4,8 @@ LASX-OBJS-$(CONFIG_SWSCALE) += loongarch/swscale_lasx.o \
loongarch/yuv2rgb_lasx.o \
loongarch/rgb2rgb_lasx.o \
loongarch/output_lasx.o
LSX-OBJS-$(CONFIG_SWSCALE) += loongarch/swscale.o \
loongarch/swscale_lsx.o \
loongarch/input.o \
loongarch/output.o \
loongarch/output_lsx.o

@ -0,0 +1,285 @@
/*
* Loongson LSX optimized swscale
*
* Copyright (c) 2023 Loongson Technology Corporation Limited
* Contributed by Lu Wang <wanglu@loongson.cn>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavcodec/loongarch/loongson_asm.S"
/* void planar_rgb_to_y_lsx(uint8_t *_dst, const uint8_t *src[4],
* int width, int32_t *rgb2yuv)
*/
function planar_rgb_to_y_lsx
ld.d a5, a1, 0
ld.d a6, a1, 8
ld.d a7, a1, 16
ld.w t1, a3, 0 // ry
ld.w t2, a3, 4 // gy
ld.w t3, a3, 8 // by
li.w t4, 9
li.w t5, 524544
li.w t7, 4
li.w t8, 8
vldi vr7, 0
vreplgr2vr.w vr1, t1
vreplgr2vr.w vr2, t2
vreplgr2vr.w vr3, t3
vreplgr2vr.w vr4, t4
vreplgr2vr.w vr5, t5
bge a2, t8, .WIDTH8
bge a2, t7, .WIDTH4
blt zero, a2, .WIDTH
b .END
.WIDTH8:
vld vr8, a5, 0
vld vr9, a6, 0
vld vr10, a7, 0
vilvl.b vr11, vr7, vr8
vilvl.b vr12, vr7, vr9
vilvl.b vr13, vr7, vr10
vilvl.h vr14, vr7, vr11
vilvl.h vr15, vr7, vr12
vilvl.h vr16, vr7, vr13
vilvh.h vr17, vr7, vr11
vilvh.h vr18, vr7, vr12
vilvh.h vr19, vr7, vr13
vmul.w vr20, vr1, vr16
vmul.w vr21, vr1, vr19
vmadd.w vr20, vr2, vr14
vmadd.w vr20, vr3, vr15
vmadd.w vr21, vr2, vr17
vmadd.w vr21, vr3, vr18
vadd.w vr20, vr20, vr5
vadd.w vr21, vr21, vr5
vsra.w vr20, vr20, vr4
vsra.w vr21, vr21, vr4
vpickev.h vr20, vr21, vr20
vst vr20, a0, 0
addi.d a2, a2, -8
addi.d a5, a5, 8
addi.d a6, a6, 8
addi.d a7, a7, 8
addi.d a0, a0, 16
bge a2, t8, .WIDTH8
bge a2, t7, .WIDTH4
blt zero, a2, .WIDTH
b .END
.WIDTH4:
vld vr8, a5, 0
vld vr9, a6, 0
vld vr10, a7, 0
vilvl.b vr11, vr7, vr8
vilvl.b vr12, vr7, vr9
vilvl.b vr13, vr7, vr10
vilvl.h vr14, vr7, vr11
vilvl.h vr15, vr7, vr12
vilvl.h vr16, vr7, vr13
vmul.w vr17, vr1, vr16
vmadd.w vr17, vr2, vr14
vmadd.w vr17, vr3, vr15
vadd.w vr17, vr17, vr5
vsra.w vr17, vr17, vr4
vpickev.h vr17, vr17, vr17
vstelm.d vr17, a0, 0, 0
addi.d a2, a2, -4
addi.d a5, a5, 4
addi.d a6, a6, 4
addi.d a7, a7, 4
addi.d a0, a0, 8
bge a2, t7, .WIDTH4
blt zero, a2, .WIDTH
b .END
.WIDTH:
ld.bu t0, a5, 0
ld.bu t4, a6, 0
ld.bu t6, a7, 0
mul.w t8, t6, t1
mul.w t7, t0, t2
add.w t8, t8, t7
mul.w t7, t4, t3
add.w t8, t8, t7
add.w t8, t8, t5
srai.w t8, t8, 9
st.h t8, a0, 0
addi.d a2, a2, -1
addi.d a5, a5, 1
addi.d a6, a6, 1
addi.d a7, a7, 1
addi.d a0, a0, 2
blt zero, a2, .WIDTH
.END:
endfunc
/* void planar_rgb_to_uv_lsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
* int width, int32_t *rgb2yuv)
*/
function planar_rgb_to_uv_lsx
addi.d sp, sp, -24
st.d s1, sp, 0
st.d s2, sp, 8
st.d s3, sp, 16
ld.d a5, a2, 0
ld.d a6, a2, 8
ld.d a7, a2, 16
ld.w t1, a4, 12 // ru
ld.w t2, a4, 16 // gu
ld.w t3, a4, 20 // bu
ld.w s1, a4, 24 // rv
ld.w s2, a4, 28 // gv
ld.w s3, a4, 32 // bv
li.w t4, 9
li.w t5, 4194560
li.w t7, 4
li.w t8, 8
vldi vr0, 0
vreplgr2vr.w vr1, t1
vreplgr2vr.w vr2, t2
vreplgr2vr.w vr3, t3
vreplgr2vr.w vr4, s1
vreplgr2vr.w vr5, s2
vreplgr2vr.w vr6, s3
vreplgr2vr.w vr7, t4
vreplgr2vr.w vr8, t5
bge a2, t8, .LOOP_WIDTH8
bge a2, t7, .LOOP_WIDTH4
blt zero, a2, .LOOP_WIDTH
b .LOOP_END
.LOOP_WIDTH8:
vld vr9, a5, 0
vld vr10, a6, 0
vld vr11, a7, 0
vilvl.b vr9, vr0, vr9
vilvl.b vr10, vr0, vr10
vilvl.b vr11, vr0, vr11
vilvl.h vr12, vr0, vr9
vilvl.h vr13, vr0, vr10
vilvl.h vr14, vr0, vr11
vilvh.h vr15, vr0, vr9
vilvh.h vr16, vr0, vr10
vilvh.h vr17, vr0, vr11
vmul.w vr18, vr1, vr14
vmul.w vr19, vr1, vr17
vmul.w vr20, vr4, vr14
vmul.w vr21, vr4, vr17
vmadd.w vr18, vr2, vr12
vmadd.w vr18, vr3, vr13
vmadd.w vr19, vr2, vr15
vmadd.w vr19, vr3, vr16
vmadd.w vr20, vr5, vr12
vmadd.w vr20, vr6, vr13
vmadd.w vr21, vr5, vr15
vmadd.w vr21, vr6, vr16
vadd.w vr18, vr18, vr8
vadd.w vr19, vr19, vr8
vadd.w vr20, vr20, vr8
vadd.w vr21, vr21, vr8
vsra.w vr18, vr18, vr7
vsra.w vr19, vr19, vr7
vsra.w vr20, vr20, vr7
vsra.w vr21, vr21, vr7
vpickev.h vr18, vr19, vr18
vpickev.h vr20, vr21, vr20
vst vr18, a0, 0
vst vr20, a1, 0
addi.d a3, a3, -8
addi.d a5, a5, 8
addi.d a6, a6, 8
addi.d a7, a7, 8
addi.d a0, a0, 16
addi.d a1, a1, 16
bge a3, t8, .LOOP_WIDTH8
bge a3, t7, .LOOP_WIDTH4
blt zero, a3, .LOOP_WIDTH
b .LOOP_END
.LOOP_WIDTH4:
vld vr9, a5, 0
vld vr10, a6, 0
vld vr11, a7, 0
vilvl.b vr9, vr0, vr9
vilvl.b vr10, vr0, vr10
vilvl.b vr11, vr0, vr11
vilvl.h vr12, vr0, vr9
vilvl.h vr13, vr0, vr10
vilvl.h vr14, vr0, vr11
vmul.w vr18, vr1, vr14
vmul.w vr19, vr4, vr14
vmadd.w vr18, vr2, vr12
vmadd.w vr18, vr3, vr13
vmadd.w vr19, vr5, vr12
vmadd.w vr19, vr6, vr13
vadd.w vr18, vr18, vr8
vadd.w vr19, vr19, vr8
vsra.w vr18, vr18, vr7
vsra.w vr19, vr19, vr7
vpickev.h vr18, vr18, vr18
vpickev.h vr19, vr19, vr19
vstelm.d vr18, a0, 0, 0
vstelm.d vr19, a1, 0, 0
addi.d a3, a3, -4
addi.d a5, a5, 4
addi.d a6, a6, 4
addi.d a7, a7, 4
addi.d a0, a0, 8
addi.d a1, a1, 8
bge a3, t7, .LOOP_WIDTH4
blt zero, a3, .LOOP_WIDTH
b .LOOP_END
.LOOP_WIDTH:
ld.bu t0, a5, 0
ld.bu t4, a6, 0
ld.bu t6, a7, 0
mul.w t8, t6, t1
mul.w t7, t0, t2
add.w t8, t8, t7
mul.w t7, t4, t3
add.w t8, t8, t7
add.w t8, t8, t5
srai.w t8, t8, 9
st.h t8, a0, 0
mul.w t8, t6, s1
mul.w t7, t0, s2
add.w t8, t8, t7
mul.w t7, t4, s3
add.w t8, t8, t7
add.w t8, t8, t5
srai.w t8, t8, 9
st.h t8, a1, 0
addi.d a3, a3, -1
addi.d a5, a5, 1
addi.d a6, a6, 1
addi.d a7, a7, 1
addi.d a0, a0, 2
addi.d a1, a1, 2
blt zero, a3, .LOOP_WIDTH
.LOOP_END:
ld.d s1, sp, 0
ld.d s2, sp, 8
ld.d s3, sp, 16
addi.d sp, sp, 24
endfunc

@ -0,0 +1,138 @@
/*
* Loongson LSX optimized swscale
*
* Copyright (c) 2023 Loongson Technology Corporation Limited
* Contributed by Lu Wang <wanglu@loongson.cn>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavcodec/loongarch/loongson_asm.S"
/* static void ff_yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
* const int16_t **src, uint8_t *dest, int dstW,
* const uint8_t *dither, int offset)
*/
function ff_yuv2planeX_8_lsx
addi.w t1, a6, 1
addi.w t2, a6, 2
addi.w t3, a6, 3
addi.w t4, a6, 4
addi.w t5, a6, 5
addi.w t6, a6, 6
addi.w t7, a6, 7
andi t0, a6, 7
andi t1, t1, 7
andi t2, t2, 7
andi t3, t3, 7
andi t4, t4, 7
andi t5, t5, 7
andi t6, t6, 7
andi t7, t7, 7
ldx.bu t0, a5, t0
ldx.bu t1, a5, t1
ldx.bu t2, a5, t2
ldx.bu t3, a5, t3
ldx.bu t4, a5, t4
ldx.bu t5, a5, t5
ldx.bu t6, a5, t6
ldx.bu t7, a5, t7
vreplgr2vr.w vr0, t0
vreplgr2vr.w vr1, t1
vreplgr2vr.w vr2, t2
vreplgr2vr.w vr3, t3
vreplgr2vr.w vr4, t4
vreplgr2vr.w vr5, t5
vreplgr2vr.w vr6, t6
vreplgr2vr.w vr7, t7
vilvl.w vr0, vr2, vr0
vilvl.w vr4, vr6, vr4
vilvl.w vr1, vr3, vr1
vilvl.w vr5, vr7, vr5
vilvl.d vr12, vr4, vr0
vilvl.d vr13, vr5, vr1
li.w t5, 0
li.w t8, 8
bge a4, t8, .WIDTH8
blt zero, a4, .WIDTH
b .END
.WIDTH8:
li.d t1, 0
li.d t4, 0
vslli.w vr2, vr12, 12
vslli.w vr3, vr13, 12
move t3, a0
.FILTERSIZE8:
ldx.d t2, a2, t1
vldx vr4, t2, t5
vldrepl.h vr5, t3, 0
vmaddwev.w.h vr2, vr4, vr5
vmaddwod.w.h vr3, vr4, vr5
addi.d t1, t1, 8
addi.d t3, t3, 2
addi.d t4, t4, 1
blt t4, a1, .FILTERSIZE8
vsrai.w vr2, vr2, 19
vsrai.w vr3, vr3, 19
vclip255.w vr2, vr2
vclip255.w vr3, vr3
vpickev.h vr2, vr3, vr2
vpickev.b vr2, vr2, vr2
vbsrl.v vr3, vr2, 4
vilvl.b vr2, vr3, vr2
fst.d f2, a3, 0
addi.d t5, t5, 16
addi.d a4, a4, -8
addi.d a3, a3, 8
bge a4, t8, .WIDTH8
blt zero, a4, .WIDTH
b .END
.WIDTH:
li.d t1, 0
li.d t4, 0
vslli.w vr2, vr12, 12
vslli.w vr3, vr13, 12
.FILTERSIZE:
ldx.d t2, a2, t1
vldx vr4, t2, t5
vldrepl.h vr5, a0, 0
vmaddwev.w.h vr2, vr4, vr5
vmaddwod.w.h vr3, vr4, vr5
addi.d t1, t1, 8
addi.d a0, a0, 2
addi.d t4, t4, 1
blt t4, a1, .FILTERSIZE
vsrai.w vr2, vr2, 19
vsrai.w vr3, vr3, 19
vclip255.w vr2, vr2
vclip255.w vr3, vr3
vpickev.h vr2, vr3, vr2
vpickev.b vr2, vr2, vr2
vbsrl.v vr3, vr2, 4
vilvl.b vr2, vr3, vr2
.DEST:
vstelm.b vr2, a3, 0, 0
vbsrl.v vr2, vr2, 1
addi.d a4, a4, -1
addi.d a3, a3, 1
blt zero, a4, .DEST
.END:
endfunc

@ -1773,11 +1773,9 @@ YUV2RGBWRAPPER(yuv2, rgb_full, bgr4_byte_full, AV_PIX_FMT_BGR4_BYTE, 0)
YUV2RGBWRAPPER(yuv2, rgb_full, rgb4_byte_full, AV_PIX_FMT_RGB4_BYTE, 0)
YUV2RGBWRAPPER(yuv2, rgb_full, bgr8_full, AV_PIX_FMT_BGR8, 0)
YUV2RGBWRAPPER(yuv2, rgb_full, rgb8_full, AV_PIX_FMT_RGB8, 0)
#undef yuvTorgb
#undef yuvTorgb_setup
av_cold void ff_sws_init_output_loongarch(SwsContext *c)
av_cold void ff_sws_init_output_lasx(SwsContext *c)
{
if(c->flags & SWS_FULL_CHR_H_INT) {

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -27,8 +27,33 @@
av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (have_lsx(cpu_flags)) {
ff_sws_init_output_lsx(c);
if (c->srcBpc == 8) {
if (c->dstBpc <= 14) {
c->hyScale = c->hcScale = ff_hscale_8_to_15_lsx;
} else {
c->hyScale = c->hcScale = ff_hscale_8_to_19_lsx;
}
} else {
c->hyScale = c->hcScale = c->dstBpc > 14 ? ff_hscale_16_to_19_lsx
: ff_hscale_16_to_15_lsx;
}
switch (c->srcFormat) {
case AV_PIX_FMT_GBRAP:
case AV_PIX_FMT_GBRP:
{
c->readChrPlanar = planar_rgb_to_uv_lsx;
c->readLumPlanar = planar_rgb_to_y_lsx;
}
break;
}
if (c->dstBpc == 8)
c->yuv2planeX = ff_yuv2planeX_8_lsx;
}
#if HAVE_LASX
if (have_lasx(cpu_flags)) {
ff_sws_init_output_loongarch(c);
ff_sws_init_output_lasx(c);
if (c->srcBpc == 8) {
if (c->dstBpc <= 14) {
c->hyScale = c->hcScale = ff_hscale_8_to_15_lasx;
@ -51,17 +76,21 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
if (c->dstBpc == 8)
c->yuv2planeX = ff_yuv2planeX_8_lasx;
}
#endif // #if HAVE_LASX
}
av_cold void rgb2rgb_init_loongarch(void)
{
#if HAVE_LASX
int cpu_flags = av_get_cpu_flags();
if (have_lasx(cpu_flags))
interleaveBytes = ff_interleave_bytes_lasx;
#endif // #if HAVE_LASX
}
av_cold SwsFunc ff_yuv2rgb_init_loongarch(SwsContext *c)
{
#if HAVE_LASX
int cpu_flags = av_get_cpu_flags();
if (have_lasx(cpu_flags)) {
switch (c->dstFormat) {
@ -91,5 +120,6 @@ av_cold SwsFunc ff_yuv2rgb_init_loongarch(SwsContext *c)
return yuv420_abgr32_lasx;
}
}
#endif // #if HAVE_LASX
return NULL;
}

@ -24,7 +24,45 @@
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#include "config.h"
void ff_hscale_8_to_15_lsx(SwsContext *c, int16_t *dst, int dstW,
const uint8_t *src, const int16_t *filter,
const int32_t *filterPos, int filterSize);
void ff_hscale_8_to_19_lsx(SwsContext *c, int16_t *_dst, int dstW,
const uint8_t *src, const int16_t *filter,
const int32_t *filterPos, int filterSize);
void ff_hscale_16_to_15_lsx(SwsContext *c, int16_t *_dst, int dstW,
const uint8_t *_src, const int16_t *filter,
const int32_t *filterPos, int filterSize);
void ff_hscale_16_to_15_sub_lsx(SwsContext *c, int16_t *_dst, int dstW,
const uint8_t *_src, const int16_t *filter,
const int32_t *filterPos, int filterSize, int sh);
void ff_hscale_16_to_19_lsx(SwsContext *c, int16_t *_dst, int dstW,
const uint8_t *_src, const int16_t *filter,
const int32_t *filterPos, int filterSize);
void ff_hscale_16_to_19_sub_lsx(SwsContext *c, int16_t *_dst, int dstW,
const uint8_t *_src, const int16_t *filter,
const int32_t *filterPos, int filterSize, int sh);
void planar_rgb_to_uv_lsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
int width, int32_t *rgb2yuv, void *opq);
void planar_rgb_to_y_lsx(uint8_t *_dst, const uint8_t *src[4], int width,
int32_t *rgb2yuv, void *opq);
void ff_yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset);
av_cold void ff_sws_init_output_lsx(SwsContext *c);
#if HAVE_LASX
void ff_hscale_8_to_15_lasx(SwsContext *c, int16_t *dst, int dstW,
const uint8_t *src, const int16_t *filter,
const int32_t *filterPos, int filterSize);
@ -69,10 +107,11 @@ void ff_interleave_bytes_lasx(const uint8_t *src1, const uint8_t *src2,
uint8_t *dest, int width, int height,
int src1Stride, int src2Stride, int dstStride);
av_cold void ff_sws_init_output_loongarch(SwsContext *c);
void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset);
av_cold void ff_sws_init_output_lasx(SwsContext *c);
#endif // #if HAVE_LASX
#endif /* SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H */

@ -0,0 +1,57 @@
/*
* Loongson LSX optimized swscale
*
* Copyright (c) 2023 Loongson Technology Corporation Limited
* Contributed by Lu Wang <wanglu@loongson.cn>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "swscale_loongarch.h"
void ff_hscale_16_to_15_lsx(SwsContext *c, int16_t *_dst, int dstW,
const uint8_t *_src, const int16_t *filter,
const int32_t *filterPos, int filterSize)
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
int sh = desc->comp[0].depth - 1;
if (sh < 15) {
sh = isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8 ? 13 :
(desc->comp[0].depth - 1);
} else if (desc->flags && AV_PIX_FMT_FLAG_FLOAT) {
sh = 15;
}
ff_hscale_16_to_15_sub_lsx(c, _dst, dstW, _src, filter, filterPos, filterSize, sh);
}
void ff_hscale_16_to_19_lsx(SwsContext *c, int16_t *_dst, int dstW,
const uint8_t *_src, const int16_t *filter,
const int32_t *filterPos, int filterSize)
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
int bits = desc->comp[0].depth - 1;
int sh = bits - 4;
if ((isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8) && desc->comp[0].depth<16) {
sh = 9;
} else if (desc->flags & AV_PIX_FMT_FLAG_FLOAT) { /* float input are process like uint 16bpc */
sh = 16 - 1 - 4;
}
ff_hscale_16_to_19_sub_lsx(c, _dst, dstW, _src, filter, filterPos, filterSize, sh);
}

@ -653,7 +653,7 @@ static av_cold int initFilter(int16_t **outFilter, int32_t **filterPos,
filterAlign = 1;
}
if (have_lasx(cpu_flags)) {
if (have_lasx(cpu_flags) || have_lsx(cpu_flags)) {
int reNum = minFilterSize & (0x07);
if (minFilterSize < 5)
@ -1806,6 +1806,7 @@ static av_cold int sws_init_single_context(SwsContext *c, SwsFilter *srcFilter,
const int filterAlign = X86_MMX(cpu_flags) ? 4 :
PPC_ALTIVEC(cpu_flags) ? 8 :
have_neon(cpu_flags) ? 4 :
have_lsx(cpu_flags) ? 8 :
have_lasx(cpu_flags) ? 8 : 1;
if ((ret = initFilter(&c->hLumFilter, &c->hLumFilterPos,

Loading…
Cancel
Save