@ -1,4 +1,4 @@
; / *
; **************************************************************************** *
; * Provide SIMD optimizations for add_residual functions for HEVC decoding
; * Copyright (c) 2014 Pierre-Edouard LEPERE
; *
@ -17,7 +17,8 @@
; * You should have received a copy of the GNU Lesser General Public
; * License along with FFmpeg; if not, write to the Free Software
; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
; */
; ******************************************************************************
% include "libavutil/x86/x86util.asm"
SECTION .text
@ -25,9 +26,8 @@ SECTION .text
cextern pw_1023
% define max_pixels_10 pw_1023
;the tr_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file
% macro TR_ADD_MMX_4_8 0
; the add_res macros and functions were largely inspired by h264_idct.asm from the x264 project
% macro ADD_RES_MMX_4_8 0
mova m2 , [ r1 ]
mova m4 , [ r1 + 8 ]
pxor m3 , m3
@ -39,27 +39,27 @@ cextern pw_1023
packuswb m4 , m4
packuswb m5 , m5
movh m0 , [ r0 ]
movh m1 , [ r0 + r2 ]
movh m0 , [ r0 ]
movh m1 , [ r0 + r2 ]
paddusb m0 , m2
paddusb m1 , m4
psubusb m0 , m3
psubusb m1 , m5
movh [ r0 ], m0
movh [ r0 + r2 ], m1
movh [ r0 ], m0
movh [ r0 + r2 ], m1
% endmacro
INIT_MMX mmxext
; void ff_hevc_tranform_add_8_mmxext(uint8_t *dst, int16_t *coeff s, ptrdiff_t stride)
cglobal hevc_add_residual4_8 , 3 , 4 , 6
TR_ ADD_MMX_4_8
; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *re s, ptrdiff_t stride)
cglobal hevc_add_residual_ 4_8 , 3 , 4 , 6
ADD_RES _MMX_4_8
add r1 , 16
lea r0 , [ r0 + r2 * 2 ]
TR_ ADD_MMX_4_8
ADD_RES _MMX_4_8
RET
% macro TR_ ADD_SSE_8_8 0
% macro ADD_RES _SSE_8_8 0
pxor m3 , m3
mova m4 , [ r1 ]
mova m6 , [ r1 + 16 ]
@ -74,22 +74,22 @@ cglobal hevc_add_residual4_8, 3, 4, 6
packuswb m6 , m2
packuswb m7 , m3
movq m0 , [ r0 ]
movq m1 , [ r0 + r2 ]
movhps m0 , [ r0 + r2 * 2 ]
movhps m1 , [ r0 + r3 ]
paddusb m0 , m4
paddusb m1 , m6
psubusb m0 , m5
psubusb m1 , m7
movq [ r0 ], m0
movq [ r0 + r2 ], m1
movhps [ r0 + 2 * r2 ], m0
movhps [ r0 + r3 ], m1
movq m0 , [ r0 ]
movq m1 , [ r0 + r2 ]
movhps m0 , [ r0 + r2 * 2 ]
movhps m1 , [ r0 + r3 ]
paddusb m0 , m4
paddusb m1 , m6
psubusb m0 , m5
psubusb m1 , m7
movq [ r0 ], m0
movq [ r0 + r2 ], m1
movhps [ r0 + 2 * r2 ], m0
movhps [ r0 + r3 ], m1
% endmacro
% macro TR_ ADD_SSE_16_32_8 3
mova xm2 , [ r1 +% 1 ]
% macro ADD_RES _SSE_16_32_8 3
mova xm2 , [ r1 +% 1 ]
mova xm6 , [ r1 +% 1 + 16 ]
% if cpuflag(avx2)
vinserti128 m2 , m2 , [ r1 +% 1 + 32 ], 1
@ -107,7 +107,7 @@ cglobal hevc_add_residual4_8, 3, 4, 6
packuswb m2 , m6
packuswb m1 , m5
mova xm4 , [ r1 +% 1 + mmsize * 2 ]
mova xm4 , [ r1 +% 1 + mmsize * 2 ]
mova xm6 , [ r1 +% 1 + mmsize * 2 + 16 ]
% if cpuflag(avx2)
vinserti128 m4 , m4 , [ r1 +% 1 + 96 ], 1
@ -135,39 +135,39 @@ cglobal hevc_add_residual4_8, 3, 4, 6
% macro TRANSFORM_ADD_8 0
; void ff_hevc_add_residual8_8_<opt>(uint8_t *dst, int16_t *coeff s, ptrdiff_t stride)
cglobal hevc_add_residual8_8 , 3 , 4 , 8
; void ff_hevc_add_residual_8_8_<opt>(uint8_t *dst, int16_t *re s, ptrdiff_t stride)
cglobal hevc_add_residual_ 8_8 , 3 , 4 , 8
lea r3 , [ r2 * 3 ]
TR_ ADD_SSE_8_8
ADD_RES _SSE_8_8
add r1 , 64
lea r0 , [ r0 + r2 * 4 ]
TR_ ADD_SSE_8_8
ADD_RES _SSE_8_8
RET
; void ff_hevc_add_residual16_8_<opt>(uint8_t *dst, int16_t *coeff s, ptrdiff_t stride)
cglobal hevc_add_residual16_8 , 3 , 4 , 7
pxor m0 , m0
lea r3 , [ r2 * 3 ]
TR_ ADD_SSE_16_32_8 0 , r0 , r0 + r2
TR_ ADD_SSE_16_32_8 64 , r0 + r2 * 2 , r0 + r3
; void ff_hevc_add_residual_16_8_<opt>(uint8_t *dst, int16_t *re s, ptrdiff_t stride)
cglobal hevc_add_residual_ 16_8 , 3 , 4 , 7
pxor m0 , m0
lea r3 , [ r2 * 3 ]
ADD_RES _SSE_16_32_8 0 , r0 , r0 + r2
ADD_RES _SSE_16_32_8 64 , r0 + r2 * 2 , r0 + r3
% rep 3
add r1 , 128
lea r0 , [ r0 + r2 * 4 ]
TR_ ADD_SSE_16_32_8 0 , r0 , r0 + r2
TR_ ADD_SSE_16_32_8 64 , r0 + r2 * 2 , r0 + r3
add r1 , 128
lea r0 , [ r0 + r2 * 4 ]
ADD_RES _SSE_16_32_8 0 , r0 , r0 + r2
ADD_RES _SSE_16_32_8 64 , r0 + r2 * 2 , r0 + r3
% endrep
RET
; void ff_hevc_add_residual32_8_<opt>(uint8_t *dst, int16_t *coeff s, ptrdiff_t stride)
cglobal hevc_add_residual32_8 , 3 , 4 , 7
pxor m0 , m0
TR_ ADD_SSE_16_32_8 0 , r0 , r0 + 16
TR_ ADD_SSE_16_32_8 64 , r0 + r2 , r0 + r2 + 16
; void ff_hevc_add_residual_32_8_<opt>(uint8_t *dst, int16_t *re s, ptrdiff_t stride)
cglobal hevc_add_residual_ 32_8 , 3 , 4 , 7
pxor m0 , m0
ADD_RES _SSE_16_32_8 0 , r0 , r0 + 16
ADD_RES _SSE_16_32_8 64 , r0 + r2 , r0 + r2 + 16
% rep 15
add r1 , 128
lea r0 , [ r0 + r2 * 2 ]
TR_ ADD_SSE_16_32_8 0 , r0 , r0 + 16
TR_ ADD_SSE_16_32_8 64 , r0 + r2 , r0 + r2 + 16
add r1 , 128
lea r0 , [ r0 + r2 * 2 ]
ADD_RES _SSE_16_32_8 0 , r0 , r0 + 16
ADD_RES _SSE_16_32_8 64 , r0 + r2 , r0 + r2 + 16
% endrep
RET
% endmacro
@ -179,80 +179,77 @@ TRANSFORM_ADD_8
% if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
; void ff_hevc_add_residual32_8_avx2(uint8_t *dst, int16_t *coeff s, ptrdiff_t stride)
cglobal hevc_add_residual32_8 , 3 , 4 , 7
pxor m0 , m0
lea r3 , [ r2 * 3 ]
TR_ ADD_SSE_16_32_8 0 , r0 , r0 + r2
TR_ ADD_SSE_16_32_8 128 , r0 + r2 * 2 , r0 + r3
; void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *re s, ptrdiff_t stride)
cglobal hevc_add_residual_ 32_8 , 3 , 4 , 7
pxor m0 , m0
lea r3 , [ r2 * 3 ]
ADD_RES _SSE_16_32_8 0 , r0 , r0 + r2
ADD_RES _SSE_16_32_8 128 , r0 + r2 * 2 , r0 + r3
% rep 7
add r1 , 256
lea r0 , [ r0 + r2 * 4 ]
TR_ ADD_SSE_16_32_8 0 , r0 , r0 + r2
TR_ ADD_SSE_16_32_8 128 , r0 + r2 * 2 , r0 + r3
add r1 , 256
lea r0 , [ r0 + r2 * 4 ]
ADD_RES _SSE_16_32_8 0 , r0 , r0 + r2
ADD_RES _SSE_16_32_8 128 , r0 + r2 * 2 , r0 + r3
% endrep
RET
% endif
;-----------------------------------------------------------------------------
; void ff_hevc_add_residual_10(pixel *dst, int16_t *block, int stride)
;-----------------------------------------------------------------------------
% macro TR_ADD_SSE_8_10 4
% macro ADD_RES_SSE_8_10 4
mova m0 , [ % 4 ]
mova m1 , [ % 4 + 16 ]
mova m2 , [ % 4 + 32 ]
mova m3 , [ % 4 + 48 ]
paddw m0 , [ % 1 + 0 ]
paddw m1 , [ % 1 +% 2 ]
paddw m0 , [ % 1 + 0 ]
paddw m1 , [ % 1 +% 2 ]
paddw m2 , [ % 1 +% 2 * 2 ]
paddw m3 , [ % 1 +% 3 ]
paddw m3 , [ % 1 +% 3 ]
CLIPW m0 , m4 , m5
CLIPW m1 , m4 , m5
CLIPW m2 , m4 , m5
CLIPW m3 , m4 , m5
mova [ % 1 + 0 ], m0
mova [ % 1 +% 2 ], m1
mova [ % 1 + 0 ], m0
mova [ % 1 +% 2 ], m1
mova [ % 1 +% 2 * 2 ], m2
mova [ % 1 +% 3 ], m3
mova [ % 1 +% 3 ], m3
% endmacro
% macro TR_ ADD_MMX4_10 3
mova m0 , [ % 1 + 0 ]
mova m1 , [ % 1 +% 2 ]
% macro ADD_RES_ MMX_ 4_10 3
mova m0 , [ % 1 + 0 ]
mova m1 , [ % 1 +% 2 ]
paddw m0 , [ % 3 ]
paddw m1 , [ % 3 + 8 ]
CLIPW m0 , m2 , m3
CLIPW m1 , m2 , m3
mova [ % 1 + 0 ], m0
mova [ % 1 +% 2 ], m1
mova [ % 1 + 0 ], m0
mova [ % 1 +% 2 ], m1
% endmacro
% macro TRANS_ ADD_SSE_16_10 3
% macro ADD_RES _SSE_16_10 3
mova m0 , [ % 3 ]
mova m1 , [ % 3 + 16 ]
mova m2 , [ % 3 + 32 ]
mova m3 , [ % 3 + 48 ]
paddw m0 , [ % 1 ]
paddw m1 , [ % 1 + 16 ]
paddw m2 , [ % 1 +% 2 ]
paddw m0 , [ % 1 ]
paddw m1 , [ % 1 + 16 ]
paddw m2 , [ % 1 +% 2 ]
paddw m3 , [ % 1 +% 2 + 16 ]
CLIPW m0 , m4 , m5
CLIPW m1 , m4 , m5
CLIPW m2 , m4 , m5
CLIPW m3 , m4 , m5
mova [ % 1 ], m0
mova [ % 1 + 16 ], m1
mova [ % 1 +% 2 ], m2
mova [ % 1 ], m0
mova [ % 1 + 16 ], m1
mova [ % 1 +% 2 ], m2
mova [ % 1 +% 2 + 16 ], m3
% endmacro
% macro TRANS_ ADD_SSE_32_10 2
% macro ADD_RES _SSE_32_10 2
mova m0 , [ % 2 ]
mova m1 , [ % 2 + 16 ]
mova m2 , [ % 2 + 32 ]
mova m3 , [ % 2 + 48 ]
paddw m0 , [ % 1 ]
paddw m0 , [ % 1 ]
paddw m1 , [ % 1 + 16 ]
paddw m2 , [ % 1 + 32 ]
paddw m3 , [ % 1 + 48 ]
@ -260,129 +257,125 @@ cglobal hevc_add_residual32_8, 3, 4, 7
CLIPW m1 , m4 , m5
CLIPW m2 , m4 , m5
CLIPW m3 , m4 , m5
mova [ % 1 ], m0
mova [ % 1 ], m0
mova [ % 1 + 16 ], m1
mova [ % 1 + 32 ], m2
mova [ % 1 + 48 ], m3
% endmacro
% macro TRANS_ADD16_AVX2 4
% macro ADD_RES_AVX2_16_10 4
mova m0 , [ % 4 ]
mova m1 , [ % 4 + 32 ]
mova m2 , [ % 4 + 64 ]
mova m3 , [ % 4 + 96 ]
paddw m0 , [ % 1 + 0 ]
paddw m1 , [ % 1 +% 2 ]
paddw m0 , [ % 1 + 0 ]
paddw m1 , [ % 1 +% 2 ]
paddw m2 , [ % 1 +% 2 * 2 ]
paddw m3 , [ % 1 +% 3 ]
paddw m3 , [ % 1 +% 3 ]
CLIPW m0 , m4 , m5
CLIPW m1 , m4 , m5
CLIPW m2 , m4 , m5
CLIPW m3 , m4 , m5
mova [ % 1 + 0 ], m0
mova [ % 1 +% 2 ], m1
mova [ % 1 + 0 ], m0
mova [ % 1 +% 2 ], m1
mova [ % 1 +% 2 * 2 ], m2
mova [ % 1 +% 3 ], m3
mova [ % 1 +% 3 ], m3
% endmacro
% macro TRANS_ADD32_AVX2 3
% macro ADD_RES_AVX2_32_10 3
mova m0 , [ % 3 ]
mova m1 , [ % 3 + 32 ]
mova m2 , [ % 3 + 64 ]
mova m3 , [ % 3 + 96 ]
paddw m0 , [ % 1 ]
paddw m1 , [ % 1 + 32 ]
paddw m2 , [ % 1 +% 2 ]
paddw m0 , [ % 1 ]
paddw m1 , [ % 1 + 32 ]
paddw m2 , [ % 1 +% 2 ]
paddw m3 , [ % 1 +% 2 + 32 ]
CLIPW m0 , m4 , m5
CLIPW m1 , m4 , m5
CLIPW m2 , m4 , m5
CLIPW m3 , m4 , m5
mova [ % 1 ], m0
mova [ % 1 + 32 ], m1
mova [ % 1 +% 2 ], m2
mova [ % 1 ], m0
mova [ % 1 + 32 ], m1
mova [ % 1 +% 2 ], m2
mova [ % 1 +% 2 + 32 ], m3
% endmacro
; void ff_hevc_add_residual_<4|8|16|32>_10(pixel *dst, int16_t *block, ptrdiff_t stride)
INIT_MMX mmxext
cglobal hevc_add_residual4_10 , 3 , 4 , 6
cglobal hevc_add_residual_ 4_10 , 3 , 4 , 6
pxor m2 , m2
mova m3 , [ max_pixels_10 ]
TR_ ADD_MMX4_10 r0 , r2 , r1
ADD_RES_ MMX_ 4_10 r0 , r2 , r1
add r1 , 16
lea r0 , [ r0 + 2 * r2 ]
TR_ ADD_MMX4_10 r0 , r2 , r1
ADD_RES_ MMX_ 4_10 r0 , r2 , r1
RET
;-----------------------------------------------------------------------------
; void ff_hevc_add_residual_10(pixel *dst, int16_t *block, int stride)
;-----------------------------------------------------------------------------
INIT_XMM ss e2
cglobal hevc_add_residual8_10 , 3 , 4 , 6
cglobal hevc_add_residual_8_10 , 3 , 4 , 6
pxor m4 , m4
mova m5 , [ max_pixels_10 ]
lea r3 , [ r2 * 3 ]
TR_ ADD_SSE_8_10 r0 , r2 , r3 , r1
ADD_RES _SSE_8_10 r0 , r2 , r3 , r1
lea r0 , [ r0 + r2 * 4 ]
add r1 , 64
TR_ ADD_SSE_8_10 r0 , r2 , r3 , r1
ADD_RES _SSE_8_10 r0 , r2 , r3 , r1
RET
cglobal hevc_add_residual16_10 , 3 , 4 , 6
cglobal hevc_add_residual_ 16_10 , 3 , 4 , 6
pxor m4 , m4
mova m5 , [ max_pixels_10 ]
TRANS_ ADD_SSE_16_10 r0 , r2 , r1
ADD_RES _SSE_16_10 r0 , r2 , r1
% rep 7
lea r0 , [ r0 + r2 * 2 ]
add r1 , 64
TRANS_ ADD_SSE_16_10 r0 , r2 , r1
lea r0 , [ r0 + r2 * 2 ]
add r1 , 64
ADD_RES _SSE_16_10 r0 , r2 , r1
% endrep
RET
cglobal hevc_add_residual32_10 , 3 , 4 , 6
cglobal hevc_add_residual_ 32_10 , 3 , 4 , 6
pxor m4 , m4
mova m5 , [ max_pixels_10 ]
TRANS_ ADD_SSE_32_10 r0 , r1
ADD_RES _SSE_32_10 r0 , r1
% rep 31
lea r0 , [ r0 + r2 ]
add r1 , 64
TRANS_ ADD_SSE_32_10 r0 , r1
lea r0 , [ r0 + r2 ]
add r1 , 64
ADD_RES _SSE_32_10 r0 , r1
% endrep
RET
% if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
cglobal hevc_add_residual_16_10 , 3 , 4 , 6
pxor m4 , m4
mova m5 , [ max_pixels_10 ]
lea r3 , [ r2 * 3 ]
cglobal hevc_add_residual16_10 , 3 , 4 , 6
pxor m4 , m4
mova m5 , [ max_pixels_10 ]
lea r3 , [ r2 * 3 ]
TRANS_ADD16_AVX2 r0 , r2 , r3 , r1
ADD_RES_AVX2_16_10 r0 , r2 , r3 , r1
% rep 3
lea r0 , [ r0 + r2 * 4 ]
add r1 , 128
TRANS_ADD16_AVX2 r0 , r2 , r3 , r1
lea r0 , [ r0 + r2 * 4 ]
add r1 , 128
ADD_RES_AVX2_16_10 r0 , r2 , r3 , r1
% endrep
RET
cglobal hevc_add_residual32_10 , 3 , 4 , 6
pxor m4 , m4
mova m5 , [ max_pixels_10 ]
cglobal hevc_add_residual_ 32_10 , 3 , 4 , 6
pxor m4 , m4
mova m5 , [ max_pixels_10 ]
TRANS_ADD32_AVX2 r0 , r2 , r1
ADD_RES_AVX2_32_10 r0 , r2 , r1
% rep 15
lea r0 , [ r0 + r2 * 2 ]
add r1 , 128
TRANS_ADD32_AVX2 r0 , r2 , r1
lea r0 , [ r0 + r2 * 2 ]
add r1 , 128
ADD_RES_AVX2_32_10 r0 , r2 , r1
% endrep
RET
% endif ; HAVE_AVX_EXTERNAL
% endif ; HAVE_AVX2 _EXTERNAL