@ -1,4 +1,4 @@
; / *
; **************************************************************************** *
; * Provide SIMD optimizations for add_residual functions for HEVC decoding
; * Copyright (c) 2014 Pierre-Edouard LEPERE
; *
@ -17,7 +17,8 @@
; * You should have received a copy of the GNU Lesser General Public
; * License along with FFmpeg; if not, write to the Free Software
; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
; */
; ******************************************************************************
% include "libavutil/x86/x86util.asm"
SECTION .text
@ -25,9 +26,8 @@ SECTION .text
cextern pw_1023
% define max_pixels_10 pw_1023
;the tr_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file
% macro TR_ADD_MMX_4_8 0
; the add_res macros and functions were largely inspired by h264_idct.asm from the x264 project
% macro ADD_RES_MMX_4_8 0
mova m2 , [ r1 ]
mova m4 , [ r1 + 8 ]
pxor m3 , m3
@ -51,15 +51,15 @@ cextern pw_1023
INIT_MMX mmxext
; void ff_hevc_tranform_add_8_mmxext(uint8_t *dst, int16_t *coeff s, ptrdiff_t stride)
cglobal hevc_add_residual4_8 , 3 , 4 , 6
TR_ ADD_MMX_4_8
; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *re s, ptrdiff_t stride)
cglobal hevc_add_residual_ 4_8 , 3 , 4 , 6
ADD_RES _MMX_4_8
add r1 , 16
lea r0 , [ r0 + r2 * 2 ]
TR_ ADD_MMX_4_8
ADD_RES _MMX_4_8
RET
% macro TR_ ADD_SSE_8_8 0
% macro ADD_RES _SSE_8_8 0
pxor m3 , m3
mova m4 , [ r1 ]
mova m6 , [ r1 + 16 ]
@ -88,7 +88,7 @@ cglobal hevc_add_residual4_8, 3, 4, 6
movhps [ r0 + r3 ], m1
% endmacro
% macro TR_ ADD_SSE_16_32_8 3
% macro ADD_RES _SSE_16_32_8 3
mova xm2 , [ r1 +% 1 ]
mova xm6 , [ r1 +% 1 + 16 ]
% if cpuflag(avx2)
@ -135,39 +135,39 @@ cglobal hevc_add_residual4_8, 3, 4, 6
% macro TRANSFORM_ADD_8 0
; void ff_hevc_add_residual8_8_<opt>(uint8_t *dst, int16_t *coeff s, ptrdiff_t stride)
cglobal hevc_add_residual8_8 , 3 , 4 , 8
; void ff_hevc_add_residual_8_8_<opt>(uint8_t *dst, int16_t *re s, ptrdiff_t stride)
cglobal hevc_add_residual_ 8_8 , 3 , 4 , 8
lea r3 , [ r2 * 3 ]
TR_ ADD_SSE_8_8
ADD_RES _SSE_8_8
add r1 , 64
lea r0 , [ r0 + r2 * 4 ]
TR_ ADD_SSE_8_8
ADD_RES _SSE_8_8
RET
; void ff_hevc_add_residual16_8_<opt>(uint8_t *dst, int16_t *coeff s, ptrdiff_t stride)
cglobal hevc_add_residual16_8 , 3 , 4 , 7
; void ff_hevc_add_residual_16_8_<opt>(uint8_t *dst, int16_t *re s, ptrdiff_t stride)
cglobal hevc_add_residual_ 16_8 , 3 , 4 , 7
pxor m0 , m0
lea r3 , [ r2 * 3 ]
TR_ ADD_SSE_16_32_8 0 , r0 , r0 + r2
TR_ ADD_SSE_16_32_8 64 , r0 + r2 * 2 , r0 + r3
ADD_RES _SSE_16_32_8 0 , r0 , r0 + r2
ADD_RES _SSE_16_32_8 64 , r0 + r2 * 2 , r0 + r3
% rep 3
add r1 , 128
lea r0 , [ r0 + r2 * 4 ]
TR_ ADD_SSE_16_32_8 0 , r0 , r0 + r2
TR_ ADD_SSE_16_32_8 64 , r0 + r2 * 2 , r0 + r3
ADD_RES _SSE_16_32_8 0 , r0 , r0 + r2
ADD_RES _SSE_16_32_8 64 , r0 + r2 * 2 , r0 + r3
% endrep
RET
; void ff_hevc_add_residual32_8_<opt>(uint8_t *dst, int16_t *coeff s, ptrdiff_t stride)
cglobal hevc_add_residual32_8 , 3 , 4 , 7
; void ff_hevc_add_residual_32_8_<opt>(uint8_t *dst, int16_t *re s, ptrdiff_t stride)
cglobal hevc_add_residual_ 32_8 , 3 , 4 , 7
pxor m0 , m0
TR_ ADD_SSE_16_32_8 0 , r0 , r0 + 16
TR_ ADD_SSE_16_32_8 64 , r0 + r2 , r0 + r2 + 16
ADD_RES _SSE_16_32_8 0 , r0 , r0 + 16
ADD_RES _SSE_16_32_8 64 , r0 + r2 , r0 + r2 + 16
% rep 15
add r1 , 128
lea r0 , [ r0 + r2 * 2 ]
TR_ ADD_SSE_16_32_8 0 , r0 , r0 + 16
TR_ ADD_SSE_16_32_8 64 , r0 + r2 , r0 + r2 + 16
ADD_RES _SSE_16_32_8 0 , r0 , r0 + 16
ADD_RES _SSE_16_32_8 64 , r0 + r2 , r0 + r2 + 16
% endrep
RET
% endmacro
@ -179,25 +179,22 @@ TRANSFORM_ADD_8
% if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
; void ff_hevc_add_residual32_8_avx2(uint8_t *dst, int16_t *coeff s, ptrdiff_t stride)
cglobal hevc_add_residual32_8 , 3 , 4 , 7
; void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *re s, ptrdiff_t stride)
cglobal hevc_add_residual_ 32_8 , 3 , 4 , 7
pxor m0 , m0
lea r3 , [ r2 * 3 ]
TR_ ADD_SSE_16_32_8 0 , r0 , r0 + r2
TR_ ADD_SSE_16_32_8 128 , r0 + r2 * 2 , r0 + r3
ADD_RES _SSE_16_32_8 0 , r0 , r0 + r2
ADD_RES _SSE_16_32_8 128 , r0 + r2 * 2 , r0 + r3
% rep 7
add r1 , 256
lea r0 , [ r0 + r2 * 4 ]
TR_ ADD_SSE_16_32_8 0 , r0 , r0 + r2
TR_ ADD_SSE_16_32_8 128 , r0 + r2 * 2 , r0 + r3
ADD_RES _SSE_16_32_8 0 , r0 , r0 + r2
ADD_RES _SSE_16_32_8 128 , r0 + r2 * 2 , r0 + r3
% endrep
RET
% endif
;-----------------------------------------------------------------------------
; void ff_hevc_add_residual_10(pixel *dst, int16_t *block, int stride)
;-----------------------------------------------------------------------------
% macro TR_ADD_SSE_8_10 4
% macro ADD_RES_SSE_8_10 4
mova m0 , [ % 4 ]
mova m1 , [ % 4 + 16 ]
mova m2 , [ % 4 + 32 ]
@ -216,7 +213,7 @@ cglobal hevc_add_residual32_8, 3, 4, 7
mova [ % 1 +% 3 ], m3
% endmacro
% macro TR_ ADD_MMX4_10 3
% macro ADD_RES_ MMX_ 4_10 3
mova m0 , [ % 1 + 0 ]
mova m1 , [ % 1 +% 2 ]
paddw m0 , [ % 3 ]
@ -227,7 +224,7 @@ cglobal hevc_add_residual32_8, 3, 4, 7
mova [ % 1 +% 2 ], m1
% endmacro
% macro TRANS_ ADD_SSE_16_10 3
% macro ADD_RES _SSE_16_10 3
mova m0 , [ % 3 ]
mova m1 , [ % 3 + 16 ]
mova m2 , [ % 3 + 32 ]
@ -246,7 +243,7 @@ cglobal hevc_add_residual32_8, 3, 4, 7
mova [ % 1 +% 2 + 16 ], m3
% endmacro
% macro TRANS_ ADD_SSE_32_10 2
% macro ADD_RES _SSE_32_10 2
mova m0 , [ % 2 ]
mova m1 , [ % 2 + 16 ]
mova m2 , [ % 2 + 32 ]
@ -266,7 +263,7 @@ cglobal hevc_add_residual32_8, 3, 4, 7
mova [ % 1 + 48 ], m3
% endmacro
% macro TRANS_ADD16_AVX2 4
% macro ADD_RES_AVX2_16_10 4
mova m0 , [ % 4 ]
mova m1 , [ % 4 + 32 ]
mova m2 , [ % 4 + 64 ]
@ -287,7 +284,7 @@ cglobal hevc_add_residual32_8, 3, 4, 7
mova [ % 1 +% 3 ], m3
% endmacro
% macro TRANS_ADD32_AVX2 3
% macro ADD_RES_AVX2_32_10 3
mova m0 , [ % 3 ]
mova m1 , [ % 3 + 32 ]
mova m2 , [ % 3 + 64 ]
@ -308,81 +305,77 @@ cglobal hevc_add_residual32_8, 3, 4, 7
mova [ % 1 +% 2 + 32 ], m3
% endmacro
; void ff_hevc_add_residual_<4|8|16|32>_10(pixel *dst, int16_t *block, ptrdiff_t stride)
INIT_MMX mmxext
cglobal hevc_add_residual4_10 , 3 , 4 , 6
cglobal hevc_add_residual_ 4_10 , 3 , 4 , 6
pxor m2 , m2
mova m3 , [ max_pixels_10 ]
TR_ ADD_MMX4_10 r0 , r2 , r1
ADD_RES_ MMX_ 4_10 r0 , r2 , r1
add r1 , 16
lea r0 , [ r0 + 2 * r2 ]
TR_ ADD_MMX4_10 r0 , r2 , r1
ADD_RES_ MMX_ 4_10 r0 , r2 , r1
RET
;-----------------------------------------------------------------------------
; void ff_hevc_add_residual_10(pixel *dst, int16_t *block, int stride)
;-----------------------------------------------------------------------------
INIT_XMM ss e2
cglobal hevc_add_residual8_10 , 3 , 4 , 6
cglobal hevc_add_residual_8_10 , 3 , 4 , 6
pxor m4 , m4
mova m5 , [ max_pixels_10 ]
lea r3 , [ r2 * 3 ]
TR_ ADD_SSE_8_10 r0 , r2 , r3 , r1
ADD_RES _SSE_8_10 r0 , r2 , r3 , r1
lea r0 , [ r0 + r2 * 4 ]
add r1 , 64
TR_ ADD_SSE_8_10 r0 , r2 , r3 , r1
ADD_RES _SSE_8_10 r0 , r2 , r3 , r1
RET
cglobal hevc_add_residual16_10 , 3 , 4 , 6
cglobal hevc_add_residual_ 16_10 , 3 , 4 , 6
pxor m4 , m4
mova m5 , [ max_pixels_10 ]
TRANS_ ADD_SSE_16_10 r0 , r2 , r1
ADD_RES _SSE_16_10 r0 , r2 , r1
% rep 7
lea r0 , [ r0 + r2 * 2 ]
add r1 , 64
TRANS_ ADD_SSE_16_10 r0 , r2 , r1
ADD_RES _SSE_16_10 r0 , r2 , r1
% endrep
RET
cglobal hevc_add_residual32_10 , 3 , 4 , 6
cglobal hevc_add_residual_ 32_10 , 3 , 4 , 6
pxor m4 , m4
mova m5 , [ max_pixels_10 ]
TRANS_ ADD_SSE_32_10 r0 , r1
ADD_RES _SSE_32_10 r0 , r1
% rep 31
lea r0 , [ r0 + r2 ]
add r1 , 64
TRANS_ ADD_SSE_32_10 r0 , r1
ADD_RES _SSE_32_10 r0 , r1
% endrep
RET
% if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
cglobal hevc_add_residual16_10 , 3 , 4 , 6
cglobal hevc_add_residual_16_10 , 3 , 4 , 6
pxor m4 , m4
mova m5 , [ max_pixels_10 ]
lea r3 , [ r2 * 3 ]
TRANS_ADD16_AVX2 r0 , r2 , r3 , r1
ADD_RES_AVX2_16_10 r0 , r2 , r3 , r1
% rep 3
lea r0 , [ r0 + r2 * 4 ]
add r1 , 128
TRANS_ADD16_AVX2 r0 , r2 , r3 , r1
ADD_RES_AVX2_16_10 r0 , r2 , r3 , r1
% endrep
RET
cglobal hevc_add_residual32_10 , 3 , 4 , 6
cglobal hevc_add_residual_ 32_10 , 3 , 4 , 6
pxor m4 , m4
mova m5 , [ max_pixels_10 ]
TRANS_ADD32_AVX2 r0 , r2 , r1
ADD_RES_AVX2_32_10 r0 , r2 , r1
% rep 15
lea r0 , [ r0 + r2 * 2 ]
add r1 , 128
TRANS_ADD32_AVX2 r0 , r2 , r1
ADD_RES_AVX2_32_10 r0 , r2 , r1
% endrep
RET
% endif ; HAVE_AVX_EXTERNAL
% endif ; HAVE_AVX2 _EXTERNAL