;******************************************************************************
;* x86-optimized functions for the CFHD decoder
;* Copyright (c) 2020 Paul B Mahol
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

%include "libavutil/x86/x86util.asm"

SECTION_RODATA

factor_p1_n1: dw 1, -1, 1, -1, 1, -1, 1, -1,
factor_n1_p1: dw -1, 1, -1, 1, -1, 1, -1, 1,
factor_p11_n4: dw 11, -4, 11, -4, 11, -4, 11, -4,
factor_p5_p4: dw 5, 4, 5, 4, 5, 4, 5, 4,
pd_4: times 4 dd 4
pw_1: times 8 dw 1
pw_0: times 8 dw 0
pw_1023: times 8 dw 1023
pw_4095: times 8 dw 4095

SECTION .text

%macro CFHD_HORIZ_FILTER 1
%if %1 == 1023
cglobal cfhd_horiz_filter_clip10, 5, 6, 8 + 4 * ARCH_X86_64, output, low, high, width, x, temp
    shl        widthd, 1
%define ostrideq widthq
%define lwidthq  widthq
%define hwidthq  widthq
%elif %1 == 4095
cglobal cfhd_horiz_filter_clip12, 5, 6, 8 + 4 * ARCH_X86_64, output, low, high, width, x, temp
    shl        widthd, 1
%define ostrideq widthq
%define lwidthq  widthq
%define hwidthq  widthq
%else
%if ARCH_X86_64
cglobal cfhd_horiz_filter, 8, 11, 12, output, ostride, low, lwidth, high, hwidth, width, height, x, y, temp
    shl  ostrided, 1
    shl   lwidthd, 1
    shl   hwidthd, 1
    shl    widthd, 1

    mov        yd, heightd
    neg        yq
%else
cglobal cfhd_horiz_filter, 7, 7, 8, output, x, low, y, high, temp, width, height
    shl        xd, 1
    shl        yd, 1
    shl     tempd, 1
    shl    widthd, 1

    mov       xmp, xq
    mov       ymp, yq
    mov    tempmp, tempq

    mov        yd, r7m
    neg        yq

%define ostrideq xm
%define lwidthq  ym
%define hwidthq  tempm
%endif
%endif

%if ARCH_X86_64
    mova       m8, [factor_p1_n1]
    mova       m9, [factor_n1_p1]
    mova      m10, [pw_1]
    mova      m11, [pd_4]
%endif

%if %1 == 0
.looph:
%endif
    movsx          xq, word [lowq]
    imul           xq, 11

    movsx       tempq, word [lowq + 2]
    imul        tempq, -4
    add         tempq, xq

    movsx          xq, word [lowq + 4]
    add         tempq, xq
    add         tempq, 4
    sar         tempq, 3

    movsx          xq, word [highq]
    add         tempq, xq
    sar         tempq, 1

%if %1
    movd          xm0, tempd
    CLIPW          m0, [pw_0], [pw_%1]
    pextrw      tempd, xm0, 0
%endif
    mov  word [outputq], tempw

    movsx          xq, word [lowq]
    imul           xq, 5

    movsx       tempq, word [lowq + 2]
    imul        tempq, 4
    add         tempq, xq

    movsx          xq, word [lowq + 4]
    sub         tempq, xq
    add         tempq, 4
    sar         tempq, 3

    movsx          xq, word [highq]
    sub         tempq, xq
    sar         tempq, 1

%if %1
    movd          xm0, tempd
    CLIPW          m0, [pw_0], [pw_%1]
    pextrw      tempd, xm0, 0
%endif
    mov  word [outputq + 2], tempw

    mov            xq, 0

.loop:
    movu           m4, [lowq + xq]
    movu           m1, [lowq + xq + 4]

    mova           m5, m4
    punpcklwd      m4, m1
    punpckhwd      m5, m1

    mova           m6, m4
    mova           m7, m5

%if ARCH_X86_64
    pmaddwd        m4, m8
    pmaddwd        m5, m8
    pmaddwd        m6, m9
    pmaddwd        m7, m9

    paddd          m4, m11
    paddd          m5, m11
    paddd          m6, m11
    paddd          m7, m11
%else
    pmaddwd        m4, [factor_p1_n1]
    pmaddwd        m5, [factor_p1_n1]
    pmaddwd        m6, [factor_n1_p1]
    pmaddwd        m7, [factor_n1_p1]

    paddd          m4, [pd_4]
    paddd          m5, [pd_4]
    paddd          m6, [pd_4]
    paddd          m7, [pd_4]
%endif

    psrad          m4, 3
    psrad          m5, 3
    psrad          m6, 3
    psrad          m7, 3

    movu           m2, [lowq + xq + 2]
    movu           m3, [highq + xq + 2]

    mova           m0, m2
    punpcklwd      m2, m3
    punpckhwd      m0, m3

    mova           m1, m2
    mova           m3, m0

%if ARCH_X86_64
    pmaddwd        m2, m10
    pmaddwd        m0, m10
    pmaddwd        m1, m8
    pmaddwd        m3, m8
%else
    pmaddwd        m2, [pw_1]
    pmaddwd        m0, [pw_1]
    pmaddwd        m1, [factor_p1_n1]
    pmaddwd        m3, [factor_p1_n1]
%endif

    paddd          m2, m4
    paddd          m0, m5
    paddd          m1, m6
    paddd          m3, m7

    psrad          m2, 1
    psrad          m0, 1
    psrad          m1, 1
    psrad          m3, 1

    packssdw       m2, m0
    packssdw       m1, m3

    mova           m0, m2
    punpcklwd      m2, m1
    punpckhwd      m0, m1

%if %1
    CLIPW          m2, [pw_0], [pw_%1]
    CLIPW          m0, [pw_0], [pw_%1]
%endif

    movu  [outputq + xq * 2 + 4], m2
    movu  [outputq + xq * 2 + mmsize + 4], m0

    add            xq, mmsize
    cmp            xq, widthq
    jl .loop

    add          lowq, widthq
    add         highq, widthq
    add       outputq, widthq
    add       outputq, widthq

    movsx          xq, word [lowq - 2]
    imul           xq, 5

    movsx       tempq, word [lowq - 4]
    imul        tempq, 4
    add         tempq, xq

    movsx          xq, word [lowq - 6]
    sub         tempq, xq
    add         tempq, 4
    sar         tempq, 3

    movsx          xq, word [highq - 2]
    add         tempq, xq
    sar         tempq, 1

%if %1
    movd          xm0, tempd
    CLIPW          m0, [pw_0], [pw_%1]
    pextrw      tempd, xm0, 0
%endif
    mov  word [outputq - 4], tempw

    movsx          xq, word [lowq - 2]
    imul           xq, 11

    movsx       tempq, word [lowq - 4]
    imul        tempq, -4
    add         tempq, xq

    movsx          xq, word [lowq - 6]
    add         tempq, xq
    add         tempq, 4
    sar         tempq, 3

    movsx          xq, word [highq - 2]
    sub         tempq, xq
    sar         tempq, 1

%if %1
    movd          xm0, tempd
    CLIPW          m0, [pw_0], [pw_%1]
    pextrw      tempd, xm0, 0
%endif
    mov  word [outputq - 2], tempw

%if %1 == 0
    sub          lowq, widthq
    sub         highq, widthq
    sub       outputq, widthq
    sub       outputq, widthq

    add          lowq, lwidthq
    add         highq, hwidthq
    add       outputq, ostrideq
    add       outputq, ostrideq
    add            yq, 1
    jl .looph
%endif

    RET
%endmacro

INIT_XMM sse2
CFHD_HORIZ_FILTER 0

INIT_XMM sse2
CFHD_HORIZ_FILTER 1023

INIT_XMM sse2
CFHD_HORIZ_FILTER 4095

INIT_XMM sse2
%if ARCH_X86_64
cglobal cfhd_vert_filter, 8, 11, 14, output, ostride, low, lwidth, high, hwidth, width, height, x, y, pos
    shl        ostrided, 1
    shl         lwidthd, 1
    shl         hwidthd, 1
    shl          widthd, 1

    dec   heightd

    mova       m8, [factor_p1_n1]
    mova       m9, [factor_n1_p1]
    mova      m10, [pw_1]
    mova      m11, [pd_4]
    mova      m12, [factor_p11_n4]
    mova      m13, [factor_p5_p4]
%else
cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, pos, width, height
    shl        xd, 1
    shl        yd, 1
    shl      posd, 1
    shl    widthd, 1

    mov       xmp, xq
    mov       ymp, yq
    mov     posmp, posq

    mov        xq, r7m
    dec        xq
    mov   widthmp, xq

%define ostrideq xm
%define lwidthq  ym
%define hwidthq  posm
%define heightq  widthm

%endif

    xor        xq, xq
.loopw:
    xor        yq, yq

    mov      posq, xq
    movu       m0, [lowq + posq]
    add      posq, lwidthq
    movu       m1, [lowq + posq]
    mova       m2, m0
    punpcklwd  m0, m1
    punpckhwd  m2, m1

%if ARCH_X86_64
    pmaddwd    m0, m12
    pmaddwd    m2, m12
%else
    pmaddwd    m0, [factor_p11_n4]
    pmaddwd    m2, [factor_p11_n4]
%endif

    pxor       m4, m4
    add      posq, lwidthq
    movu       m1, [lowq + posq]
    mova       m3, m4
    punpcklwd  m4, m1
    punpckhwd  m3, m1

    psrad      m4, 16
    psrad      m3, 16

    paddd      m0, m4
    paddd      m2, m3

    paddd      m0, [pd_4]
    paddd      m2, [pd_4]

    psrad      m0, 3
    psrad      m2, 3

    mov      posq, xq
    pxor       m4, m4
    movu       m1, [highq + posq]
    mova       m3, m4
    punpcklwd  m4, m1
    punpckhwd  m3, m1

    psrad      m4, 16
    psrad      m3, 16

    paddd      m0, m4
    paddd      m2, m3

    psrad      m0, 1
    psrad      m2, 1

    packssdw   m0, m2

    movu    [outputq + posq], m0

    movu       m0, [lowq + posq]
    add      posq, lwidthq
    movu       m1, [lowq + posq]
    mova       m2, m0
    punpcklwd  m0, m1
    punpckhwd  m2, m1

%if ARCH_X86_64
    pmaddwd    m0, m13
    pmaddwd    m2, m13
%else
    pmaddwd    m0, [factor_p5_p4]
    pmaddwd    m2, [factor_p5_p4]
%endif

    pxor       m4, m4
    add      posq, lwidthq
    movu       m1, [lowq + posq]
    mova       m3, m4
    punpcklwd  m4, m1
    punpckhwd  m3, m1

    psrad      m4, 16
    psrad      m3, 16

    psubd      m0, m4
    psubd      m2, m3

    paddd      m0, [pd_4]
    paddd      m2, [pd_4]

    psrad      m0, 3
    psrad      m2, 3

    mov      posq, xq
    pxor       m4, m4
    movu       m1, [highq + posq]
    mova       m3, m4
    punpcklwd  m4, m1
    punpckhwd  m3, m1

    psrad      m4, 16
    psrad      m3, 16

    psubd      m0, m4
    psubd      m2, m3

    psrad      m0, 1
    psrad      m2, 1

    packssdw   m0, m2

    add      posq, ostrideq
    movu    [outputq + posq], m0

    add        yq, 1
.looph:
    mov      posq, lwidthq
    imul     posq, yq
    sub      posq, lwidthq
    add      posq, xq

    movu       m4, [lowq + posq]

    add      posq, lwidthq
    add      posq, lwidthq
    movu       m1, [lowq + posq]

    mova       m5, m4
    punpcklwd  m4, m1
    punpckhwd  m5, m1

    mova       m6, m4
    mova       m7, m5

%if ARCH_X86_64
    pmaddwd    m4, m8
    pmaddwd    m5, m8
    pmaddwd    m6, m9
    pmaddwd    m7, m9

    paddd      m4, m11
    paddd      m5, m11
    paddd      m6, m11
    paddd      m7, m11
%else
    pmaddwd    m4, [factor_p1_n1]
    pmaddwd    m5, [factor_p1_n1]
    pmaddwd    m6, [factor_n1_p1]
    pmaddwd    m7, [factor_n1_p1]

    paddd      m4, [pd_4]
    paddd      m5, [pd_4]
    paddd      m6, [pd_4]
    paddd      m7, [pd_4]
%endif

    psrad      m4, 3
    psrad      m5, 3
    psrad      m6, 3
    psrad      m7, 3

    sub      posq, lwidthq
    movu       m0, [lowq + posq]

    mov      posq, hwidthq
    imul     posq, yq
    add      posq, xq
    movu       m1, [highq + posq]

    mova       m2, m0
    punpcklwd  m0, m1
    punpckhwd  m2, m1

    mova       m1, m0
    mova       m3, m2

%if ARCH_X86_64
    pmaddwd    m0, m10
    pmaddwd    m2, m10
    pmaddwd    m1, m8
    pmaddwd    m3, m8
%else
    pmaddwd    m0, [pw_1]
    pmaddwd    m2, [pw_1]
    pmaddwd    m1, [factor_p1_n1]
    pmaddwd    m3, [factor_p1_n1]
%endif

    paddd      m0, m4
    paddd      m2, m5
    paddd      m1, m6
    paddd      m3, m7

    psrad      m0, 1
    psrad      m2, 1
    psrad      m1, 1
    psrad      m3, 1

    packssdw   m0, m2
    packssdw   m1, m3

    mov      posq, ostrideq
    imul     posq, 2
    imul     posq, yq
    add      posq, xq

    movu    [outputq + posq], m0
    add      posq, ostrideq
    movu    [outputq + posq], m1

    add        yq, 1
    cmp        yq, heightq
    jl .looph

    mov      posq, lwidthq
    imul     posq, yq
    add      posq, xq
    movu       m0, [lowq + posq]
    sub      posq, lwidthq
    movu       m1, [lowq + posq]
    mova       m2, m0
    punpcklwd  m0, m1
    punpckhwd  m2, m1

%if ARCH_X86_64
    pmaddwd    m0, m13
    pmaddwd    m2, m13
%else
    pmaddwd    m0, [factor_p5_p4]
    pmaddwd    m2, [factor_p5_p4]
%endif

    pxor       m4, m4
    sub      posq, lwidthq
    movu       m1, [lowq + posq]
    mova       m3, m4
    punpcklwd  m4, m1
    punpckhwd  m3, m1

    psrad      m4, 16
    psrad      m3, 16

    psubd      m0, m4
    psubd      m2, m3

%if ARCH_X86_64
    paddd      m0, m11
    paddd      m2, m11
%else
    paddd      m0, [pd_4]
    paddd      m2, [pd_4]
%endif

    psrad      m0, 3
    psrad      m2, 3

    mov      posq, hwidthq
    imul     posq, yq
    add      posq, xq
    pxor       m4, m4
    movu       m1, [highq + posq]
    mova       m3, m4
    punpcklwd  m4, m1
    punpckhwd  m3, m1

    psrad      m4, 16
    psrad      m3, 16

    paddd      m0, m4
    paddd      m2, m3

    psrad      m0, 1
    psrad      m2, 1

    packssdw   m0, m2

    mov      posq, ostrideq
    imul     posq, 2
    imul     posq, yq
    add      posq, xq
    movu    [outputq + posq], m0

    mov      posq, lwidthq
    imul     posq, yq
    add      posq, xq
    movu       m0, [lowq + posq]
    sub      posq, lwidthq
    movu       m1, [lowq + posq]
    mova       m2, m0
    punpcklwd  m0, m1
    punpckhwd  m2, m1

%if ARCH_X86_64
    pmaddwd    m0, m12
    pmaddwd    m2, m12
%else
    pmaddwd    m0, [factor_p11_n4]
    pmaddwd    m2, [factor_p11_n4]
%endif

    pxor       m4, m4
    sub      posq, lwidthq
    movu       m1, [lowq + posq]
    mova       m3, m4
    punpcklwd  m4, m1
    punpckhwd  m3, m1

    psrad      m4, 16
    psrad      m3, 16

    paddd      m0, m4
    paddd      m2, m3

%if ARCH_X86_64
    paddd      m0, m11
    paddd      m2, m11
%else
    paddd      m0, [pd_4]
    paddd      m2, [pd_4]
%endif

    psrad      m0, 3
    psrad      m2, 3

    mov      posq, hwidthq
    imul     posq, yq
    add      posq, xq
    pxor       m4, m4
    movu       m1, [highq + posq]
    mova       m3, m4
    punpcklwd  m4, m1
    punpckhwd  m3, m1

    psrad      m4, 16
    psrad      m3, 16

    psubd      m0, m4
    psubd      m2, m3

    psrad      m0, 1
    psrad      m2, 1

    packssdw   m0, m2

    mov      posq, ostrideq
    imul     posq, 2
    imul     posq, yq
    add      posq, ostrideq
    add      posq, xq
    movu    [outputq + posq], m0

    add        xq, mmsize
    cmp        xq, widthq
    jl .loopw
    RET