/* -*-arm64-*-
 * vim: syntax=arm64asm
 *
 * Copyright (c) 2022 J. Dekker <jdek@itanimul.li>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavutil/aarch64/asm.S"
#define MAX_PB_SIZE 64

const qpel_filters, align=4
        .byte           0,  0,  0,  0,  0,  0, 0,  0
        .byte           -1, 4,-10, 58, 17, -5, 1,  0
        .byte           -1, 4,-11, 40, 40,-11, 4, -1
        .byte           0,  1, -5, 17, 58,-10, 4, -1
endconst

const qpel_filters_abs, align=4
        .byte           0,  0,  0,  0,  0,  0, 0,  0
        .byte           1,  4, 10, 58, 17,  5, 1,  0
        .byte           1,  4, 11, 40, 40, 11, 4,  1
        .byte           0,  1,  5, 17, 58, 10, 4,  1
endconst

.macro load_filter m
        movrel          x15, qpel_filters
        add             x15, x15, \m, lsl #3
        ld1             {v0.8b}, [x15]
        sxtl            v0.8h, v0.8b
.endm

.macro load_qpel_filterb freg, xreg
        movrel          \xreg, qpel_filters_abs
        add             \xreg, \xreg, \freg, lsl #3
        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b}, [\xreg], #4
        ld4r            {v4.16b, v5.16b, v6.16b, v7.16b}, [\xreg]
.endm

.macro calc_qpelb dst, src0, src1, src2, src3, src4, src5, src6, src7
        umull           \dst\().8h, \src1\().8b, v1.8b
        umlsl           \dst\().8h, \src0\().8b, v0.8b
        umlsl           \dst\().8h, \src2\().8b, v2.8b
        umlal           \dst\().8h, \src3\().8b, v3.8b
        umlal           \dst\().8h, \src4\().8b, v4.8b
        umlsl           \dst\().8h, \src5\().8b, v5.8b
        umlal           \dst\().8h, \src6\().8b, v6.8b
        umlsl           \dst\().8h, \src7\().8b, v7.8b
.endm

.macro calc_qpelb2 dst, src0, src1, src2, src3, src4, src5, src6, src7
        umull2          \dst\().8h, \src1\().16b, v1.16b
        umlsl2          \dst\().8h, \src0\().16b, v0.16b
        umlsl2          \dst\().8h, \src2\().16b, v2.16b
        umlal2          \dst\().8h, \src3\().16b, v3.16b
        umlal2          \dst\().8h, \src4\().16b, v4.16b
        umlsl2          \dst\().8h, \src5\().16b, v5.16b
        umlal2          \dst\().8h, \src6\().16b, v6.16b
        umlsl2          \dst\().8h, \src7\().16b, v7.16b
.endm

.macro load_qpel_filterh freg, xreg
        movrel          \xreg, qpel_filters
        add             \xreg, \xreg, \freg, lsl #3
        ld1             {v0.8b}, [\xreg]
        sxtl            v0.8h, v0.8b
.endm

.macro calc_qpelh dst, src0, src1, src2, src3, src4, src5, src6, src7, op, shift=6
        smull           \dst\().4s, \src0\().4h, v0.h[0]
        smlal           \dst\().4s, \src1\().4h, v0.h[1]
        smlal           \dst\().4s, \src2\().4h, v0.h[2]
        smlal           \dst\().4s, \src3\().4h, v0.h[3]
        smlal           \dst\().4s, \src4\().4h, v0.h[4]
        smlal           \dst\().4s, \src5\().4h, v0.h[5]
        smlal           \dst\().4s, \src6\().4h, v0.h[6]
        smlal           \dst\().4s, \src7\().4h, v0.h[7]
.ifc \op, sshr
        sshr            \dst\().4s, \dst\().4s, \shift
.else
        \op             \dst\().4h, \dst\().4s, \shift
.endif
.endm

.macro calc_qpelh2 dst, dstt, src0, src1, src2, src3, src4, src5, src6, src7, op, shift=6
        smull2          \dstt\().4s, \src0\().8h, v0.h[0]
        smlal2          \dstt\().4s, \src1\().8h, v0.h[1]
        smlal2          \dstt\().4s, \src2\().8h, v0.h[2]
        smlal2          \dstt\().4s, \src3\().8h, v0.h[3]
        smlal2          \dstt\().4s, \src4\().8h, v0.h[4]
        smlal2          \dstt\().4s, \src5\().8h, v0.h[5]
        smlal2          \dstt\().4s, \src6\().8h, v0.h[6]
        smlal2          \dstt\().4s, \src7\().8h, v0.h[7]
.ifc \op, sshr
        sshr            \dst\().4s, \dstt\().4s, \shift
.else
        \op             \dst\().8h, \dstt\().4s, \shift
.endif
.endm

.macro calc_all
        calc            v23, v16, v17, v18, v19, v20, v21, v22, v23
        b.eq            2f
        calc            v16, v17, v18, v19, v20, v21, v22, v23, v16
        b.eq            2f
        calc            v17, v18, v19, v20, v21, v22, v23, v16, v17
        b.eq            2f
        calc            v18, v19, v20, v21, v22, v23, v16, v17, v18
        b.eq            2f
        calc            v19, v20, v21, v22, v23, v16, v17, v18, v19
        b.eq            2f
        calc            v20, v21, v22, v23, v16, v17, v18, v19, v20
        b.eq            2f
        calc            v21, v22, v23, v16, v17, v18, v19, v20, v21
        b.eq            2f
        calc            v22, v23, v16, v17, v18, v19, v20, v21, v22
        b.hi            1b
.endm

.macro calc_all2
        calc            v30, v31, v16, v18, v20, v22, v24, v26, v28, v30, v17, v19, v21, v23, v25, v27, v29, v31
        b.eq            2f
        calc            v16, v17, v18, v20, v22, v24, v26, v28, v30, v16, v19, v21, v23, v25, v27, v29, v31, v17
        b.eq            2f
        calc            v18, v19, v20, v22, v24, v26, v28, v30, v16, v18, v21, v23, v25, v27, v29, v31, v17, v19
        b.eq            2f
        calc            v20, v21, v22, v24, v26, v28, v30, v16, v18, v20, v23, v25, v27, v29, v31, v17, v19, v21
        b.eq            2f
        calc            v22, v23, v24, v26, v28, v30, v16, v18, v20, v22, v25, v27, v29, v31, v17, v19, v21, v23
        b.eq            2f
        calc            v24, v25, v26, v28, v30, v16, v18, v20, v22, v24, v27, v29, v31, v17, v19, v21, v23, v25
        b.eq            2f
        calc            v26, v27, v28, v30, v16, v18, v20, v22, v24, v26, v29, v31, v17, v19, v21, v23, v25, v27
        b.eq            2f
        calc            v28, v29, v30, v16, v18, v20, v22, v24, v26, v28, v31, v17, v19, v21, v23, v25, v27, v29
        b.hi            1b
.endm

.macro put_hevc type
.ifc \type, qpel
        // void put_hevc_qpel_h(int16_t *dst,
        //                      uint8_t *_src, ptrdiff_t _srcstride,
        //                      int height, intptr_t mx, intptr_t my, int width)
        dst        .req x0
        dststride  .req x7
        src        .req x1
        srcstride  .req x2
        height     .req x3
        heightw    .req w3
        mx         .req x4
        width      .req w6
.endif
.ifc \type, qpel_uni
        // void put_hevc_qpel_uni_h(uint8_t *_dst,  ptrdiff_t _dststride,
        //                          uint8_t *_src, ptrdiff_t _srcstride,
        //                          int height, intptr_t mx, intptr_t my, int width)
        dst        .req x0
        dststride  .req x1
        src        .req x2
        srcstride  .req x3
        height     .req x4
        heightw    .req w4
        mx         .req x5
        width      .req w7
.endif
.ifc \type, qpel_bi
        // void put_hevc_qpel_bi_h(uint8_t *_dst, ptrdiff_t _dststride,
        //                         uint8_t *_src, ptrdiff_t _srcstride,
        //                         int16_t *src2, int height, intptr_t mx,
        //                         intptr_t my, int width)
        dst        .req x0
        dststride  .req x1
        src        .req x2
        srcstride  .req x3
        height     .req x5
        heightw    .req w5
        mx         .req x6
        width      .req w8
.endif

.ifc \type, qpel
function ff_hevc_put_hevc_h4_8_neon, export=0
        uxtl            v16.8h,  v16.8b
        uxtl            v17.8h,  v17.8b
        uxtl            v18.8h,  v18.8b
        uxtl            v19.8h,  v19.8b

        mul             v23.4h,  v16.4h, v0.h[0]
        mul             v24.4h,  v18.4h, v0.h[0]

.irpc i, 1234567
        ext             v20.16b, v16.16b, v17.16b, #(2*\i)
        ext             v21.16b, v18.16b, v19.16b, #(2*\i)
        mla             v23.4h,  v20.4h, v0.h[\i]
        mla             v24.4h,  v21.4h, v0.h[\i]
.endr
        ret
endfunc
.endif

function ff_hevc_put_hevc_\type\()_h4_8_neon, export=1
        load_filter     mx
.ifc \type, qpel_bi
        mov             x16, #(MAX_PB_SIZE << 2) // src2bstridel
        add             x15, x4, #(MAX_PB_SIZE << 1) // src2b
.endif
        sub             src, src, #3
        mov             mx, x30
.ifc \type, qpel
        mov             dststride, #(MAX_PB_SIZE << 1)
        lsl             x13, srcstride, #1 // srcstridel
        mov             x14, #(MAX_PB_SIZE << 2)
.else
        lsl             x14, dststride, #1 // dststridel
        lsl             x13, srcstride, #1 // srcstridel
.endif
        add             x10, dst, dststride // dstb
        add             x12, src, srcstride // srcb
0:      ld1             {v16.8b, v17.8b}, [src], x13
        ld1             {v18.8b, v19.8b}, [x12], x13
.ifc \type, qpel_bi
        ld1             {v25.8h}, [ x4], x16
        ld1             {v26.8h}, [x15], x16
.endif

        bl              ff_hevc_put_hevc_h4_8_neon
        subs            heightw, heightw, #2

.ifc \type, qpel
        st1             {v23.4h}, [dst], x14
        st1             {v24.4h}, [x10], x14
.else
.ifc \type, qpel_bi
        sqadd           v23.4h, v23.4h, v25.4h
        sqadd           v24.4h, v24.4h, v26.4h
        sqrshrun        v23.8b, v23.8h, #7
        sqrshrun        v24.8b, v24.8h, #7
.else
        sqrshrun        v23.8b, v23.8h, #6
        sqrshrun        v24.8b, v24.8h, #6
.endif
        st1             {v23.s}[0], [dst], x14
        st1             {v24.s}[0], [x10], x14
.endif
        b.gt            0b // double line
        ret             mx
endfunc

.ifc \type, qpel
function ff_hevc_put_hevc_h8_8_neon, export=0
        uxtl            v16.8h,  v16.8b
        uxtl            v17.8h,  v17.8b
        uxtl            v18.8h,  v18.8b
        uxtl            v19.8h,  v19.8b

        mul             v23.8h,  v16.8h, v0.h[0]
        mul             v24.8h,  v18.8h, v0.h[0]

.irpc i, 1234567
        ext             v20.16b, v16.16b, v17.16b, #(2*\i)
        ext             v21.16b, v18.16b, v19.16b, #(2*\i)
        mla             v23.8h,  v20.8h, v0.h[\i]
        mla             v24.8h,  v21.8h, v0.h[\i]
.endr
        ret
endfunc
.endif

function ff_hevc_put_hevc_\type\()_h6_8_neon, export=1
        load_filter     mx
.ifc \type, qpel_bi
        mov             x16, #(MAX_PB_SIZE << 2) // src2bstridel
        add             x15, x4, #(MAX_PB_SIZE << 1) // src2b
.endif
        sub             src, src, #3
        mov             mx, x30
.ifc \type, qpel
        mov             dststride, #(MAX_PB_SIZE << 1)
        lsl             x13, srcstride, #1 // srcstridel
        mov             x14, #((MAX_PB_SIZE << 2) - 8)
.else
        lsl             x14, dststride, #1 // dststridel
        lsl             x13, srcstride, #1 // srcstridel
        sub             x14, x14, #4
.endif
        add             x10, dst, dststride // dstb
        add             x12, src, srcstride // srcb
0:      ld1             {v16.8b, v17.8b}, [src], x13
        ld1             {v18.8b, v19.8b}, [x12], x13
.ifc \type, qpel_bi
        ld1             {v25.8h}, [ x4], x16
        ld1             {v26.8h}, [x15], x16
.endif

        bl              ff_hevc_put_hevc_h8_8_neon
        subs            heightw, heightw, #2

.ifc \type, qpel
        st1             {v23.4h},   [dst], #8
        st1             {v24.4h},   [x10], #8
        st1             {v23.s}[2], [dst], x14
        st1             {v24.s}[2], [x10], x14
.else
.ifc \type, qpel_bi
        sqadd           v23.8h, v23.8h, v25.8h
        sqadd           v24.8h, v24.8h, v26.8h
        sqrshrun        v23.8b, v23.8h, #7
        sqrshrun        v24.8b, v24.8h, #7
.else
        sqrshrun        v23.8b, v23.8h, #6
        sqrshrun        v24.8b, v24.8h, #6
.endif
        st1             {v23.s}[0], [dst], #4
        st1             {v24.s}[0], [x10], #4
        st1             {v23.h}[2], [dst], x14
        st1             {v24.h}[2], [x10], x14
.endif
        b.gt            0b // double line
        ret             mx
endfunc

function ff_hevc_put_hevc_\type\()_h8_8_neon, export=1
        load_filter     mx
.ifc \type, qpel_bi
        mov             x16, #(MAX_PB_SIZE << 2) // src2bstridel
        add             x15, x4, #(MAX_PB_SIZE << 1) // src2b
.endif
        sub             src, src, #3
        mov             mx, x30
.ifc \type, qpel
        mov             dststride, #(MAX_PB_SIZE << 1)
        lsl             x13, srcstride, #1 // srcstridel
        mov             x14, #(MAX_PB_SIZE << 2)
.else
        lsl             x14, dststride, #1 // dststridel
        lsl             x13, srcstride, #1 // srcstridel
.endif
        add             x10, dst, dststride // dstb
        add             x12, src, srcstride // srcb
0:      ld1             {v16.8b, v17.8b}, [src], x13
        ld1             {v18.8b, v19.8b}, [x12], x13
.ifc \type, qpel_bi
        ld1             {v25.8h}, [ x4], x16
        ld1             {v26.8h}, [x15], x16
.endif

        bl              ff_hevc_put_hevc_h8_8_neon
        subs            heightw, heightw, #2

.ifc \type, qpel
        st1             {v23.8h}, [dst], x14
        st1             {v24.8h}, [x10], x14
.else
.ifc \type, qpel_bi
        sqadd           v23.8h, v23.8h, v25.8h
        sqadd           v24.8h, v24.8h, v26.8h
        sqrshrun        v23.8b, v23.8h, #7
        sqrshrun        v24.8b, v24.8h, #7
.else
        sqrshrun        v23.8b, v23.8h, #6
        sqrshrun        v24.8b, v24.8h, #6
.endif
        st1             {v23.8b}, [dst], x14
        st1             {v24.8b}, [x10], x14
.endif
        b.gt            0b // double line
        ret             mx
endfunc

.ifc \type, qpel
function ff_hevc_put_hevc_h16_8_neon, export=0
        uxtl            v17.8h,  v17.8b
        uxtl            v18.8h,  v18.8b

        uxtl            v20.8h,  v20.8b
        uxtl            v21.8h,  v21.8b

        mul             v26.8h,  v16.8h, v0.h[0]
        mul             v27.8h,  v17.8h, v0.h[0]
        mul             v28.8h,  v19.8h, v0.h[0]
        mul             v29.8h,  v20.8h, v0.h[0]
.irpc i, 1234567
        ext             v22.16b, v16.16b, v17.16b, #(2*\i)
        ext             v23.16b, v17.16b, v18.16b, #(2*\i)

        ext             v24.16b, v19.16b, v20.16b, #(2*\i)
        ext             v25.16b, v20.16b, v21.16b, #(2*\i)

        mla             v26.8h,  v22.8h, v0.h[\i]
        mla             v27.8h,  v23.8h, v0.h[\i]

        mla             v28.8h,  v24.8h, v0.h[\i]
        mla             v29.8h,  v25.8h, v0.h[\i]
.endr
        ret
endfunc
.endif

function ff_hevc_put_hevc_\type\()_h12_8_neon, export=1
        load_filter     mx
        sxtw            height, heightw
.ifc \type, qpel_bi
        ldrh            w8, [sp] // width
        mov             x16, #(MAX_PB_SIZE << 2) // src2bstridel
        lsl             x17, height, #7 // src2b reset (height * (MAX_PB_SIZE << 1))
        add             x15, x4, #(MAX_PB_SIZE << 1) // src2b
.endif
        sub             src, src, #3
        mov             mx, x30
.ifc \type, qpel
        mov             dststride, #(MAX_PB_SIZE << 1)
        lsl             x13, srcstride, #1 // srcstridel
        mov             x14, #((MAX_PB_SIZE << 2) - 16)
.else
        lsl             x14, dststride, #1 // dststridel
        lsl             x13, srcstride, #1 // srcstridel
        sub             x14, x14, #8
.endif
        add             x10, dst, dststride // dstb
        add             x12, src, srcstride // srcb
0:      mov             x9, height
1:      ld1             {v16.8b-v18.8b}, [src], x13
        ld1             {v19.8b-v21.8b}, [x12], x13

        uxtl            v16.8h,  v16.8b
        uxtl            v19.8h,  v19.8b
        bl              ff_hevc_put_hevc_h16_8_neon
        subs            x9, x9, #2

.ifc \type, qpel
        st1             {v26.8h}, [dst], #16
        st1             {v28.8h}, [x10], #16
        st1             {v27.4h}, [dst], x14
        st1             {v29.4h}, [x10], x14
.else
.ifc \type, qpel_bi
        ld1             {v16.8h, v17.8h}, [ x4], x16
        ld1             {v18.8h, v19.8h}, [x15], x16
        sqadd           v26.8h, v26.8h, v16.8h
        sqadd           v27.8h, v27.8h, v17.8h
        sqadd           v28.8h, v28.8h, v18.8h
        sqadd           v29.8h, v29.8h, v19.8h
        sqrshrun        v26.8b, v26.8h, #7
        sqrshrun        v27.8b, v27.8h, #7
        sqrshrun        v28.8b, v28.8h, #7
        sqrshrun        v29.8b, v29.8h, #7
.else
        sqrshrun        v26.8b, v26.8h, #6
        sqrshrun        v27.8b, v27.8h, #6
        sqrshrun        v28.8b, v28.8h, #6
        sqrshrun        v29.8b, v29.8h, #6
.endif
        st1             {v26.8b},   [dst], #8
        st1             {v28.8b},   [x10], #8
        st1             {v27.s}[0], [dst], x14
        st1             {v29.s}[0], [x10], x14
.endif
        b.gt            1b // double line
        subs            width, width, #12
        // reset src
        msub            src, srcstride, height, src
        msub            x12, srcstride, height, x12
        // reset dst
        msub            dst, dststride, height, dst
        msub            x10, dststride, height, x10
.ifc \type, qpel_bi
        // reset xsrc
        sub             x4,  x4,  x17
        sub             x15, x15, x17
        add             x4,  x4,  #24
        add             x15, x15, #24
.endif
        add             src, src, #12
        add             x12, x12, #12
.ifc \type, qpel
        add             dst, dst, #24
        add             x10, x10, #24
.else
        add             dst, dst, #12
        add             x10, x10, #12
.endif
        b.gt            0b
        ret             mx
endfunc

function ff_hevc_put_hevc_\type\()_h16_8_neon, export=1
        load_filter     mx
        sxtw            height, heightw
        mov             mx, x30
.ifc \type, qpel_bi
        ldrh            w8, [sp] // width
        mov             x16, #(MAX_PB_SIZE << 2) // src2bstridel
        add             x15, x4, #(MAX_PB_SIZE << 1) // src2b
.endif
        sub             src, src, #3
        mov             mx, x30
.ifc \type, qpel
        mov             dststride, #(MAX_PB_SIZE << 1)
        lsl             x13, srcstride, #1 // srcstridel
        mov             x14, #(MAX_PB_SIZE << 2)
.else
        lsl             x14, dststride, #1 // dststridel
        lsl             x13, srcstride, #1 // srcstridel
.endif
        add             x10, dst, dststride // dstb
        add             x12, src, srcstride // srcb

1:      ld1             {v16.8b-v18.8b}, [src], x13
        ld1             {v19.8b-v21.8b}, [x12], x13

        uxtl            v16.8h,  v16.8b
        uxtl            v19.8h,  v19.8b
        bl              ff_hevc_put_hevc_h16_8_neon
        subs            height, height, #2

.ifc \type, qpel
        st1             {v26.8h, v27.8h}, [dst], x14
        st1             {v28.8h, v29.8h}, [x10], x14
.else
.ifc \type, qpel_bi
        ld1             {v16.8h, v17.8h}, [ x4], x16
        ld1             {v18.8h, v19.8h}, [x15], x16
        sqadd           v26.8h, v26.8h, v16.8h
        sqadd           v27.8h, v27.8h, v17.8h
        sqadd           v28.8h, v28.8h, v18.8h
        sqadd           v29.8h, v29.8h, v19.8h
        sqrshrun        v26.8b, v26.8h, #7
        sqrshrun        v27.8b, v27.8h, #7
        sqrshrun        v28.8b, v28.8h, #7
        sqrshrun        v29.8b, v29.8h, #7
.else
        sqrshrun        v26.8b, v26.8h, #6
        sqrshrun        v27.8b, v27.8h, #6
        sqrshrun        v28.8b, v28.8h, #6
        sqrshrun        v29.8b, v29.8h, #6
.endif
        st1             {v26.8b, v27.8b}, [dst], x14
        st1             {v28.8b, v29.8b}, [x10], x14
.endif
        b.gt            1b // double line
        ret             mx
endfunc

function ff_hevc_put_hevc_\type\()_h32_8_neon, export=1
        load_filter     mx
        sxtw            height, heightw
        mov             mx, x30
.ifc \type, qpel_bi
        ldrh            w8, [sp] // width
        mov             x16, #(MAX_PB_SIZE << 2) // src2bstridel
        lsl             x17, x5, #7 // src2b reset
        add             x15, x4, #(MAX_PB_SIZE << 1) // src2b
        sub             x16, x16, width, uxtw #1
.endif
        sub             src, src, #3
        mov             mx, x30
.ifc \type, qpel
        mov             dststride, #(MAX_PB_SIZE << 1)
        lsl             x13, srcstride, #1 // srcstridel
        mov             x14, #(MAX_PB_SIZE << 2)
        sub             x14, x14, width, uxtw #1
.else
        lsl             x14, dststride, #1 // dststridel
        lsl             x13, srcstride, #1 // srcstridel
        sub             x14, x14, width, uxtw
.endif
        sub             x13, x13, width, uxtw
        sub             x13, x13, #8
        add             x10, dst, dststride // dstb
        add             x12, src, srcstride // srcb
0:      mov             w9, width
        ld1             {v16.8b}, [src], #8
        ld1             {v19.8b}, [x12], #8
        uxtl            v16.8h, v16.8b
        uxtl            v19.8h, v19.8b
1:
        ld1             {v17.8b-v18.8b}, [src], #16
        ld1             {v20.8b-v21.8b}, [x12], #16

        bl              ff_hevc_put_hevc_h16_8_neon
        subs            w9, w9, #16

        mov             v16.16b, v18.16b
        mov             v19.16b, v21.16b
.ifc \type, qpel
        st1             {v26.8h, v27.8h}, [dst], #32
        st1             {v28.8h, v29.8h}, [x10], #32
.else
.ifc \type, qpel_bi
        ld1             {v20.8h, v21.8h}, [ x4], #32
        ld1             {v22.8h, v23.8h}, [x15], #32
        sqadd           v26.8h, v26.8h, v20.8h
        sqadd           v27.8h, v27.8h, v21.8h
        sqadd           v28.8h, v28.8h, v22.8h
        sqadd           v29.8h, v29.8h, v23.8h
        sqrshrun        v26.8b, v26.8h, #7
        sqrshrun        v27.8b, v27.8h, #7
        sqrshrun        v28.8b, v28.8h, #7
        sqrshrun        v29.8b, v29.8h, #7
.else
        sqrshrun        v26.8b, v26.8h, #6
        sqrshrun        v27.8b, v27.8h, #6
        sqrshrun        v28.8b, v28.8h, #6
        sqrshrun        v29.8b, v29.8h, #6
.endif
        st1             {v26.8b, v27.8b}, [dst], #16
        st1             {v28.8b, v29.8b}, [x10], #16
.endif
        b.gt            1b // double line
        subs            height, height, #2
        add             src, src, x13
        add             x12, x12, x13
        add             dst, dst, x14
        add             x10, x10, x14
.ifc \type, qpel_bi
        add             x4,  x4,  x16
        add             x15, x15, x16
.endif
        b.gt            0b
        ret             mx
endfunc

.unreq height
.unreq heightw
.unreq width
.unreq src
.unreq dst
.unreq srcstride
.unreq dststride
.unreq mx
.endm

put_hevc qpel
put_hevc qpel_uni
put_hevc qpel_bi

function ff_hevc_put_hevc_qpel_v4_8_neon, export=1
        load_qpel_filterb x5, x4
        sub             x1, x1, x2, lsl #1
        mov             x9, #(MAX_PB_SIZE * 2)
        sub             x1, x1, x2
        ldr             s16, [x1]
        ldr             s17, [x1, x2]
        add             x1, x1, x2, lsl #1
        ldr             s18, [x1]
        ldr             s19, [x1, x2]
        add             x1, x1, x2, lsl #1
        ldr             s20, [x1]
        ldr             s21, [x1, x2]
        add             x1, x1, x2, lsl #1
        ldr             s22, [x1]
        add             x1, x1, x2
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
        ld1             {\tmp\().s}[0], [x1], x2
        movi            v24.8h, #0
        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
        st1             {v24.4h}, [x0], x9
        subs            w3, w3, #1
        b.eq            2f
.endm
1:      calc_all
.purgem calc
2:      ret
endfunc

function ff_hevc_put_hevc_qpel_v6_8_neon, export=1
        load_qpel_filterb x5, x4
        sub             x1, x1, x2, lsl #1
        mov             x9, #(MAX_PB_SIZE * 2 - 8)
        sub             x1, x1, x2
        ldr             d16, [x1]
        ldr             d17, [x1, x2]
        add             x1, x1, x2, lsl #1
        ldr             d18, [x1]
        ldr             d19, [x1, x2]
        add             x1, x1, x2, lsl #1
        ldr             d20, [x1]
        ldr             d21, [x1, x2]
        add             x1, x1, x2, lsl #1
        ldr             d22, [x1]
        add             x1, x1, x2
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
        ld1             {\tmp\().8b}, [x1], x2
        movi            v24.8h, #0
        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
        st1             {v24.4h}, [x0], #8
        st1             {v24.s}[2], [x0], x9
        subs            w3, w3, #1
.endm
1:      calc_all
.purgem calc
2:      ret
endfunc

function ff_hevc_put_hevc_qpel_v8_8_neon, export=1
        load_qpel_filterb x5, x4
        sub             x1, x1, x2, lsl #1
        mov             x9, #(MAX_PB_SIZE * 2)
        sub             x1, x1, x2
        ldr             d16, [x1]
        ldr             d17, [x1, x2]
        add             x1, x1, x2, lsl #1
        ldr             d18, [x1]
        ldr             d19, [x1, x2]
        add             x1, x1, x2, lsl #1
        ldr             d20, [x1]
        ldr             d21, [x1, x2]
        add             x1, x1, x2, lsl #1
        ldr             d22, [x1]
        add             x1, x1, x2
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
        ld1             {\tmp\().8b}, [x1], x2
        movi            v24.8h, #0
        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
        st1             {v24.8h}, [x0], x9
        subs            w3, w3, #1
.endm
1:      calc_all
.purgem calc
2:      ret
endfunc

function ff_hevc_put_hevc_qpel_v12_8_neon, export=1
        load_qpel_filterb x5, x4
        sub             x1, x1, x2, lsl #1
        mov             x9, #(MAX_PB_SIZE * 2 - 16)
        sub             x1, x1, x2
        ldr             q16, [x1]
        ldr             q17, [x1, x2]
        add             x1, x1, x2, lsl #1
        ldr             q18, [x1]
        ldr             q19, [x1, x2]
        add             x1, x1, x2, lsl #1
        ldr             q20, [x1]
        ldr             q21, [x1, x2]
        add             x1, x1, x2, lsl #1
        ldr             q22, [x1]
        add             x1, x1, x2
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
        ld1             {\tmp\().16b}, [x1], x2
        movi            v24.8h, #0
        movi            v25.8h, #0
        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
        calc_qpelb2     v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
        st1             {v24.8h}, [x0], #16
        subs            w3, w3, #1
        st1             {v25.4h}, [x0], x9
.endm
1:      calc_all
.purgem calc
2:      ret
endfunc

function ff_hevc_put_hevc_qpel_v16_8_neon, export=1
        load_qpel_filterb x5, x4
        sub             x1, x1, x2, lsl #1
        mov             x9, #(MAX_PB_SIZE * 2)
        sub             x1, x1, x2
        ldr             q16, [x1]
        ldr             q17, [x1, x2]
        add             x1, x1, x2, lsl #1
        ldr             q18, [x1]
        ldr             q19, [x1, x2]
        add             x1, x1, x2, lsl #1
        ldr             q20, [x1]
        ldr             q21, [x1, x2]
        add             x1, x1, x2, lsl #1
        ldr             q22, [x1]
        add             x1, x1, x2
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
        ld1             {\tmp\().16b}, [x1], x2
        movi            v24.8h, #0
        movi            v25.8h, #0
        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
        calc_qpelb2     v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
        subs            w3, w3, #1
        st1             {v24.8h, v25.8h}, [x0], x9
.endm
1:      calc_all
.purgem calc
2:      ret
endfunc

// todo: reads #32 bytes
function ff_hevc_put_hevc_qpel_v24_8_neon, export=1
        sub             sp, sp, #32
        st1             {v8.8b, v9.8b, v10.8b}, [sp]
        load_qpel_filterb x5, x4
        sub             x1, x1, x2, lsl #1
        sub             x1, x1, x2
        mov             x9, #(MAX_PB_SIZE * 2)
        ld1             {v16.16b, v17.16b}, [x1], x2
        ld1             {v18.16b, v19.16b}, [x1], x2
        ld1             {v20.16b, v21.16b}, [x1], x2
        ld1             {v22.16b, v23.16b}, [x1], x2
        ld1             {v24.16b, v25.16b}, [x1], x2
        ld1             {v26.16b, v27.16b}, [x1], x2
        ld1             {v28.16b, v29.16b}, [x1], x2
.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
        ld1             {\tmp0\().16b, \tmp1\().16b}, [x1], x2
        movi            v8.8h, #0
        movi            v9.8h, #0
        movi            v10.8h, #0
        calc_qpelb      v8,  \src0, \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7
        calc_qpelb2     v9,  \src0, \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7
        calc_qpelb      v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
        subs            w3, w3, #1
        st1             {v8.8h, v9.8h, v10.8h}, [x0], x9
.endm
1:      calc_all2
.purgem calc
2:      ld1             {v8.8b, v9.8b, v10.8b}, [sp]
        add             sp, sp, #32
        ret
endfunc

function ff_hevc_put_hevc_qpel_v32_8_neon, export=1
        sub             sp, sp, #32
        st1             {v8.8b-v11.8b}, [sp]
        load_qpel_filterb x5, x4
        sub             x1, x1, x2, lsl #1
        mov             x9, #(MAX_PB_SIZE * 2)
        sub             x1, x1, x2
        ld1             {v16.16b, v17.16b}, [x1], x2
        ld1             {v18.16b, v19.16b}, [x1], x2
        ld1             {v20.16b, v21.16b}, [x1], x2
        ld1             {v22.16b, v23.16b}, [x1], x2
        ld1             {v24.16b, v25.16b}, [x1], x2
        ld1             {v26.16b, v27.16b}, [x1], x2
        ld1             {v28.16b, v29.16b}, [x1], x2
.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
        ld1             {\tmp0\().16b, \tmp1\().16b}, [x1], x2
        movi            v8.8h, #0
        movi            v9.8h, #0
        movi            v10.8h, #0
        movi            v11.8h, #0
        calc_qpelb      v8,  \src0, \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7
        calc_qpelb2     v9,  \src0, \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7
        calc_qpelb      v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
        calc_qpelb2     v11, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
        subs            w3, w3, #1
        st1             {v8.8h-v11.8h}, [x0], x9
.endm
1:      calc_all2
.purgem calc
2:      ld1             {v8.8b-v11.8b}, [sp], #32
        ret
endfunc

function ff_hevc_put_hevc_qpel_v48_8_neon, export=1
        stp             x2, x3, [sp, #-48]!
        stp             x0, x1, [sp, #16]
        stp             x5, x30, [sp, #32]
        bl              X(ff_hevc_put_hevc_qpel_v24_8_neon)
        ldr             x5, [sp, #32]
        ldp             x0, x1, [sp, #16]
        ldp             x2, x3, [sp], #32
        add             x0, x0, #48
        add             x1, x1, #24
        bl              X(ff_hevc_put_hevc_qpel_v24_8_neon)
        ldr             x30, [sp, #8]
        add             sp, sp, #16
        ret
endfunc

function ff_hevc_put_hevc_qpel_v64_8_neon, export=1
        sub             sp, sp, #32
        st1             {v8.8b-v11.8b}, [sp]
        load_qpel_filterb x5, x4
        sub             x1, x1, x2, lsl #1
        sub             x1, x1, x2
        mov             x9, #(MAX_PB_SIZE * 2)
0:      mov             x8, x1          // src
        ld1             {v16.16b, v17.16b}, [x8], x2
        mov             w11, w3         // height
        ld1             {v18.16b, v19.16b}, [x8], x2
        mov             x10, x0         // dst
        ld1             {v20.16b, v21.16b}, [x8], x2
        ld1             {v22.16b, v23.16b}, [x8], x2
        ld1             {v24.16b, v25.16b}, [x8], x2
        ld1             {v26.16b, v27.16b}, [x8], x2
        ld1             {v28.16b, v29.16b}, [x8], x2
.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
        ld1             {\tmp0\().16b, \tmp1\().16b}, [x8], x2
        movi            v8.8h, #0
        movi            v9.8h, #0
        movi            v10.8h, #0
        movi            v11.8h, #0
        calc_qpelb      v8,  \src0, \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7
        calc_qpelb2     v9,  \src0, \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7
        calc_qpelb      v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
        calc_qpelb2     v11, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
        subs            x11, x11, #1
        st1             {v8.8h-v11.8h}, [x10], x9
.endm
1:      calc_all2
.purgem calc
2:      add             x0, x0, #64
        add             x1, x1, #32
        subs            w6, w6, #32
        b.hi            0b
        ld1             {v8.8b-v11.8b}, [sp], #32
        ret
endfunc

function ff_hevc_put_hevc_qpel_bi_v4_8_neon, export=1
        load_qpel_filterb x7, x6
        sub             x2, x2, x3, lsl #1
        sub             x2, x2, x3
        mov             x12, #(MAX_PB_SIZE * 2)
        ld1             {v16.s}[0], [x2], x3
        ld1             {v17.s}[0], [x2], x3
        ld1             {v18.s}[0], [x2], x3
        ld1             {v19.s}[0], [x2], x3
        ld1             {v20.s}[0], [x2], x3
        ld1             {v21.s}[0], [x2], x3
        ld1             {v22.s}[0], [x2], x3
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
        ld1             {\tmp\().s}[0], [x2], x3
        movi            v24.8h, #0
        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
        ld1             {v25.4h}, [x4], x12 // src2
        sqadd           v24.8h, v24.8h, v25.8h
        sqrshrun        v25.8b, v24.8h, #7
        subs            w5, w5, #1
        st1             {v25.s}[0], [x0], x1
.endm
1:      calc_all
.purgem calc
2:      ret
endfunc

function ff_hevc_put_hevc_qpel_bi_v6_8_neon, export=1
        load_qpel_filterb x7, x6
        sub             x2, x2, x3, lsl #1
        sub             x2, x2, x3
        ld1             {v16.8b}, [x2], x3
        sub             x1, x1, #4
        ld1             {v17.8b}, [x2], x3
        mov             x12, #(MAX_PB_SIZE * 2)
        ld1             {v18.8b}, [x2], x3
        ld1             {v19.8b}, [x2], x3
        ld1             {v20.8b}, [x2], x3
        ld1             {v21.8b}, [x2], x3
        ld1             {v22.8b}, [x2], x3
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
        ld1             {\tmp\().8b}, [x2], x3
        movi            v24.8h, #0
        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
        ld1             {v25.8h}, [x4], x12 // src2
        sqadd           v24.8h, v24.8h, v25.8h
        sqrshrun        v25.8b, v24.8h, #7
        st1             {v25.s}[0], [x0], #4
        subs            w5, w5, #1
        st1             {v25.h}[2], [x0], x1
.endm
1:      calc_all
.purgem calc
2:      ret
endfunc

function ff_hevc_put_hevc_qpel_bi_v8_8_neon, export=1
        load_qpel_filterb x7, x6
        sub             x2, x2, x3, lsl #1
        sub             x2, x2, x3
        mov             x12, #(MAX_PB_SIZE * 2)
        ld1             {v16.8b}, [x2], x3
        ld1             {v17.8b}, [x2], x3
        ld1             {v18.8b}, [x2], x3
        ld1             {v19.8b}, [x2], x3
        ld1             {v20.8b}, [x2], x3
        ld1             {v21.8b}, [x2], x3
        ld1             {v22.8b}, [x2], x3
 .macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
        ld1             {\tmp\().8b}, [x2], x3
        movi            v24.8h, #0
        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
        ld1             {v25.8h}, [x4], x12   // src2
        sqadd           v24.8h, v24.8h, v25.8h
        sqrshrun        v25.8b, v24.8h, #7
        subs            w5, w5, #1
        st1             {v25.8b}, [x0], x1
.endm
1:      calc_all
.purgem calc
2:      ret
endfunc

function ff_hevc_put_hevc_qpel_bi_v12_8_neon, export=1
        load_qpel_filterb x7, x6
        sub             x2, x2, x3, lsl #1
        sub             x2, x2, x3
        sub             x1, x1, #8
        ld1             {v16.16b}, [x2], x3
        mov             x12, #(MAX_PB_SIZE * 2)
        ld1             {v17.16b}, [x2], x3
        ld1             {v18.16b}, [x2], x3
        ld1             {v19.16b}, [x2], x3
        ld1             {v20.16b}, [x2], x3
        ld1             {v21.16b}, [x2], x3
        ld1             {v22.16b}, [x2], x3
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
        ld1             {\tmp\().16b}, [x2], x3
        movi            v24.8h, #0
        movi            v25.8h, #0
        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
        calc_qpelb2     v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
        ld1             {v26.8h, v27.8h}, [x4], x12   // src2
        sqadd           v24.8h, v24.8h, v26.8h
        sqadd           v25.8h, v25.8h, v27.8h
        sqrshrun        v26.8b, v24.8h, #7
        sqrshrun2       v26.16b, v25.8h, #7
        st1             {v26.8b}, [x0], #8
        subs            w5, w5, #1
        st1             {v26.s}[2], [x0], x1
.endm
1:      calc_all
.purgem calc
2:      ret
endfunc

function ff_hevc_put_hevc_qpel_bi_v16_8_neon, export=1
        load_qpel_filterb x7, x6
        sub             x2, x2, x3, lsl #1
        sub             x2, x2, x3
        mov             x12, #(MAX_PB_SIZE * 2)
        ld1             {v16.16b}, [x2], x3
        ld1             {v17.16b}, [x2], x3
        ld1             {v18.16b}, [x2], x3
        ld1             {v19.16b}, [x2], x3
        ld1             {v20.16b}, [x2], x3
        ld1             {v21.16b}, [x2], x3
        ld1             {v22.16b}, [x2], x3
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
        ld1             {\tmp\().16b}, [x2], x3
        movi            v24.8h, #0
        movi            v25.8h, #0
        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
        calc_qpelb2     v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
        ld1             {v26.8h, v27.8h}, [x4], x12   // src2
        sqadd           v24.8h, v24.8h, v26.8h
        sqadd           v25.8h, v25.8h, v27.8h
        sqrshrun        v26.8b, v24.8h, #7
        subs            w5, w5, #1
        sqrshrun2       v26.16b, v25.8h, #7
        st1             {v26.16b}, [x0], x1
.endm
1:      calc_all
.purgem calc
2:      ret
endfunc

function ff_hevc_put_hevc_qpel_bi_v24_8_neon, export=1
        stp             x4, x5, [sp, #-64]!
        stp             x2, x3, [sp, #16]
        stp             x0, x1, [sp, #32]
        stp             x7, x30, [sp, #48]
        bl              X(ff_hevc_put_hevc_qpel_bi_v16_8_neon)
        ldp             x2, x3, [sp, #16]
        ldp             x0, x1, [sp, #32]
        ldr             x7, [sp, #48]
        ldp             x4, x5, [sp], #48
        add             x0, x0, #16
        add             x2, x2, #16
        add             x4, x4, #32
        bl              X(ff_hevc_put_hevc_qpel_bi_v8_8_neon)
        ldr             x30, [sp, #8]
        add             sp, sp, #16
        ret
endfunc

function ff_hevc_put_hevc_qpel_bi_v32_8_neon, export=1
        stp             d8, d9, [sp, #-64]!
        stp             d10, d11, [sp, #16]
        stp             d12, d13, [sp, #32]
        stp             d14, d15, [sp, #48]
        sub             x2, x2, x3, lsl #1
        sub             x2, x2, x3
        load_qpel_filterb x7, x6
        ldr             w6, [sp, #64]
        mov             x12, #(MAX_PB_SIZE * 2)
0:      mov             x8, x2          // src
        ld1             {v16.16b, v17.16b}, [x8], x3
        mov             w11, w5         // height
        ld1             {v18.16b, v19.16b}, [x8], x3
        mov             x10, x0         // dst
        ld1             {v20.16b, v21.16b}, [x8], x3
        mov             x9, x4          // src2
        ld1             {v22.16b, v23.16b}, [x8], x3
        ld1             {v24.16b, v25.16b}, [x8], x3
        ld1             {v26.16b, v27.16b}, [x8], x3
        ld1             {v28.16b, v29.16b}, [x8], x3
.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
        ld1             {\tmp0\().8h, \tmp1\().8h}, [x8], x3
        movi            v8.8h, #0
        movi            v9.8h, #0
        movi            v10.8h, #0
        movi            v11.8h, #0
        calc_qpelb      v8,  \src0,  \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7
        calc_qpelb2     v9,  \src0, \src1,  \src2,  \src3,  \src4,  \src5,  \src6,  \src7
        calc_qpelb      v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
        calc_qpelb2     v11, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
        ld1             {v12.8h, v13.8h, v14.8h, v15.8h}, [x9], x12   // src2
        sqadd           v8.8h, v8.8h, v12.8h
        sqadd           v9.8h, v9.8h, v13.8h
        sqadd           v10.8h, v10.8h, v14.8h
        sqadd           v11.8h, v11.8h, v15.8h
        sqrshrun        v12.8b, v8.8h, #7
        sqrshrun2       v12.16b, v9.8h, #7
        sqrshrun        v13.8b, v10.8h, #7
        sqrshrun2       v13.16b, v11.8h, #7
        subs            x11, x11, #1
        st1             {v12.16b, v13.16b}, [x10], x1
.endm
1:      calc_all2
.purgem calc
2:      add             x0, x0, #32 // dst
        add             x2, x2, #32 // src
        add             x4, x4, #64 // src2
        subs            w6, w6, #32
        b.ne            0b
        ldp             d10, d11, [sp, #16]
        ldp             d12, d13, [sp, #32]
        ldp             d14, d15, [sp, #48]
        ldp             d8, d9, [sp], #64
        ret
endfunc

function ff_hevc_put_hevc_qpel_bi_v48_8_neon, export=1
        mov             x8, #32
        str             x8, [sp, #-80]!
        stp             x4, x5, [sp, #16]
        stp             x2, x3, [sp, #32]
        stp             x0, x1, [sp, #48]
        stp             x7, x30, [sp, #64]
        bl              X(ff_hevc_put_hevc_qpel_bi_v32_8_neon)
        ldp             x4, x5, [sp, #16]
        ldp             x2, x3, [sp, #32]
        ldp             x0, x1, [sp, #48]
        ldr             x7, [sp, #64]
        add             sp, sp, #64
        add             x0, x0, #32
        add             x2, x2, #32
        add             x4, x4, #64
        bl              X(ff_hevc_put_hevc_qpel_bi_v16_8_neon)
        ldr             x30, [sp, #8]
        add             sp, sp, #16
        ret
endfunc

function ff_hevc_put_hevc_qpel_bi_v64_8_neon, export=1
        b               X(ff_hevc_put_hevc_qpel_bi_v32_8_neon)
endfunc

function ff_hevc_put_hevc_pel_uni_pixels4_8_neon, export=1
1:
        ldr             s0, [x2]
        ldr             s1, [x2, x3]
        subs            w4, w4, #2
        add             x2, x2, x3, lsl #1
        str             s0, [x0]
        str             s1, [x0, x1]
        add             x0, x0, x1, lsl #1
        b.hi            1b
        ret
endfunc

function ff_hevc_put_hevc_pel_uni_pixels6_8_neon, export=1
        sub             x1, x1, #4
1:
        ldr             d0, [x2]
        ldr             d1, [x2, x3]
        subs            w4, w4, #2
        add             x2, x2, x3, lsl #1
        str             s0, [x0], #4
        st1             {v0.h}[2], [x0], x1
        str             s1, [x0], #4
        st1             {v1.h}[2], [x0], x1
        b.hi            1b
        ret
endfunc

function ff_hevc_put_hevc_pel_uni_pixels8_8_neon, export=1
1:
        ldr             d0, [x2]
        ldr             d1, [x2, x3]
        subs            w4, w4, #2
        add             x2, x2, x3, lsl #1
        str             d0, [x0]
        str             d1, [x0, x1]
        add             x0, x0, x1, lsl #1
        b.hi            1b
        ret
endfunc

function ff_hevc_put_hevc_pel_uni_pixels12_8_neon, export=1
        sub             x1, x1, #8
1:
        ldr             q0, [x2]
        ldr             q1, [x2, x3]
        subs            w4, w4, #2
        add             x2, x2, x3, lsl #1
        str             d0, [x0], #8
        st1             {v0.s}[2], [x0], x1
        str             d1, [x0], #8
        st1             {v1.s}[2], [x0], x1
        b.hi            1b
        ret
endfunc

function ff_hevc_put_hevc_pel_uni_pixels16_8_neon, export=1
1:
        ldr             q0, [x2]
        ldr             q1, [x2, x3]
        subs            w4, w4, #2
        add             x2, x2, x3, lsl #1
        str             q0, [x0]
        str             q1, [x0, x1]
        add             x0, x0, x1, lsl #1
        b.hi            1b
        ret
endfunc

function ff_hevc_put_hevc_pel_uni_pixels24_8_neon, export=1
1:
        ld1             {v0.8b, v1.8b, v2.8b}, [x2], x3
        subs            w4, w4, #1
        st1             {v0.8b, v1.8b, v2.8b}, [x0], x1
        b.hi            1b
        ret
endfunc

function ff_hevc_put_hevc_pel_uni_pixels32_8_neon, export=1
1:
        ld1             {v0.16b, v1.16b}, [x2], x3
        subs            w4, w4, #1
        st1             {v0.16b, v1.16b}, [x0], x1
        b.hi            1b
        ret
endfunc

function ff_hevc_put_hevc_pel_uni_pixels48_8_neon, export=1
1:
        ld1             {v0.16b, v1.16b, v2.16b}, [x2], x3
        subs            w4, w4, #1
        st1             {v0.16b, v1.16b, v2.16b}, [x0], x1
        b.hi            1b
        ret
endfunc

function ff_hevc_put_hevc_pel_uni_pixels64_8_neon, export=1
1:
        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3
        subs            w4, w4, #1
        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
        b.hi            1b
        ret
endfunc

function ff_hevc_put_hevc_qpel_uni_v4_8_neon, export=1
        load_qpel_filterb x6, x5
        sub             x2, x2, x3, lsl #1
        sub             x2, x2, x3
        ldr             s16, [x2]
        ldr             s17, [x2, x3]
        add             x2, x2, x3, lsl #1
        ldr             s18, [x2]
        ldr             s19, [x2, x3]
        add             x2, x2, x3, lsl #1
        ldr             s20, [x2]
        ldr             s21, [x2, x3]
        add             x2, x2, x3, lsl #1
        ldr             s22, [x2]
        add             x2, x2, x3
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
        ld1             {\tmp\().s}[0], [x2], x3
        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
        sqrshrun        v24.8b, v24.8h, #6
        subs            w4, w4, #1
        st1             {v24.s}[0], [x0], x1
.endm
1:      calc_all
.purgem calc
2: ret
endfunc

function ff_hevc_put_hevc_qpel_uni_v6_8_neon, export=1
        load_qpel_filterb x6, x5
        sub             x2, x2, x3, lsl #1
        sub             x1, x1, #4
        sub             x2, x2, x3
        ldr             d16, [x2]
        ldr             d17, [x2, x3]
        add             x2, x2, x3, lsl #1
        ldr             d18, [x2]
        ldr             d19, [x2, x3]
        add             x2, x2, x3, lsl #1
        ldr             d20, [x2]
        ldr             d21, [x2, x3]
        add             x2, x2, x3, lsl #1
        ldr             d22, [x2]
        add             x2, x2, x3
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
        ld1             {\tmp\().8b}, [x2], x3
        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
        sqrshrun        v24.8b, v24.8h, #6
        st1             {v24.s}[0], [x0], #4
        subs            w4, w4, #1
        st1             {v24.h}[2], [x0], x1
.endm
1:      calc_all
.purgem calc
2: ret
endfunc

function ff_hevc_put_hevc_qpel_uni_v8_8_neon, export=1
        load_qpel_filterb x6, x5
        sub             x2, x2, x3, lsl #1
        sub             x2, x2, x3
        ldr             d16, [x2]
        ldr             d17, [x2, x3]
        add             x2, x2, x3, lsl #1
        ldr             d18, [x2]
        ldr             d19, [x2, x3]
        add             x2, x2, x3, lsl #1
        ldr             d20, [x2]
        ldr             d21, [x2, x3]
        add             x2, x2, x3, lsl #1
        ldr             d22, [x2]
        add             x2, x2, x3
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
        ld1             {\tmp\().8b}, [x2], x3
        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
        sqrshrun        v24.8b, v24.8h, #6
        subs            w4, w4, #1
        st1             {v24.8b}, [x0], x1
.endm
1:      calc_all
.purgem calc
2: ret
endfunc

function ff_hevc_put_hevc_qpel_uni_v12_8_neon, export=1
        load_qpel_filterb x6, x5
        sub             x2, x2, x3, lsl #1
        sub             x1, x1, #8
        sub             x2, x2, x3
0:      mov             x8, x2          // src
        mov             w11, w4         // height
        mov             x10, x0         // dst
        ldr             q16, [x8]
        ldr             q17, [x8, x3]
        add             x8, x8, x3, lsl #1
        ldr             q18, [x8]
        ldr             q19, [x8, x3]
        add             x8, x8, x3, lsl #1
        ldr             q20, [x8]
        ldr             q21, [x8, x3]
        add             x8, x8, x3, lsl #1
        ldr             q22, [x8]
        add             x8, x8, x3
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
        ld1             {\tmp\().16b}, [x8], x3
        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
        calc_qpelb2     v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
        sqrshrun        v24.8b, v24.8h, #6
        sqrshrun2       v24.16b, v25.8h, #6
        st1             {v24.8b}, [x10], #8
        subs            x11, x11, #1
        st1             {v24.s}[2], [x10], x1
.endm
1:      calc_all
.purgem calc
2:      add             x0, x0, #12
        add             x2, x2, #12
        subs            w7, w7, #12
        b.ne            0b
        ret
endfunc

function ff_hevc_put_hevc_qpel_uni_v16_8_neon, export=1
        load_qpel_filterb x6, x5
        sub             x2, x2, x3, lsl #1
        sub             x2, x2, x3
0:      mov             x8, x2          // src
        mov             w11, w4         // height
        mov             x10, x0         // dst
        ldr             q16, [x8]
        ldr             q17, [x8, x3]
        add             x8, x8, x3, lsl #1
        ldr             q18, [x8]
        ldr             q19, [x8, x3]
        add             x8, x8, x3, lsl #1
        ldr             q20, [x8]
        ldr             q21, [x8, x3]
        add             x8, x8, x3, lsl #1
        ldr             q22, [x8]
        add             x8, x8, x3
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
        ld1             {\tmp\().16b}, [x8], x3
        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
        calc_qpelb2     v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
        sqrshrun        v24.8b, v24.8h, #6
        sqrshrun2       v24.16b, v25.8h, #6
        subs            x11, x11, #1
        st1             {v24.16b}, [x10], x1
.endm
1:      calc_all
.purgem calc
2:      add             x0, x0, #16
        add             x2, x2, #16
        subs            w7, w7, #16
        b.ne            0b
        ret
endfunc

function ff_hevc_put_hevc_qpel_uni_v24_8_neon, export=1
        b               X(ff_hevc_put_hevc_qpel_uni_v12_8_neon)
endfunc

function ff_hevc_put_hevc_qpel_uni_v32_8_neon, export=1
        b               X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
endfunc

function ff_hevc_put_hevc_qpel_uni_v48_8_neon, export=1
        b               X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
endfunc

function ff_hevc_put_hevc_qpel_uni_v64_8_neon, export=1
        b               X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
endfunc

function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1
        mov             w10, #-6
        sub             w10, w10, w5
        dup             v30.8h, w6
        dup             v31.4s, w10
        dup             v29.4s, w7
1:
        ldr             s0, [x2]
        ldr             s1, [x2, x3]
        add             x2, x2, x3, lsl #1
        ushll           v0.8h, v0.8b, #6
        ushll           v1.8h, v1.8b, #6
        smull           v0.4s, v0.4h, v30.4h
        smull           v1.4s, v1.4h, v30.4h
        sqrshl          v0.4s, v0.4s, v31.4s
        sqrshl          v1.4s, v1.4s, v31.4s
        sqadd           v0.4s, v0.4s, v29.4s
        sqadd           v1.4s, v1.4s, v29.4s
        sqxtn           v0.4h, v0.4s
        sqxtn           v1.4h, v1.4s
        sqxtun          v0.8b, v0.8h
        sqxtun          v1.8b, v1.8h
        str             s0, [x0]
        str             s1, [x0, x1]
        add             x0, x0, x1, lsl #1
        subs            w4, w4, #2
        b.ne            1b
        ret
endfunc

function ff_hevc_put_hevc_pel_uni_w_pixels6_8_neon, export=1
        mov             w10, #-6
        sub             w10, w10, w5
        dup             v30.8h, w6
        dup             v31.4s, w10
        dup             v29.4s, w7
        sub             x1, x1, #4
1:
        ldr             d0, [x2]
        ldr             d1, [x2, x3]
        add             x2, x2, x3, lsl #1
        ushll           v0.8h, v0.8b, #6
        ushll           v1.8h, v1.8b, #6
        smull           v4.4s, v0.4h, v30.4h
        smull2          v5.4s, v0.8h, v30.8h
        smull           v6.4s, v1.4h, v30.4h
        smull2          v7.4s, v1.8h, v30.8h
        sqrshl          v4.4s, v4.4s, v31.4s
        sqrshl          v5.4s, v5.4s, v31.4s
        sqrshl          v6.4s, v6.4s, v31.4s
        sqrshl          v7.4s, v7.4s, v31.4s
        sqadd           v4.4s, v4.4s, v29.4s
        sqadd           v5.4s, v5.4s, v29.4s
        sqadd           v6.4s, v6.4s, v29.4s
        sqadd           v7.4s, v7.4s, v29.4s
        sqxtn           v0.4h, v4.4s
        sqxtn2          v0.8h, v5.4s
        sqxtn           v1.4h, v6.4s
        sqxtn2          v1.8h, v7.4s
        sqxtun          v0.8b, v0.8h
        sqxtun          v1.8b, v1.8h
        str             s0, [x0], #4
        st1             {v0.h}[2], [x0], x1
        str             s1, [x0], #4
        st1             {v1.h}[2], [x0], x1
        subs            w4, w4, #2
        b.ne            1b
        ret
endfunc

function ff_hevc_put_hevc_pel_uni_w_pixels8_8_neon, export=1
        mov             w10, #-6
        sub             w10, w10, w5
        dup             v30.8h, w6
        dup             v31.4s, w10
        dup             v29.4s, w7
1:
        ldr             d0, [x2]
        ldr             d1, [x2, x3]
        add             x2, x2, x3, lsl #1
        ushll           v0.8h, v0.8b, #6
        ushll           v1.8h, v1.8b, #6
        smull           v4.4s, v0.4h, v30.4h
        smull2          v5.4s, v0.8h, v30.8h
        smull           v6.4s, v1.4h, v30.4h
        smull2          v7.4s, v1.8h, v30.8h
        sqrshl          v4.4s, v4.4s, v31.4s
        sqrshl          v5.4s, v5.4s, v31.4s
        sqrshl          v6.4s, v6.4s, v31.4s
        sqrshl          v7.4s, v7.4s, v31.4s
        sqadd           v4.4s, v4.4s, v29.4s
        sqadd           v5.4s, v5.4s, v29.4s
        sqadd           v6.4s, v6.4s, v29.4s
        sqadd           v7.4s, v7.4s, v29.4s
        sqxtn           v0.4h, v4.4s
        sqxtn2          v0.8h, v5.4s
        sqxtn           v1.4h, v6.4s
        sqxtn2          v1.8h, v7.4s
        sqxtun          v0.8b, v0.8h
        sqxtun          v1.8b, v1.8h
        str             d0, [x0]
        str             d1, [x0, x1]
        add             x0, x0, x1, lsl #1
        subs            w4, w4, #2
        b.ne            1b
        ret
endfunc

function ff_hevc_put_hevc_pel_uni_w_pixels12_8_neon, export=1
        mov             w10, #-6
        sub             w10, w10, w5
        dup             v30.8h, w6
        dup             v31.4s, w10
        dup             v29.4s, w7
        sub             x1, x1, #8
1:
        ldr             q0, [x2]
        ldr             q1, [x2, x3]
        add             x2, x2, x3, lsl #1
        ushll           v4.8h, v0.8b, #6
        ushll2          v5.8h, v0.16b, #6
        ushll           v6.8h, v1.8b, #6
        ushll2          v7.8h, v1.16b, #6
        smull           v16.4s, v4.4h, v30.4h
        smull2          v17.4s, v4.8h, v30.8h
        smull           v18.4s, v5.4h, v30.4h
        smull2          v19.4s, v5.8h, v30.8h
        smull           v20.4s, v6.4h, v30.4h
        smull2          v21.4s, v6.8h, v30.8h
        smull           v22.4s, v7.4h, v30.4h
        smull2          v23.4s, v7.8h, v30.8h

        sqrshl          v16.4s, v16.4s, v31.4s
        sqrshl          v17.4s, v17.4s, v31.4s
        sqrshl          v18.4s, v18.4s, v31.4s
        sqrshl          v19.4s, v19.4s, v31.4s
        sqrshl          v20.4s, v20.4s, v31.4s
        sqrshl          v21.4s, v21.4s, v31.4s
        sqrshl          v22.4s, v22.4s, v31.4s
        sqrshl          v23.4s, v23.4s, v31.4s
        sqadd           v16.4s, v16.4s, v29.4s
        sqadd           v17.4s, v17.4s, v29.4s
        sqadd           v18.4s, v18.4s, v29.4s
        sqadd           v19.4s, v19.4s, v29.4s
        sqadd           v20.4s, v20.4s, v29.4s
        sqadd           v21.4s, v21.4s, v29.4s
        sqadd           v22.4s, v22.4s, v29.4s
        sqadd           v23.4s, v23.4s, v29.4s
        sqxtn           v0.4h, v16.4s
        sqxtn2          v0.8h, v17.4s
        sqxtn           v1.4h, v18.4s
        sqxtn2          v1.8h, v19.4s
        sqxtn           v2.4h, v20.4s
        sqxtn2          v2.8h, v21.4s
        sqxtn           v3.4h, v22.4s
        sqxtn2          v3.8h, v23.4s
        sqxtun          v0.8b, v0.8h
        sqxtun2         v0.16b, v1.8h
        sqxtun          v2.8b, v2.8h
        sqxtun2         v2.16b, v3.8h
        str             d0, [x0], #8
        st1             {v0.s}[2], [x0], x1
        str             d2, [x0], #8
        st1             {v2.s}[2], [x0], x1
        subs            w4, w4, #2
        b.ne            1b
        ret
endfunc

.macro PEL_UNI_W_PIXEL_CALC     s0, t0, t1, d0, d1, d2, d3
        ushll           \t0\().8h, \s0\().8b, #6
        ushll2          \t1\().8h, \s0\().16b, #6
        smull           \d0\().4s, \t0\().4h, v30.4h
        smull2          \d1\().4s, \t0\().8h, v30.8h
        smull           \d2\().4s, \t1\().4h, v30.4h
        smull2          \d3\().4s, \t1\().8h, v30.8h
        sqrshl          \d0\().4s, \d0\().4s, v31.4s
        sqrshl          \d1\().4s, \d1\().4s, v31.4s
        sqrshl          \d2\().4s, \d2\().4s, v31.4s
        sqrshl          \d3\().4s, \d3\().4s, v31.4s
        sqadd           \d0\().4s, \d0\().4s, v29.4s
        sqadd           \d1\().4s, \d1\().4s, v29.4s
        sqadd           \d2\().4s, \d2\().4s, v29.4s
        sqadd           \d3\().4s, \d3\().4s, v29.4s
        sqxtn           \t0\().4h, \d0\().4s
        sqxtn2          \t0\().8h, \d1\().4s
        sqxtn           \t1\().4h, \d2\().4s
        sqxtn2          \t1\().8h, \d3\().4s
        sqxtun          \s0\().8b,  \t0\().8h
        sqxtun2         \s0\().16b, \t1\().8h
.endm


function ff_hevc_put_hevc_pel_uni_w_pixels16_8_neon, export=1
        mov             w10, #-6
        sub             w10, w10, w5
        dup             v30.8h, w6
        dup             v31.4s, w10
        dup             v29.4s, w7
1:
        ldr             q0, [x2]
        ldr             q1, [x2, x3]
        add             x2, x2, x3, lsl #1
        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
        str             q0, [x0]
        str             q1, [x0, x1]
        add             x0, x0, x1, lsl #1
        subs            w4, w4, #2
        b.ne            1b
        ret
endfunc



function ff_hevc_put_hevc_pel_uni_w_pixels24_8_neon, export=1
        mov             w10, #-6
        sub             w10, w10, w5
        dup             v30.8h, w6
        dup             v31.4s, w10
        dup             v29.4s, w7
1:
        ld1             {v0.16b, v1.16b}, [x2], x3
        ushll           v4.8h, v0.8b, #6
        ushll2          v5.8h, v0.16b, #6
        ushll           v6.8h, v1.8b, #6
        smull           v16.4s, v4.4h, v30.4h
        smull2          v17.4s, v4.8h, v30.8h
        smull           v18.4s, v5.4h, v30.4h
        smull2          v19.4s, v5.8h, v30.8h
        smull           v20.4s, v6.4h, v30.4h
        smull2          v21.4s, v6.8h, v30.8h
        sqrshl          v16.4s, v16.4s, v31.4s
        sqrshl          v17.4s, v17.4s, v31.4s
        sqrshl          v18.4s, v18.4s, v31.4s
        sqrshl          v19.4s, v19.4s, v31.4s
        sqrshl          v20.4s, v20.4s, v31.4s
        sqrshl          v21.4s, v21.4s, v31.4s
        sqadd           v16.4s, v16.4s, v29.4s
        sqadd           v17.4s, v17.4s, v29.4s
        sqadd           v18.4s, v18.4s, v29.4s
        sqadd           v19.4s, v19.4s, v29.4s
        sqadd           v20.4s, v20.4s, v29.4s
        sqadd           v21.4s, v21.4s, v29.4s
        sqxtn           v0.4h, v16.4s
        sqxtn2          v0.8h, v17.4s
        sqxtn           v1.4h, v18.4s
        sqxtn2          v1.8h, v19.4s
        sqxtn           v2.4h, v20.4s
        sqxtn2          v2.8h, v21.4s
        sqxtun          v0.8b, v0.8h
        sqxtun          v1.8b, v1.8h
        sqxtun          v2.8b, v2.8h
        st1             {v0.8b, v1.8b, v2.8b}, [x0], x1
        subs            w4, w4, #1
        b.ne            1b
        ret
endfunc

function ff_hevc_put_hevc_pel_uni_w_pixels32_8_neon, export=1
        mov             w10, #-6
        sub             w10, w10, w5
        dup             v30.8h, w6
        dup             v31.4s, w10
        dup             v29.4s, w7
1:
        ld1             {v0.16b, v1.16b}, [x2], x3
        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
        st1             {v0.16b, v1.16b}, [x0], x1
        subs            w4, w4, #1
        b.ne            1b
        ret
endfunc


function ff_hevc_put_hevc_pel_uni_w_pixels48_8_neon, export=1
        mov             w10, #-6
        sub             w10, w10, w5
        dup             v30.8h, w6
        dup             v31.4s, w10
        dup             v29.4s, w7
1:
        ld1             {v0.16b, v1.16b, v2.16b}, [x2], x3
        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
        PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
        st1             {v0.16b, v1.16b, v2.16b}, [x0], x1
        subs            w4, w4, #1
        b.ne            1b
        ret
endfunc

function ff_hevc_put_hevc_pel_uni_w_pixels64_8_neon, export=1
        mov             w10, #-6
        sub             w10, w10, w5
        dup             v30.8h, w6
        dup             v31.4s, w10
        dup             v29.4s, w7
1:
        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3
        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
        PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
        PEL_UNI_W_PIXEL_CALC v3, v6, v7, v20, v21, v22, v23
        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
        subs            w4, w4, #1
        b.ne            1b
        ret
endfunc

.macro QPEL_UNI_W_V_HEADER
        ldur            x12, [sp, #8]          // my
        sub             x2, x2, x3, lsl #1
        sub             x2, x2, x3
        movrel          x9, qpel_filters_abs
        add             x9, x9, x12, lsl #3
        ldr             d28, [x9]
        dup             v0.16b, v28.b[0]
        dup             v1.16b, v28.b[1]
        dup             v2.16b, v28.b[2]
        dup             v3.16b, v28.b[3]
        dup             v4.16b, v28.b[4]
        dup             v5.16b, v28.b[5]
        dup             v6.16b, v28.b[6]
        dup             v7.16b, v28.b[7]

        mov             w10, #-6
        sub             w10, w10, w5
        dup             v30.8h, w6              // wx
        dup             v31.4s, w10             // shift
        dup             v29.4s, w7              // ox
.endm

.macro QPEL_FILTER_B dst, src0, src1, src2, src3, src4, src5, src6, src7
        umull           \dst\().8h, \src1\().8b, v1.8b
        umlsl           \dst\().8h, \src0\().8b, v0.8b
        umlsl           \dst\().8h, \src2\().8b, v2.8b
        umlal           \dst\().8h, \src3\().8b, v3.8b
        umlal           \dst\().8h, \src4\().8b, v4.8b
        umlsl           \dst\().8h, \src5\().8b, v5.8b
        umlal           \dst\().8h, \src6\().8b, v6.8b
        umlsl           \dst\().8h, \src7\().8b, v7.8b
.endm

.macro QPEL_FILTER_B2 dst, src0, src1, src2, src3, src4, src5, src6, src7
        umull2          \dst\().8h, \src1\().16b, v1.16b
        umlsl2          \dst\().8h, \src0\().16b, v0.16b
        umlsl2          \dst\().8h, \src2\().16b, v2.16b
        umlal2          \dst\().8h, \src3\().16b, v3.16b
        umlal2          \dst\().8h, \src4\().16b, v4.16b
        umlsl2          \dst\().8h, \src5\().16b, v5.16b
        umlal2          \dst\().8h, \src6\().16b, v6.16b
        umlsl2          \dst\().8h, \src7\().16b, v7.16b
.endm

.macro  QPEL_UNI_W_V_4
        smull           v24.4s, v24.4h, v30.4h
        sqrshl          v24.4s, v24.4s, v31.4s
        sqadd           v24.4s, v24.4s, v29.4s
        sqxtn           v24.4h, v24.4s
        sqxtun          v24.8b, v24.8h
        st1             {v24.s}[0], [x0], x1
.endm

function ff_hevc_put_hevc_qpel_uni_w_v4_8_neon, export=1
        QPEL_UNI_W_V_HEADER
        ldr             s16, [x2]
        ldr             s17, [x2, x3]
        add             x2, x2, x3, lsl #1
        ldr             s18, [x2]
        ldr             s19, [x2, x3]
        add             x2, x2, x3, lsl #1
        ldr             s20, [x2]
        ldr             s21, [x2, x3]
        add             x2, x2, x3, lsl #1
        ldr             s22, [x2]

1:      ldr             s23, [x2, x3]
        add             x2, x2, x3, lsl #1
        QPEL_FILTER_B   v24, v16, v17, v18, v19, v20, v21, v22, v23
        QPEL_UNI_W_V_4
        subs            w4, w4, #1
        b.eq            2f

        ldr             s16, [x2]
        QPEL_FILTER_B   v24, v17, v18, v19, v20, v21, v22, v23, v16
        QPEL_UNI_W_V_4
        subs            w4, w4, #1
        b.eq            2f

        ldr             s17, [x2, x3]
        add             x2, x2, x3, lsl #1
        QPEL_FILTER_B   v24, v18, v19, v20, v21, v22, v23, v16, v17
        QPEL_UNI_W_V_4
        subs            w4, w4, #1
        b.eq            2f

        ldr             s18, [x2]
        QPEL_FILTER_B   v24, v19, v20, v21, v22, v23, v16, v17, v18
        QPEL_UNI_W_V_4
        subs            w4, w4, #1
        b.eq            2f

        ldr             s19, [x2, x3]
        add             x2, x2, x3, lsl #1
        QPEL_FILTER_B   v24, v20, v21, v22, v23, v16, v17, v18, v19
        QPEL_UNI_W_V_4
        subs            w4, w4, #1
        b.eq            2f

        ldr             s20, [x2]
        QPEL_FILTER_B   v24, v21, v22, v23, v16, v17, v18, v19, v20
        QPEL_UNI_W_V_4
        subs            w4, w4, #1
        b.eq            2f

        ldr             s21, [x2, x3]
        add             x2, x2, x3, lsl #1
        QPEL_FILTER_B   v24, v22, v23, v16, v17, v18, v19, v20, v21
        QPEL_UNI_W_V_4
        subs            w4, w4, #1
        b.eq            2f

        ldr             s22, [x2]
        QPEL_FILTER_B   v24, v23, v16, v17, v18, v19, v20, v21, v22
        QPEL_UNI_W_V_4
        subs            w4, w4, #1
        b.ne            1b
2:
        ret
endfunc

.macro QPEL_UNI_W_V_8
        smull           v24.4s, v26.4h, v30.4h
        smull2          v25.4s, v26.8h, v30.8h
        sqrshl          v24.4s, v24.4s, v31.4s
        sqrshl          v25.4s, v25.4s, v31.4s
        sqadd           v24.4s, v24.4s, v29.4s
        sqadd           v25.4s, v25.4s, v29.4s
        sqxtn           v24.4h, v24.4s
        sqxtn2          v24.8h, v25.4s
        sqxtun          v24.8b, v24.8h
        st1             {v24.d}[0], [x0], x1
.endm

function ff_hevc_put_hevc_qpel_uni_w_v8_8_neon, export=1
        QPEL_UNI_W_V_HEADER
        ldr             d16, [x2]
        ldr             d17, [x2, x3]
        add             x2, x2, x3, lsl #1
        ldr             d18, [x2]
        ldr             d19, [x2, x3]
        add             x2, x2, x3, lsl #1
        ldr             d20, [x2]
        ldr             d21, [x2, x3]
        add             x2, x2, x3, lsl #1
        ldr             d22, [x2]

1:      ldr             d23, [x2, x3]
        add             x2, x2, x3, lsl #1
        QPEL_FILTER_B   v26, v16, v17, v18, v19, v20, v21, v22, v23
        QPEL_UNI_W_V_8
        subs            w4, w4, #1
        b.eq            2f

        ldr             d16, [x2]
        QPEL_FILTER_B   v26, v17, v18, v19, v20, v21, v22, v23, v16
        QPEL_UNI_W_V_8
        subs            w4, w4, #1
        b.eq            2f

        ldr             d17, [x2, x3]
        add             x2, x2, x3, lsl #1
        QPEL_FILTER_B   v26, v18, v19, v20, v21, v22, v23, v16, v17
        QPEL_UNI_W_V_8
        subs            w4, w4, #1
        b.eq            2f

        ldr             d18, [x2]
        QPEL_FILTER_B   v26, v19, v20, v21, v22, v23, v16, v17, v18
        QPEL_UNI_W_V_8
        subs            w4, w4, #1
        b.eq            2f

        ldr             d19, [x2, x3]
        add             x2, x2, x3, lsl #1
        QPEL_FILTER_B   v26, v20, v21, v22, v23, v16, v17, v18, v19
        QPEL_UNI_W_V_8
        subs            w4, w4, #1
        b.eq            2f

        ldr             d20, [x2]
        QPEL_FILTER_B   v26, v21, v22, v23, v16, v17, v18, v19, v20
        QPEL_UNI_W_V_8
        subs            w4, w4, #1
        b.eq            2f

        ldr             d21, [x2, x3]
        add             x2, x2, x3, lsl #1
        QPEL_FILTER_B   v26, v22, v23, v16, v17, v18, v19, v20, v21
        QPEL_UNI_W_V_8
        subs            w4, w4, #1
        b.eq            2f

        ldr             d22, [x2]
        QPEL_FILTER_B   v26, v23, v16, v17, v18, v19, v20, v21, v22
        QPEL_UNI_W_V_8
        subs            w4, w4, #1
        b.ne            1b
2:
        ret
endfunc

.macro QPEL_UNI_W_V_16
        smull           v24.4s, v26.4h, v30.4h
        smull2          v25.4s, v26.8h, v30.8h
        smull           v26.4s, v27.4h, v30.4h
        smull2          v27.4s, v27.8h, v30.8h
        sqrshl          v24.4s, v24.4s, v31.4s
        sqrshl          v25.4s, v25.4s, v31.4s
        sqrshl          v26.4s, v26.4s, v31.4s
        sqrshl          v27.4s, v27.4s, v31.4s
        sqadd           v24.4s, v24.4s, v29.4s
        sqadd           v25.4s, v25.4s, v29.4s
        sqadd           v26.4s, v26.4s, v29.4s
        sqadd           v27.4s, v27.4s, v29.4s
        sqxtn           v24.4h, v24.4s
        sqxtn2          v24.8h, v25.4s
        sqxtn           v26.4h, v26.4s
        sqxtn2          v26.8h, v27.4s
        sqxtun          v24.8b, v24.8h
        sqxtun2         v24.16b, v26.8h
        st1             {v24.16b}, [x0], x1
.endm

function ff_hevc_put_hevc_qpel_uni_w_v16_8_neon, export=1
        QPEL_UNI_W_V_HEADER
        ldr             q16, [x2]
        ldr             q17, [x2, x3]
        add             x2, x2, x3, lsl #1
        ldr             q18, [x2]
        ldr             q19, [x2, x3]
        add             x2, x2, x3, lsl #1
        ldr             q20, [x2]
        ldr             q21, [x2, x3]
        add             x2, x2, x3, lsl #1
        ldr             q22, [x2]

1:      ldr             q23, [x2, x3]
        add             x2, x2, x3, lsl #1
        QPEL_FILTER_B   v26, v16, v17, v18, v19, v20, v21, v22, v23
        QPEL_FILTER_B2  v27, v16, v17, v18, v19, v20, v21, v22, v23
        QPEL_UNI_W_V_16
        subs            w4, w4, #1
        b.eq            2f

        ldr             q16, [x2]
        QPEL_FILTER_B   v26, v17, v18, v19, v20, v21, v22, v23, v16
        QPEL_FILTER_B2  v27, v17, v18, v19, v20, v21, v22, v23, v16
        QPEL_UNI_W_V_16
        subs            w4, w4, #1
        b.eq            2f

        ldr             q17, [x2, x3]
        add             x2, x2, x3, lsl #1
        QPEL_FILTER_B   v26, v18, v19, v20, v21, v22, v23, v16, v17
        QPEL_FILTER_B2  v27, v18, v19, v20, v21, v22, v23, v16, v17
        QPEL_UNI_W_V_16
        subs            w4, w4, #1
        b.eq            2f

        ldr             q18, [x2]
        QPEL_FILTER_B   v26, v19, v20, v21, v22, v23, v16, v17, v18
        QPEL_FILTER_B2  v27, v19, v20, v21, v22, v23, v16, v17, v18
        QPEL_UNI_W_V_16
        subs            w4, w4, #1
        b.eq            2f

        ldr             q19, [x2, x3]
        add             x2, x2, x3, lsl #1
        QPEL_FILTER_B   v26, v20, v21, v22, v23, v16, v17, v18, v19
        QPEL_FILTER_B2  v27, v20, v21, v22, v23, v16, v17, v18, v19
        QPEL_UNI_W_V_16
        subs            w4, w4, #1
        b.eq            2f

        ldr             q20, [x2]
        QPEL_FILTER_B   v26, v21, v22, v23, v16, v17, v18, v19, v20
        QPEL_FILTER_B2  v27, v21, v22, v23, v16, v17, v18, v19, v20
        QPEL_UNI_W_V_16
        subs            w4, w4, #1
        b.eq            2f

        ldr             q21, [x2, x3]
        add             x2, x2, x3, lsl #1
        QPEL_FILTER_B   v26, v22, v23, v16, v17, v18, v19, v20, v21
        QPEL_FILTER_B2  v27, v22, v23, v16, v17, v18, v19, v20, v21
        QPEL_UNI_W_V_16
        subs            w4, w4, #1
        b.eq            2f

        ldr             q22, [x2]
        QPEL_FILTER_B   v26, v23, v16, v17, v18, v19, v20, v21, v22
        QPEL_FILTER_B2  v27, v23, v16, v17, v18, v19, v20, v21, v22
        QPEL_UNI_W_V_16
        subs            w4, w4, #1
        b.ne            1b
2:
        ret
endfunc

function ff_hevc_put_hevc_qpel_uni_w_v64_8_neon, export=1
        QPEL_UNI_W_V_HEADER
        ldur            w13, [sp, #16]
        mov             x14, x0
        mov             x15, x2
        mov             w11, w4

3:
        ldr             q16, [x2]
        ldr             q17, [x2, x3]
        add             x2, x2, x3, lsl #1
        ldr             q18, [x2]
        ldr             q19, [x2, x3]
        add             x2, x2, x3, lsl #1
        ldr             q20, [x2]
        ldr             q21, [x2, x3]
        add             x2, x2, x3, lsl #1
        ldr             q22, [x2]


1:      ldr             q23, [x2, x3]
        add             x2, x2, x3, lsl #1
        QPEL_FILTER_B   v26, v16, v17, v18, v19, v20, v21, v22, v23
        QPEL_FILTER_B2  v27, v16, v17, v18, v19, v20, v21, v22, v23
        QPEL_UNI_W_V_16
        subs            w4, w4, #1
        b.eq            2f

        ldr             q16, [x2]
        QPEL_FILTER_B   v26, v17, v18, v19, v20, v21, v22, v23, v16
        QPEL_FILTER_B2  v27, v17, v18, v19, v20, v21, v22, v23, v16
        QPEL_UNI_W_V_16
        subs            w4, w4, #1
        b.eq            2f

        ldr             q17, [x2, x3]
        add             x2, x2, x3, lsl #1
        QPEL_FILTER_B   v26, v18, v19, v20, v21, v22, v23, v16, v17
        QPEL_FILTER_B2  v27, v18, v19, v20, v21, v22, v23, v16, v17
        QPEL_UNI_W_V_16
        subs            w4, w4, #1
        b.eq            2f

        ldr             q18, [x2]
        QPEL_FILTER_B   v26, v19, v20, v21, v22, v23, v16, v17, v18
        QPEL_FILTER_B2  v27, v19, v20, v21, v22, v23, v16, v17, v18
        QPEL_UNI_W_V_16
        subs            w4, w4, #1
        b.eq            2f

        ldr             q19, [x2, x3]
        add             x2, x2, x3, lsl #1
        QPEL_FILTER_B   v26, v20, v21, v22, v23, v16, v17, v18, v19
        QPEL_FILTER_B2  v27, v20, v21, v22, v23, v16, v17, v18, v19
        QPEL_UNI_W_V_16
        subs            w4, w4, #1
        b.eq            2f

        ldr             q20, [x2]
        QPEL_FILTER_B   v26, v21, v22, v23, v16, v17, v18, v19, v20
        QPEL_FILTER_B2  v27, v21, v22, v23, v16, v17, v18, v19, v20
        QPEL_UNI_W_V_16
        subs            w4, w4, #1
        b.eq            2f

        ldr             q21, [x2, x3]
        add             x2, x2, x3, lsl #1
        QPEL_FILTER_B   v26, v22, v23, v16, v17, v18, v19, v20, v21
        QPEL_FILTER_B2  v27, v22, v23, v16, v17, v18, v19, v20, v21
        QPEL_UNI_W_V_16
        subs            w4, w4, #1
        b.eq            2f

        ldr             q22, [x2]
        QPEL_FILTER_B   v26, v23, v16, v17, v18, v19, v20, v21, v22
        QPEL_FILTER_B2  v27, v23, v16, v17, v18, v19, v20, v21, v22
        QPEL_UNI_W_V_16
        subs            w4, w4, #1
        b.ne            1b
2:
        subs            w13, w13, #16
        add             x14, x14, #16
        add             x15, x15, #16
        mov             x0, x14
        mov             x2, x15
        mov             w4, w11
        b.hi            3b
        ret
endfunc

function hevc_put_hevc_qpel_uni_hv4_8_end_neon
        mov             x9, #(MAX_PB_SIZE * 2)
        load_qpel_filterh x6, x5
        ldr             d16, [sp]
        ldr             d17, [sp, x9]
        add             sp, sp, x9, lsl #1
        ldr             d18, [sp]
        ldr             d19, [sp, x9]
        add             sp, sp, x9, lsl #1
        ldr             d20, [sp]
        ldr             d21, [sp, x9]
        add             sp, sp, x9, lsl #1
        ldr             d22, [sp]
        add             sp, sp, x9
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
        ld1             {\tmp\().4h}, [sp], x9
        calc_qpelh      v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12
        sqxtun          v1.8b, v1.8h
        subs            w4, w4, #1
        st1             {v1.s}[0], [x0], x1
.endm
1:      calc_all
.purgem calc
2:      mov             sp, x14
        ret
endfunc

function hevc_put_hevc_qpel_uni_hv6_8_end_neon
        mov             x9, #(MAX_PB_SIZE * 2)
        load_qpel_filterh x6, x5
        sub             x1, x1, #4
        ldr             q16, [sp]
        ldr             q17, [sp, x9]
        add             sp, sp, x9, lsl #1
        ldr             q18, [sp]
        ldr             q19, [sp, x9]
        add             sp, sp, x9, lsl #1
        ldr             q20, [sp]
        ldr             q21, [sp, x9]
        add             sp, sp, x9, lsl #1
        ldr             q22, [sp]
        add             sp, sp, x9
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
        ld1             {\tmp\().8h}, [sp], x9
        calc_qpelh      v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12
        calc_qpelh2     v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn2, #12
        sqxtun          v1.8b, v1.8h
        st1             {v1.s}[0], [x0], #4
        subs            w4, w4, #1
        st1             {v1.h}[2], [x0], x1
.endm
1:      calc_all
.purgem calc
2:      mov             sp, x14
        ret
endfunc

function hevc_put_hevc_qpel_uni_hv8_8_end_neon
        mov             x9, #(MAX_PB_SIZE * 2)
        load_qpel_filterh x6, x5
        ldr             q16, [sp]
        ldr             q17, [sp, x9]
        add             sp, sp, x9, lsl #1
        ldr             q18, [sp]
        ldr             q19, [sp, x9]
        add             sp, sp, x9, lsl #1
        ldr             q20, [sp]
        ldr             q21, [sp, x9]
        add             sp, sp, x9, lsl #1
        ldr             q22, [sp]
        add             sp, sp, x9
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
        ld1             {\tmp\().8h}, [sp], x9
        calc_qpelh      v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12
        calc_qpelh2     v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn2, #12
        sqxtun          v1.8b, v1.8h
        subs            w4, w4, #1
        st1             {v1.8b}, [x0], x1
.endm
1:      calc_all
.purgem calc
2:      mov             sp, x14
        ret
endfunc

function hevc_put_hevc_qpel_uni_hv12_8_end_neon
        mov             x9, #(MAX_PB_SIZE * 2)
        load_qpel_filterh x6, x5
        sub             x1, x1, #8
        ld1             {v16.8h, v17.8h}, [sp], x9
        ld1             {v18.8h, v19.8h}, [sp], x9
        ld1             {v20.8h, v21.8h}, [sp], x9
        ld1             {v22.8h, v23.8h}, [sp], x9
        ld1             {v24.8h, v25.8h}, [sp], x9
        ld1             {v26.8h, v27.8h}, [sp], x9
        ld1             {v28.8h, v29.8h}, [sp], x9
.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
        ld1             {\tmp0\().8h, \tmp1\().8h}, [sp], x9
        calc_qpelh      v1,     \src0,  \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7, sqrshrn, #12
        calc_qpelh2     v1, v2, \src0, \src1,  \src2,  \src3,  \src4,  \src5,  \src6,  \src7, sqrshrn2, #12
        calc_qpelh      v2,     \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqrshrn, #12
        sqxtun          v1.8b, v1.8h
        sqxtun2         v1.16b, v2.8h
        st1             {v1.8b}, [x0], #8
        subs            w4, w4, #1
        st1             {v1.s}[2], [x0], x1
.endm
1:      calc_all2
.purgem calc
2:      mov             sp, x14
        ret
endfunc

function hevc_put_hevc_qpel_uni_hv16_8_end_neon
        mov             x9, #(MAX_PB_SIZE * 2)
        load_qpel_filterh x6, x5
        sub             w12, w9, w7, lsl #1
0:      mov             x8, sp          // src
        ld1             {v16.8h, v17.8h}, [x8], x9
        mov             w11, w4         // height
        ld1             {v18.8h, v19.8h}, [x8], x9
        mov             x10, x0         // dst
        ld1             {v20.8h, v21.8h}, [x8], x9
        ld1             {v22.8h, v23.8h}, [x8], x9
        ld1             {v24.8h, v25.8h}, [x8], x9
        ld1             {v26.8h, v27.8h}, [x8], x9
        ld1             {v28.8h, v29.8h}, [x8], x9
.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
        ld1             {\tmp0\().8h, \tmp1\().8h}, [x8], x9
        calc_qpelh      v1,     \src0, \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7,  sqrshrn,  #12
        calc_qpelh2     v1, v2, \src0, \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7,  sqrshrn2, #12
        calc_qpelh      v2,     \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqrshrn,  #12
        calc_qpelh2     v2, v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqrshrn2, #12
        sqxtun          v1.8b, v1.8h
        subs            x11, x11, #1
        sqxtun2         v1.16b, v2.8h
        st1             {v1.16b}, [x10], x1
.endm
1:      calc_all2
.purgem calc
2:      add             x0, x0, #16
        add             sp, sp, #32
        subs            w7, w7, #16
        b.ne            0b
        mov             sp, x14
        ret
endfunc

.macro qpel_uni_hv suffix
function ff_hevc_put_hevc_qpel_uni_hv4_8_\suffix, export=1
        add             w10, w4, #8
        lsl             x10, x10, #7
        mov             x14, sp
        sub             sp, sp, x10         // tmp_array
        stp             x30, x14,[sp, #-48]!
        stp             x4, x6, [sp, #16]
        stp             x0, x1, [sp, #32]
        sub             x1, x2, x3, lsl #1
        sub             x1, x1, x3
        add             x0, sp, #48
        mov             x2, x3
        add             x3, x4, #7
        mov             x4, x5
        bl              X(ff_hevc_put_hevc_qpel_h4_8_\suffix)
        ldp             x4, x6, [sp, #16]
        ldp             x0, x1, [sp, #32]
        ldp             x30, x14, [sp], #48
        b               hevc_put_hevc_qpel_uni_hv4_8_end_neon
endfunc

function ff_hevc_put_hevc_qpel_uni_hv6_8_\suffix, export=1
        add             w10, w4, #8
        lsl             x10, x10, #7
        mov             x14, sp
        sub             sp, sp, x10         // tmp_array
        stp             x30, x14,[sp, #-48]!
        stp             x4, x6, [sp, #16]
        stp             x0, x1, [sp, #32]
        sub             x1, x2, x3, lsl #1
        sub             x1, x1, x3
        add             x0, sp, #48
        mov             x2, x3
        add             w3, w4, #7
        mov             x4, x5
        bl              X(ff_hevc_put_hevc_qpel_h6_8_\suffix)
        ldp             x4, x6, [sp, #16]
        ldp             x0, x1, [sp, #32]
        ldp             x30, x14, [sp], #48
        b               hevc_put_hevc_qpel_uni_hv6_8_end_neon
endfunc

function ff_hevc_put_hevc_qpel_uni_hv8_8_\suffix, export=1
        add             w10, w4, #8
        lsl             x10, x10, #7
        mov             x14, sp
        sub             sp, sp, x10         // tmp_array
        stp             x30, x14,[sp, #-48]!
        stp             x4, x6, [sp, #16]
        stp             x0, x1, [sp, #32]
        sub             x1, x2, x3, lsl #1
        sub             x1, x1, x3
        add             x0, sp, #48
        mov             x2, x3
        add             w3, w4, #7
        mov             x4, x5
        bl              X(ff_hevc_put_hevc_qpel_h8_8_\suffix)
        ldp             x4, x6, [sp, #16]
        ldp             x0, x1, [sp, #32]
        ldp             x30, x14, [sp], #48
        b               hevc_put_hevc_qpel_uni_hv8_8_end_neon
endfunc

function ff_hevc_put_hevc_qpel_uni_hv12_8_\suffix, export=1
        add             w10, w4, #8
        lsl             x10, x10, #7
        mov             x14, sp
        sub             sp, sp, x10         // tmp_array
        stp             x7, x30, [sp, #-64]!
        stp             x4, x6, [sp, #16]
        stp             x0, x1, [sp, #32]
        str             x14,    [sp, #48]
        sub             x1, x2, x3, lsl #1
        sub             x1, x1, x3
        mov             x2, x3
        add             x0, sp, #64
        add             w3, w4, #7
        mov             x4, x5
        mov             w6, #12
        bl              X(ff_hevc_put_hevc_qpel_h12_8_\suffix)
        ldr             x14,    [sp, #48]
        ldp             x4, x6, [sp, #16]
        ldp             x0, x1, [sp, #32]
        ldp             x7, x30, [sp], #64
        b               hevc_put_hevc_qpel_uni_hv12_8_end_neon
endfunc

function ff_hevc_put_hevc_qpel_uni_hv16_8_\suffix, export=1
        add             w10, w4, #8
        lsl             x10, x10, #7
        mov             x14, sp
        sub             sp, sp, x10         // tmp_array
        stp             x7, x30, [sp, #-64]!
        stp             x4, x6, [sp, #16]
        stp             x0, x1, [sp, #32]
        str             x14,    [sp, #48]
        add             x0, sp, #64
        sub             x1, x2, x3, lsl #1
        sub             x1, x1, x3
        mov             x2, x3
        add             w3, w4, #7
        mov             x4, x5
        bl              X(ff_hevc_put_hevc_qpel_h16_8_\suffix)
        ldr             x14,    [sp, #48]
        ldp             x4, x6, [sp, #16]
        ldp             x0, x1, [sp, #32]
        ldp             x7, x30, [sp], #64
        b               hevc_put_hevc_qpel_uni_hv16_8_end_neon
endfunc

function ff_hevc_put_hevc_qpel_uni_hv24_8_\suffix, export=1
        stp             x4, x5, [sp, #-64]!
        stp             x2, x3, [sp, #16]
        stp             x0, x1, [sp, #32]
        stp             x6, x30, [sp, #48]
        mov             x7, #16
        bl              X(ff_hevc_put_hevc_qpel_uni_hv16_8_\suffix)
        ldp             x2, x3, [sp, #16]
        add             x2, x2, #16
        ldp             x0, x1, [sp, #32]
        ldp             x4, x5, [sp], #48
        mov             x7, #8
        add             x0, x0, #16
        ldr             x6, [sp]
        bl              X(ff_hevc_put_hevc_qpel_uni_hv8_8_\suffix)
        ldr             x30, [sp, #8]
        add             sp, sp, #16
        ret
endfunc

function ff_hevc_put_hevc_qpel_uni_hv32_8_\suffix, export=1
        add             w10, w4, #8
        lsl             x10, x10, #7
        mov             x14, sp
        sub             sp, sp, x10         // tmp_array
        stp             x7, x30, [sp, #-64]!
        stp             x4, x6, [sp, #16]
        stp             x0, x1, [sp, #32]
        str             x14,    [sp, #48]
        sub             x1, x2, x3, lsl #1
        add             x0, sp, #64
        sub             x1, x1, x3
        mov             x2, x3
        add             w3, w4, #7
        mov             x4, x5
        mov             w6, #32
        bl              X(ff_hevc_put_hevc_qpel_h32_8_\suffix)
        ldr             x14,    [sp, #48]
        ldp             x4, x6, [sp, #16]
        ldp             x0, x1, [sp, #32]
        ldp             x7, x30, [sp], #64
        b               hevc_put_hevc_qpel_uni_hv16_8_end_neon
endfunc

function ff_hevc_put_hevc_qpel_uni_hv48_8_\suffix, export=1
        add             w10, w4, #8
        lsl             x10, x10, #7
        mov             x14, sp
        sub             sp, sp, x10         // tmp_array
        stp             x7, x30, [sp, #-64]!
        stp             x4, x6, [sp, #16]
        stp             x0, x1, [sp, #32]
        str             x14,    [sp, #48]
        sub             x1, x2, x3, lsl #1
        sub             x1, x1, x3
        mov             x2, x3
        add             x0, sp, #64
        add             w3, w4, #7
        mov             x4, x5
.ifc \suffix, neon
        mov             w6, #48
        bl              X(ff_hevc_put_hevc_qpel_h32_8_\suffix)
.else
        bl              X(ff_hevc_put_hevc_qpel_h48_8_\suffix)
.endif
        ldr             x14,    [sp, #48]
        ldp             x4, x6, [sp, #16]
        ldp             x0, x1, [sp, #32]
        ldp             x7, x30, [sp], #64
        b               hevc_put_hevc_qpel_uni_hv16_8_end_neon
endfunc

function ff_hevc_put_hevc_qpel_uni_hv64_8_\suffix, export=1
        add             w10, w4, #8
        lsl             x10, x10, #7
        mov             x14, sp
        sub             sp, sp, x10         // tmp_array
        stp             x7, x30, [sp, #-64]!
        stp             x4, x6, [sp, #16]
        stp             x0, x1, [sp, #32]
        str             x14,    [sp, #48]
        add             x0, sp, #64
        sub             x1, x2, x3, lsl #1
        mov             x2, x3
        sub             x1, x1, x3
        add             w3, w4, #7
        mov             x4, x5
.ifc \suffix, neon
        mov             w6, #64
        bl              X(ff_hevc_put_hevc_qpel_h32_8_\suffix)
.else
        bl              X(ff_hevc_put_hevc_qpel_h64_8_\suffix)
.endif
        ldr             x14,    [sp, #48]
        ldp             x4, x6, [sp, #16]
        ldp             x0, x1, [sp, #32]
        ldp             x7, x30, [sp], #64
        b               hevc_put_hevc_qpel_uni_hv16_8_end_neon
endfunc
.endm

qpel_uni_hv neon

#if HAVE_I8MM
ENABLE_I8MM

qpel_uni_hv neon_i8mm

DISABLE_I8MM
#endif

.macro QPEL_UNI_W_H_HEADER elems=4s
        ldr             x12, [sp]
        sub             x2, x2, #3
        movrel          x9, qpel_filters
        add             x9, x9, x12, lsl #3
        ld1r            {v28.2d}, [x9]
        mov             w10, #-6
        sub             w10, w10, w5
        dup             v30.\elems, w6          // wx
        dup             v31.4s, w10             // shift
        dup             v29.4s, w7              // ox
.endm

function ff_hevc_put_hevc_qpel_uni_w_h4_8_neon, export=1
        QPEL_UNI_W_H_HEADER 4h
        sxtl            v0.8h,   v28.8b
1:
        ld1             {v1.8b, v2.8b}, [x2], x3
        subs            w4,  w4,  #1
        uxtl            v1.8h,   v1.8b
        uxtl            v2.8h,   v2.8b
        ext             v3.16b,  v1.16b,  v2.16b,  #2
        ext             v4.16b,  v1.16b,  v2.16b,  #4
        ext             v5.16b,  v1.16b,  v2.16b,  #6
        ext             v6.16b,  v1.16b,  v2.16b,  #8
        ext             v7.16b,  v1.16b,  v2.16b,  #10
        ext             v16.16b, v1.16b,  v2.16b,  #12
        ext             v17.16b, v1.16b,  v2.16b,  #14
        mul             v18.4h,  v1.4h,   v0.h[0]
        mla             v18.4h,  v3.4h,   v0.h[1]
        mla             v18.4h,  v4.4h,   v0.h[2]
        mla             v18.4h,  v5.4h,   v0.h[3]
        mla             v18.4h,  v6.4h,   v0.h[4]
        mla             v18.4h,  v7.4h,   v0.h[5]
        mla             v18.4h,  v16.4h,  v0.h[6]
        mla             v18.4h,  v17.4h,  v0.h[7]
        smull           v16.4s,  v18.4h,  v30.4h
        sqrshl          v16.4s,  v16.4s,  v31.4s
        sqadd           v16.4s,  v16.4s,  v29.4s
        sqxtn           v16.4h,  v16.4s
        sqxtun          v16.8b,  v16.8h
        str             s16, [x0]
        add             x0,  x0,  x1
        b.hi            1b
        ret
endfunc

function ff_hevc_put_hevc_qpel_uni_w_h6_8_neon, export=1
        QPEL_UNI_W_H_HEADER 8h
        sub             x1,  x1,  #4
        sxtl            v0.8h,   v28.8b
1:
        ld1             {v1.8b, v2.8b}, [x2], x3
        subs            w4,  w4,  #1
        uxtl            v1.8h,   v1.8b
        uxtl            v2.8h,   v2.8b
        ext             v3.16b,  v1.16b,  v2.16b,  #2
        ext             v4.16b,  v1.16b,  v2.16b,  #4
        ext             v5.16b,  v1.16b,  v2.16b,  #6
        ext             v6.16b,  v1.16b,  v2.16b,  #8
        ext             v7.16b,  v1.16b,  v2.16b,  #10
        ext             v16.16b, v1.16b,  v2.16b,  #12
        ext             v17.16b, v1.16b,  v2.16b,  #14
        mul             v18.8h,  v1.8h,   v0.h[0]
        mla             v18.8h,  v3.8h,   v0.h[1]
        mla             v18.8h,  v4.8h,   v0.h[2]
        mla             v18.8h,  v5.8h,   v0.h[3]
        mla             v18.8h,  v6.8h,   v0.h[4]
        mla             v18.8h,  v7.8h,   v0.h[5]
        mla             v18.8h,  v16.8h,  v0.h[6]
        mla             v18.8h,  v17.8h,  v0.h[7]
        smull           v16.4s,  v18.4h,  v30.4h
        smull2          v17.4s,  v18.8h,  v30.8h
        sqrshl          v16.4s,  v16.4s,  v31.4s
        sqrshl          v17.4s,  v17.4s,  v31.4s
        sqadd           v16.4s,  v16.4s,  v29.4s
        sqadd           v17.4s,  v17.4s,  v29.4s
        sqxtn           v16.4h,  v16.4s
        sqxtn2          v16.8h,  v17.4s
        sqxtun          v16.8b,  v16.8h
        str             s16, [x0], #4
        st1             {v16.h}[2], [x0], x1
        b.hi            1b
        ret
endfunc

function ff_hevc_put_hevc_qpel_uni_w_h8_8_neon, export=1
        QPEL_UNI_W_H_HEADER 8h
        sxtl            v0.8h,   v28.8b
1:
        ld1             {v1.8b, v2.8b}, [x2], x3
        subs            w4,  w4,  #1
        uxtl            v1.8h,   v1.8b
        uxtl            v2.8h,   v2.8b
        ext             v3.16b,  v1.16b,  v2.16b,  #2
        ext             v4.16b,  v1.16b,  v2.16b,  #4
        ext             v5.16b,  v1.16b,  v2.16b,  #6
        ext             v6.16b,  v1.16b,  v2.16b,  #8
        ext             v7.16b,  v1.16b,  v2.16b,  #10
        ext             v16.16b, v1.16b,  v2.16b,  #12
        ext             v17.16b, v1.16b,  v2.16b,  #14
        mul             v18.8h,  v1.8h,   v0.h[0]
        mla             v18.8h,  v3.8h,   v0.h[1]
        mla             v18.8h,  v4.8h,   v0.h[2]
        mla             v18.8h,  v5.8h,   v0.h[3]
        mla             v18.8h,  v6.8h,   v0.h[4]
        mla             v18.8h,  v7.8h,   v0.h[5]
        mla             v18.8h,  v16.8h,  v0.h[6]
        mla             v18.8h,  v17.8h,  v0.h[7]
        smull           v16.4s,  v18.4h,  v30.4h
        smull2          v17.4s,  v18.8h,  v30.8h
        sqrshl          v16.4s,  v16.4s,  v31.4s
        sqrshl          v17.4s,  v17.4s,  v31.4s
        sqadd           v16.4s,  v16.4s,  v29.4s
        sqadd           v17.4s,  v17.4s,  v29.4s
        sqxtn           v16.4h,  v16.4s
        sqxtn2          v16.8h,  v17.4s
        sqxtun          v16.8b,  v16.8h
        st1             {v16.8b}, [x0], x1
        b.hi            1b
        ret
endfunc

function ff_hevc_put_hevc_qpel_uni_w_h12_8_neon, export=1
        QPEL_UNI_W_H_HEADER 8h
        add             x13, x0,  #8
        sxtl            v0.8h,   v28.8b
1:
        ld1             {v1.8b, v2.8b, v3.8b}, [x2], x3
        subs            w4,  w4,  #1
        uxtl            v1.8h,   v1.8b
        uxtl            v2.8h,   v2.8b
        uxtl            v3.8h,   v3.8b
        ext             v4.16b,  v1.16b,  v2.16b,  #2
        ext             v5.16b,  v1.16b,  v2.16b,  #4
        ext             v6.16b,  v1.16b,  v2.16b,  #6
        ext             v7.16b,  v1.16b,  v2.16b,  #8
        ext             v16.16b, v1.16b,  v2.16b,  #10
        ext             v17.16b, v1.16b,  v2.16b,  #12
        ext             v18.16b, v1.16b,  v2.16b,  #14
        mul             v19.8h,  v1.8h,   v0.h[0]
        mla             v19.8h,  v4.8h,   v0.h[1]
        mla             v19.8h,  v5.8h,   v0.h[2]
        mla             v19.8h,  v6.8h,   v0.h[3]
        mla             v19.8h,  v7.8h,   v0.h[4]
        mla             v19.8h,  v16.8h,  v0.h[5]
        mla             v19.8h,  v17.8h,  v0.h[6]
        mla             v19.8h,  v18.8h,  v0.h[7]
        ext             v4.16b,  v2.16b,  v3.16b,  #2
        ext             v5.16b,  v2.16b,  v3.16b,  #4
        ext             v6.16b,  v2.16b,  v3.16b,  #6
        ext             v7.16b,  v2.16b,  v3.16b,  #8
        ext             v16.16b, v2.16b,  v3.16b,  #10
        ext             v17.16b, v2.16b,  v3.16b,  #12
        ext             v18.16b, v2.16b,  v3.16b,  #14
        mul             v20.4h,  v2.4h,   v0.h[0]
        mla             v20.4h,  v4.4h,   v0.h[1]
        mla             v20.4h,  v5.4h,   v0.h[2]
        mla             v20.4h,  v6.4h,   v0.h[3]
        mla             v20.4h,  v7.4h,   v0.h[4]
        mla             v20.4h,  v16.4h,  v0.h[5]
        mla             v20.4h,  v17.4h,  v0.h[6]
        mla             v20.4h,  v18.4h,  v0.h[7]
        smull           v16.4s,  v19.4h,  v30.4h
        smull2          v17.4s,  v19.8h,  v30.8h
        smull           v18.4s,  v20.4h,  v30.4h
        sqrshl          v16.4s,  v16.4s,  v31.4s
        sqrshl          v17.4s,  v17.4s,  v31.4s
        sqrshl          v18.4s,  v18.4s,  v31.4s
        sqadd           v16.4s,  v16.4s,  v29.4s
        sqadd           v17.4s,  v17.4s,  v29.4s
        sqadd           v18.4s,  v18.4s,  v29.4s
        sqxtn           v16.4h,  v16.4s
        sqxtn2          v16.8h,  v17.4s
        sqxtn           v17.4h,  v18.4s
        sqxtun          v16.8b,  v16.8h
        sqxtun          v17.8b,  v17.8h
        st1             {v16.8b},   [x0],  x1
        st1             {v17.s}[0], [x13], x1
        b.hi            1b
        ret
endfunc

function ff_hevc_put_hevc_qpel_uni_w_h16_8_neon, export=1
        QPEL_UNI_W_H_HEADER 8h
        sxtl            v0.8h,   v28.8b
1:
        ld1             {v1.8b, v2.8b, v3.8b}, [x2], x3
        subs            w4,  w4,  #1
        uxtl            v1.8h,   v1.8b
        uxtl            v2.8h,   v2.8b
        uxtl            v3.8h,   v3.8b
        ext             v4.16b,  v1.16b,  v2.16b,  #2
        ext             v5.16b,  v1.16b,  v2.16b,  #4
        ext             v6.16b,  v1.16b,  v2.16b,  #6
        ext             v7.16b,  v1.16b,  v2.16b,  #8
        ext             v16.16b, v1.16b,  v2.16b,  #10
        ext             v17.16b, v1.16b,  v2.16b,  #12
        ext             v18.16b, v1.16b,  v2.16b,  #14
        mul             v19.8h,  v1.8h,   v0.h[0]
        mla             v19.8h,  v4.8h,   v0.h[1]
        mla             v19.8h,  v5.8h,   v0.h[2]
        mla             v19.8h,  v6.8h,   v0.h[3]
        mla             v19.8h,  v7.8h,   v0.h[4]
        mla             v19.8h,  v16.8h,  v0.h[5]
        mla             v19.8h,  v17.8h,  v0.h[6]
        mla             v19.8h,  v18.8h,  v0.h[7]
        ext             v4.16b,  v2.16b,  v3.16b,  #2
        ext             v5.16b,  v2.16b,  v3.16b,  #4
        ext             v6.16b,  v2.16b,  v3.16b,  #6
        ext             v7.16b,  v2.16b,  v3.16b,  #8
        ext             v16.16b, v2.16b,  v3.16b,  #10
        ext             v17.16b, v2.16b,  v3.16b,  #12
        ext             v18.16b, v2.16b,  v3.16b,  #14
        mul             v20.8h,  v2.8h,   v0.h[0]
        mla             v20.8h,  v4.8h,   v0.h[1]
        mla             v20.8h,  v5.8h,   v0.h[2]
        mla             v20.8h,  v6.8h,   v0.h[3]
        mla             v20.8h,  v7.8h,   v0.h[4]
        mla             v20.8h,  v16.8h,  v0.h[5]
        mla             v20.8h,  v17.8h,  v0.h[6]
        mla             v20.8h,  v18.8h,  v0.h[7]
        smull           v16.4s,  v19.4h,  v30.4h
        smull2          v17.4s,  v19.8h,  v30.8h
        smull           v18.4s,  v20.4h,  v30.4h
        smull2          v19.4s,  v20.8h,  v30.8h
        sqrshl          v16.4s,  v16.4s,  v31.4s
        sqrshl          v17.4s,  v17.4s,  v31.4s
        sqrshl          v18.4s,  v18.4s,  v31.4s
        sqrshl          v19.4s,  v19.4s,  v31.4s
        sqadd           v16.4s,  v16.4s,  v29.4s
        sqadd           v17.4s,  v17.4s,  v29.4s
        sqadd           v18.4s,  v18.4s,  v29.4s
        sqadd           v19.4s,  v19.4s,  v29.4s
        sqxtn           v16.4h,  v16.4s
        sqxtn2          v16.8h,  v17.4s
        sqxtn           v17.4h,  v18.4s
        sqxtn2          v17.8h,  v19.4s
        sqxtun          v16.8b,  v16.8h
        sqxtun          v17.8b,  v17.8h
        st1             {v16.8b, v17.8b}, [x0], x1
        b.hi            1b
        ret
endfunc

function ff_hevc_put_hevc_qpel_uni_w_h24_8_neon, export=1
        QPEL_UNI_W_H_HEADER 8h
        sxtl            v0.8h,   v28.8b
1:
        ld1             {v1.8b, v2.8b, v3.8b, v4.8b}, [x2], x3
        subs            w4,  w4,  #1
        uxtl            v1.8h,   v1.8b
        uxtl            v2.8h,   v2.8b
        uxtl            v3.8h,   v3.8b
        uxtl            v4.8h,   v4.8b
        ext             v5.16b,  v1.16b,  v2.16b,  #2
        ext             v6.16b,  v1.16b,  v2.16b,  #4
        ext             v7.16b,  v1.16b,  v2.16b,  #6
        ext             v16.16b, v1.16b,  v2.16b,  #8
        ext             v17.16b, v1.16b,  v2.16b,  #10
        ext             v18.16b, v1.16b,  v2.16b,  #12
        ext             v19.16b, v1.16b,  v2.16b,  #14
        mul             v20.8h,  v1.8h,   v0.h[0]
        mla             v20.8h,  v5.8h,   v0.h[1]
        mla             v20.8h,  v6.8h,   v0.h[2]
        mla             v20.8h,  v7.8h,   v0.h[3]
        mla             v20.8h,  v16.8h,  v0.h[4]
        mla             v20.8h,  v17.8h,  v0.h[5]
        mla             v20.8h,  v18.8h,  v0.h[6]
        mla             v20.8h,  v19.8h,  v0.h[7]
        ext             v5.16b,  v2.16b,  v3.16b,  #2
        ext             v6.16b,  v2.16b,  v3.16b,  #4
        ext             v7.16b,  v2.16b,  v3.16b,  #6
        ext             v16.16b, v2.16b,  v3.16b,  #8
        ext             v17.16b, v2.16b,  v3.16b,  #10
        ext             v18.16b, v2.16b,  v3.16b,  #12
        ext             v19.16b, v2.16b,  v3.16b,  #14
        mul             v21.8h,  v2.8h,   v0.h[0]
        mla             v21.8h,  v5.8h,   v0.h[1]
        mla             v21.8h,  v6.8h,   v0.h[2]
        mla             v21.8h,  v7.8h,   v0.h[3]
        mla             v21.8h,  v16.8h,  v0.h[4]
        mla             v21.8h,  v17.8h,  v0.h[5]
        mla             v21.8h,  v18.8h,  v0.h[6]
        mla             v21.8h,  v19.8h,  v0.h[7]
        ext             v5.16b,  v3.16b,  v4.16b,  #2
        ext             v6.16b,  v3.16b,  v4.16b,  #4
        ext             v7.16b,  v3.16b,  v4.16b,  #6
        ext             v16.16b, v3.16b,  v4.16b,  #8
        ext             v17.16b, v3.16b,  v4.16b,  #10
        ext             v18.16b, v3.16b,  v4.16b,  #12
        ext             v19.16b, v3.16b,  v4.16b,  #14
        mul             v22.8h,  v3.8h,   v0.h[0]
        mla             v22.8h,  v5.8h,   v0.h[1]
        mla             v22.8h,  v6.8h,   v0.h[2]
        mla             v22.8h,  v7.8h,   v0.h[3]
        mla             v22.8h,  v16.8h,  v0.h[4]
        mla             v22.8h,  v17.8h,  v0.h[5]
        mla             v22.8h,  v18.8h,  v0.h[6]
        mla             v22.8h,  v19.8h,  v0.h[7]
        smull           v16.4s,  v20.4h,  v30.4h
        smull2          v17.4s,  v20.8h,  v30.8h
        smull           v18.4s,  v21.4h,  v30.4h
        smull2          v19.4s,  v21.8h,  v30.8h
        smull           v20.4s,  v22.4h,  v30.4h
        smull2          v21.4s,  v22.8h,  v30.8h
        sqrshl          v16.4s,  v16.4s,  v31.4s
        sqrshl          v17.4s,  v17.4s,  v31.4s
        sqrshl          v18.4s,  v18.4s,  v31.4s
        sqrshl          v19.4s,  v19.4s,  v31.4s
        sqrshl          v20.4s,  v20.4s,  v31.4s
        sqrshl          v21.4s,  v21.4s,  v31.4s
        sqadd           v16.4s,  v16.4s,  v29.4s
        sqadd           v17.4s,  v17.4s,  v29.4s
        sqadd           v18.4s,  v18.4s,  v29.4s
        sqadd           v19.4s,  v19.4s,  v29.4s
        sqadd           v20.4s,  v20.4s,  v29.4s
        sqadd           v21.4s,  v21.4s,  v29.4s
        sqxtn           v16.4h,  v16.4s
        sqxtn2          v16.8h,  v17.4s
        sqxtn           v17.4h,  v18.4s
        sqxtn2          v17.8h,  v19.4s
        sqxtn           v18.4h,  v20.4s
        sqxtn2          v18.8h,  v21.4s
        sqxtun          v16.8b,  v16.8h
        sqxtun          v17.8b,  v17.8h
        sqxtun          v18.8b,  v18.8h
        st1             {v16.8b, v17.8b, v18.8b}, [x0], x1
        b.hi            1b
        ret
endfunc

function ff_hevc_put_hevc_qpel_uni_w_h32_8_neon, export=1
        QPEL_UNI_W_H_HEADER 8h
        ldr             w10, [sp, #16]        // width
        ld1             {v1.8b}, [x2], #8
        sub             x3,  x3,  w10, uxtw   // decrement src stride
        mov             w11, w10              // original width
        sub             x3,  x3,  #8          // decrement src stride
        sub             x1,  x1,  w10, uxtw   // decrement dst stride
        sxtl            v0.8h,   v28.8b
        uxtl            v1.8h,   v1.8b
1:
        ld1             {v2.8b, v3.8b}, [x2], #16
        subs            w10, w10, #16         // width
        uxtl            v2.8h,   v2.8b
        uxtl            v3.8h,   v3.8b
        ext             v4.16b,  v1.16b,  v2.16b,  #2
        ext             v5.16b,  v1.16b,  v2.16b,  #4
        ext             v6.16b,  v1.16b,  v2.16b,  #6
        ext             v7.16b,  v1.16b,  v2.16b,  #8
        ext             v16.16b, v1.16b,  v2.16b,  #10
        ext             v17.16b, v1.16b,  v2.16b,  #12
        ext             v18.16b, v1.16b,  v2.16b,  #14
        mul             v19.8h,  v1.8h,   v0.h[0]
        mla             v19.8h,  v4.8h,   v0.h[1]
        mla             v19.8h,  v5.8h,   v0.h[2]
        mla             v19.8h,  v6.8h,   v0.h[3]
        mla             v19.8h,  v7.8h,   v0.h[4]
        mla             v19.8h,  v16.8h,  v0.h[5]
        mla             v19.8h,  v17.8h,  v0.h[6]
        mla             v19.8h,  v18.8h,  v0.h[7]
        ext             v4.16b,  v2.16b,  v3.16b,  #2
        ext             v5.16b,  v2.16b,  v3.16b,  #4
        ext             v6.16b,  v2.16b,  v3.16b,  #6
        ext             v7.16b,  v2.16b,  v3.16b,  #8
        ext             v16.16b, v2.16b,  v3.16b,  #10
        ext             v17.16b, v2.16b,  v3.16b,  #12
        ext             v18.16b, v2.16b,  v3.16b,  #14
        mul             v20.8h,  v2.8h,   v0.h[0]
        mla             v20.8h,  v4.8h,   v0.h[1]
        mla             v20.8h,  v5.8h,   v0.h[2]
        mla             v20.8h,  v6.8h,   v0.h[3]
        mla             v20.8h,  v7.8h,   v0.h[4]
        mla             v20.8h,  v16.8h,  v0.h[5]
        mla             v20.8h,  v17.8h,  v0.h[6]
        mla             v20.8h,  v18.8h,  v0.h[7]
        smull           v16.4s,  v19.4h,  v30.4h
        smull2          v17.4s,  v19.8h,  v30.8h
        smull           v18.4s,  v20.4h,  v30.4h
        smull2          v19.4s,  v20.8h,  v30.8h
        sqrshl          v16.4s,  v16.4s,  v31.4s
        sqrshl          v17.4s,  v17.4s,  v31.4s
        sqrshl          v18.4s,  v18.4s,  v31.4s
        sqrshl          v19.4s,  v19.4s,  v31.4s
        sqadd           v16.4s,  v16.4s,  v29.4s
        sqadd           v17.4s,  v17.4s,  v29.4s
        sqadd           v18.4s,  v18.4s,  v29.4s
        sqadd           v19.4s,  v19.4s,  v29.4s
        sqxtn           v16.4h,  v16.4s
        sqxtn2          v16.8h,  v17.4s
        sqxtn           v17.4h,  v18.4s
        sqxtn2          v17.8h,  v19.4s
        sqxtun          v16.8b,  v16.8h
        sqxtun          v17.8b,  v17.8h
        st1             {v16.8b, v17.8b}, [x0], #16
        mov             v1.16b,  v3.16b
        b.gt            1b
        subs            w4,  w4,  #1          // height
        add             x2,  x2,  x3
        b.le            9f
        ld1             {v1.8b}, [x2], #8
        mov             w10, w11
        add             x0,  x0,  x1
        uxtl            v1.8h,   v1.8b
        b               1b
9:
        ret
endfunc

#if HAVE_I8MM
ENABLE_I8MM
function ff_hevc_put_hevc_qpel_uni_w_h4_8_neon_i8mm, export=1
        QPEL_UNI_W_H_HEADER
1:
        ld1             {v0.16b}, [x2], x3
        ext             v1.16b, v0.16b, v0.16b, #1
        ext             v2.16b, v0.16b, v0.16b, #2
        ext             v3.16b, v0.16b, v0.16b, #3
        zip1            v0.2d, v0.2d, v1.2d
        zip1            v2.2d, v2.2d, v3.2d
        movi            v16.16b, #0
        movi            v17.16b, #0
        usdot           v16.4s, v0.16b, v28.16b
        usdot           v17.4s, v2.16b, v28.16b
        addp            v16.4s, v16.4s, v17.4s
        mul             v16.4s, v16.4s, v30.4s
        sqrshl          v16.4s, v16.4s, v31.4s
        sqadd           v16.4s, v16.4s, v29.4s
        sqxtn           v16.4h, v16.4s
        sqxtun          v16.8b, v16.8h
        str             s16, [x0]
        add             x0, x0, x1
        subs            w4, w4, #1
        b.hi            1b
        ret
endfunc

function ff_hevc_put_hevc_qpel_uni_w_h6_8_neon_i8mm, export=1
        QPEL_UNI_W_H_HEADER
        sub             x1, x1, #4
1:
        ld1             {v0.16b}, [x2], x3
        ext             v1.16b, v0.16b, v0.16b, #1
        ext             v2.16b, v0.16b, v0.16b, #2
        ext             v3.16b, v0.16b, v0.16b, #3
        ext             v4.16b, v0.16b, v0.16b, #4
        ext             v5.16b, v0.16b, v0.16b, #5
        zip1            v0.2d, v0.2d, v1.2d
        zip1            v2.2d, v2.2d, v3.2d
        zip1            v4.2d, v4.2d, v5.2d
        movi            v16.16b, #0
        movi            v17.16b, #0
        movi            v18.16b, #0
        usdot           v16.4s, v0.16b, v28.16b
        usdot           v17.4s, v2.16b, v28.16b
        usdot           v18.4s, v4.16b, v28.16b
        addp            v16.4s, v16.4s, v17.4s
        addp            v18.4s, v18.4s, v18.4s
        mul             v16.4s, v16.4s, v30.4s
        mul             v18.2s, v18.2s, v30.2s
        sqrshl          v16.4s, v16.4s, v31.4s
        sqrshl          v18.2s, v18.2s, v31.2s
        sqadd           v16.4s, v16.4s, v29.4s
        sqadd           v18.2s, v18.2s, v29.2s
        sqxtn           v16.4h, v16.4s
        sqxtn2          v16.8h, v18.4s
        sqxtun          v16.8b, v16.8h
        str             s16, [x0], #4
        st1             {v16.h}[2], [x0], x1
        subs            w4, w4, #1
        b.hi            1b
        ret
endfunc


.macro  QPEL_UNI_W_H_CALC s0, s1, s2, s3, d0, d1, d2, d3
        movi            \d0\().16b, #0
        movi            \d1\().16b, #0
        movi            \d2\().16b, #0
        movi            \d3\().16b, #0
        usdot           \d0\().4s, \s0\().16b, v28.16b
        usdot           \d1\().4s, \s1\().16b, v28.16b
        usdot           \d2\().4s, \s2\().16b, v28.16b
        usdot           \d3\().4s, \s3\().16b, v28.16b
        addp            \d0\().4s, \d0\().4s, \d1\().4s
        addp            \d2\().4s, \d2\().4s, \d3\().4s
        mul             \d0\().4s, \d0\().4s, v30.4s
        mul             \d2\().4s, \d2\().4s, v30.4s
        sqrshl          \d0\().4s, \d0\().4s, v31.4s
        sqrshl          \d2\().4s, \d2\().4s, v31.4s
        sqadd           \d0\().4s, \d0\().4s, v29.4s
        sqadd           \d2\().4s, \d2\().4s, v29.4s
.endm

.macro  QPEL_UNI_W_H_CALC_HALF s0, s1, d0, d1
        movi            \d0\().16b, #0
        movi            \d1\().16b, #0
        usdot           \d0\().4s, \s0\().16b, v28.16b
        usdot           \d1\().4s, \s1\().16b, v28.16b
        addp            \d0\().4s, \d0\().4s, \d1\().4s
        mul             \d0\().4s, \d0\().4s, v30.4s
        sqrshl          \d0\().4s, \d0\().4s, v31.4s
        sqadd           \d0\().4s, \d0\().4s, v29.4s
.endm


function ff_hevc_put_hevc_qpel_uni_w_h8_8_neon_i8mm, export=1
        QPEL_UNI_W_H_HEADER
1:
        ld1             {v16.16b, v17.16b}, [x2], x3
        ext             v1.16b, v16.16b, v17.16b, #1
        ext             v2.16b, v16.16b, v17.16b, #2
        ext             v3.16b, v16.16b, v17.16b, #3
        ext             v4.16b, v16.16b, v17.16b, #4
        ext             v5.16b, v16.16b, v17.16b, #5
        ext             v6.16b, v16.16b, v17.16b, #6
        ext             v7.16b, v16.16b, v17.16b, #7
        zip1            v0.2d, v16.2d, v1.2d
        zip1            v2.2d, v2.2d, v3.2d
        zip1            v4.2d, v4.2d, v5.2d
        zip1            v6.2d, v6.2d, v7.2d
        QPEL_UNI_W_H_CALC  v0, v2, v4, v6,  v18, v19, v20, v21
        sqxtn           v18.4h, v18.4s
        sqxtn2          v18.8h, v20.4s
        sqxtun          v18.8b, v18.8h
        str             d18, [x0]
        add             x0, x0, x1
        subs            w4, w4, #1
        b.hi            1b
        ret
endfunc

function ff_hevc_put_hevc_qpel_uni_w_h12_8_neon_i8mm, export=1
        QPEL_UNI_W_H_HEADER
        add             x13, x0, #8
1:
        ld1             {v16.16b, v17.16b}, [x2], x3
        ext             v1.16b, v16.16b, v17.16b, #1
        ext             v2.16b, v16.16b, v17.16b, #2
        ext             v3.16b, v16.16b, v17.16b, #3
        ext             v4.16b, v16.16b, v17.16b, #4
        ext             v5.16b, v16.16b, v17.16b, #5
        ext             v6.16b, v16.16b, v17.16b, #6
        ext             v7.16b, v16.16b, v17.16b, #7
        zip1            v18.2d, v16.2d, v1.2d
        zip1            v19.2d, v2.2d, v3.2d
        zip1            v20.2d, v4.2d, v5.2d
        zip1            v21.2d, v6.2d, v7.2d
        zip2            v22.2d, v16.2d, v1.2d
        zip2            v23.2d, v2.2d, v3.2d
        QPEL_UNI_W_H_CALC  v18, v19, v20, v21, v0, v2, v4, v6
        QPEL_UNI_W_H_CALC_HALF v22, v23, v24, v25
        sqxtn           v0.4h, v0.4s
        sqxtn2          v0.8h, v4.4s
        sqxtn           v1.4h, v24.4s
        sqxtun          v0.8b, v0.8h
        sqxtun          v1.8b, v1.8h

        str             d0, [x0]
        str             s1, [x13]
        add             x0, x0, x1
        add             x13, x13, x1
        subs            w4, w4, #1
        b.hi            1b
        ret
endfunc

function ff_hevc_put_hevc_qpel_uni_w_h16_8_neon_i8mm, export=1
        QPEL_UNI_W_H_HEADER
1:
        ld1             {v16.16b, v17.16b}, [x2], x3
        ext             v1.16b, v16.16b, v17.16b, #1
        ext             v2.16b, v16.16b, v17.16b, #2
        ext             v3.16b, v16.16b, v17.16b, #3
        ext             v4.16b, v16.16b, v17.16b, #4
        ext             v5.16b, v16.16b, v17.16b, #5
        ext             v6.16b, v16.16b, v17.16b, #6
        ext             v7.16b, v16.16b, v17.16b, #7
        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v18, v19, v20, v21   // v18: 0, 8, 2, 10 v20: 1, 9, 3, 11
        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v23, v24, v25    // v22: 4, 12, 6, 14 v24: 5, 13, 7, 15
        sqxtn           v0.4h, v18.4s
        sqxtn2          v0.8h, v22.4s
        sqxtn           v1.4h, v20.4s
        sqxtn2          v1.8h, v24.4s
        trn1            v2.8h, v0.8h, v1.8h
        trn2            v3.8h, v0.8h, v1.8h
        sqxtun          v0.8b, v2.8h
        sqxtun2         v0.16b, v3.8h
        st1             {v0.16b}, [x0], x1
        subs            w4, w4, #1
        b.hi            1b
        ret
endfunc

function ff_hevc_put_hevc_qpel_uni_w_h24_8_neon_i8mm, export=1
        QPEL_UNI_W_H_HEADER
        sub             x1, x1, #16
1:
        ld1             {v16.16b, v17.16b}, [x2], x3
        ext             v1.16b, v16.16b, v17.16b, #1
        ext             v2.16b, v16.16b, v17.16b, #2
        ext             v3.16b, v16.16b, v17.16b, #3
        ext             v4.16b, v16.16b, v17.16b, #4
        ext             v5.16b, v16.16b, v17.16b, #5
        ext             v6.16b, v16.16b, v17.16b, #6
        ext             v7.16b, v16.16b, v17.16b, #7
        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v18, v19, v20, v21
        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v23, v24, v25
        sqxtn           v18.4h, v18.4s
        sqxtn2          v18.8h, v22.4s
        sqxtn           v19.4h, v20.4s
        sqxtn2          v19.8h, v24.4s
        trn1            v20.8h, v18.8h, v19.8h
        trn2            v21.8h, v18.8h, v19.8h
        sqxtun          v26.8b, v20.8h
        sqxtun2         v26.16b, v21.8h                         // 0-15
        ext             v1.16b, v17.16b, v17.16b, #1
        ext             v2.16b, v17.16b, v17.16b, #2
        ext             v3.16b, v17.16b, v17.16b, #3
        ext             v4.16b, v17.16b, v17.16b, #4
        ext             v5.16b, v17.16b, v17.16b, #5
        ext             v6.16b, v17.16b, v17.16b, #6
        ext             v7.16b, v17.16b, v17.16b, #7
        zip1            v0.2d, v17.2d, v1.2d
        zip1            v2.2d, v2.2d, v3.2d
        zip1            v4.2d, v4.2d, v5.2d
        zip1            v6.2d, v6.2d, v7.2d
        QPEL_UNI_W_H_CALC  v0, v2, v4, v6, v18, v19, v20, v21
        sqxtn           v18.4h, v18.4s
        sqxtn2          v18.8h, v20.4s
        sqxtun          v27.8b, v18.8h

        st1             {v26.16b}, [x0], #16
        st1             {v27.8b}, [x0], x1
        subs            w4, w4, #1
        b.hi            1b
        ret
endfunc


function ff_hevc_put_hevc_qpel_uni_w_h32_8_neon_i8mm, export=1
        QPEL_UNI_W_H_HEADER
1:
        ld1             {v16.16b, v17.16b, v18.16b}, [x2], x3
        ext             v1.16b, v16.16b, v17.16b, #1
        ext             v2.16b, v16.16b, v17.16b, #2
        ext             v3.16b, v16.16b, v17.16b, #3
        ext             v4.16b, v16.16b, v17.16b, #4
        ext             v5.16b, v16.16b, v17.16b, #5
        ext             v6.16b, v16.16b, v17.16b, #6
        ext             v7.16b, v16.16b, v17.16b, #7
        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v0, v19, v20, v21
        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v23, v24, v25
        sqxtn           v0.4h, v0.4s
        sqxtn2          v0.8h, v22.4s
        sqxtn           v19.4h, v20.4s
        sqxtn2          v19.8h, v24.4s
        trn1            v20.8h, v0.8h, v19.8h
        trn2            v21.8h, v0.8h, v19.8h
        sqxtun          v26.8b, v20.8h
        sqxtun2         v26.16b, v21.8h                         // 0-15
        ext             v1.16b, v17.16b, v18.16b, #1
        ext             v2.16b, v17.16b, v18.16b, #2
        ext             v3.16b, v17.16b, v18.16b, #3
        ext             v4.16b, v17.16b, v18.16b, #4
        ext             v5.16b, v17.16b, v18.16b, #5
        ext             v6.16b, v17.16b, v18.16b, #6
        ext             v7.16b, v17.16b, v18.16b, #7
        QPEL_UNI_W_H_CALC  v17, v2, v1, v3, v0, v19, v20, v21
        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v23, v24, v25
        sqxtn           v0.4h, v0.4s
        sqxtn2          v0.8h, v22.4s
        sqxtn           v19.4h, v20.4s
        sqxtn2          v19.8h, v24.4s
        trn1            v20.8h, v0.8h, v19.8h
        trn2            v21.8h, v0.8h, v19.8h
        sqxtun          v27.8b, v20.8h
        sqxtun2         v27.16b, v21.8h                         // 16-31
        st1             {v26.16b, v27.16b}, [x0], x1
        subs            w4, w4, #1
        b.hi            1b
        ret
endfunc

function ff_hevc_put_hevc_qpel_uni_w_h48_8_neon_i8mm, export=1
        QPEL_UNI_W_H_HEADER
1:
        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], x3
        ext             v1.16b, v16.16b, v17.16b, #1
        ext             v2.16b, v16.16b, v17.16b, #2
        ext             v3.16b, v16.16b, v17.16b, #3
        ext             v4.16b, v16.16b, v17.16b, #4
        ext             v5.16b, v16.16b, v17.16b, #5
        ext             v6.16b, v16.16b, v17.16b, #6
        ext             v7.16b, v16.16b, v17.16b, #7
        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v20, v24, v21, v0
        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
        sqxtn           v20.4h, v20.4s
        sqxtn2          v20.8h, v22.4s
        sqxtn           v21.4h, v21.4s
        sqxtn2          v21.8h, v23.4s
        trn1            v22.8h, v20.8h, v21.8h
        trn2            v23.8h, v20.8h, v21.8h
        sqxtun          v25.8b, v22.8h
        sqxtun2         v25.16b, v23.8h                         // 0-15
        ext             v1.16b, v17.16b, v18.16b, #1
        ext             v2.16b, v17.16b, v18.16b, #2
        ext             v3.16b, v17.16b, v18.16b, #3
        ext             v4.16b, v17.16b, v18.16b, #4
        ext             v5.16b, v17.16b, v18.16b, #5
        ext             v6.16b, v17.16b, v18.16b, #6
        ext             v7.16b, v17.16b, v18.16b, #7
        QPEL_UNI_W_H_CALC  v17, v2, v1, v3, v20, v24, v21, v0
        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
        sqxtn           v20.4h, v20.4s
        sqxtn2          v20.8h, v22.4s
        sqxtn           v21.4h, v21.4s
        sqxtn2          v21.8h, v23.4s
        trn1            v22.8h, v20.8h, v21.8h
        trn2            v23.8h, v20.8h, v21.8h
        sqxtun          v26.8b, v22.8h
        sqxtun2         v26.16b, v23.8h                         // 16-31
        ext             v1.16b, v18.16b, v19.16b, #1
        ext             v2.16b, v18.16b, v19.16b, #2
        ext             v3.16b, v18.16b, v19.16b, #3
        ext             v4.16b, v18.16b, v19.16b, #4
        ext             v5.16b, v18.16b, v19.16b, #5
        ext             v6.16b, v18.16b, v19.16b, #6
        ext             v7.16b, v18.16b, v19.16b, #7
        QPEL_UNI_W_H_CALC  v18, v2, v1, v3, v20, v24, v21, v0
        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
        sqxtn           v20.4h, v20.4s
        sqxtn2          v20.8h, v22.4s
        sqxtn           v21.4h, v21.4s
        sqxtn2          v21.8h, v23.4s
        trn1            v22.8h, v20.8h, v21.8h
        trn2            v23.8h, v20.8h, v21.8h
        sqxtun          v27.8b, v22.8h
        sqxtun2         v27.16b, v23.8h                         // 32-47
        st1             {v25.16b, v26.16b, v27.16b}, [x0], x1
        subs            w4, w4, #1
        b.hi            1b
        ret
endfunc



function ff_hevc_put_hevc_qpel_uni_w_h64_8_neon_i8mm, export=1
        QPEL_UNI_W_H_HEADER
        sub             x3, x3, #64
1:
        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64
        ext             v1.16b, v16.16b, v17.16b, #1
        ext             v2.16b, v16.16b, v17.16b, #2
        ext             v3.16b, v16.16b, v17.16b, #3
        ext             v4.16b, v16.16b, v17.16b, #4
        ext             v5.16b, v16.16b, v17.16b, #5
        ext             v6.16b, v16.16b, v17.16b, #6
        ext             v7.16b, v16.16b, v17.16b, #7
        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v20, v24, v21, v0
        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
        sqxtn           v20.4h, v20.4s
        sqxtn2          v20.8h, v22.4s
        sqxtn           v21.4h, v21.4s
        sqxtn2          v21.8h, v23.4s
        trn1            v22.8h, v20.8h, v21.8h
        trn2            v23.8h, v20.8h, v21.8h
        sqxtun          v16.8b, v22.8h
        sqxtun2         v16.16b, v23.8h                         // 0-15
        ext             v1.16b, v17.16b, v18.16b, #1
        ext             v2.16b, v17.16b, v18.16b, #2
        ext             v3.16b, v17.16b, v18.16b, #3
        ext             v4.16b, v17.16b, v18.16b, #4
        ext             v5.16b, v17.16b, v18.16b, #5
        ext             v6.16b, v17.16b, v18.16b, #6
        ext             v7.16b, v17.16b, v18.16b, #7
        QPEL_UNI_W_H_CALC  v17, v2, v1, v3, v20, v24, v21, v0
        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
        sqxtn           v20.4h, v20.4s
        sqxtn2          v20.8h, v22.4s
        sqxtn           v21.4h, v21.4s
        sqxtn2          v21.8h, v23.4s
        trn1            v22.8h, v20.8h, v21.8h
        trn2            v23.8h, v20.8h, v21.8h
        sqxtun          v17.8b, v22.8h
        sqxtun2         v17.16b, v23.8h                         // 16-31
        ext             v1.16b, v18.16b, v19.16b, #1
        ext             v2.16b, v18.16b, v19.16b, #2
        ext             v3.16b, v18.16b, v19.16b, #3
        ext             v4.16b, v18.16b, v19.16b, #4
        ext             v5.16b, v18.16b, v19.16b, #5
        ext             v6.16b, v18.16b, v19.16b, #6
        ext             v7.16b, v18.16b, v19.16b, #7
        QPEL_UNI_W_H_CALC  v18, v2, v1, v3, v20, v24, v21, v0
        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
        ld1             {v0.16b}, [x2], x3
        sqxtn           v20.4h, v20.4s
        sqxtn2          v20.8h, v22.4s
        sqxtn           v21.4h, v21.4s
        sqxtn2          v21.8h, v23.4s
        trn1            v22.8h, v20.8h, v21.8h
        trn2            v23.8h, v20.8h, v21.8h
        sqxtun          v18.8b, v22.8h
        sqxtun2         v18.16b, v23.8h                         // 32-47
        ext             v1.16b, v19.16b, v0.16b, #1
        ext             v2.16b, v19.16b, v0.16b, #2
        ext             v3.16b, v19.16b, v0.16b, #3
        ext             v4.16b, v19.16b, v0.16b, #4
        ext             v5.16b, v19.16b, v0.16b, #5
        ext             v6.16b, v19.16b, v0.16b, #6
        ext             v7.16b, v19.16b, v0.16b, #7
        QPEL_UNI_W_H_CALC  v19, v2, v1, v3, v20, v24, v21, v0
        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
        sqxtn           v20.4h, v20.4s
        sqxtn2          v20.8h, v22.4s
        sqxtn           v21.4h, v21.4s
        sqxtn2          v21.8h, v23.4s
        trn1            v22.8h, v20.8h, v21.8h
        trn2            v23.8h, v20.8h, v21.8h
        sqxtun          v19.8b, v22.8h
        sqxtun2         v19.16b, v23.8h                         // 48-63

        st1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
        subs            w4, w4, #1
        b.hi            1b
        ret
endfunc

.macro QPEL_H_HEADER
        movrel          x9, qpel_filters
        add             x9, x9, x4, lsl #3
        ldr             x11, [x9]
        dup             v31.2d, x11
        sub             x1, x1, #3
.endm

function ff_hevc_put_hevc_qpel_h4_8_neon_i8mm, export=1
        QPEL_H_HEADER
        mov             x10, #MAX_PB_SIZE * 2
1:
        ld1             {v0.16b}, [x1], x2
        ext             v1.16b, v0.16b, v0.16b, #1
        ext             v2.16b, v0.16b, v0.16b, #2
        ext             v3.16b, v0.16b, v0.16b, #3
        zip1            v0.2d, v0.2d, v1.2d
        zip1            v2.2d, v2.2d, v3.2d
        movi            v16.16b, #0
        movi            v17.16b, #0
        usdot           v16.4s, v0.16b, v31.16b
        usdot           v17.4s, v2.16b, v31.16b
        addp            v16.4s, v16.4s, v17.4s
        sqxtn           v16.4h, v16.4s
        str             d16, [x0]
        add             x0, x0, x10
        subs            w3, w3, #1
        b.ne            1b
        ret
endfunc

function ff_hevc_put_hevc_qpel_h6_8_neon_i8mm, export=1
        QPEL_H_HEADER
        mov             x10, #MAX_PB_SIZE * 2
        add             x15, x0, #8
1:
        ld1             {v0.16b}, [x1], x2
        ext             v1.16b, v0.16b, v0.16b, #1
        ext             v2.16b, v0.16b, v0.16b, #2
        ext             v3.16b, v0.16b, v0.16b, #3
        ext             v4.16b, v0.16b, v0.16b, #4
        ext             v5.16b, v0.16b, v0.16b, #5
        zip1            v0.2d, v0.2d, v1.2d
        zip1            v2.2d, v2.2d, v3.2d
        zip1            v4.2d, v4.2d, v5.2d
        movi            v16.16b, #0
        movi            v17.16b, #0
        movi            v18.16b, #0
        usdot           v16.4s, v0.16b, v31.16b
        usdot           v17.4s, v2.16b, v31.16b
        usdot           v18.4s, v4.16b, v31.16b
        addp            v16.4s, v16.4s, v17.4s
        addp            v18.4s, v18.4s, v18.4s
        sqxtn           v16.4h, v16.4s
        sqxtn           v18.4h, v18.4s
        str             d16, [x0]
        str             s18, [x15]
        add             x0, x0, x10
        add             x15, x15, x10
        subs            w3, w3, #1
        b.ne            1b
        ret
endfunc

function ff_hevc_put_hevc_qpel_h8_8_neon_i8mm, export=1
        QPEL_H_HEADER
        mov             x10, #MAX_PB_SIZE * 2
1:
        ld1             {v0.16b}, [x1], x2
        ext             v1.16b, v0.16b, v0.16b, #1
        ext             v2.16b, v0.16b, v0.16b, #2
        ext             v3.16b, v0.16b, v0.16b, #3
        ext             v4.16b, v0.16b, v0.16b, #4
        ext             v5.16b, v0.16b, v0.16b, #5
        ext             v6.16b, v0.16b, v0.16b, #6
        ext             v7.16b, v0.16b, v0.16b, #7
        zip1            v0.2d, v0.2d, v1.2d
        zip1            v2.2d, v2.2d, v3.2d
        zip1            v4.2d, v4.2d, v5.2d
        zip1            v6.2d, v6.2d, v7.2d
        movi            v16.16b, #0
        movi            v17.16b, #0
        movi            v18.16b, #0
        movi            v19.16b, #0
        usdot           v16.4s, v0.16b, v31.16b
        usdot           v17.4s, v2.16b, v31.16b
        usdot           v18.4s, v4.16b, v31.16b
        usdot           v19.4s, v6.16b, v31.16b
        addp            v16.4s, v16.4s, v17.4s
        addp            v18.4s, v18.4s, v19.4s
        sqxtn           v16.4h, v16.4s
        sqxtn2          v16.8h, v18.4s
        str             q16, [x0]
        add             x0, x0, x10
        subs            w3, w3, #1
        b.ne            1b
        ret
endfunc

.macro QPEL_H_CALC s0, s1, s2, s3, d0, d1, d2, d3
        movi            \d0\().16b, #0
        movi            \d1\().16b, #0
        movi            \d2\().16b, #0
        movi            \d3\().16b, #0
        usdot           \d0\().4s, \s0\().16b, v31.16b
        usdot           \d1\().4s, \s1\().16b, v31.16b
        usdot           \d2\().4s, \s2\().16b, v31.16b
        usdot           \d3\().4s, \s3\().16b, v31.16b
.endm

function ff_hevc_put_hevc_qpel_h12_8_neon_i8mm, export=1
        QPEL_H_HEADER
        mov             x10, #MAX_PB_SIZE * 2
        add             x15, x0, #16
1:
        ld1             {v16.16b, v17.16b}, [x1], x2
        ext             v1.16b, v16.16b, v17.16b, #1
        ext             v2.16b, v16.16b, v17.16b, #2
        ext             v3.16b, v16.16b, v17.16b, #3
        ext             v4.16b, v16.16b, v17.16b, #4
        ext             v5.16b, v16.16b, v17.16b, #5
        ext             v6.16b, v16.16b, v17.16b, #6
        ext             v7.16b, v16.16b, v17.16b, #7
        zip1            v18.2d, v4.2d, v5.2d
        zip1            v19.2d, v6.2d, v7.2d
        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
        addp            v20.4s, v20.4s, v22.4s
        addp            v21.4s, v21.4s, v23.4s
        movi            v24.16b, #0
        movi            v25.16b, #0
        usdot           v24.4s, v18.16b, v31.16b
        usdot           v25.4s, v19.16b, v31.16b
        addp            v24.4s, v24.4s, v25.4s
        trn1            v26.4s, v20.4s, v21.4s
        trn2            v27.4s, v20.4s, v21.4s
        sqxtn           v26.4h, v26.4s
        sqxtn           v27.4h, v27.4s
        sqxtn2          v26.8h, v24.4s

        str             q26, [x0]
        str             d27, [x15]
        add             x0, x0, x10
        add             x15, x15, x10
        subs            w3, w3, #1
        b.ne            1b
        ret
endfunc

function ff_hevc_put_hevc_qpel_h16_8_neon_i8mm, export=1
        QPEL_H_HEADER
        mov             x10, #MAX_PB_SIZE * 2
1:
        ld1             {v16.16b, v17.16b}, [x1], x2
        ext             v1.16b, v16.16b, v17.16b, #1
        ext             v2.16b, v16.16b, v17.16b, #2
        ext             v3.16b, v16.16b, v17.16b, #3
        ext             v4.16b, v16.16b, v17.16b, #4
        ext             v5.16b, v16.16b, v17.16b, #5
        ext             v6.16b, v16.16b, v17.16b, #6
        ext             v7.16b, v16.16b, v17.16b, #7

        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27

        addp            v20.4s, v20.4s, v22.4s
        addp            v21.4s, v21.4s, v23.4s
        addp            v24.4s, v24.4s, v26.4s
        addp            v25.4s, v25.4s, v27.4s

        trn1            v22.4s, v20.4s, v21.4s
        trn2            v23.4s, v20.4s, v21.4s
        trn1            v26.4s, v24.4s, v25.4s
        trn2            v27.4s, v24.4s, v25.4s

        sqxtn           v18.4h, v22.4s
        sqxtn2          v18.8h, v26.4s
        sqxtn           v19.4h, v23.4s
        sqxtn2          v19.8h, v27.4s

        stp             q18, q19, [x0]
        add             x0, x0, x10
        subs            w3, w3, #1
        b.ne            1b
        ret
endfunc

function ff_hevc_put_hevc_qpel_h24_8_neon_i8mm, export=1
        QPEL_H_HEADER
        mov             x10, #MAX_PB_SIZE * 2
        add             x15, x0, #32
1:
        ld1             {v16.16b, v17.16b}, [x1], x2
        ext             v1.16b, v16.16b, v17.16b, #1
        ext             v2.16b, v16.16b, v17.16b, #2
        ext             v3.16b, v16.16b, v17.16b, #3
        ext             v4.16b, v16.16b, v17.16b, #4
        ext             v5.16b, v16.16b, v17.16b, #5
        ext             v6.16b, v16.16b, v17.16b, #6
        ext             v7.16b, v16.16b, v17.16b, #7
        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
        addp            v20.4s, v20.4s, v22.4s
        addp            v21.4s, v21.4s, v23.4s
        addp            v24.4s, v24.4s, v26.4s
        addp            v25.4s, v25.4s, v27.4s
        trn1            v22.4s, v20.4s, v21.4s
        trn2            v23.4s, v20.4s, v21.4s
        trn1            v26.4s, v24.4s, v25.4s
        trn2            v27.4s, v24.4s, v25.4s
        sqxtn           v18.4h, v22.4s
        sqxtn2          v18.8h, v26.4s
        sqxtn           v19.4h, v23.4s
        sqxtn2          v19.8h, v27.4s
        stp             q18, q19, [x0]
        add             x0, x0, x10
        ext             v1.16b, v17.16b, v17.16b, #1
        ext             v2.16b, v17.16b, v17.16b, #2
        ext             v3.16b, v17.16b, v17.16b, #3
        ext             v4.16b, v17.16b, v17.16b, #4
        ext             v5.16b, v17.16b, v17.16b, #5
        ext             v6.16b, v17.16b, v17.16b, #6
        ext             v7.16b, v17.16b, v17.16b, #7
        zip1            v0.2d, v17.2d, v1.2d
        zip1            v2.2d, v2.2d, v3.2d
        zip1            v4.2d, v4.2d, v5.2d
        zip1            v6.2d, v6.2d, v7.2d
        QPEL_H_CALC     v0, v2, v4, v6, v20, v21, v22, v23
        addp            v20.4s, v20.4s, v21.4s
        addp            v22.4s, v22.4s, v23.4s
        sqxtn           v20.4h, v20.4s
        sqxtn2          v20.8h, v22.4s
        str             q20, [x15]
        add             x15, x15, x10
        subs            w3, w3, #1
        b.ne            1b
        ret
endfunc

function ff_hevc_put_hevc_qpel_h32_8_neon_i8mm, export=1
        QPEL_H_HEADER
        mov             x10, #MAX_PB_SIZE * 2
        add             x15, x0, #32
1:
        ld1             {v16.16b, v17.16b, v18.16b}, [x1], x2
        ext             v1.16b, v16.16b, v17.16b, #1
        ext             v2.16b, v16.16b, v17.16b, #2
        ext             v3.16b, v16.16b, v17.16b, #3
        ext             v4.16b, v16.16b, v17.16b, #4
        ext             v5.16b, v16.16b, v17.16b, #5
        ext             v6.16b, v16.16b, v17.16b, #6
        ext             v7.16b, v16.16b, v17.16b, #7
        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
        addp            v20.4s, v20.4s, v22.4s
        addp            v21.4s, v21.4s, v23.4s
        addp            v24.4s, v24.4s, v26.4s
        addp            v25.4s, v25.4s, v27.4s
        trn1            v22.4s, v20.4s, v21.4s
        trn2            v23.4s, v20.4s, v21.4s
        trn1            v26.4s, v24.4s, v25.4s
        trn2            v27.4s, v24.4s, v25.4s
        sqxtn           v20.4h, v22.4s
        sqxtn2          v20.8h, v26.4s
        sqxtn           v21.4h, v23.4s
        sqxtn2          v21.8h, v27.4s
        stp             q20, q21, [x0]
        add             x0, x0, x10
        ext             v1.16b, v17.16b, v18.16b, #1
        ext             v2.16b, v17.16b, v18.16b, #2
        ext             v3.16b, v17.16b, v18.16b, #3
        ext             v4.16b, v17.16b, v18.16b, #4
        ext             v5.16b, v17.16b, v18.16b, #5
        ext             v6.16b, v17.16b, v18.16b, #6
        ext             v7.16b, v17.16b, v18.16b, #7
        QPEL_H_CALC     v17, v1, v2, v3, v20, v21, v22, v23
        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
        addp            v20.4s, v20.4s, v22.4s
        addp            v21.4s, v21.4s, v23.4s
        addp            v24.4s, v24.4s, v26.4s
        addp            v25.4s, v25.4s, v27.4s
        trn1            v22.4s, v20.4s, v21.4s
        trn2            v23.4s, v20.4s, v21.4s
        trn1            v26.4s, v24.4s, v25.4s
        trn2            v27.4s, v24.4s, v25.4s
        sqxtn           v20.4h, v22.4s
        sqxtn2          v20.8h, v26.4s
        sqxtn           v21.4h, v23.4s
        sqxtn2          v21.8h, v27.4s
        stp             q20, q21, [x15]
        add             x15, x15, x10
        subs            w3, w3, #1
        b.ne            1b
        ret
endfunc

function ff_hevc_put_hevc_qpel_h48_8_neon_i8mm, export=1
        QPEL_H_HEADER
        mov             x10, #MAX_PB_SIZE * 2 - 64
1:
        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x2
        ext             v1.16b, v16.16b, v17.16b, #1
        ext             v2.16b, v16.16b, v17.16b, #2
        ext             v3.16b, v16.16b, v17.16b, #3
        ext             v4.16b, v16.16b, v17.16b, #4
        ext             v5.16b, v16.16b, v17.16b, #5
        ext             v6.16b, v16.16b, v17.16b, #6
        ext             v7.16b, v16.16b, v17.16b, #7
        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
        addp            v20.4s, v20.4s, v22.4s
        addp            v21.4s, v21.4s, v23.4s
        addp            v24.4s, v24.4s, v26.4s
        addp            v25.4s, v25.4s, v27.4s
        trn1            v22.4s, v20.4s, v21.4s
        trn2            v23.4s, v20.4s, v21.4s
        trn1            v26.4s, v24.4s, v25.4s
        trn2            v27.4s, v24.4s, v25.4s
        sqxtn           v20.4h, v22.4s
        sqxtn2          v20.8h, v26.4s
        sqxtn           v21.4h, v23.4s
        sqxtn2          v21.8h, v27.4s
        stp             q20, q21, [x0], #32

        ext             v1.16b, v17.16b, v18.16b, #1
        ext             v2.16b, v17.16b, v18.16b, #2
        ext             v3.16b, v17.16b, v18.16b, #3
        ext             v4.16b, v17.16b, v18.16b, #4
        ext             v5.16b, v17.16b, v18.16b, #5
        ext             v6.16b, v17.16b, v18.16b, #6
        ext             v7.16b, v17.16b, v18.16b, #7
        QPEL_H_CALC     v17, v1, v2, v3, v20, v21, v22, v23
        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
        addp            v20.4s, v20.4s, v22.4s
        addp            v21.4s, v21.4s, v23.4s
        addp            v24.4s, v24.4s, v26.4s
        addp            v25.4s, v25.4s, v27.4s
        trn1            v22.4s, v20.4s, v21.4s
        trn2            v23.4s, v20.4s, v21.4s
        trn1            v26.4s, v24.4s, v25.4s
        trn2            v27.4s, v24.4s, v25.4s
        sqxtn           v20.4h, v22.4s
        sqxtn2          v20.8h, v26.4s
        sqxtn           v21.4h, v23.4s
        sqxtn2          v21.8h, v27.4s
        stp             q20, q21, [x0], #32
        ext             v1.16b, v18.16b, v19.16b, #1
        ext             v2.16b, v18.16b, v19.16b, #2
        ext             v3.16b, v18.16b, v19.16b, #3
        ext             v4.16b, v18.16b, v19.16b, #4
        ext             v5.16b, v18.16b, v19.16b, #5
        ext             v6.16b, v18.16b, v19.16b, #6
        ext             v7.16b, v18.16b, v19.16b, #7
        QPEL_H_CALC     v18, v1, v2, v3, v20, v21, v22, v23
        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
        addp            v20.4s, v20.4s, v22.4s
        addp            v21.4s, v21.4s, v23.4s
        addp            v24.4s, v24.4s, v26.4s
        addp            v25.4s, v25.4s, v27.4s
        trn1            v22.4s, v20.4s, v21.4s
        trn2            v23.4s, v20.4s, v21.4s
        trn1            v26.4s, v24.4s, v25.4s
        trn2            v27.4s, v24.4s, v25.4s
        sqxtn           v20.4h, v22.4s
        sqxtn2          v20.8h, v26.4s
        sqxtn           v21.4h, v23.4s
        sqxtn2          v21.8h, v27.4s
        stp             q20, q21, [x0]
        add             x0, x0, x10
        subs            w3, w3, #1
        b.ne            1b
        ret
endfunc

function ff_hevc_put_hevc_qpel_h64_8_neon_i8mm, export=1
        QPEL_H_HEADER
        sub             x2, x2, #64
1:
        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], #64
        ext             v1.16b, v16.16b, v17.16b, #1
        ext             v2.16b, v16.16b, v17.16b, #2
        ext             v3.16b, v16.16b, v17.16b, #3
        ext             v4.16b, v16.16b, v17.16b, #4
        ext             v5.16b, v16.16b, v17.16b, #5
        ext             v6.16b, v16.16b, v17.16b, #6
        ext             v7.16b, v16.16b, v17.16b, #7
        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
        addp            v20.4s, v20.4s, v22.4s
        addp            v21.4s, v21.4s, v23.4s
        addp            v24.4s, v24.4s, v26.4s
        addp            v25.4s, v25.4s, v27.4s
        trn1            v22.4s, v20.4s, v21.4s
        trn2            v23.4s, v20.4s, v21.4s
        trn1            v26.4s, v24.4s, v25.4s
        trn2            v27.4s, v24.4s, v25.4s
        sqxtn           v20.4h, v22.4s
        sqxtn2          v20.8h, v26.4s
        sqxtn           v21.4h, v23.4s
        sqxtn2          v21.8h, v27.4s
        stp             q20, q21, [x0], #32

        ext             v1.16b, v17.16b, v18.16b, #1
        ext             v2.16b, v17.16b, v18.16b, #2
        ext             v3.16b, v17.16b, v18.16b, #3
        ext             v4.16b, v17.16b, v18.16b, #4
        ext             v5.16b, v17.16b, v18.16b, #5
        ext             v6.16b, v17.16b, v18.16b, #6
        ext             v7.16b, v17.16b, v18.16b, #7
        QPEL_H_CALC     v17, v1, v2, v3, v20, v21, v22, v23
        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
        addp            v20.4s, v20.4s, v22.4s
        addp            v21.4s, v21.4s, v23.4s
        addp            v24.4s, v24.4s, v26.4s
        addp            v25.4s, v25.4s, v27.4s
        trn1            v22.4s, v20.4s, v21.4s
        trn2            v23.4s, v20.4s, v21.4s
        trn1            v26.4s, v24.4s, v25.4s
        trn2            v27.4s, v24.4s, v25.4s
        sqxtn           v20.4h, v22.4s
        sqxtn2          v20.8h, v26.4s
        sqxtn           v21.4h, v23.4s
        sqxtn2          v21.8h, v27.4s
        stp             q20, q21, [x0], #32
        ext             v1.16b, v18.16b, v19.16b, #1
        ext             v2.16b, v18.16b, v19.16b, #2
        ext             v3.16b, v18.16b, v19.16b, #3
        ext             v4.16b, v18.16b, v19.16b, #4
        ext             v5.16b, v18.16b, v19.16b, #5
        ext             v6.16b, v18.16b, v19.16b, #6
        ext             v7.16b, v18.16b, v19.16b, #7
        QPEL_H_CALC     v18, v1, v2, v3, v20, v21, v22, v23
        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
        addp            v20.4s, v20.4s, v22.4s
        addp            v21.4s, v21.4s, v23.4s
        addp            v24.4s, v24.4s, v26.4s
        addp            v25.4s, v25.4s, v27.4s
        trn1            v22.4s, v20.4s, v21.4s
        trn2            v23.4s, v20.4s, v21.4s
        trn1            v26.4s, v24.4s, v25.4s
        trn2            v27.4s, v24.4s, v25.4s
        sqxtn           v20.4h, v22.4s
        sqxtn2          v20.8h, v26.4s
        sqxtn           v21.4h, v23.4s
        sqxtn2          v21.8h, v27.4s
        stp             q20, q21, [x0], #32
        ld1             {v28.8b}, [x1], x2
        ext             v1.16b, v19.16b, v28.16b, #1
        ext             v2.16b, v19.16b, v28.16b, #2
        ext             v3.16b, v19.16b, v28.16b, #3
        ext             v4.16b, v19.16b, v28.16b, #4
        ext             v5.16b, v19.16b, v28.16b, #5
        ext             v6.16b, v19.16b, v28.16b, #6
        ext             v7.16b, v19.16b, v28.16b, #7
        QPEL_H_CALC     v19, v1, v2, v3, v20, v21, v22, v23
        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
        addp            v20.4s, v20.4s, v22.4s
        addp            v21.4s, v21.4s, v23.4s
        addp            v24.4s, v24.4s, v26.4s
        addp            v25.4s, v25.4s, v27.4s
        trn1            v22.4s, v20.4s, v21.4s
        trn2            v23.4s, v20.4s, v21.4s
        trn1            v26.4s, v24.4s, v25.4s
        trn2            v27.4s, v24.4s, v25.4s
        sqxtn           v20.4h, v22.4s
        sqxtn2          v20.8h, v26.4s
        sqxtn           v21.4h, v23.4s
        sqxtn2          v21.8h, v27.4s
        stp             q20, q21, [x0], #32
        subs            w3, w3, #1
        b.ne            1b
        ret
endfunc
DISABLE_I8MM
#endif


function hevc_put_hevc_qpel_hv4_8_end_neon
        load_qpel_filterh x5, x4
        ldr             d16, [sp]
        ldr             d17, [sp, x7]
        add             sp, sp, x7, lsl #1
        ldr             d18, [sp]
        ldr             d19, [sp, x7]
        add             sp, sp, x7, lsl #1
        ldr             d20, [sp]
        ldr             d21, [sp, x7]
        add             sp, sp, x7, lsl #1
        ldr             d22, [sp]
        add             sp, sp, x7
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
        ld1             {\tmp\().4h}, [sp], x7
        calc_qpelh      v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
        subs            w3, w3, #1
        st1             {v1.4h}, [x0], x7
.endm
1:      calc_all
.purgem calc
2:      mov             sp, x14
        ret
endfunc

function hevc_put_hevc_qpel_hv6_8_end_neon
        mov             x8, #120
        load_qpel_filterh x5, x4
        ldr             q16, [sp]
        ldr             q17, [sp, x7]
        add             sp, sp, x7, lsl #1
        ldr             q18, [sp]
        ldr             q19, [sp, x7]
        add             sp, sp, x7, lsl #1
        ldr             q20, [sp]
        ldr             q21, [sp, x7]
        add             sp, sp, x7, lsl #1
        ldr             q22, [sp]
        add             sp, sp, x7
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
        ld1             {\tmp\().8h}, [sp], x7
        calc_qpelh      v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
        calc_qpelh2     v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
        st1             {v1.4h}, [x0], #8
        subs            w3, w3, #1
        st1             {v1.s}[2], [x0], x8
.endm
1:      calc_all
.purgem calc
2:      mov             sp, x14
        ret
endfunc

function hevc_put_hevc_qpel_hv8_8_end_neon
        mov             x7, #128
        load_qpel_filterh x5, x4
        ldr             q16, [sp]
        ldr             q17, [sp, x7]
        add             sp, sp, x7, lsl #1
        ldr             q18, [sp]
        ldr             q19, [sp, x7]
        add             sp, sp, x7, lsl #1
        ldr             q20, [sp]
        ldr             q21, [sp, x7]
        add             sp, sp, x7, lsl #1
        ldr             q22, [sp]
        add             sp, sp, x7
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
        ld1             {\tmp\().8h}, [sp], x7
        calc_qpelh      v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
        calc_qpelh2     v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
        subs            w3, w3, #1
        st1             {v1.8h}, [x0], x7
.endm
1:      calc_all
.purgem calc
2:      mov             sp, x14
        ret
endfunc

function hevc_put_hevc_qpel_hv12_8_end_neon
        mov             x7, #128
        load_qpel_filterh x5, x4
        mov             x8, #112
        ld1             {v16.8h, v17.8h}, [sp], x7
        ld1             {v18.8h, v19.8h}, [sp], x7
        ld1             {v20.8h, v21.8h}, [sp], x7
        ld1             {v22.8h, v23.8h}, [sp], x7
        ld1             {v24.8h, v25.8h}, [sp], x7
        ld1             {v26.8h, v27.8h}, [sp], x7
        ld1             {v28.8h, v29.8h}, [sp], x7
.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
        ld1             {\tmp0\().8h, \tmp1\().8h}, [sp], x7
        calc_qpelh      v1,     \src0,  \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7, sqshrn
        calc_qpelh2     v1, v2, \src0, \src1,  \src2,  \src3,  \src4,  \src5,  \src6,  \src7, sqshrn2
        calc_qpelh      v2,     \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn
        st1             {v1.8h}, [x0], #16
        subs            w3, w3, #1
        st1             {v2.4h}, [x0], x8
.endm
1:      calc_all2
.purgem calc
2:      mov             sp, x14
        ret
endfunc

function hevc_put_hevc_qpel_hv16_8_end_neon
        mov             x7, #128
        load_qpel_filterh x5, x4
        ld1             {v16.8h, v17.8h}, [sp], x7
        ld1             {v18.8h, v19.8h}, [sp], x7
        ld1             {v20.8h, v21.8h}, [sp], x7
        ld1             {v22.8h, v23.8h}, [sp], x7
        ld1             {v24.8h, v25.8h}, [sp], x7
        ld1             {v26.8h, v27.8h}, [sp], x7
        ld1             {v28.8h, v29.8h}, [sp], x7
.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
        ld1             {\tmp0\().8h, \tmp1\().8h}, [sp], x7
        calc_qpelh      v1,     \src0,  \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7, sqshrn
        calc_qpelh2     v1, v2, \src0, \src1,  \src2,  \src3,  \src4,  \src5,  \src6,  \src7, sqshrn2
        calc_qpelh      v2,     \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn
        calc_qpelh2     v2, v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn2
        subs            w3, w3, #1
        st1             {v1.8h, v2.8h}, [x0], x7
.endm
1:      calc_all2
.purgem calc
2:      mov             sp, x14
        ret
endfunc

function hevc_put_hevc_qpel_hv32_8_end_neon
        mov             x7, #128
        load_qpel_filterh x5, x4
0:      mov             x8, sp          // src
        ld1             {v16.8h, v17.8h}, [x8], x7
        mov             w9, w3          // height
        ld1             {v18.8h, v19.8h}, [x8], x7
        mov             x5, x0          // dst
        ld1             {v20.8h, v21.8h}, [x8], x7
        ld1             {v22.8h, v23.8h}, [x8], x7
        ld1             {v24.8h, v25.8h}, [x8], x7
        ld1             {v26.8h, v27.8h}, [x8], x7
        ld1             {v28.8h, v29.8h}, [x8], x7
.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
        ld1             {\tmp0\().8h, \tmp1\().8h}, [x8], x7
        calc_qpelh      v1,     \src0,  \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7, sqshrn
        calc_qpelh2     v1, v2, \src0, \src1,  \src2,  \src3,  \src4,  \src5,  \src6,  \src7, sqshrn2
        calc_qpelh      v2,     \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn
        calc_qpelh2     v2, v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn2
        subs            x9, x9, #1
        st1             {v1.8h, v2.8h}, [x5], x7
.endm
1:      calc_all2
.purgem calc
2:      add             x0, x0, #32
        add             sp, sp, #32
        subs            w6, w6, #16
        b.hi            0b
        mov             sp, x14
        ret
endfunc

.macro qpel_hv suffix
function ff_hevc_put_hevc_qpel_hv4_8_\suffix, export=1
        add             w10, w3, #8
        mov             x7, #128
        lsl             x10, x10, #7
        mov             x14, sp
        sub             sp, sp, x10         // tmp_array
        stp             x5,  x30, [sp, #-48]!
        stp             x0,  x3,  [sp, #16]
        str             x14,      [sp, #32]
        add             x0, sp, #48
        sub             x1, x1, x2, lsl #1
        add             x3, x3, #7
        sub             x1, x1, x2
        bl              X(ff_hevc_put_hevc_qpel_h4_8_\suffix)
        ldr             x14,      [sp, #32]
        ldp             x0,  x3,  [sp, #16]
        ldp             x5,  x30, [sp], #48
        b               hevc_put_hevc_qpel_hv4_8_end_neon
endfunc

function ff_hevc_put_hevc_qpel_hv6_8_\suffix, export=1
        add             w10, w3, #8
        mov             x7, #128
        lsl             x10, x10, #7
        mov             x14, sp
        sub             sp, sp, x10         // tmp_array
        stp             x5,  x30, [sp, #-48]!
        stp             x0,  x3,  [sp, #16]
        str             x14,      [sp, #32]
        add             x0, sp, #48
        sub             x1, x1, x2, lsl #1
        add             x3, x3, #7
        sub             x1, x1, x2
        bl              X(ff_hevc_put_hevc_qpel_h6_8_\suffix)
        ldr             x14,      [sp, #32]
        ldp             x0,  x3,  [sp, #16]
        ldp             x5,  x30, [sp], #48
        b               hevc_put_hevc_qpel_hv6_8_end_neon
endfunc

function ff_hevc_put_hevc_qpel_hv8_8_\suffix, export=1
        add             w10, w3, #8
        lsl             x10, x10, #7
        sub             x1, x1, x2, lsl #1
        mov             x14, sp
        sub             sp, sp, x10         // tmp_array
        stp             x5,  x30, [sp, #-48]!
        stp             x0,  x3,  [sp, #16]
        str             x14,      [sp, #32]
        add             x0, sp, #48
        add             x3, x3, #7
        sub             x1, x1, x2
        bl              X(ff_hevc_put_hevc_qpel_h8_8_\suffix)
        ldr             x14,      [sp, #32]
        ldp             x0,  x3,  [sp, #16]
        ldp             x5,  x30, [sp], #48
        b               hevc_put_hevc_qpel_hv8_8_end_neon
endfunc

function ff_hevc_put_hevc_qpel_hv12_8_\suffix, export=1
        add             w10, w3, #8
        lsl             x10, x10, #7
        sub             x1, x1, x2, lsl #1
        mov             x14, sp
        sub             sp, sp, x10         // tmp_array
        stp             x5,  x30, [sp, #-48]!
        stp             x0,  x3,  [sp, #16]
        str             x14,      [sp, #32]
        add             x0, sp, #48
        add             x3, x3, #7
        sub             x1, x1, x2
        mov             w6, #12
        bl              X(ff_hevc_put_hevc_qpel_h12_8_\suffix)
        ldr             x14,      [sp, #32]
        ldp             x0,  x3,  [sp, #16]
        ldp             x5,  x30, [sp], #48
        b               hevc_put_hevc_qpel_hv12_8_end_neon
endfunc

function ff_hevc_put_hevc_qpel_hv16_8_\suffix, export=1
        add             w10, w3, #8
        lsl             x10, x10, #7
        sub             x1, x1, x2, lsl #1
        mov             x14, sp
        sub             sp, sp, x10         // tmp_array
        stp             x5,  x30, [sp, #-48]!
        stp             x0,  x3,  [sp, #16]
        str             x14,      [sp, #32]
        add             x3, x3, #7
        add             x0, sp, #48
        sub             x1, x1, x2
        bl              X(ff_hevc_put_hevc_qpel_h16_8_\suffix)
        ldr             x14,      [sp, #32]
        ldp             x0,  x3,  [sp, #16]
        ldp             x5,  x30, [sp], #48
        b               hevc_put_hevc_qpel_hv16_8_end_neon
endfunc

function ff_hevc_put_hevc_qpel_hv24_8_\suffix, export=1
        stp             x4, x5, [sp, #-64]!
        stp             x2, x3, [sp, #16]
        stp             x0, x1, [sp, #32]
        str             x30, [sp, #48]
        bl              X(ff_hevc_put_hevc_qpel_hv12_8_\suffix)
        ldp             x0, x1, [sp, #32]
        ldp             x2, x3, [sp, #16]
        ldp             x4, x5, [sp], #48
        add             x1, x1, #12
        add             x0, x0, #24
        bl              X(ff_hevc_put_hevc_qpel_hv12_8_\suffix)
        ldr             x30, [sp], #16
        ret
endfunc

function ff_hevc_put_hevc_qpel_hv32_8_\suffix, export=1
        add             w10, w3, #8
        sub             x1, x1, x2, lsl #1
        lsl             x10, x10, #7
        sub             x1, x1, x2
        mov             x14, sp
        sub             sp, sp, x10         // tmp_array
        stp             x5,  x30, [sp, #-48]!
        stp             x0,  x3,  [sp, #16]
        str             x14,      [sp, #32]
        add             x3, x3, #7
        add             x0, sp, #48
        mov             w6, #32
        bl              X(ff_hevc_put_hevc_qpel_h32_8_\suffix)
        ldr             x14,      [sp, #32]
        ldp             x0,  x3,  [sp, #16]
        ldp             x5,  x30, [sp], #48
        b               hevc_put_hevc_qpel_hv32_8_end_neon
endfunc

function ff_hevc_put_hevc_qpel_hv48_8_\suffix, export=1
        stp             x4, x5, [sp, #-64]!
        stp             x2, x3, [sp, #16]
        stp             x0, x1, [sp, #32]
        str             x30, [sp, #48]
        bl              X(ff_hevc_put_hevc_qpel_hv24_8_\suffix)
        ldp             x0, x1, [sp, #32]
        ldp             x2, x3, [sp, #16]
        ldp             x4, x5, [sp], #48
        add             x1, x1, #24
        add             x0, x0, #48
        bl              X(ff_hevc_put_hevc_qpel_hv24_8_\suffix)
        ldr             x30, [sp], #16
        ret
endfunc

function ff_hevc_put_hevc_qpel_hv64_8_\suffix, export=1
        stp             x4, x5, [sp, #-64]!
        stp             x2, x3, [sp, #16]
        stp             x0, x1, [sp, #32]
        str             x30, [sp, #48]
        mov             x6, #32
        bl              X(ff_hevc_put_hevc_qpel_hv32_8_\suffix)
        ldp             x0, x1, [sp, #32]
        ldp             x2, x3, [sp, #16]
        ldp             x4, x5, [sp], #48
        add             x1, x1, #32
        add             x0, x0, #64
        mov             x6, #32
        bl              X(ff_hevc_put_hevc_qpel_hv32_8_\suffix)
        ldr             x30, [sp], #16
        ret
endfunc
.endm

qpel_hv neon

#if HAVE_I8MM
ENABLE_I8MM

qpel_hv neon_i8mm

DISABLE_I8MM
#endif

.macro QPEL_UNI_W_HV_HEADER width, suffix
        ldp             x14, x15, [sp]          // mx, my
        ldr             w13, [sp, #16]          // width
        stp             x19, x30, [sp, #-80]!
        stp             x20, x21, [sp, #16]
        stp             x22, x23, [sp, #32]
        stp             x24, x25, [sp, #48]
        stp             x26, x27, [sp, #64]
        mov             x19, sp
        mov             x11, #(MAX_PB_SIZE*(MAX_PB_SIZE+8)*2)
        sub             sp, sp, x11
        mov             x20, x0
        mov             x21, x1
        mov             x0, sp
        sub             x1, x2, x3, lsl #1
        sub             x1, x1, x3
        mov             x2, x3
        add             w3, w4, #7
        mov             w22, w4                 // height
        mov             x4, x14                 // mx
        mov             x23, x15                // my
        mov             w24, w6                 // wx
        mov             w25, w7                 // ox
        mov             w26, #-6
        sub             w26, w26, w5            // -shift
        mov             w27, w13                // width
.ifc \suffix, neon
.if \width >= 32
        mov             w6,  #\width
        bl              X(ff_hevc_put_hevc_qpel_h32_8_neon)
.else
        bl              X(ff_hevc_put_hevc_qpel_h\width\()_8_\suffix)
.endif
.else
        bl              X(ff_hevc_put_hevc_qpel_h\width\()_8_\suffix)
.endif
        movrel          x9, qpel_filters
        add             x9, x9, x23, lsl #3
        ld1             {v0.8b}, [x9]
        sxtl            v0.8h, v0.8b
        mov             x10, #(MAX_PB_SIZE * 2)
        dup             v28.4s, w24
        dup             v29.4s, w25
        dup             v30.4s, w26
.endm

.macro QPEL_UNI_W_HV_END
        mov             sp, x19
        ldp             x20, x21, [sp, #16]
        ldp             x22, x23, [sp, #32]
        ldp             x24, x25, [sp, #48]
        ldp             x26, x27, [sp, #64]
        ldp             x19, x30, [sp], #80
.endm

.macro QPEL_UNI_W_HV_4
        sshr            v26.4s, v26.4s, #6
        mul             v24.4s, v26.4s, v28.4s
        sqrshl          v24.4s, v24.4s, v30.4s
        sqadd           v24.4s, v24.4s, v29.4s
        sqxtn           v24.4h, v24.4s
        sqxtun          v24.8b, v24.8h
        st1             {v24.s}[0], [x20], x21
.endm

.macro QPEL_FILTER_H    dst, src0, src1, src2, src3, src4, src5, src6, src7
        smull           \dst\().4s, \src0\().4h, v0.h[0]
        smlal           \dst\().4s, \src1\().4h, v0.h[1]
        smlal           \dst\().4s, \src2\().4h, v0.h[2]
        smlal           \dst\().4s, \src3\().4h, v0.h[3]
        smlal           \dst\().4s, \src4\().4h, v0.h[4]
        smlal           \dst\().4s, \src5\().4h, v0.h[5]
        smlal           \dst\().4s, \src6\().4h, v0.h[6]
        smlal           \dst\().4s, \src7\().4h, v0.h[7]
.endm

.macro QPEL_FILTER_H2    dst, src0, src1, src2, src3, src4, src5, src6, src7
        smull2          \dst\().4s, \src0\().8h, v0.h[0]
        smlal2          \dst\().4s, \src1\().8h, v0.h[1]
        smlal2          \dst\().4s, \src2\().8h, v0.h[2]
        smlal2          \dst\().4s, \src3\().8h, v0.h[3]
        smlal2          \dst\().4s, \src4\().8h, v0.h[4]
        smlal2          \dst\().4s, \src5\().8h, v0.h[5]
        smlal2          \dst\().4s, \src6\().8h, v0.h[6]
        smlal2          \dst\().4s, \src7\().8h, v0.h[7]
.endm

function hevc_put_hevc_qpel_uni_w_hv4_8_end_neon
        ldr             d16, [sp]
        ldr             d17, [sp, x10]
        add             sp, sp, x10, lsl #1
        ldr             d18, [sp]
        ldr             d19, [sp, x10]
        add             sp, sp, x10, lsl #1
        ldr             d20, [sp]
        ldr             d21, [sp, x10]
        add             sp, sp, x10, lsl #1
        ldr             d22, [sp]
        add             sp, sp, x10
1:
        ldr             d23, [sp]
        add             sp, sp, x10
        QPEL_FILTER_H   v26, v16, v17, v18, v19, v20, v21, v22, v23
        QPEL_UNI_W_HV_4
        subs            w22, w22, #1
        b.eq            2f

        ldr             d16, [sp]
        add             sp, sp, x10
        QPEL_FILTER_H   v26, v17, v18, v19, v20, v21, v22, v23, v16
        QPEL_UNI_W_HV_4
        subs            w22, w22, #1
        b.eq            2f

        ldr             d17, [sp]
        add             sp, sp, x10
        QPEL_FILTER_H   v26, v18, v19, v20, v21, v22, v23, v16, v17
        QPEL_UNI_W_HV_4
        subs            w22, w22, #1
        b.eq            2f

        ldr             d18, [sp]
        add             sp, sp, x10
        QPEL_FILTER_H   v26, v19, v20, v21, v22, v23, v16, v17, v18
        QPEL_UNI_W_HV_4
        subs            w22, w22, #1
        b.eq            2f

        ldr             d19, [sp]
        add             sp, sp, x10
        QPEL_FILTER_H   v26, v20, v21, v22, v23, v16, v17, v18, v19
        QPEL_UNI_W_HV_4
        subs            w22, w22, #1
        b.eq            2f

        ldr             d20, [sp]
        add             sp, sp, x10
        QPEL_FILTER_H   v26, v21, v22, v23, v16, v17, v18, v19, v20
        QPEL_UNI_W_HV_4
        subs            w22, w22, #1
        b.eq            2f

        ldr             d21, [sp]
        add             sp, sp, x10
        QPEL_FILTER_H   v26, v22, v23, v16, v17, v18, v19, v20, v21
        QPEL_UNI_W_HV_4
        subs            w22, w22, #1
        b.eq            2f

        ldr             d22, [sp]
        add             sp, sp, x10
        QPEL_FILTER_H   v26, v23, v16, v17, v18, v19, v20, v21, v22
        QPEL_UNI_W_HV_4
        subs            w22, w22, #1
        b.hi            1b

2:
        QPEL_UNI_W_HV_END
        ret
endfunc

.macro QPEL_UNI_W_HV_8
        sshr            v26.4s, v26.4s, #6
        sshr            v27.4s, v27.4s, #6
        mul             v24.4s, v26.4s, v28.4s
        mul             v25.4s, v27.4s, v28.4s
        sqrshl          v24.4s, v24.4s, v30.4s
        sqrshl          v25.4s, v25.4s, v30.4s
        sqadd           v24.4s, v24.4s, v29.4s
        sqadd           v25.4s, v25.4s, v29.4s
        sqxtn           v24.4h, v24.4s
        sqxtn2          v24.8h, v25.4s
        sqxtun          v24.8b, v24.8h
        st1             {v24.d}[0], [x20], x21
.endm

function hevc_put_hevc_qpel_uni_w_hv8_8_end_neon
        ldr             q16, [sp]
        ldr             q17, [sp, x10]
        add             sp, sp, x10, lsl #1
        ldr             q18, [sp]
        ldr             q19, [sp, x10]
        add             sp, sp, x10, lsl #1
        ldr             q20, [sp]
        ldr             q21, [sp, x10]
        add             sp, sp, x10, lsl #1
        ldr             q22, [sp]
        add             sp, sp, x10
1:
        ldr             q23, [sp]
        add             sp, sp, x10
        QPEL_FILTER_H   v26, v16, v17, v18, v19, v20, v21, v22, v23
        QPEL_FILTER_H2  v27, v16, v17, v18, v19, v20, v21, v22, v23
        QPEL_UNI_W_HV_8
        subs            w22, w22, #1
        b.eq            2f

        ldr             q16, [sp]
        add             sp, sp, x10
        QPEL_FILTER_H   v26, v17, v18, v19, v20, v21, v22, v23, v16
        QPEL_FILTER_H2  v27, v17, v18, v19, v20, v21, v22, v23, v16
        QPEL_UNI_W_HV_8
        subs            w22, w22, #1
        b.eq            2f

        ldr             q17, [sp]
        add             sp, sp, x10
        QPEL_FILTER_H   v26, v18, v19, v20, v21, v22, v23, v16, v17
        QPEL_FILTER_H2  v27, v18, v19, v20, v21, v22, v23, v16, v17
        QPEL_UNI_W_HV_8
        subs            w22, w22, #1
        b.eq            2f

        ldr             q18, [sp]
        add             sp, sp, x10
        QPEL_FILTER_H   v26, v19, v20, v21, v22, v23, v16, v17, v18
        QPEL_FILTER_H2  v27, v19, v20, v21, v22, v23, v16, v17, v18
        QPEL_UNI_W_HV_8
        subs            w22, w22, #1
        b.eq            2f

        ldr             q19, [sp]
        add             sp, sp, x10
        QPEL_FILTER_H   v26, v20, v21, v22, v23, v16, v17, v18, v19
        QPEL_FILTER_H2  v27, v20, v21, v22, v23, v16, v17, v18, v19
        QPEL_UNI_W_HV_8
        subs            w22, w22, #1
        b.eq            2f

        ldr             q20, [sp]
        add             sp, sp, x10
        QPEL_FILTER_H   v26, v21, v22, v23, v16, v17, v18, v19, v20
        QPEL_FILTER_H2  v27, v21, v22, v23, v16, v17, v18, v19, v20
        QPEL_UNI_W_HV_8
        subs            w22, w22, #1
        b.eq            2f

        ldr             q21, [sp]
        add             sp, sp, x10
        QPEL_FILTER_H   v26, v22, v23, v16, v17, v18, v19, v20, v21
        QPEL_FILTER_H2  v27, v22, v23, v16, v17, v18, v19, v20, v21
        QPEL_UNI_W_HV_8
        subs            w22, w22, #1
        b.eq            2f

        ldr             q22, [sp]
        add             sp, sp, x10
        QPEL_FILTER_H   v26, v23, v16, v17, v18, v19, v20, v21, v22
        QPEL_FILTER_H2  v27, v23, v16, v17, v18, v19, v20, v21, v22
        QPEL_UNI_W_HV_8
        subs            w22, w22, #1
        b.hi            1b

2:
        QPEL_UNI_W_HV_END
        ret
endfunc

.macro QPEL_UNI_W_HV_16
        sshr            v24.4s, v24.4s, #6
        sshr            v25.4s, v25.4s, #6
        sshr            v26.4s, v26.4s, #6
        sshr            v27.4s, v27.4s, #6
        mul             v24.4s, v24.4s, v28.4s
        mul             v25.4s, v25.4s, v28.4s
        mul             v26.4s, v26.4s, v28.4s
        mul             v27.4s, v27.4s, v28.4s
        sqrshl          v24.4s, v24.4s, v30.4s
        sqrshl          v25.4s, v25.4s, v30.4s
        sqrshl          v26.4s, v26.4s, v30.4s
        sqrshl          v27.4s, v27.4s, v30.4s
        sqadd           v24.4s, v24.4s, v29.4s
        sqadd           v25.4s, v25.4s, v29.4s
        sqadd           v26.4s, v26.4s, v29.4s
        sqadd           v27.4s, v27.4s, v29.4s
        sqxtn           v24.4h, v24.4s
        sqxtn2          v24.8h, v25.4s
        sqxtn           v26.4h, v26.4s
        sqxtn2          v26.8h, v27.4s
        sqxtun          v24.8b, v24.8h
        sqxtun2         v24.16b, v26.8h

        st1             {v24.16b}, [x20], x21
.endm

function hevc_put_hevc_qpel_uni_w_hv16_8_end_neon
        mov             x11, sp
        mov             w12, w22
        mov             x13, x20
        mov             x14, sp
3:
        ldp             q16, q1, [x11]
        add             x11, x11, x10
        ldp             q17, q2, [x11]
        add             x11, x11, x10
        ldp             q18, q3, [x11]
        add             x11, x11, x10
        ldp             q19, q4, [x11]
        add             x11, x11, x10
        ldp             q20, q5, [x11]
        add             x11, x11, x10
        ldp             q21, q6, [x11]
        add             x11, x11, x10
        ldp             q22, q7, [x11]
        add             x11, x11, x10
1:
        ldp             q23, q31, [x11]
        add             x11, x11, x10
        QPEL_FILTER_H   v24, v16, v17, v18, v19, v20, v21, v22, v23
        QPEL_FILTER_H2  v25, v16, v17, v18, v19, v20, v21, v22, v23
        QPEL_FILTER_H   v26,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
        QPEL_FILTER_H2  v27,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
        QPEL_UNI_W_HV_16
        subs            w22, w22, #1
        b.eq            2f

        ldp             q16, q1, [x11]
        add             x11, x11, x10
        QPEL_FILTER_H   v24, v17, v18, v19, v20, v21, v22, v23, v16
        QPEL_FILTER_H2  v25, v17, v18, v19, v20, v21, v22, v23, v16
        QPEL_FILTER_H   v26,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
        QPEL_FILTER_H2  v27,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
        QPEL_UNI_W_HV_16
        subs            w22, w22, #1
        b.eq            2f

        ldp             q17, q2, [x11]
        add             x11, x11, x10
        QPEL_FILTER_H   v24, v18, v19, v20, v21, v22, v23, v16, v17
        QPEL_FILTER_H2  v25, v18, v19, v20, v21, v22, v23, v16, v17
        QPEL_FILTER_H   v26,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
        QPEL_FILTER_H2  v27,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
        QPEL_UNI_W_HV_16
        subs            w22, w22, #1
        b.eq            2f

        ldp             q18, q3, [x11]
        add             x11, x11, x10
        QPEL_FILTER_H   v24, v19, v20, v21, v22, v23, v16, v17, v18
        QPEL_FILTER_H2  v25, v19, v20, v21, v22, v23, v16, v17, v18
        QPEL_FILTER_H   v26,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
        QPEL_FILTER_H2  v27,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
        QPEL_UNI_W_HV_16
        subs            w22, w22, #1
        b.eq            2f

        ldp             q19, q4, [x11]
        add             x11, x11, x10
        QPEL_FILTER_H   v24, v20, v21, v22, v23, v16, v17, v18, v19
        QPEL_FILTER_H2  v25, v20, v21, v22, v23, v16, v17, v18, v19
        QPEL_FILTER_H   v26,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
        QPEL_FILTER_H2  v27,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
        QPEL_UNI_W_HV_16
        subs            w22, w22, #1
        b.eq            2f

        ldp             q20, q5, [x11]
        add             x11, x11, x10
        QPEL_FILTER_H   v24, v21, v22, v23, v16, v17, v18, v19, v20
        QPEL_FILTER_H2  v25, v21, v22, v23, v16, v17, v18, v19, v20
        QPEL_FILTER_H   v26,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
        QPEL_FILTER_H2  v27,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
        QPEL_UNI_W_HV_16
        subs            w22, w22, #1
        b.eq            2f

        ldp             q21, q6, [x11]
        add             x11, x11, x10
        QPEL_FILTER_H   v24, v22, v23, v16, v17, v18, v19, v20, v21
        QPEL_FILTER_H2  v25, v22, v23, v16, v17, v18, v19, v20, v21
        QPEL_FILTER_H   v26,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
        QPEL_FILTER_H2  v27,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
        QPEL_UNI_W_HV_16
        subs            w22, w22, #1
        b.eq            2f

        ldp             q22, q7, [x11]
        add             x11, x11, x10
        QPEL_FILTER_H   v24, v23, v16, v17, v18, v19, v20, v21, v22
        QPEL_FILTER_H2  v25, v23, v16, v17, v18, v19, v20, v21, v22
        QPEL_FILTER_H   v26, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
        QPEL_FILTER_H2  v27, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
        QPEL_UNI_W_HV_16
        subs            w22, w22, #1
        b.hi            1b
2:
        subs            w27, w27, #16
        add             x11, x14, #32
        add             x20, x13, #16
        mov             w22, w12
        mov             x14, x11
        mov             x13, x20
        b.hi            3b
        QPEL_UNI_W_HV_END
        ret
endfunc

.macro qpel_uni_w_hv suffix
function ff_hevc_put_hevc_qpel_uni_w_hv4_8_\suffix, export=1
        QPEL_UNI_W_HV_HEADER 4, \suffix
        b               hevc_put_hevc_qpel_uni_w_hv4_8_end_neon
endfunc

function ff_hevc_put_hevc_qpel_uni_w_hv8_8_\suffix, export=1
        QPEL_UNI_W_HV_HEADER 8, \suffix
        b               hevc_put_hevc_qpel_uni_w_hv8_8_end_neon
endfunc

function ff_hevc_put_hevc_qpel_uni_w_hv16_8_\suffix, export=1
        QPEL_UNI_W_HV_HEADER 16, \suffix
        b               hevc_put_hevc_qpel_uni_w_hv16_8_end_neon
endfunc

function ff_hevc_put_hevc_qpel_uni_w_hv32_8_\suffix, export=1
        QPEL_UNI_W_HV_HEADER 32, \suffix
        b               hevc_put_hevc_qpel_uni_w_hv16_8_end_neon
endfunc

function ff_hevc_put_hevc_qpel_uni_w_hv64_8_\suffix, export=1
        QPEL_UNI_W_HV_HEADER 64, \suffix
        b               hevc_put_hevc_qpel_uni_w_hv16_8_end_neon
endfunc
.endm

qpel_uni_w_hv neon

function hevc_put_hevc_qpel_bi_hv4_8_end_neon
        mov             x9, #(MAX_PB_SIZE * 2)
        load_qpel_filterh x7, x6
        ld1             {v16.4h}, [sp], x9
        ld1             {v17.4h}, [sp], x9
        ld1             {v18.4h}, [sp], x9
        ld1             {v19.4h}, [sp], x9
        ld1             {v20.4h}, [sp], x9
        ld1             {v21.4h}, [sp], x9
        ld1             {v22.4h}, [sp], x9
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
        ld1             {\tmp\().4h}, [sp], x9
        calc_qpelh      v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
        ld1             {v5.4h}, [x4], x9 // src2
        saddw           v1.4s, v1.4s, v5.4h
        rshrn           v1.4h, v1.4s, #7
        sqxtun          v1.8b, v1.8h
        subs            w5, w5, #1
        st1             {v1.s}[0], [x0], x1
.endm
1:      calc_all
.purgem calc
2:      mov             sp, x14
        ret
endfunc

function hevc_put_hevc_qpel_bi_hv6_8_end_neon
        mov             x9, #(MAX_PB_SIZE * 2)
        load_qpel_filterh x7, x6
        sub             x1, x1, #4
        ld1             {v16.8h}, [sp], x9
        ld1             {v17.8h}, [sp], x9
        ld1             {v18.8h}, [sp], x9
        ld1             {v19.8h}, [sp], x9
        ld1             {v20.8h}, [sp], x9
        ld1             {v21.8h}, [sp], x9
        ld1             {v22.8h}, [sp], x9
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
        ld1             {\tmp\().8h}, [sp], x9
        calc_qpelh      v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
        calc_qpelh2     v2, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
        ld1             {v5.8h}, [x4], x9 // src2
        saddw           v1.4s, v1.4s, v5.4h
        saddw2          v2.4s, v2.4s, v5.8h
        rshrn           v1.4h, v1.4s, #7
        rshrn2          v1.8h, v2.4s, #7
        sqxtun          v1.8b, v1.8h
        st1             {v1.s}[0], [x0], #4
        subs            w5, w5, #1
        st1             {v1.h}[2], [x0], x1
.endm
1:      calc_all
.purgem calc
2:      mov             sp, x14
        ret
endfunc

function hevc_put_hevc_qpel_bi_hv8_8_end_neon
        mov             x9, #(MAX_PB_SIZE * 2)
        load_qpel_filterh x7, x6
        ld1             {v16.8h}, [sp], x9
        ld1             {v17.8h}, [sp], x9
        ld1             {v18.8h}, [sp], x9
        ld1             {v19.8h}, [sp], x9
        ld1             {v20.8h}, [sp], x9
        ld1             {v21.8h}, [sp], x9
        ld1             {v22.8h}, [sp], x9
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
        ld1             {\tmp\().8h}, [sp], x9
        calc_qpelh      v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
        calc_qpelh2     v2, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
        ld1             {v5.8h}, [x4], x9 // src2
        saddw           v1.4s, v1.4s, v5.4h
        saddw2          v2.4s, v2.4s, v5.8h
        rshrn           v1.4h, v1.4s, #7
        rshrn2          v1.8h, v2.4s, #7
        sqxtun          v1.8b, v1.8h
        subs            w5, w5, #1
        st1             {v1.8b}, [x0], x1
.endm
1:      calc_all
.purgem calc
2:      mov             sp, x14
        ret
endfunc

function hevc_put_hevc_qpel_bi_hv16_8_end_neon
        load_qpel_filterh x7, x8
        mov             x9, #(MAX_PB_SIZE * 2)
        mov             x10, x6
0:      mov             x8, sp          // src
        ld1             {v16.8h, v17.8h}, [x8], x9
        mov             w11, w5         // height
        ld1             {v18.8h, v19.8h}, [x8], x9
        mov             x12, x4         // src2
        ld1             {v20.8h, v21.8h}, [x8], x9
        mov             x7, x0          // dst
        ld1             {v22.8h, v23.8h}, [x8], x9
        ld1             {v24.8h, v25.8h}, [x8], x9
        ld1             {v26.8h, v27.8h}, [x8], x9
        ld1             {v28.8h, v29.8h}, [x8], x9
.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
        ld1             {\tmp0\().8h, \tmp1\().8h}, [x8], x9
        calc_qpelh      v1,     \src0,  \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7, sshr
        calc_qpelh2     v2, v2, \src0, \src1,  \src2,  \src3,  \src4,  \src5,  \src6,  \src7, sshr
        calc_qpelh      v3,     \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sshr
        calc_qpelh2     v4, v4, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sshr
        ld1             {v5.8h, v6.8h}, [x12], x9 // src2
        saddw           v1.4s, v1.4s, v5.4h
        saddw2          v2.4s, v2.4s, v5.8h
        saddw           v3.4s, v3.4s, v6.4h
        saddw2          v4.4s, v4.4s, v6.8h
        rshrn           v1.4h, v1.4s, #7
        rshrn2          v1.8h, v2.4s, #7
        rshrn           v2.4h, v3.4s, #7
        rshrn2          v2.8h, v4.4s, #7
        sqxtun          v1.8b, v1.8h
        sqxtun2         v1.16b, v2.8h
        subs            x11, x11, #1
        st1             {v1.16b}, [x7], x1
.endm
1:      calc_all2
.purgem calc
2:      add             x0, x0, #16
        add             sp, sp, #32
        subs            x10, x10, #16
        add             x4, x4, #32
        b.ne            0b
        mov             sp, x14
        ret
endfunc

.macro qpel_bi_hv suffix
function ff_hevc_put_hevc_qpel_bi_hv4_8_\suffix, export=1
        add             w10, w5, #8
        lsl             x10, x10, #7
        mov             x14, sp
        sub             sp, sp, x10 // tmp_array
        stp             x7, x30, [sp, #-64]!
        stp             x4, x5, [sp, #16]
        stp             x0, x1, [sp, #32]
        str             x14,    [sp, #48]
        sub             x1, x2, x3, lsl #1
        sub             x1, x1, x3
        add             x0, sp, #64
        mov             x2, x3
        add             w3, w5, #7
        mov             x4, x6
        bl              X(ff_hevc_put_hevc_qpel_h4_8_\suffix)
        ldp             x4, x5, [sp, #16]
        ldp             x0, x1, [sp, #32]
        ldr             x14,    [sp, #48]
        ldp             x7, x30, [sp], #64
        b               hevc_put_hevc_qpel_bi_hv4_8_end_neon
endfunc

function ff_hevc_put_hevc_qpel_bi_hv6_8_\suffix, export=1
        add             w10, w5, #8
        lsl             x10, x10, #7
        mov             x14, sp
        sub             sp, sp, x10         // tmp_array
        stp             x7, x30, [sp, #-64]!
        stp             x4, x5, [sp, #16]
        stp             x0, x1, [sp, #32]
        str             x14,    [sp, #48]
        sub             x1, x2, x3, lsl #1
        sub             x1, x1, x3
        add             x0, sp, #64
        mov             x2, x3
        add             x3, x5, #7
        mov             x4, x6
        bl              X(ff_hevc_put_hevc_qpel_h6_8_\suffix)
        ldp             x4, x5, [sp, #16]
        ldp             x0, x1, [sp, #32]
        ldr             x14,    [sp, #48]
        ldp             x7, x30, [sp], #64
        b               hevc_put_hevc_qpel_bi_hv6_8_end_neon
endfunc

function ff_hevc_put_hevc_qpel_bi_hv8_8_\suffix, export=1
        add             w10, w5, #8
        lsl             x10, x10, #7
        mov             x14, sp
        sub             sp, sp, x10         // tmp_array
        stp             x7, x30, [sp, #-64]!
        stp             x4, x5, [sp, #16]
        stp             x0, x1, [sp, #32]
        str             x14,    [sp, #48]
        sub             x1, x2, x3, lsl #1
        sub             x1, x1, x3
        add             x0, sp, #64
        mov             x2, x3
        add             x3, x5, #7
        mov             x4, x6
        bl              X(ff_hevc_put_hevc_qpel_h8_8_\suffix)
        ldp             x4, x5, [sp, #16]
        ldp             x0, x1, [sp, #32]
        ldr             x14,    [sp, #48]
        ldp             x7, x30, [sp], #64
        b               hevc_put_hevc_qpel_bi_hv8_8_end_neon
endfunc

function ff_hevc_put_hevc_qpel_bi_hv12_8_\suffix, export=1
        stp             x6, x7, [sp, #-80]!
        stp             x4, x5, [sp, #16]
        stp             x2, x3, [sp, #32]
        stp             x0, x1, [sp, #48]
        str             x30, [sp, #64]
        bl              X(ff_hevc_put_hevc_qpel_bi_hv8_8_\suffix)
        ldp             x4, x5, [sp, #16]
        ldp             x2, x3, [sp, #32]
        ldp             x0, x1, [sp, #48]
        ldp             x6, x7, [sp], #64
        add             x4, x4, #16
        add             x2, x2, #8
        add             x0, x0, #8
        bl              X(ff_hevc_put_hevc_qpel_bi_hv4_8_\suffix)
        ldr             x30, [sp], #16
        ret
endfunc

function ff_hevc_put_hevc_qpel_bi_hv16_8_\suffix, export=1
        add             w10, w5, #8
        lsl             x10, x10, #7
        mov             x14, sp
        sub             sp, sp, x10         // tmp_array
        stp             x7, x30, [sp, #-64]!
        stp             x4, x5, [sp, #16]
        stp             x0, x1, [sp, #32]
        str             x14,    [sp, #48]
        add             x0, sp, #64
        sub             x1, x2, x3, lsl #1
        sub             x1, x1, x3
        mov             x2, x3
        add             w3, w5, #7
        mov             x4, x6
        bl              X(ff_hevc_put_hevc_qpel_h16_8_\suffix)
        ldp             x4, x5, [sp, #16]
        ldp             x0, x1, [sp, #32]
        ldr             x14,    [sp, #48]
        ldp             x7, x30, [sp], #64
        mov             x6, #16          // width
        b               hevc_put_hevc_qpel_bi_hv16_8_end_neon
endfunc

function ff_hevc_put_hevc_qpel_bi_hv24_8_\suffix, export=1
        stp             x6, x7, [sp, #-80]!
        stp             x4, x5, [sp, #16]
        stp             x2, x3, [sp, #32]
        stp             x0, x1, [sp, #48]
        str             x30, [sp, #64]
        bl              X(ff_hevc_put_hevc_qpel_bi_hv16_8_\suffix)
        ldp             x4, x5, [sp, #16]
        ldp             x2, x3, [sp, #32]
        ldp             x0, x1, [sp, #48]
        ldp             x6, x7, [sp], #64
        add             x4, x4, #32
        add             x2, x2, #16
        add             x0, x0, #16
        bl              X(ff_hevc_put_hevc_qpel_bi_hv8_8_\suffix)
        ldr             x30, [sp], #16
        ret
endfunc

function ff_hevc_put_hevc_qpel_bi_hv32_8_\suffix, export=1
        add             w10, w5, #8
        lsl             x10, x10, #7
        mov             x14, sp
        sub             sp, sp, x10         // tmp_array
        stp             x7, x30, [sp, #-64]!
        stp             x4, x5, [sp, #16]
        stp             x0, x1, [sp, #32]
        str             x14,    [sp, #48]
        add             x0, sp, #64
        sub             x1, x2, x3, lsl #1
        mov             x2, x3
        sub             x1, x1, x3
        add             w3, w5, #7
        mov             x4, x6
        mov             w6, #32
        bl              X(ff_hevc_put_hevc_qpel_h32_8_\suffix)
        ldp             x4, x5, [sp, #16]
        ldp             x0, x1, [sp, #32]
        ldr             x14,    [sp, #48]
        ldp             x7, x30, [sp], #64
        mov             x6, #32 // width
        b               hevc_put_hevc_qpel_bi_hv16_8_end_neon
endfunc

function ff_hevc_put_hevc_qpel_bi_hv48_8_\suffix, export=1
        add             w10, w5, #8
        lsl             x10, x10, #7
        mov             x14, sp
        sub             sp, sp, x10 // tmp_array
        stp             x7, x30, [sp, #-64]!
        stp             x4, x5, [sp, #16]
        stp             x0, x1, [sp, #32]
        str             x14,    [sp, #48]
        add             x0, sp, #64
        sub             x1, x2, x3, lsl #1
        mov             x2, x3
        sub             x1, x1, x3
        add             w3, w5, #7
        mov             x4, x6
.ifc \suffix, neon
        mov             w6, #48
        bl              X(ff_hevc_put_hevc_qpel_h32_8_\suffix)
.else
        bl              X(ff_hevc_put_hevc_qpel_h48_8_\suffix)
.endif
        ldp             x4, x5, [sp, #16]
        ldp             x0, x1, [sp, #32]
        ldr             x14,    [sp, #48]
        ldp             x7, x30, [sp], #64
        mov             x6, #48 // width
        b               hevc_put_hevc_qpel_bi_hv16_8_end_neon
endfunc

function ff_hevc_put_hevc_qpel_bi_hv64_8_\suffix, export=1
        add             w10, w5, #8
        lsl             x10, x10, #7
        mov             x14, sp
        sub             sp, sp, x10 // tmp_array
        stp             x7, x30, [sp, #-64]!
        stp             x4, x5, [sp, #16]
        stp             x0, x1, [sp, #32]
        str             x14,    [sp, #48]
        add             x0, sp, #64
        sub             x1, x2, x3, lsl #1
        mov             x2, x3
        sub             x1, x1, x3
        add             w3, w5, #7
        mov             x4, x6
.ifc \suffix, neon
        mov             w6, #64
        bl              X(ff_hevc_put_hevc_qpel_h32_8_\suffix)
.else
        bl              X(ff_hevc_put_hevc_qpel_h64_8_\suffix)
.endif
        ldp             x4, x5, [sp, #16]
        ldp             x0, x1, [sp, #32]
        ldr             x14,    [sp, #48]
        ldp             x7, x30, [sp], #64
        mov             x6, #64          // width
        b               hevc_put_hevc_qpel_bi_hv16_8_end_neon
endfunc
.endm

qpel_bi_hv neon

#if HAVE_I8MM
ENABLE_I8MM

qpel_uni_w_hv neon_i8mm

qpel_bi_hv neon_i8mm

DISABLE_I8MM
#endif // HAVE_I8MM