aarch64: hevc: Specialize put_hevc_\type\()_h*_8_neon for horizontal looping

For widths of 32 pixels and more, loop first horizontally, then vertically. Previously, this function would process a 16 pixel wide slice of the block, looping vertically. After processing the whole height, it would backtrack and process the next 16 pixel wide slice. When doing 8tap filtering horizontally, the function must load 7 more pixels (in practice, 8) following the actual inputs, and this was done for each slice. By iterating first horizontally throughout each line, then vertically, we access data in a more cache friendly order, and we don't need to reload data unnecessarily. Keep the original order in put_hevc_\type\()_h12_8_neon; the only suboptimal case there is for width=24. But specializing an optimal variant for that would require more code, which might not be worth it. For the h16 case, this implementation would give a slowdown, as it now loads the first 8 pixels separately from the rest, but for larger widths, it is a gain. Therefore, keep the h16 case as it was (but remove the outer loop), and create a new specialized version for horizontal looping with 16 pixels at a time. Before: Cortex A53 A72 A73 Graviton 3 put_hevc_qpel_h16_8_neon: 710.5 667.7 692.5 211.0 put_hevc_qpel_h32_8_neon: 2791.5 2643.5 2732.0 883.5 put_hevc_qpel_h64_8_neon: 10954.0 10657.0 10874.2 3241.5 After: put_hevc_qpel_h16_8_neon: 697.5 663.5 705.7 212.5 put_hevc_qpel_h32_8_neon: 2767.2 2684.5 2791.2 920.5 put_hevc_qpel_h64_8_neon: 10559.2 10471.5 10932.2 3051.7 Signed-off-by: Martin Storsjö <martin@martin.st>
10 months ago · 717cc82d28
parent e3a54cabde
commit 717cc82d28
2 changed files with 94 additions and 29 deletions
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@ -109,6 +109,8 @@ void ff_hevc_put_hevc_qpel_h12_8_neon(int16_t *dst, const uint8_t *_src, ptrdiff
                                      intptr_t mx, intptr_t my, int width);
 void ff_hevc_put_hevc_qpel_h16_8_neon(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height,
                                      intptr_t mx, intptr_t my, int width);
 void ff_hevc_put_hevc_qpel_h32_8_neon(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height,
                                      intptr_t mx, intptr_t my, int width);
 void ff_hevc_put_hevc_qpel_uni_h4_8_neon(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src,
                                         ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,
                                         int width);
@ -124,6 +126,9 @@ void ff_hevc_put_hevc_qpel_uni_h12_8_neon(uint8_t *_dst, ptrdiff_t _dststride, c
 void ff_hevc_put_hevc_qpel_uni_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src,
                                          ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t
                                          my, int width);
 void ff_hevc_put_hevc_qpel_uni_h32_8_neon(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src,
                                          ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t
                                          my, int width);
 void ff_hevc_put_hevc_qpel_bi_h4_8_neon(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src,
                                        ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t
                                        mx, intptr_t my, int width);
@ -139,6 +144,9 @@ void ff_hevc_put_hevc_qpel_bi_h12_8_neon(uint8_t *_dst, ptrdiff_t _dststride, co
 void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src,
                                         ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t
                                         mx, intptr_t my, int width);
 void ff_hevc_put_hevc_qpel_bi_h32_8_neon(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src,
                                         ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t
                                         mx, intptr_t my, int width);
 #define NEON8_FNPROTO(fn, args, ext) \
    void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
@ -335,28 +343,28 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
        c->put_hevc_qpel[3][0][1]      = ff_hevc_put_hevc_qpel_h8_8_neon;
        c->put_hevc_qpel[4][0][1]      =
        c->put_hevc_qpel[6][0][1]      = ff_hevc_put_hevc_qpel_h12_8_neon;
-        c->put_hevc_qpel[5][0][1]      =
+        c->put_hevc_qpel[5][0][1]      = ff_hevc_put_hevc_qpel_h16_8_neon;
        c->put_hevc_qpel[7][0][1]      =
        c->put_hevc_qpel[8][0][1]      =
-        c->put_hevc_qpel[9][0][1]      = ff_hevc_put_hevc_qpel_h16_8_neon;
+        c->put_hevc_qpel[9][0][1]      = ff_hevc_put_hevc_qpel_h32_8_neon;
        c->put_hevc_qpel_uni[1][0][1]  = ff_hevc_put_hevc_qpel_uni_h4_8_neon;
        c->put_hevc_qpel_uni[2][0][1]  = ff_hevc_put_hevc_qpel_uni_h6_8_neon;
        c->put_hevc_qpel_uni[3][0][1]  = ff_hevc_put_hevc_qpel_uni_h8_8_neon;
        c->put_hevc_qpel_uni[4][0][1]  =
        c->put_hevc_qpel_uni[6][0][1]  = ff_hevc_put_hevc_qpel_uni_h12_8_neon;
-        c->put_hevc_qpel_uni[5][0][1]  =
+        c->put_hevc_qpel_uni[5][0][1]  = ff_hevc_put_hevc_qpel_uni_h16_8_neon;
        c->put_hevc_qpel_uni[7][0][1]  =
        c->put_hevc_qpel_uni[8][0][1]  =
-        c->put_hevc_qpel_uni[9][0][1]  = ff_hevc_put_hevc_qpel_uni_h16_8_neon;
+        c->put_hevc_qpel_uni[9][0][1]  = ff_hevc_put_hevc_qpel_uni_h32_8_neon;
        c->put_hevc_qpel_bi[1][0][1]   = ff_hevc_put_hevc_qpel_bi_h4_8_neon;
        c->put_hevc_qpel_bi[2][0][1]   = ff_hevc_put_hevc_qpel_bi_h6_8_neon;
        c->put_hevc_qpel_bi[3][0][1]   = ff_hevc_put_hevc_qpel_bi_h8_8_neon;
        c->put_hevc_qpel_bi[4][0][1]   =
        c->put_hevc_qpel_bi[6][0][1]   = ff_hevc_put_hevc_qpel_bi_h12_8_neon;
-        c->put_hevc_qpel_bi[5][0][1]   =
+        c->put_hevc_qpel_bi[5][0][1]   = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
        c->put_hevc_qpel_bi[7][0][1]   =
        c->put_hevc_qpel_bi[8][0][1]   =
-        c->put_hevc_qpel_bi[9][0][1]   = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
+        c->put_hevc_qpel_bi[9][0][1]   = ff_hevc_put_hevc_qpel_bi_h32_8_neon;
        NEON8_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels,);
        NEON8_FNASSIGN(c->put_hevc_epel, 1, 0, epel_v,);
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@ -383,11 +383,9 @@ endfunc
 .ifc \type, qpel
 function ff_hevc_put_hevc_h16_8_neon, export=0
        uxtl            v16.8h,  v16.8b
        uxtl            v17.8h,  v17.8b
        uxtl            v18.8h,  v18.8b
        uxtl            v19.8h,  v19.8b
        uxtl            v20.8h,  v20.8b
        uxtl            v21.8h,  v21.8b
@ -408,7 +406,6 @@ function ff_hevc_put_hevc_h16_8_neon, export=0
        mla             v28.8h,  v24.8h, v0.h[\i]
        mla             v29.8h,  v25.8h, v0.h[\i]
 .endr
        subs            x9, x9, #2
        ret
 endfunc
 .endif
@ -439,7 +436,10 @@ function ff_hevc_put_hevc_\type\()_h12_8_neon, export=1
 1:      ld1             {v16.8b-v18.8b}, [src], x13
        ld1             {v19.8b-v21.8b}, [x12], x13
        uxtl            v16.8h,  v16.8b
        uxtl            v19.8h,  v19.8b
        bl              ff_hevc_put_hevc_h16_8_neon
        subs            x9, x9, #2
 .ifc \type, qpel
        st1             {v26.8h}, [dst], #16
@ -504,7 +504,6 @@ function ff_hevc_put_hevc_\type\()_h16_8_neon, export=1
 .ifc \type, qpel_bi
        ldrh            w8, [sp] // width
        mov             x16, #(MAX_PB_SIZE << 2) // src2bstridel
        lsl             x17, x5, #7 // src2b reset
        add             x15, x4, #(MAX_PB_SIZE << 1) // src2b
 .endif
        sub             src, src, #3
@ -519,11 +518,14 @@ function ff_hevc_put_hevc_\type\()_h16_8_neon, export=1
 .endif
        add             x10, dst, dststride // dstb
        add             x12, src, srcstride // srcb
-0:      mov             x9, height
+
 1:      ld1             {v16.8b-v18.8b}, [src], x13
        ld1             {v19.8b-v21.8b}, [x12], x13
        uxtl            v16.8h,  v16.8b
        uxtl            v19.8h,  v19.8b
        bl              ff_hevc_put_hevc_h16_8_neon
        subs            height, height, #2
 .ifc \type, qpel
        st1             {v26.8h, v27.8h}, [dst], x14
@ -550,28 +552,83 @@ function ff_hevc_put_hevc_\type\()_h16_8_neon, export=1
        st1             {v28.8b, v29.8b}, [x10], x14
 .endif
        b.gt            1b // double line
-        subs            width, width, #16
+        ret             mx
-        // reset src
+endfunc
-        msub            src, srcstride, height, src
+
-        msub            x12, srcstride, height, x12
+function ff_hevc_put_hevc_\type\()_h32_8_neon, export=1
-        // reset dst
+        load_filter     mx
-        msub            dst, dststride, height, dst
+        sxtw            height, heightw
-        msub            x10, dststride, height, x10
+        mov             mx, x30
 .ifc \type, qpel_bi
-        // reset xsrc
+        ldrh            w8, [sp] // width
-        sub             x4,  x4,  x17
+        mov             x16, #(MAX_PB_SIZE << 2) // src2bstridel
-        sub             x15, x15, x17
+        lsl             x17, x5, #7 // src2b reset
-        add             x4,  x4,  #32
+        add             x15, x4, #(MAX_PB_SIZE << 1) // src2b
-        add             x15, x15, #32
+        sub             x16, x16, width, uxtw #1
 .endif
-        add             src, src, #16
+        sub             src, src, #3
-        add             x12, x12, #16
+        mov             mx, x30
 .ifc \type, qpel
        mov             dststride, #(MAX_PB_SIZE << 1)
        lsl             x13, srcstride, #1 // srcstridel
        mov             x14, #(MAX_PB_SIZE << 2)
        sub             x14, x14, width, uxtw #1
 .else
        lsl             x14, dststride, #1 // dststridel
        lsl             x13, srcstride, #1 // srcstridel
        sub             x14, x14, width, uxtw
 .endif
        sub             x13, x13, width, uxtw
        sub             x13, x13, #8
        add             x10, dst, dststride // dstb
        add             x12, src, srcstride // srcb
 0:      mov             w9, width
        ld1             {v16.8b}, [src], #8
        ld1             {v19.8b}, [x12], #8
        uxtl            v16.8h, v16.8b
        uxtl            v19.8h, v19.8b
 1:
        ld1             {v17.8b-v18.8b}, [src], #16
        ld1             {v20.8b-v21.8b}, [x12], #16
        bl              ff_hevc_put_hevc_h16_8_neon
        subs            w9, w9, #16
        mov             v16.16b, v18.16b
        mov             v19.16b, v21.16b
 .ifc \type, qpel
-        add             dst, dst, #32
+        st1             {v26.8h, v27.8h}, [dst], #32
-        add             x10, x10, #32
+        st1             {v28.8h, v29.8h}, [x10], #32
 .else
 .ifc \type, qpel_bi
        ld1             {v20.8h, v21.8h}, [ x4], #32
        ld1             {v22.8h, v23.8h}, [x15], #32
        sqadd           v26.8h, v26.8h, v20.8h
        sqadd           v27.8h, v27.8h, v21.8h
        sqadd           v28.8h, v28.8h, v22.8h
        sqadd           v29.8h, v29.8h, v23.8h
        sqrshrun        v26.8b, v26.8h, #7
        sqrshrun        v27.8b, v27.8h, #7
        sqrshrun        v28.8b, v28.8h, #7
        sqrshrun        v29.8b, v29.8h, #7
 .else
-        add             dst, dst, #16
+        sqrshrun        v26.8b, v26.8h, #6
-        add             x10, x10, #16
+        sqrshrun        v27.8b, v27.8h, #6
        sqrshrun        v28.8b, v28.8h, #6
        sqrshrun        v29.8b, v29.8h, #6
 .endif
        st1             {v26.8b, v27.8b}, [dst], #16
        st1             {v28.8b, v29.8b}, [x10], #16
 .endif
        b.gt            1b // double line
        subs            height, height, #2
        add             src, src, x13
        add             x12, x12, x13
        add             dst, dst, x14
        add             x10, x10, x14
 .ifc \type, qpel_bi
        add             x4,  x4,  x16
        add             x15, x15, x16
 .endif
        b.gt            0b
        ret             mx