VP8: ARM optimised decode_block_coeffs_internal

Approximately 5% faster on Cortex-A8. Signed-off-by: Mans Rullgard <mans@mansr.com> (cherry picked from commit a7878c9f73)
14 years ago · 4ae3ee4ae9
parent 5da7494dc5
commit 4ae3ee4ae9
6 changed files with 260 additions and 3 deletions
--- a/2
+++ b/2
@ -66,7 +66,7 @@ config.h: .config
 SUBDIR_VARS := OBJS FFLIBS CLEANFILES DIRS TESTPROGS EXAMPLES SKIPHEADERS \
               ALTIVEC-OBJS MMX-OBJS NEON-OBJS X86-OBJS YASM-OBJS-FFT YASM-OBJS \
-               HOSTPROGS BUILT_HEADERS TESTOBJS ARCH_HEADERS
+               HOSTPROGS BUILT_HEADERS TESTOBJS ARCH_HEADERS ARMV6-OBJS
 define RESET
 $(1) :=
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@ -3,6 +3,7 @@ OBJS-$(CONFIG_DCA_DECODER)             += arm/dcadsp_init_arm.o         \
 OBJS-$(CONFIG_VP5_DECODER)             += arm/vp56dsp_init_arm.o
 OBJS-$(CONFIG_VP6_DECODER)             += arm/vp56dsp_init_arm.o
 OBJS-$(CONFIG_VP8_DECODER)             += arm/vp8dsp_init_arm.o
 ARMV6-OBJS-$(CONFIG_VP8_DECODER)       += arm/vp8_armv6.o
 OBJS-$(CONFIG_H264DSP)                 += arm/h264dsp_init_arm.o
 OBJS-$(CONFIG_H264PRED)                += arm/h264pred_init_arm.o
@ -23,6 +24,7 @@ OBJS-$(HAVE_ARMV5TE)                   += arm/dsputil_init_armv5te.o    \
 OBJS-$(HAVE_ARMV6)                     += arm/dsputil_init_armv6.o      \
                                          arm/dsputil_armv6.o           \
                                          arm/simple_idct_armv6.o       \
                                          $(ARMV6-OBJS-yes)
 VFP-OBJS-$(HAVE_ARMV6)                 += arm/fmtconvert_vfp.o          \
--- a/libavcodec/arm/vp8.h
+++ b/libavcodec/arm/vp8.h
@ -0,0 +1,29 @@
 /**
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 #ifndef AVCODEC_ARM_VP8_H
 #define AVCODEC_ARM_VP8_H
 #if HAVE_ARMV6
 #define decode_block_coeffs_internal ff_decode_block_coeffs_armv6
 int ff_decode_block_coeffs_armv6(VP56RangeCoder *rc, DCTELEM block[16],
                                 uint8_t probs[8][3][NUM_DCT_TOKENS-1],
                                 int i, uint8_t *token_prob, int16_t qmul[2]);
 #endif
 #endif
--- a/libavcodec/arm/vp8_armv6.S
+++ b/libavcodec/arm/vp8_armv6.S
@ -0,0 +1,220 @@
 /**
 * Copyright (C) 2010 Mans Rullgard
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 #include "asm.S"
        .syntax         unified
 .macro rac_get_prob     h, bs, buf, cw, pr, t0, t1
        adds            \bs, \bs, \t0
        lsl             \cw, \cw, \t0
        lsl             \t0, \h,  \t0
        rsb             \h,  \pr, #256
        ldrhcs          \t1, [\buf], #2
        smlabb          \h,  \t0, \pr, \h
        rev16cs         \t1, \t1
        orrcs           \cw, \cw, \t1, lsl \bs
        subcs           \bs, \bs, #16
        lsr             \h,  \h,  #8
        cmp             \cw, \h,  lsl #16
        subge           \cw, \cw, \h,  lsl #16
        subge           \h,  \t0, \h
 .endm
 .macro rac_get_128      h, bs, buf, cw, t0, t1
        adds            \bs, \bs, \t0
        lsl             \cw, \cw, \t0
        lsl             \t0, \h,  \t0
        ldrhcs          \t1, [\buf], #2
        mov             \h,  #128
        rev16cs         \t1, \t1
        add             \h,  \h,  \t0, lsl #7
        orrcs           \cw, \cw, \t1, lsl \bs
        subcs           \bs, \bs, #16
        lsr             \h,  \h,  #8
        cmp             \cw, \h,  lsl #16
        subge           \cw, \cw, \h,  lsl #16
        subge           \h,  \t0, \h
 .endm
 function ff_decode_block_coeffs_armv6, export=1
        push            {r0,r1,r4-r11,lr}
        movrel          lr,  ff_vp56_norm_shift
        ldrd            r4,  r5,  [sp, #44]             @ token_prob, qmul
        cmp             r3,  #0
        ldr             r11, [r5]
        ldm             r0,  {r5-r7}                    @ high, bits, buf
        pkhtbne         r11, r11, r11, asr #16
        ldr             r8,  [r0, #16]                  @ code_word
 0:
        ldrb            r9,  [lr, r5]
        add             r3,  r3,  #1
        ldrb            r0,  [r4, #1]
        rac_get_prob    r5,  r6,  r7,  r8,  r0,  r9,  r10
        blt             2f
        ldrb            r9,  [lr, r5]
        ldrb            r0,  [r4, #2]
        rac_get_prob    r5,  r6,  r7,  r8,  r0,  r9,  r10
        ldrb            r9,  [lr, r5]
        bge             3f
        add             r4,  r3,  r3,  lsl #5
        sxth            r12, r11
        add             r4,  r2,  r4
        adds            r6,  r6,  r9
        add             r4,  r4,  #11
        lsl             r8,  r8,  r9
        ldrhcs          r10, [r7], #2
        lsl             r9,  r5,  r9
        mov             r5,  #128
        rev16cs         r10, r10
        add             r5,  r5,  r9,  lsl #7
        orrcs           r8,  r8,  r10, lsl r6
        subcs           r6,  r6,  #16
        lsr             r5,  r5,  #8
        cmp             r8,  r5,  lsl #16
        movrel          r10, zigzag_scan-1
        subge           r8,  r8,  r5,  lsl #16
        subge           r5,  r9,  r5
        ldrb            r10, [r10, r3]
        rsbge           r12, r12, #0
        cmp             r3,  #16
        strh            r12, [r1, r10]
        bge             6f
 5:
        ldrb            r9,  [lr, r5]
        ldrb            r0,  [r4]
        rac_get_prob    r5,  r6,  r7,  r8,  r0,  r9,  r10
        pkhtb           r11, r11, r11, asr #16
        bge             0b
 6:
        ldr             r0,  [sp]
        ldr             r9,  [r0, #12]
        cmp             r7,  r9
        movhi           r7,  r9
        stm             r0,  {r5-r7}                    @ high, bits, buf
        str             r8,  [r0, #16]                  @ code_word
        add             sp,  sp,  #8
        mov             r0,  r3
        pop             {r4-r11,pc}
 2:
        add             r4,  r3,  r3,  lsl #5
        cmp             r3,  #16
        add             r4,  r2,  r4
        pkhtb           r11, r11, r11, asr #16
        bne             0b
        b               6b
 3:
        ldrb            r0,  [r4, #3]
        rac_get_prob    r5,  r6,  r7,  r8,  r0,  r9,  r10
        ldrb            r9,  [lr, r5]
        bge             1f
        mov             r12, #2
        ldrb            r0,  [r4, #4]
        rac_get_prob    r5,  r6,  r7,  r8,  r0,  r9,  r10
        addge           r12, #1
        ldrb            r9,  [lr, r5]
        blt             4f
        ldrb            r0,  [r4, #5]
        rac_get_prob    r5,  r6,  r7,  r8,  r0,  r9,  r10
        addge           r12, #1
        ldrb            r9,  [lr, r5]
        b               4f
 1:
        ldrb            r0,  [r4, #6]
        rac_get_prob    r5,  r6,  r7,  r8,  r0,  r9,  r10
        ldrb            r9,  [lr, r5]
        bge             3f
        ldrb            r0,  [r4, #7]
        rac_get_prob    r5,  r6,  r7,  r8,  r0,  r9,  r10
        ldrb            r9,  [lr, r5]
        bge             2f
        mov             r12, #5
        mov             r0,  #159
        rac_get_prob    r5,  r6,  r7,  r8,  r0,  r9,  r10
        addge           r12, r12, #1
        ldrb            r9,  [lr, r5]
        b               4f
 2:
        mov             r12, #7
        mov             r0,  #165
        rac_get_prob    r5,  r6,  r7,  r8,  r0,  r9,  r10
        addge           r12, r12, #2
        ldrb            r9,  [lr, r5]
        mov             r0,  #145
        rac_get_prob    r5,  r6,  r7,  r8,  r0,  r9,  r10
        addge           r12, r12, #1
        ldrb            r9,  [lr, r5]
        b               4f
 3:
        ldrb            r0,  [r4, #8]
        rac_get_prob    r5,  r6,  r7,  r8,  r0,  r9,  r10
        addge           r4,  r4,  #1
        ldrb            r9,  [lr, r5]
        movge           r12, #2
        movlt           r12, #0
        ldrb            r0,  [r4, #9]
        rac_get_prob    r5,  r6,  r7,  r8,  r0,  r9,  r10
        mov             r9,  #8
        addge           r12, r12, #1
        movrel          r4,  ff_vp8_dct_cat_prob
        lsl             r9,  r9,  r12
        ldr             r4,  [r4, r12, lsl #2]
        add             r12, r9,  #3
        mov             r1,  #0
        ldrb            r0,  [r4], #1
 1:
        ldrb            r9,  [lr, r5]
        lsl             r1,  r1,  #1
        rac_get_prob    r5,  r6,  r7,  r8,  r0,  r9,  r10
        ldrb            r0,  [r4], #1
        addge           r1,  r1,  #1
        cmp             r0,  #0
        bne             1b
        ldrb            r9,  [lr, r5]
        add             r12, r12, r1
        ldr             r1,  [sp, #4]
 4:
        add             r4,  r3,  r3,  lsl #5
        add             r4,  r2,  r4
        add             r4,  r4,  #22
        rac_get_128     r5,  r6,  r7,  r8,  r9,  r10
        rsbge           r12, r12, #0
        smulbb          r12, r12, r11
        movrel          r9,  zigzag_scan-1
        ldrb            r9,  [r9, r3]
        cmp             r3,  #16
        strh            r12, [r1, r9]
        bge             6b
        b               5b
 endfunc
        .section        .rodata
 zigzag_scan:
        .byte            0,  2,  8, 16
        .byte           10,  4,  6, 12
        .byte           18, 24, 26, 20
        .byte           14, 22, 28, 30
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@ -30,6 +30,10 @@
 #include "h264pred.h"
 #include "rectangle.h"
 #if ARCH_ARM
 #   include "arm/vp8.h"
 #endif
 typedef struct {
    uint8_t filter_level;
    uint8_t inner_limit;
@ -801,6 +805,7 @@ void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, uint8_
    }
 }
 #ifndef decode_block_coeffs_internal
 /**
 * @param c arithmetic bitstream reader context
 * @param block destination for block coefficients
@ -854,7 +859,7 @@ skip_eob:
                    int b = vp56_rac_get_prob(c, token_prob[9+a]);
                    int cat = (a<<1) + b;
                    coeff  = 3 + (8<<cat);
-                    coeff += vp8_rac_get_coeff(c, vp8_dct_cat_prob[cat]);
+                    coeff += vp8_rac_get_coeff(c, ff_vp8_dct_cat_prob[cat]);
                }
            }
            token_prob = probs[i+1][2];
@ -864,6 +869,7 @@ skip_eob:
    return i;
 }
 #endif
 static av_always_inline
 int decode_block_coeffs(VP56RangeCoder *c, DCTELEM block[16],
--- a/libavcodec/vp8data.h
+++ b/libavcodec/vp8data.h
@ -313,7 +313,7 @@ static const uint8_t vp8_dct_cat5_prob[] = { 180, 157, 141, 134, 130, 0 };
 static const uint8_t vp8_dct_cat6_prob[] = { 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 };
 // only used for cat3 and above; cat 1 and 2 are referenced directly
-static const uint8_t * const vp8_dct_cat_prob[] =
+const uint8_t * const ff_vp8_dct_cat_prob[] =
 {
    vp8_dct_cat3_prob,
    vp8_dct_cat4_prob,