Implement put_pixels_clamped and add_pixels_clamped in Assembler. This

allows better scheduling of the memory accesses, and is portable among all compilers. Originally committed as revision 709 to svn://svn.ffmpeg.org/ffmpeg/trunk
23 years ago · bb7d4939ba
parent e09d12f4f6
commit bb7d4939ba
4 changed files with 287 additions and 43 deletions
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@ -63,10 +63,11 @@ endif
 # alpha specific stuff
 ifeq ($(TARGET_ARCH_ALPHA),yes)
 OBJS += alpha/dsputil_alpha.o alpha/mpegvideo_alpha.o
+ASM_OBJS += alpha/dsputil_alpha_asm.o
 CFLAGS += -Wa,-mpca56
 endif

-SRCS := $(OBJS:.o=.c) $(ASM_OBJS:.o=.s)
+SRCS := $(OBJS:.o=.c) $(ASM_OBJS:.o=.S)
 OBJS := $(OBJS) $(ASM_OBJS)

 LIB= libavcodec.a
--- a/libavcodec/alpha/dsputil_alpha.c
+++ b/libavcodec/alpha/dsputil_alpha.c
@ -22,64 +22,86 @@

 void simple_idct_axp(DCTELEM *block);

-static void put_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels, 
-				   int line_size)
+void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
+				int line_size);
+void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, 
+				int line_size);
+
+#if 0
+/* These functions were the base for the optimized assembler routines,
+   and remain here for documentation purposes.  */
+static void put_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels, 
+                                   int line_size)
 {
    int i = 8;
+    uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */

    ASM_ACCEPT_MVI;

    do {
-	UINT64 shorts;
+        uint64_t shorts0, shorts1;

-	shorts = ldq(block);
-	shorts = maxsw4(shorts, 0);
-	shorts = minsw4(shorts, WORD_VEC(0x00ff));
-	stl(pkwb(shorts), pixels);
+        shorts0 = ldq(block);
+        shorts0 = maxsw4(shorts0, 0);
+        shorts0 = minsw4(shorts0, clampmask);
+        stl(pkwb(shorts0), pixels);

-	shorts = ldq(block + 4);
-	shorts = maxsw4(shorts, 0);
-	shorts = minsw4(shorts, WORD_VEC(0x00ff));
-	stl(pkwb(shorts), pixels + 4);
+        shorts1 = ldq(block + 4);
+        shorts1 = maxsw4(shorts1, 0);
+        shorts1 = minsw4(shorts1, clampmask);
+        stl(pkwb(shorts1), pixels + 4);

-	pixels += line_size;
-	block += 8;
+        pixels += line_size;
+        block += 8;
    } while (--i);
 }

-static void add_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels, 
-				   int line_size)
+void add_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels, 
+                            int line_size)
 {
-    int i = 8;
+    int h = 8;
+    /* Keep this function a leaf function by generating the constants
+       manually (mainly for the hack value ;-).  */
+    uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
+    uint64_t signmask  = zap(-1, 0x33);
+    signmask ^= signmask >> 1;  /* 0x8000800080008000 */

    ASM_ACCEPT_MVI;

    do {
-	UINT64 shorts; 
-
-	shorts = ldq(block);
-	shorts &= ~WORD_VEC(0x8000); /* clear highest bit to avoid overflow */
-	shorts += unpkbw(ldl(pixels));
-	shorts &= ~WORD_VEC(0x8000); /* hibit would be set for e. g. -2 + 3 */
-	shorts = minuw4(shorts, WORD_VEC(0x4000)); /* set neg. to 0x4000 */
-	shorts &= ~WORD_VEC(0x4000); /* ...and zap them */
-	shorts = minsw4(shorts, WORD_VEC(0x00ff)); /* clamp to 255 */
-	stl(pkwb(shorts), pixels);
-
-	/* next 4 */
-	shorts = ldq(block + 4);
-	shorts &= ~WORD_VEC(0x8000);
-	shorts += unpkbw(ldl(pixels + 4));
-	shorts &= ~WORD_VEC(0x8000);
-	shorts = minuw4(shorts, WORD_VEC(0x4000));
-	shorts &= ~WORD_VEC(0x4000);
-	shorts = minsw4(shorts, WORD_VEC(0x00ff));
-	stl(pkwb(shorts), pixels + 4);
-
-	pixels += line_size;
-	block += 8;
-    } while (--i);
+        uint64_t shorts0, pix0, signs0;
+        uint64_t shorts1, pix1, signs1;
+
+        shorts0 = ldq(block);
+        shorts1 = ldq(block + 4);
+
+        pix0    = unpkbw(ldl(pixels));
+        /* Signed subword add (MMX paddw).  */
+        signs0  = shorts0 & signmask;
+        shorts0 &= ~signmask;
+        shorts0 += pix0;
+        shorts0 ^= signs0;
+        /* Clamp. */
+        shorts0 = maxsw4(shorts0, 0);
+        shorts0 = minsw4(shorts0, clampmask);   
+
+        /* Next 4.  */
+        pix1    = unpkbw(ldl(pixels + 4));
+        signs1  = shorts1 & signmask;
+        shorts1 &= ~signmask;
+        shorts1 += pix1;
+        shorts1 ^= signs1;
+        shorts1 = maxsw4(shorts1, 0);
+        shorts1 = minsw4(shorts1, clampmask);
+
+        stl(pkwb(shorts0), pixels);
+        stl(pkwb(shorts1), pixels + 4);
+
+        pixels += line_size;
+        block += 8;
+    } while (--h);
 }
+#endif

 /* Average 8 unsigned bytes in parallel: (b1 + b2) >> 1
   Since the immediate result could be greater than 255, we do the
@ -222,7 +244,7 @@ void dsputil_init_alpha(void)

    /* amask clears all bits that correspond to present features.  */
    if (amask(AMASK_MVI) == 0) {
-	put_pixels_clamped = put_pixels_clamped_axp;
-	add_pixels_clamped = add_pixels_clamped_axp;
+        put_pixels_clamped = put_pixels_clamped_mvi_asm;
+        add_pixels_clamped = add_pixels_clamped_mvi_asm;
    }
 }
--- a/libavcodec/alpha/dsputil_alpha_asm.S
+++ b/libavcodec/alpha/dsputil_alpha_asm.S
@ -0,0 +1,176 @@
+/*
+ * Alpha optimized DSP utils
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * These functions are scheduled for pca56. They should work
+ * reasonably on ev6, though.
+ */
+
+#include "regdef.h"
+
+/* Some nicer register names.  */
+#define ta t10
+#define tb t11
+#define tc t12
+#define td AT
+/* Danger: these overlap with the argument list and the return value */
+#define te a5
+#define tf a4
+#define tg a3
+#define th v0
+                
+        .set noat
+        .set noreorder
+        .arch pca56
+        .text
+
+/************************************************************************
+ * void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, 
+ *                                 int line_size)
+ */
+        .align 6
+        .globl put_pixels_clamped_mvi_asm
+        .ent put_pixels_clamped_mvi_asm
+put_pixels_clamped_mvi_asm:
+        .frame sp, 0, ra
+        .prologue 0
+
+        lda     t8, -1
+        lda     t9, 8           # loop counter
+        zap     t8, 0xaa, t8    # 00ff00ff00ff00ff
+
+        .align 4
+1:      ldq     t0,  0(a0)
+        ldq     t1,  8(a0)
+        ldq     t2, 16(a0)
+        ldq     t3, 24(a0)
+
+        maxsw4  t0, zero, t0
+        subq    t9, 2, t9
+        maxsw4  t1, zero, t1
+        lda     a0, 32(a0)
+
+        maxsw4  t2, zero, t2
+        addq    a1, a2, ta
+        maxsw4  t3, zero, t3
+        minsw4  t0, t8, t0
+        
+        minsw4  t1, t8, t1
+        minsw4  t2, t8, t2
+        minsw4  t3, t8, t3
+        pkwb    t0, t0
+        
+        pkwb    t1, t1
+        pkwb    t2, t2
+        pkwb    t3, t3
+        stl     t0, 0(a1)
+        
+        stl     t1, 4(a1)
+        addq    ta, a2, a1
+        stl     t2, 0(ta)
+        stl     t3, 4(ta)
+
+        bne     t9, 1b
+        ret
+        .end put_pixels_clamped_mvi_asm
+
+/************************************************************************
+ * void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, 
+ *                                 int line_size)
+ */
+        .align 6
+        .globl add_pixels_clamped_mvi_asm
+        .ent add_pixels_clamped_mvi_asm
+add_pixels_clamped_mvi_asm:
+        .frame sp, 0, ra
+        .prologue 0
+
+        lda     t1, -1
+        lda     th, 8
+        zap     t1, 0x33, tg
+        nop
+
+        srl     tg, 1, t0
+        xor     tg, t0, tg      # 0x8000800080008000
+        zap     t1, 0xaa, tf    # 0x00ff00ff00ff00ff
+
+        .align 4
+1:      ldl     t1, 0(a1)       # pix0 (try to hit cache line soon)
+        ldl     t4, 4(a1)       # pix1
+        addq    a1, a2, te      # pixels += line_size
+        ldq     t0, 0(a0)       # shorts0
+
+        ldl     t7, 0(te)       # pix2 (try to hit cache line soon)
+        ldl     ta, 4(te)       # pix3
+        ldq     t3, 8(a0)       # shorts1
+        ldq     t6, 16(a0)      # shorts2
+
+        ldq     t9, 24(a0)      # shorts3
+        unpkbw  t1, t1          # 0 0 (quarter/op no.)
+        and     t0, tg, t2      # 0 1
+        unpkbw  t4, t4          # 1 0
+
+        bic     t0, tg, t0      # 0 2
+        unpkbw  t7, t7          # 2 0
+        and     t3, tg, t5      # 1 1
+        addq    t0, t1, t0      # 0 3 
+
+        xor     t0, t2, t0      # 0 4
+        unpkbw  ta, ta          # 3 0
+        and     t6, tg, t8      # 2 1
+        maxsw4  t0, zero, t0    # 0 5
+        
+        bic     t3, tg, t3      # 1 2
+        bic     t6, tg, t6      # 2 2
+        minsw4  t0, tf, t0      # 0 6
+        addq    t3, t4, t3      # 1 3
+        
+        pkwb    t0, t0          # 0 7
+        xor     t3, t5, t3      # 1 4
+        maxsw4  t3, zero, t3    # 1 5
+        addq    t6, t7, t6      # 2 3
+
+        xor     t6, t8, t6      # 2 4
+        and     t9, tg, tb      # 3 1
+        minsw4  t3, tf, t3      # 1 6
+        bic     t9, tg, t9      # 3 2
+
+        maxsw4  t6, zero, t6    # 2 5
+        addq    t9, ta, t9      # 3 3
+        stl     t0, 0(a1)       # 0 8   
+        minsw4  t6, tf, t6      # 2 6
+
+        xor     t9, tb, t9      # 3 4
+        maxsw4  t9, zero, t9    # 3 5
+        lda     a0, 32(a0)      # block += 16;
+        pkwb    t3, t3          # 1 7
+        
+        minsw4  t9, tf, t9      # 3 6
+        subq    th, 2, th
+        pkwb    t6, t6          # 2 7
+        pkwb    t9, t9          # 3 7
+
+        stl     t3, 4(a1)       # 1 8
+        addq    te, a2, a1      # pixels += line_size
+        stl     t6, 0(te)       # 2 8
+        stl     t9, 4(te)       # 3 8
+
+        bne     th, 1b
+        ret     
+        .end add_pixels_clamped_mvi_asm
--- a/libavcodec/alpha/regdef.h
+++ b/libavcodec/alpha/regdef.h
@ -0,0 +1,45 @@
+/* Some BSDs don't seem to have regdef.h... sigh  */
+#ifndef alpha_regdef_h
+#define alpha_regdef_h
+
+#define v0      $0      /* function return value */
+
+#define t0      $1      /* temporary registers (caller-saved) */
+#define t1      $2
+#define t2      $3
+#define t3      $4
+#define t4      $5
+#define t5      $6
+#define t6      $7
+#define t7      $8
+
+#define s0      $9      /* saved-registers (callee-saved registers) */
+#define s1      $10
+#define s2      $11
+#define s3      $12
+#define s4      $13
+#define s5      $14
+#define s6      $15
+#define fp      s6      /* frame-pointer (s6 in frame-less procedures) */
+
+#define a0      $16     /* argument registers (caller-saved) */
+#define a1      $17
+#define a2      $18
+#define a3      $19
+#define a4      $20
+#define a5      $21
+
+#define t8      $22     /* more temps (caller-saved) */
+#define t9      $23
+#define t10     $24
+#define t11     $25
+#define ra      $26     /* return address register */
+#define t12     $27
+
+#define pv      t12     /* procedure-variable register */
+#define AT      $at     /* assembler temporary */
+#define gp      $29     /* global pointer */
+#define sp      $30     /* stack pointer */
+#define zero    $31     /* reads as zero, writes are noops */
+
+#endif /* alpha_regdef_h */