From 64233e702a95df9167e3362e58aae4e82ce2ddf8 Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <jason@x264.com>
Date: Mon, 31 Jan 2011 19:04:29 -0800
Subject: [PATCH] VP8: merge chroma MC calls

Adds some duplicated code, but avoids duplicate edge checks and similar.
~0.5% faster overall on Parkjoy test sample.
---
 libavcodec/vp8.c | 125 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 79 insertions(+), 46 deletions(-)

diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index d2f55d90f3..effeb4ed2f 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -1193,6 +1193,13 @@ void intra_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
                        s->filter.simple, 0);
 }
 
+static const uint8_t subpel_idx[3][8] = {
+    { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
+                                // also function pointer index
+    { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
+    { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
+};
+
 /**
  * Generic MC function.
  *
@@ -1211,32 +1218,26 @@ void intra_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
  * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
  */
 static av_always_inline
-void vp8_mc(VP8Context *s, int luma,
-            uint8_t *dst, uint8_t *src, const VP56mv *mv,
-            int x_off, int y_off, int block_w, int block_h,
-            int width, int height, int linesize,
-            vp8_mc_func mc_func[3][3])
+void vp8_mc_luma(VP8Context *s, uint8_t *dst, uint8_t *src, const VP56mv *mv,
+                 int x_off, int y_off, int block_w, int block_h,
+                 int width, int height, int linesize,
+                 vp8_mc_func mc_func[3][3])
 {
     if (AV_RN32A(mv)) {
-        static const uint8_t idx[3][8] = {
-            { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
-                                        // also function pointer index
-            { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
-            { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
-        };
-        int mx = (mv->x << luma)&7, mx_idx = idx[0][mx];
-        int my = (mv->y << luma)&7, my_idx = idx[0][my];
 
-        x_off += mv->x >> (3 - luma);
-        y_off += mv->y >> (3 - luma);
+        int mx = (mv->x << 1)&7, mx_idx = subpel_idx[0][mx];
+        int my = (mv->y << 1)&7, my_idx = subpel_idx[0][my];
+
+        x_off += mv->x >> 2;
+        y_off += mv->y >> 2;
 
         // edge emulation
         src += y_off * linesize + x_off;
-        if (x_off < mx_idx || x_off >= width  - block_w - idx[2][mx] ||
-            y_off < my_idx || y_off >= height - block_h - idx[2][my]) {
+        if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
+            y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
             s->dsp.emulated_edge_mc(s->edge_emu_buffer, src - my_idx * linesize - mx_idx, linesize,
-                                block_w + idx[1][mx], block_h + idx[1][my],
-                                x_off - mx_idx, y_off - my_idx, width, height);
+                                    block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
+                                    x_off - mx_idx, y_off - my_idx, width, height);
             src = s->edge_emu_buffer + mx_idx + linesize * my_idx;
         }
         mc_func[my_idx][mx_idx](dst, linesize, src, linesize, block_h, mx, my);
@@ -1244,6 +1245,45 @@ void vp8_mc(VP8Context *s, int luma,
         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off, linesize, block_h, 0, 0);
 }
 
+static av_always_inline
+void vp8_mc_chroma(VP8Context *s, uint8_t *dst1, uint8_t *dst2, uint8_t *src1,
+                   uint8_t *src2, const VP56mv *mv, int x_off, int y_off,
+                   int block_w, int block_h, int width, int height, int linesize,
+                   vp8_mc_func mc_func[3][3])
+{
+    if (AV_RN32A(mv)) {
+        int mx = mv->x&7, mx_idx = subpel_idx[0][mx];
+        int my = mv->y&7, my_idx = subpel_idx[0][my];
+
+        x_off += mv->x >> 3;
+        y_off += mv->y >> 3;
+
+        // edge emulation
+        src1 += y_off * linesize + x_off;
+        src2 += y_off * linesize + x_off;
+        if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
+            y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
+            s->dsp.emulated_edge_mc(s->edge_emu_buffer, src1 - my_idx * linesize - mx_idx, linesize,
+                                    block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
+                                    x_off - mx_idx, y_off - my_idx, width, height);
+            src1 = s->edge_emu_buffer + mx_idx + linesize * my_idx;
+            mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
+
+            s->dsp.emulated_edge_mc(s->edge_emu_buffer, src2 - my_idx * linesize - mx_idx, linesize,
+                                    block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
+                                    x_off - mx_idx, y_off - my_idx, width, height);
+            src2 = s->edge_emu_buffer + mx_idx + linesize * my_idx;
+            mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
+        } else {
+            mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
+            mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
+        }
+    } else {
+        mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
+        mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
+    }
+}
+
 static av_always_inline
 void vp8_mc_part(VP8Context *s, uint8_t *dst[3],
                  AVFrame *ref_frame, int x_off, int y_off,
@@ -1254,10 +1294,10 @@ void vp8_mc_part(VP8Context *s, uint8_t *dst[3],
     VP56mv uvmv = *mv;
 
     /* Y */
-    vp8_mc(s, 1, dst[0] + by_off * s->linesize + bx_off,
-           ref_frame->data[0], mv, x_off + bx_off, y_off + by_off,
-           block_w, block_h, width, height, s->linesize,
-           s->put_pixels_tab[block_w == 8]);
+    vp8_mc_luma(s, dst[0] + by_off * s->linesize + bx_off,
+                ref_frame->data[0], mv, x_off + bx_off, y_off + by_off,
+                block_w, block_h, width, height, s->linesize,
+                s->put_pixels_tab[block_w == 8]);
 
     /* U/V */
     if (s->profile == 3) {
@@ -1268,14 +1308,11 @@ void vp8_mc_part(VP8Context *s, uint8_t *dst[3],
     bx_off  >>= 1; by_off  >>= 1;
     width   >>= 1; height  >>= 1;
     block_w >>= 1; block_h >>= 1;
-    vp8_mc(s, 0, dst[1] + by_off * s->uvlinesize + bx_off,
-           ref_frame->data[1], &uvmv, x_off + bx_off, y_off + by_off,
-           block_w, block_h, width, height, s->uvlinesize,
-           s->put_pixels_tab[1 + (block_w == 4)]);
-    vp8_mc(s, 0, dst[2] + by_off * s->uvlinesize + bx_off,
-           ref_frame->data[2], &uvmv, x_off + bx_off, y_off + by_off,
-           block_w, block_h, width, height, s->uvlinesize,
-           s->put_pixels_tab[1 + (block_w == 4)]);
+    vp8_mc_chroma(s, dst[1] + by_off * s->uvlinesize + bx_off,
+                  dst[2] + by_off * s->uvlinesize + bx_off, ref_frame->data[1],
+                  ref_frame->data[2], &uvmv, x_off + bx_off, y_off + by_off,
+                  block_w, block_h, width, height, s->uvlinesize,
+                  s->put_pixels_tab[1 + (block_w == 4)]);
 }
 
 /* Fetch pixels for estimated mv 4 macroblocks ahead.
@@ -1319,11 +1356,11 @@ void inter_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
         /* Y */
         for (y = 0; y < 4; y++) {
             for (x = 0; x < 4; x++) {
-                vp8_mc(s, 1, dst[0] + 4*y*s->linesize + x*4,
-                       ref->data[0], &bmv[4*y + x],
-                       4*x + x_off, 4*y + y_off, 4, 4,
-                       width, height, s->linesize,
-                       s->put_pixels_tab[2]);
+                vp8_mc_luma(s, dst[0] + 4*y*s->linesize + x*4,
+                            ref->data[0], &bmv[4*y + x],
+                            4*x + x_off, 4*y + y_off, 4, 4,
+                            width, height, s->linesize,
+                            s->put_pixels_tab[2]);
             }
         }
 
@@ -1345,16 +1382,12 @@ void inter_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
                     uvmv.x &= ~7;
                     uvmv.y &= ~7;
                 }
-                vp8_mc(s, 0, dst[1] + 4*y*s->uvlinesize + x*4,
-                       ref->data[1], &uvmv,
-                       4*x + x_off, 4*y + y_off, 4, 4,
-                       width, height, s->uvlinesize,
-                       s->put_pixels_tab[2]);
-                vp8_mc(s, 0, dst[2] + 4*y*s->uvlinesize + x*4,
-                       ref->data[2], &uvmv,
-                       4*x + x_off, 4*y + y_off, 4, 4,
-                       width, height, s->uvlinesize,
-                       s->put_pixels_tab[2]);
+                vp8_mc_chroma(s, dst[1] + 4*y*s->uvlinesize + x*4,
+                              dst[2] + 4*y*s->uvlinesize + x*4,
+                              ref->data[1], ref->data[2], &uvmv,
+                              4*x + x_off, 4*y + y_off, 4, 4,
+                              width, height, s->uvlinesize,
+                              s->put_pixels_tab[2]);
             }
         }
         break;