Merge commit '0c15a9aa7e1654a19144eb594f9639a57fd47482'

* commit '0c15a9aa7e1654a19144eb594f9639a57fd47482':
  sh4: Remove dubious aligned dsputil code

Conflicts:
	libavcodec/sh4/dsputil_align.c
	libavcodec/sh4/h264chroma_init.c
	libavcodec/sh4/hpeldsp.c
	libavcodec/sh4/qpel.c

If someone wants to maintain the sh4 code in ffmpeg, wants to
add more optimizations, or volunteers to maintain any of
what is removed here and can confirm that they are faster.
Then please contact us!

Merged-by: Michael Niedermayer <michaelni@gmx.at>
pull/15/head
Michael Niedermayer 12 years ago
commit 7a556ebccf
  1. 2
      libavcodec/h264chroma.c
  2. 1
      libavcodec/h264chroma.h
  3. 2
      libavcodec/hpeldsp.c
  4. 1
      libavcodec/hpeldsp.h
  5. 7
      libavcodec/sh4/Makefile
  6. 298
      libavcodec/sh4/dsputil_align.c
  7. 1
      libavcodec/sh4/dsputil_sh4.c
  8. 10
      libavcodec/sh4/dsputil_sh4.h
  9. 132
      libavcodec/sh4/h264chroma_init.c
  10. 351
      libavcodec/sh4/hpeldsp.c
  11. 862
      libavcodec/sh4/qpel.c

@ -47,8 +47,6 @@ void ff_h264chroma_init(H264ChromaContext *c, int bit_depth)
ff_h264chroma_init_arm(c, bit_depth);
if (ARCH_PPC)
ff_h264chroma_init_ppc(c, bit_depth);
if (ARCH_SH4)
ff_h264chroma_init_sh4(c, bit_depth);
if (ARCH_X86)
ff_h264chroma_init_x86(c, bit_depth);
}

@ -32,7 +32,6 @@ void ff_h264chroma_init(H264ChromaContext *c, int bit_depth);
void ff_h264chroma_init_arm(H264ChromaContext *c, int bit_depth);
void ff_h264chroma_init_ppc(H264ChromaContext *c, int bit_depth);
void ff_h264chroma_init_sh4(H264ChromaContext *c, int bit_depth);
void ff_h264chroma_init_x86(H264ChromaContext *c, int bit_depth);
#endif /* AVCODEC_H264CHROMA_H */

@ -62,8 +62,6 @@ av_cold void ff_hpeldsp_init(HpelDSPContext *c, int flags)
ff_hpeldsp_init_bfin(c, flags);
if (ARCH_PPC)
ff_hpeldsp_init_ppc(c, flags);
if (ARCH_SH4)
ff_hpeldsp_init_sh4(c, flags);
if (HAVE_VIS)
ff_hpeldsp_init_vis(c, flags);
if (ARCH_X86)

@ -98,7 +98,6 @@ void ff_hpeldsp_init_alpha(HpelDSPContext *c, int flags);
void ff_hpeldsp_init_arm(HpelDSPContext *c, int flags);
void ff_hpeldsp_init_bfin(HpelDSPContext *c, int flags);
void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags);
void ff_hpeldsp_init_sh4(HpelDSPContext *c, int flags);
void ff_hpeldsp_init_vis(HpelDSPContext *c, int flags);
void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags);

@ -1,7 +1,2 @@
OBJS += sh4/dsputil_align.o \
sh4/dsputil_sh4.o \
OBJS += sh4/dsputil_sh4.o \
sh4/idct_sh4.o \
OBJS-$(CONFIG_H264CHROMA) += sh4/h264chroma_init.o \
OBJS-$(CONFIG_HPELDSP) += sh4/hpeldsp.o

@ -1,298 +0,0 @@
/*
* aligned/packed access motion
*
* Copyright (c) 2001-2003 BERO <bero@geocities.co.jp>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/dsputil.h"
#include "libavcodec/rnd_avg.h"
#include "dsputil_sh4.h"
#define LP(p) *(uint32_t*)(p)
#define LPC(p) *(const uint32_t*)(p)
#define UNPACK(ph,pl,tt0,tt1) do { \
uint32_t t0,t1; t0=tt0;t1=tt1; \
ph = ( (t0 & ~BYTE_VEC32(0x03))>>2) + ( (t1 & ~BYTE_VEC32(0x03))>>2); \
pl = (t0 & BYTE_VEC32(0x03)) + (t1 & BYTE_VEC32(0x03)); } while(0)
#define rnd_PACK(ph,pl,nph,npl) ph + nph + (((pl + npl + BYTE_VEC32(0x02))>>2) & BYTE_VEC32(0x03))
#define no_rnd_PACK(ph,pl,nph,npl) ph + nph + (((pl + npl + BYTE_VEC32(0x01))>>2) & BYTE_VEC32(0x03))
/* little-endian */
#define MERGE1(a,b,ofs) (ofs==0)?a:( ((a)>>(8*ofs))|((b)<<(32-8*ofs)) )
#define MERGE2(a,b,ofs) (ofs==3)?b:( ((a)>>(8*(ofs+1)))|((b)<<(32-8*(ofs+1))) )
/* big
#define MERGE1(a,b,ofs) (ofs==0)?a:( ((a)<<(8*ofs))|((b)>>(32-8*ofs)) )
#define MERGE2(a,b,ofs) (ofs==3)?b:( ((a)<<(8+8*ofs))|((b)>>(32-8-8*ofs)) )
*/
#define put(d,s) d = s
#define avg(d,s) d = rnd_avg32(s,d)
#define OP_C4(ofs) \
ref-=ofs; \
do { \
OP(LP(dest),MERGE1(LPC(ref),LPC(ref+4),ofs)); \
ref+=stride; \
dest+=stride; \
} while(--height)
#define OP_C40() \
do { \
OP(LP(dest),LPC(ref)); \
ref+=stride; \
dest+=stride; \
} while(--height)
#define OP_C(ofs,sz,avg2) \
{ \
ref-=ofs; \
do { \
uint32_t t0,t1; \
t0 = LPC(ref+0); \
t1 = LPC(ref+4); \
OP(LP(dest+0), MERGE1(t0,t1,ofs)); \
t0 = LPC(ref+8); \
OP(LP(dest+4), MERGE1(t1,t0,ofs)); \
if (sz==16) { \
t1 = LPC(ref+12); \
OP(LP(dest+8), MERGE1(t0,t1,ofs)); \
t0 = LPC(ref+16); \
OP(LP(dest+12), MERGE1(t1,t0,ofs)); \
} \
ref+=stride; \
dest+= stride; \
} while(--height); \
}
/* aligned */
#define OP_C0(sz,avg2) \
{ \
do { \
OP(LP(dest+0), LPC(ref+0)); \
OP(LP(dest+4), LPC(ref+4)); \
if (sz==16) { \
OP(LP(dest+8), LPC(ref+8)); \
OP(LP(dest+12), LPC(ref+12)); \
} \
ref+=stride; \
dest+= stride; \
} while(--height); \
}
#define OP_X(ofs,sz,avg2) \
{ \
ref-=ofs; \
do { \
uint32_t t0,t1; \
t0 = LPC(ref+0); \
t1 = LPC(ref+4); \
OP(LP(dest+0), avg2(MERGE1(t0,t1,ofs),MERGE2(t0,t1,ofs))); \
t0 = LPC(ref+8); \
OP(LP(dest+4), avg2(MERGE1(t1,t0,ofs),MERGE2(t1,t0,ofs))); \
if (sz==16) { \
t1 = LPC(ref+12); \
OP(LP(dest+8), avg2(MERGE1(t0,t1,ofs),MERGE2(t0,t1,ofs))); \
t0 = LPC(ref+16); \
OP(LP(dest+12), avg2(MERGE1(t1,t0,ofs),MERGE2(t1,t0,ofs))); \
} \
ref+=stride; \
dest+= stride; \
} while(--height); \
}
/* aligned */
#define OP_Y0(sz,avg2) \
{ \
uint32_t t0,t1,t2,t3,t; \
\
t0 = LPC(ref+0); \
t1 = LPC(ref+4); \
if (sz==16) { \
t2 = LPC(ref+8); \
t3 = LPC(ref+12); \
} \
do { \
ref += stride; \
\
t = LPC(ref+0); \
OP(LP(dest+0), avg2(t0,t)); t0 = t; \
t = LPC(ref+4); \
OP(LP(dest+4), avg2(t1,t)); t1 = t; \
if (sz==16) { \
t = LPC(ref+8); \
OP(LP(dest+8), avg2(t2,t)); t2 = t; \
t = LPC(ref+12); \
OP(LP(dest+12), avg2(t3,t)); t3 = t; \
} \
dest+= stride; \
} while(--height); \
}
#define OP_Y(ofs,sz,avg2) \
{ \
uint32_t t0,t1,t2,t3,t,w0,w1; \
\
ref-=ofs; \
w0 = LPC(ref+0); \
w1 = LPC(ref+4); \
t0 = MERGE1(w0,w1,ofs); \
w0 = LPC(ref+8); \
t1 = MERGE1(w1,w0,ofs); \
if (sz==16) { \
w1 = LPC(ref+12); \
t2 = MERGE1(w0,w1,ofs); \
w0 = LPC(ref+16); \
t3 = MERGE1(w1,w0,ofs); \
} \
do { \
ref += stride; \
\
w0 = LPC(ref+0); \
w1 = LPC(ref+4); \
t = MERGE1(w0,w1,ofs); \
OP(LP(dest+0), avg2(t0,t)); t0 = t; \
w0 = LPC(ref+8); \
t = MERGE1(w1,w0,ofs); \
OP(LP(dest+4), avg2(t1,t)); t1 = t; \
if (sz==16) { \
w1 = LPC(ref+12); \
t = MERGE1(w0,w1,ofs); \
OP(LP(dest+8), avg2(t2,t)); t2 = t; \
w0 = LPC(ref+16); \
t = MERGE1(w1,w0,ofs); \
OP(LP(dest+12), avg2(t3,t)); t3 = t; \
} \
dest+=stride; \
} while(--height); \
}
#define OP_X0(sz,avg2) OP_X(0,sz,avg2)
#define OP_XY0(sz,PACK) OP_XY(0,sz,PACK)
#define OP_XY(ofs,sz,PACK) \
{ \
uint32_t t2,t3,w0,w1; \
uint32_t a0,a1,a2,a3,a4,a5,a6,a7; \
\
ref -= ofs; \
w0 = LPC(ref+0); \
w1 = LPC(ref+4); \
UNPACK(a0,a1,MERGE1(w0,w1,ofs),MERGE2(w0,w1,ofs)); \
w0 = LPC(ref+8); \
UNPACK(a2,a3,MERGE1(w1,w0,ofs),MERGE2(w1,w0,ofs)); \
if (sz==16) { \
w1 = LPC(ref+12); \
UNPACK(a4,a5,MERGE1(w0,w1,ofs),MERGE2(w0,w1,ofs)); \
w0 = LPC(ref+16); \
UNPACK(a6,a7,MERGE1(w1,w0,ofs),MERGE2(w1,w0,ofs)); \
} \
do { \
ref+=stride; \
w0 = LPC(ref+0); \
w1 = LPC(ref+4); \
UNPACK(t2,t3,MERGE1(w0,w1,ofs),MERGE2(w0,w1,ofs)); \
OP(LP(dest+0),PACK(a0,a1,t2,t3)); \
a0 = t2; a1 = t3; \
w0 = LPC(ref+8); \
UNPACK(t2,t3,MERGE1(w1,w0,ofs),MERGE2(w1,w0,ofs)); \
OP(LP(dest+4),PACK(a2,a3,t2,t3)); \
a2 = t2; a3 = t3; \
if (sz==16) { \
w1 = LPC(ref+12); \
UNPACK(t2,t3,MERGE1(w0,w1,ofs),MERGE2(w0,w1,ofs)); \
OP(LP(dest+8),PACK(a4,a5,t2,t3)); \
a4 = t2; a5 = t3; \
w0 = LPC(ref+16); \
UNPACK(t2,t3,MERGE1(w1,w0,ofs),MERGE2(w1,w0,ofs)); \
OP(LP(dest+12),PACK(a6,a7,t2,t3)); \
a6 = t2; a7 = t3; \
} \
dest+=stride; \
} while(--height); \
}
#define put_pixels8_c ff_put_rnd_pixels8_o
#define put_pixels16_c ff_put_rnd_pixels16_o
#define avg_pixels8_c ff_avg_rnd_pixels8_o
#define avg_pixels16_c ff_avg_rnd_pixels16_o
#define put_no_rnd_pixels8_c ff_put_rnd_pixels8_o
#define put_no_rnd_pixels16_c ff_put_rnd_pixels16_o
#define avg_no_rnd_pixels16_c ff_avg_rnd_pixels16_o
#if CONFIG_HPELDSP
#include "qpel.c"
#endif
av_cold void ff_dsputil_init_align(DSPContext *c, AVCodecContext *avctx)
{
#if CONFIG_HPELDSP
#define dspfunc(PFX, IDX, NUM) \
c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_sh4; \
c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_sh4; \
c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_sh4; \
c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_sh4; \
c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_sh4; \
c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_sh4; \
c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_sh4; \
c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_sh4; \
c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_sh4; \
c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_sh4; \
c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_sh4; \
c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_sh4; \
c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_sh4; \
c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_sh4; \
c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_sh4; \
c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_sh4
dspfunc(put_qpel, 0, 16);
dspfunc(put_no_rnd_qpel, 0, 16);
dspfunc(avg_qpel, 0, 16);
/* dspfunc(avg_no_rnd_qpel, 0, 16); */
dspfunc(put_qpel, 1, 8);
dspfunc(put_no_rnd_qpel, 1, 8);
dspfunc(avg_qpel, 1, 8);
/* dspfunc(avg_no_rnd_qpel, 1, 8); */
#undef dspfunc
c->put_mspel_pixels_tab[0]= put_mspel8_mc00_sh4;
c->put_mspel_pixels_tab[1]= put_mspel8_mc10_sh4;
c->put_mspel_pixels_tab[2]= put_mspel8_mc20_sh4;
c->put_mspel_pixels_tab[3]= put_mspel8_mc30_sh4;
c->put_mspel_pixels_tab[4]= put_mspel8_mc02_sh4;
c->put_mspel_pixels_tab[5]= put_mspel8_mc12_sh4;
c->put_mspel_pixels_tab[6]= put_mspel8_mc22_sh4;
c->put_mspel_pixels_tab[7]= put_mspel8_mc32_sh4;
c->gmc1 = gmc1_c;
#endif
}

@ -94,7 +94,6 @@ av_cold void ff_dsputil_init_sh4(DSPContext *c, AVCodecContext *avctx)
{
const int idct_algo= avctx->idct_algo;
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
ff_dsputil_init_align(c,avctx);
if (!high_bit_depth)
c->clear_blocks = clear_blocks_sh4;

@ -24,15 +24,5 @@
#include "libavcodec/hpeldsp.h"
void ff_idct_sh4(int16_t *block);
void ff_dsputil_init_align(DSPContext* c, AVCodecContext *avctx);
void ff_put_rnd_pixels8_o(uint8_t *dest, const uint8_t *ref,
const ptrdiff_t stride, int height);
void ff_put_rnd_pixels16_o(uint8_t *dest, const uint8_t *ref,
const ptrdiff_t stride, int height);
void ff_avg_rnd_pixels8_o (uint8_t *dest, const uint8_t *ref,
const ptrdiff_t stride, int height);
void ff_avg_rnd_pixels16_o(uint8_t *dest, const uint8_t *ref,
const ptrdiff_t stride, int height);
#endif /* AVCODEC_SH4_DSPUTIL_SH4_H */

@ -1,132 +0,0 @@
/*
* aligned/packed access motion
*
* Copyright (c) 2001-2003 BERO <bero@geocities.co.jp>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <assert.h>
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavcodec/h264chroma.h"
#define H264_CHROMA_MC(OPNAME, OP)\
static void OPNAME ## h264_chroma_mc2_sh4(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
const int A=(8-x)*(8-y);\
const int B=( x)*(8-y);\
const int C=(8-x)*( y);\
const int D=( x)*( y);\
\
assert(x<8 && y<8 && x>=0 && y>=0);\
\
do {\
int t0,t1,t2,t3; \
uint8_t *s0 = src; \
uint8_t *s1 = src+stride; \
t0 = *s0++; t2 = *s1++; \
t1 = *s0++; t3 = *s1++; \
OP(dst[0], (A*t0 + B*t1 + C*t2 + D*t3));\
t0 = *s0++; t2 = *s1++; \
OP(dst[1], (A*t1 + B*t0 + C*t3 + D*t2));\
dst+= stride;\
src+= stride;\
}while(--h);\
}\
\
static void OPNAME ## h264_chroma_mc4_sh4(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
const int A=(8-x)*(8-y);\
const int B=( x)*(8-y);\
const int C=(8-x)*( y);\
const int D=( x)*( y);\
\
assert(x<8 && y<8 && x>=0 && y>=0);\
\
do {\
int t0,t1,t2,t3; \
uint8_t *s0 = src; \
uint8_t *s1 = src+stride; \
t0 = *s0++; t2 = *s1++; \
t1 = *s0++; t3 = *s1++; \
OP(dst[0], (A*t0 + B*t1 + C*t2 + D*t3));\
t0 = *s0++; t2 = *s1++; \
OP(dst[1], (A*t1 + B*t0 + C*t3 + D*t2));\
t1 = *s0++; t3 = *s1++; \
OP(dst[2], (A*t0 + B*t1 + C*t2 + D*t3));\
t0 = *s0++; t2 = *s1++; \
OP(dst[3], (A*t1 + B*t0 + C*t3 + D*t2));\
dst+= stride;\
src+= stride;\
}while(--h);\
}\
\
static void OPNAME ## h264_chroma_mc8_sh4(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
const int A=(8-x)*(8-y);\
const int B=( x)*(8-y);\
const int C=(8-x)*( y);\
const int D=( x)*( y);\
\
assert(x<8 && y<8 && x>=0 && y>=0);\
\
do {\
int t0,t1,t2,t3; \
uint8_t *s0 = src; \
uint8_t *s1 = src+stride; \
t0 = *s0++; t2 = *s1++; \
t1 = *s0++; t3 = *s1++; \
OP(dst[0], (A*t0 + B*t1 + C*t2 + D*t3));\
t0 = *s0++; t2 = *s1++; \
OP(dst[1], (A*t1 + B*t0 + C*t3 + D*t2));\
t1 = *s0++; t3 = *s1++; \
OP(dst[2], (A*t0 + B*t1 + C*t2 + D*t3));\
t0 = *s0++; t2 = *s1++; \
OP(dst[3], (A*t1 + B*t0 + C*t3 + D*t2));\
t1 = *s0++; t3 = *s1++; \
OP(dst[4], (A*t0 + B*t1 + C*t2 + D*t3));\
t0 = *s0++; t2 = *s1++; \
OP(dst[5], (A*t1 + B*t0 + C*t3 + D*t2));\
t1 = *s0++; t3 = *s1++; \
OP(dst[6], (A*t0 + B*t1 + C*t2 + D*t3));\
t0 = *s0++; t2 = *s1++; \
OP(dst[7], (A*t1 + B*t0 + C*t3 + D*t2));\
dst+= stride;\
src+= stride;\
}while(--h);\
}
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
#define op_put(a, b) a = (((b) + 32)>>6)
H264_CHROMA_MC(put_ , op_put)
H264_CHROMA_MC(avg_ , op_avg)
#undef op_avg
#undef op_put
av_cold void ff_h264chroma_init_sh4(H264ChromaContext *c, int bit_depth)
{
const int high_bit_depth = bit_depth > 8;
if (!high_bit_depth) {
c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_sh4;
c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_sh4;
c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_sh4;
c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_sh4;
c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_sh4;
c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_sh4;
}
}

@ -1,351 +0,0 @@
/*
* aligned/packed access motion
*
* Copyright (c) 2001-2003 BERO <bero@geocities.co.jp>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/dsputil.h"
#include "libavcodec/bit_depth_template.c" // for BYTE_VEC32
#include "libavcodec/hpeldsp.h"
#include "libavcodec/rnd_avg.h"
#include "dsputil_sh4.h"
#define LP(p) *(uint32_t*)(p)
#define LPC(p) *(const uint32_t*)(p)
#define UNPACK(ph,pl,tt0,tt1) do { \
uint32_t t0,t1; t0=tt0;t1=tt1; \
ph = ( (t0 & ~BYTE_VEC32(0x03))>>2) + ( (t1 & ~BYTE_VEC32(0x03))>>2); \
pl = (t0 & BYTE_VEC32(0x03)) + (t1 & BYTE_VEC32(0x03)); } while(0)
#define rnd_PACK(ph,pl,nph,npl) ph + nph + (((pl + npl + BYTE_VEC32(0x02))>>2) & BYTE_VEC32(0x03))
#define no_rnd_PACK(ph,pl,nph,npl) ph + nph + (((pl + npl + BYTE_VEC32(0x01))>>2) & BYTE_VEC32(0x03))
/* little-endian */
#define MERGE1(a,b,ofs) (ofs==0)?a:( ((a)>>(8*ofs))|((b)<<(32-8*ofs)) )
#define MERGE2(a,b,ofs) (ofs==3)?b:( ((a)>>(8*(ofs+1)))|((b)<<(32-8*(ofs+1))) )
/* big
#define MERGE1(a,b,ofs) (ofs==0)?a:( ((a)<<(8*ofs))|((b)>>(32-8*ofs)) )
#define MERGE2(a,b,ofs) (ofs==3)?b:( ((a)<<(8+8*ofs))|((b)>>(32-8-8*ofs)) )
*/
#define put(d,s) d = s
#define avg(d,s) d = rnd_avg32(s,d)
#define OP_C4(ofs) \
ref-=ofs; \
do { \
OP(LP(dest),MERGE1(LPC(ref),LPC(ref+4),ofs)); \
ref+=stride; \
dest+=stride; \
} while(--height)
#define OP_C40() \
do { \
OP(LP(dest),LPC(ref)); \
ref+=stride; \
dest+=stride; \
} while(--height)
#define OP put
static void put_pixels4_c(uint8_t *dest, const uint8_t *ref,
const int stride, int height)
{
switch((int)ref&3){
case 0: OP_C40(); return;
case 1: OP_C4(1); return;
case 2: OP_C4(2); return;
case 3: OP_C4(3); return;
}
}
#undef OP
#define OP avg
static void avg_pixels4_c(uint8_t *dest, const uint8_t *ref,
const int stride, int height)
{
switch((int)ref&3){
case 0: OP_C40(); return;
case 1: OP_C4(1); return;
case 2: OP_C4(2); return;
case 3: OP_C4(3); return;
}
}
#undef OP
#define OP_C(ofs,sz,avg2) \
{ \
ref-=ofs; \
do { \
uint32_t t0,t1; \
t0 = LPC(ref+0); \
t1 = LPC(ref+4); \
OP(LP(dest+0), MERGE1(t0,t1,ofs)); \
t0 = LPC(ref+8); \
OP(LP(dest+4), MERGE1(t1,t0,ofs)); \
if (sz==16) { \
t1 = LPC(ref+12); \
OP(LP(dest+8), MERGE1(t0,t1,ofs)); \
t0 = LPC(ref+16); \
OP(LP(dest+12), MERGE1(t1,t0,ofs)); \
} \
ref+=stride; \
dest+= stride; \
} while(--height); \
}
/* aligned */
#define OP_C0(sz,avg2) \
{ \
do { \
OP(LP(dest+0), LPC(ref+0)); \
OP(LP(dest+4), LPC(ref+4)); \
if (sz==16) { \
OP(LP(dest+8), LPC(ref+8)); \
OP(LP(dest+12), LPC(ref+12)); \
} \
ref+=stride; \
dest+= stride; \
} while(--height); \
}
#define OP_X(ofs,sz,avg2) \
{ \
ref-=ofs; \
do { \
uint32_t t0,t1; \
t0 = LPC(ref+0); \
t1 = LPC(ref+4); \
OP(LP(dest+0), avg2(MERGE1(t0,t1,ofs),MERGE2(t0,t1,ofs))); \
t0 = LPC(ref+8); \
OP(LP(dest+4), avg2(MERGE1(t1,t0,ofs),MERGE2(t1,t0,ofs))); \
if (sz==16) { \
t1 = LPC(ref+12); \
OP(LP(dest+8), avg2(MERGE1(t0,t1,ofs),MERGE2(t0,t1,ofs))); \
t0 = LPC(ref+16); \
OP(LP(dest+12), avg2(MERGE1(t1,t0,ofs),MERGE2(t1,t0,ofs))); \
} \
ref+=stride; \
dest+= stride; \
} while(--height); \
}
/* aligned */
#define OP_Y0(sz,avg2) \
{ \
uint32_t t0,t1,t2,t3,t; \
\
t0 = LPC(ref+0); \
t1 = LPC(ref+4); \
if (sz==16) { \
t2 = LPC(ref+8); \
t3 = LPC(ref+12); \
} \
do { \
ref += stride; \
\
t = LPC(ref+0); \
OP(LP(dest+0), avg2(t0,t)); t0 = t; \
t = LPC(ref+4); \
OP(LP(dest+4), avg2(t1,t)); t1 = t; \
if (sz==16) { \
t = LPC(ref+8); \
OP(LP(dest+8), avg2(t2,t)); t2 = t; \
t = LPC(ref+12); \
OP(LP(dest+12), avg2(t3,t)); t3 = t; \
} \
dest+= stride; \
} while(--height); \
}
#define OP_Y(ofs,sz,avg2) \
{ \
uint32_t t0,t1,t2,t3,t,w0,w1; \
\
ref-=ofs; \
w0 = LPC(ref+0); \
w1 = LPC(ref+4); \
t0 = MERGE1(w0,w1,ofs); \
w0 = LPC(ref+8); \
t1 = MERGE1(w1,w0,ofs); \
if (sz==16) { \
w1 = LPC(ref+12); \
t2 = MERGE1(w0,w1,ofs); \
w0 = LPC(ref+16); \
t3 = MERGE1(w1,w0,ofs); \
} \
do { \
ref += stride; \
\
w0 = LPC(ref+0); \
w1 = LPC(ref+4); \
t = MERGE1(w0,w1,ofs); \
OP(LP(dest+0), avg2(t0,t)); t0 = t; \
w0 = LPC(ref+8); \
t = MERGE1(w1,w0,ofs); \
OP(LP(dest+4), avg2(t1,t)); t1 = t; \
if (sz==16) { \
w1 = LPC(ref+12); \
t = MERGE1(w0,w1,ofs); \
OP(LP(dest+8), avg2(t2,t)); t2 = t; \
w0 = LPC(ref+16); \
t = MERGE1(w1,w0,ofs); \
OP(LP(dest+12), avg2(t3,t)); t3 = t; \
} \
dest+=stride; \
} while(--height); \
}
#define OP_X0(sz,avg2) OP_X(0,sz,avg2)
#define OP_XY0(sz,PACK) OP_XY(0,sz,PACK)
#define OP_XY(ofs,sz,PACK) \
{ \
uint32_t t2,t3,w0,w1; \
uint32_t a0,a1,a2,a3,a4,a5,a6,a7; \
\
ref -= ofs; \
w0 = LPC(ref+0); \
w1 = LPC(ref+4); \
UNPACK(a0,a1,MERGE1(w0,w1,ofs),MERGE2(w0,w1,ofs)); \
w0 = LPC(ref+8); \
UNPACK(a2,a3,MERGE1(w1,w0,ofs),MERGE2(w1,w0,ofs)); \
if (sz==16) { \
w1 = LPC(ref+12); \
UNPACK(a4,a5,MERGE1(w0,w1,ofs),MERGE2(w0,w1,ofs)); \
w0 = LPC(ref+16); \
UNPACK(a6,a7,MERGE1(w1,w0,ofs),MERGE2(w1,w0,ofs)); \
} \
do { \
ref+=stride; \
w0 = LPC(ref+0); \
w1 = LPC(ref+4); \
UNPACK(t2,t3,MERGE1(w0,w1,ofs),MERGE2(w0,w1,ofs)); \
OP(LP(dest+0),PACK(a0,a1,t2,t3)); \
a0 = t2; a1 = t3; \
w0 = LPC(ref+8); \
UNPACK(t2,t3,MERGE1(w1,w0,ofs),MERGE2(w1,w0,ofs)); \
OP(LP(dest+4),PACK(a2,a3,t2,t3)); \
a2 = t2; a3 = t3; \
if (sz==16) { \
w1 = LPC(ref+12); \
UNPACK(t2,t3,MERGE1(w0,w1,ofs),MERGE2(w0,w1,ofs)); \
OP(LP(dest+8),PACK(a4,a5,t2,t3)); \
a4 = t2; a5 = t3; \
w0 = LPC(ref+16); \
UNPACK(t2,t3,MERGE1(w1,w0,ofs),MERGE2(w1,w0,ofs)); \
OP(LP(dest+12),PACK(a6,a7,t2,t3)); \
a6 = t2; a7 = t3; \
} \
dest+=stride; \
} while(--height); \
}
#define DEFFUNC(prefix, op, rnd, xy, sz, OP_N, avgfunc) \
prefix void op##_##rnd##_pixels##sz##_##xy(uint8_t *dest, const uint8_t *ref, \
const ptrdiff_t stride, int height) \
{ \
switch((int)ref&3) { \
case 0:OP_N##0(sz,rnd##_##avgfunc); return; \
case 1:OP_N(1,sz,rnd##_##avgfunc); return; \
case 2:OP_N(2,sz,rnd##_##avgfunc); return; \
case 3:OP_N(3,sz,rnd##_##avgfunc); return; \
} \
}
#define OP put
DEFFUNC( ,ff_put,rnd,o,8,OP_C,avg32)
DEFFUNC(static,put, rnd,x,8,OP_X,avg32)
DEFFUNC(static,put,no_rnd,x,8,OP_X,avg32)
DEFFUNC(static,put, rnd,y,8,OP_Y,avg32)
DEFFUNC(static,put,no_rnd,y,8,OP_Y,avg32)
DEFFUNC(static,put, rnd,xy,8,OP_XY,PACK)
DEFFUNC(static,put,no_rnd,xy,8,OP_XY,PACK)
DEFFUNC( ,ff_put,rnd,o,16,OP_C,avg32)
DEFFUNC(static,put, rnd,x,16,OP_X,avg32)
DEFFUNC(static,put,no_rnd,x,16,OP_X,avg32)
DEFFUNC(static,put, rnd,y,16,OP_Y,avg32)
DEFFUNC(static,put,no_rnd,y,16,OP_Y,avg32)
DEFFUNC(static,put, rnd,xy,16,OP_XY,PACK)
DEFFUNC(static,put,no_rnd,xy,16,OP_XY,PACK)
#undef OP
#define OP avg
DEFFUNC( ,ff_avg,rnd,o,8,OP_C,avg32)
DEFFUNC(static,avg, rnd,x,8,OP_X,avg32)
DEFFUNC(static,avg, rnd,y,8,OP_Y,avg32)
DEFFUNC(static,avg, rnd,xy,8,OP_XY,PACK)
DEFFUNC( ,ff_avg,rnd,o,16,OP_C,avg32)
DEFFUNC(static,avg, rnd,x,16,OP_X,avg32)
DEFFUNC(static,avg,no_rnd,x,16,OP_X,avg32)
DEFFUNC(static,avg, rnd,y,16,OP_Y,avg32)
DEFFUNC(static,avg,no_rnd,y,16,OP_Y,avg32)
DEFFUNC(static,avg, rnd,xy,16,OP_XY,PACK)
DEFFUNC(static,avg,no_rnd,xy,16,OP_XY,PACK)
#undef OP
#define ff_put_no_rnd_pixels8_o ff_put_rnd_pixels8_o
#define ff_put_no_rnd_pixels16_o ff_put_rnd_pixels16_o
#define ff_avg_no_rnd_pixels16_o ff_avg_rnd_pixels16_o
av_cold void ff_hpeldsp_init_sh4(HpelDSPContext *c, int flags)
{
c->put_pixels_tab[0][0] = ff_put_rnd_pixels16_o;
c->put_pixels_tab[0][1] = put_rnd_pixels16_x;
c->put_pixels_tab[0][2] = put_rnd_pixels16_y;
c->put_pixels_tab[0][3] = put_rnd_pixels16_xy;
c->put_pixels_tab[1][0] = ff_put_rnd_pixels8_o;
c->put_pixels_tab[1][1] = put_rnd_pixels8_x;
c->put_pixels_tab[1][2] = put_rnd_pixels8_y;
c->put_pixels_tab[1][3] = put_rnd_pixels8_xy;
c->put_no_rnd_pixels_tab[0][0] = ff_put_no_rnd_pixels16_o;
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x;
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y;
c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy;
c->put_no_rnd_pixels_tab[1][0] = ff_put_no_rnd_pixels8_o;
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x;
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y;
c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy;
c->avg_pixels_tab[0][0] = ff_avg_rnd_pixels16_o;
c->avg_pixels_tab[0][1] = avg_rnd_pixels16_x;
c->avg_pixels_tab[0][2] = avg_rnd_pixels16_y;
c->avg_pixels_tab[0][3] = avg_rnd_pixels16_xy;
c->avg_pixels_tab[1][0] = ff_avg_rnd_pixels8_o;
c->avg_pixels_tab[1][1] = avg_rnd_pixels8_x;
c->avg_pixels_tab[1][2] = avg_rnd_pixels8_y;
c->avg_pixels_tab[1][3] = avg_rnd_pixels8_xy;
c->avg_no_rnd_pixels_tab[0] = ff_avg_no_rnd_pixels16_o;
c->avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels16_x;
c->avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels16_y;
c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy;
}

@ -1,862 +0,0 @@
/*
* This is optimized for sh, which have post increment addressing (*p++).
* Some CPU may be index (p[n]) faster than post increment (*p++).
*
* copyright (c) 2001-2003 BERO <bero@geocities.co.jp>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/common.h"
#include "libavcodec/copy_block.h"
#include "libavcodec/rnd_avg.h"
#define PIXOP2(OPNAME, OP) \
\
static inline void OPNAME ## _pixels4_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
{\
do {\
OP(LP(dst ),rnd_avg32(LPC(src1 ),LPC(src2 )) ); \
src1+=src_stride1; \
src2+=src_stride2; \
dst+=dst_stride; \
} while(--h); \
}\
\
static inline void OPNAME ## _pixels4_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
{\
do {\
OP(LP(dst ),rnd_avg32(AV_RN32(src1 ),LPC(src2 )) ); \
src1+=src_stride1; \
src2+=src_stride2; \
dst+=dst_stride; \
} while(--h); \
}\
\
static inline void OPNAME ## _no_rnd_pixels16_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
{\
do {\
OP(LP(dst ),no_rnd_avg32(AV_RN32(src1 ),LPC(src2 )) ); \
OP(LP(dst+4),no_rnd_avg32(AV_RN32(src1+4),LPC(src2+4)) ); \
OP(LP(dst+8),no_rnd_avg32(AV_RN32(src1+8),LPC(src2+8)) ); \
OP(LP(dst+12),no_rnd_avg32(AV_RN32(src1+12),LPC(src2+12)) ); \
src1+=src_stride1; \
src2+=src_stride2; \
dst+=dst_stride; \
} while(--h); \
}\
\
static inline void OPNAME ## _pixels16_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
{\
do {\
OP(LP(dst ),rnd_avg32(AV_RN32(src1 ),LPC(src2 )) ); \
OP(LP(dst+4),rnd_avg32(AV_RN32(src1+4),LPC(src2+4)) ); \
OP(LP(dst+8),rnd_avg32(AV_RN32(src1+8),LPC(src2+8)) ); \
OP(LP(dst+12),rnd_avg32(AV_RN32(src1+12),LPC(src2+12)) ); \
src1+=src_stride1; \
src2+=src_stride2; \
dst+=dst_stride; \
} while(--h); \
}\
\
static inline void OPNAME ## _no_rnd_pixels8_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
{\
do { /* onlye src2 aligned */\
OP(LP(dst ),no_rnd_avg32(AV_RN32(src1 ),LPC(src2 )) ); \
OP(LP(dst+4),no_rnd_avg32(AV_RN32(src1+4),LPC(src2+4)) ); \
src1+=src_stride1; \
src2+=src_stride2; \
dst+=dst_stride; \
} while(--h); \
}\
\
static inline void OPNAME ## _pixels8_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
{\
do {\
OP(LP(dst ),rnd_avg32(AV_RN32(src1 ),LPC(src2 )) ); \
OP(LP(dst+4),rnd_avg32(AV_RN32(src1+4),LPC(src2+4)) ); \
src1+=src_stride1; \
src2+=src_stride2; \
dst+=dst_stride; \
} while(--h); \
}\
\
static inline void OPNAME ## _no_rnd_pixels8_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
{\
do {\
OP(LP(dst ),no_rnd_avg32(LPC(src1 ),LPC(src2 )) ); \
OP(LP(dst+4),no_rnd_avg32(LPC(src1+4),LPC(src2+4)) ); \
src1+=src_stride1; \
src2+=src_stride2; \
dst+=dst_stride; \
} while(--h); \
}\
\
static inline void OPNAME ## _pixels8_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
{\
do {\
OP(LP(dst ),rnd_avg32(LPC(src1 ),LPC(src2 )) ); \
OP(LP(dst+4),rnd_avg32(LPC(src1+4),LPC(src2+4)) ); \
src1+=src_stride1; \
src2+=src_stride2; \
dst+=dst_stride; \
} while(--h); \
}\
\
static inline void OPNAME ## _no_rnd_pixels16_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
{\
do {\
OP(LP(dst ),no_rnd_avg32(LPC(src1 ),LPC(src2 )) ); \
OP(LP(dst+4),no_rnd_avg32(LPC(src1+4),LPC(src2+4)) ); \
OP(LP(dst+8),no_rnd_avg32(LPC(src1+8),LPC(src2+8)) ); \
OP(LP(dst+12),no_rnd_avg32(LPC(src1+12),LPC(src2+12)) ); \
src1+=src_stride1; \
src2+=src_stride2; \
dst+=dst_stride; \
} while(--h); \
}\
\
static inline void OPNAME ## _pixels16_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
{\
do {\
OP(LP(dst ),rnd_avg32(LPC(src1 ),LPC(src2 )) ); \
OP(LP(dst+4),rnd_avg32(LPC(src1+4),LPC(src2+4)) ); \
OP(LP(dst+8),rnd_avg32(LPC(src1+8),LPC(src2+8)) ); \
OP(LP(dst+12),rnd_avg32(LPC(src1+12),LPC(src2+12)) ); \
src1+=src_stride1; \
src2+=src_stride2; \
dst+=dst_stride; \
} while(--h); \
}\
\
static inline void OPNAME ## _no_rnd_pixels16_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
{ OPNAME ## _no_rnd_pixels16_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
\
static inline void OPNAME ## _pixels16_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
{ OPNAME ## _pixels16_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
\
static inline void OPNAME ## _no_rnd_pixels8_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
{ OPNAME ## _no_rnd_pixels8_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
\
static inline void OPNAME ## _pixels8_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
{ OPNAME ## _pixels8_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
\
static inline void OPNAME ## _pixels8_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
do { \
uint32_t a0,a1,a2,a3; \
UNPACK(a0,a1,LPC(src1),LPC(src2)); \
UNPACK(a2,a3,LPC(src3),LPC(src4)); \
OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
UNPACK(a0,a1,LPC(src1+4),LPC(src2+4)); \
UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
OP(LP(dst+4),rnd_PACK(a0,a1,a2,a3)); \
src1+=src_stride1;\
src2+=src_stride2;\
src3+=src_stride3;\
src4+=src_stride4;\
dst+=dst_stride;\
} while(--h); \
} \
\
static inline void OPNAME ## _no_rnd_pixels8_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
do { \
uint32_t a0,a1,a2,a3; \
UNPACK(a0,a1,LPC(src1),LPC(src2)); \
UNPACK(a2,a3,LPC(src3),LPC(src4)); \
OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
UNPACK(a0,a1,LPC(src1+4),LPC(src2+4)); \
UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
src1+=src_stride1;\
src2+=src_stride2;\
src3+=src_stride3;\
src4+=src_stride4;\
dst+=dst_stride;\
} while(--h); \
} \
\
static inline void OPNAME ## _pixels8_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
do { \
uint32_t a0,a1,a2,a3; /* src1 only not aligned */\
UNPACK(a0,a1,AV_RN32(src1),LPC(src2)); \
UNPACK(a2,a3,LPC(src3),LPC(src4)); \
OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
UNPACK(a0,a1,AV_RN32(src1+4),LPC(src2+4)); \
UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
OP(LP(dst+4),rnd_PACK(a0,a1,a2,a3)); \
src1+=src_stride1;\
src2+=src_stride2;\
src3+=src_stride3;\
src4+=src_stride4;\
dst+=dst_stride;\
} while(--h); \
} \
\
static inline void OPNAME ## _no_rnd_pixels8_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
do { \
uint32_t a0,a1,a2,a3; \
UNPACK(a0,a1,AV_RN32(src1),LPC(src2)); \
UNPACK(a2,a3,LPC(src3),LPC(src4)); \
OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
UNPACK(a0,a1,AV_RN32(src1+4),LPC(src2+4)); \
UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
src1+=src_stride1;\
src2+=src_stride2;\
src3+=src_stride3;\
src4+=src_stride4;\
dst+=dst_stride;\
} while(--h); \
} \
\
static inline void OPNAME ## _pixels16_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
do { \
uint32_t a0,a1,a2,a3; \
UNPACK(a0,a1,LPC(src1),LPC(src2)); \
UNPACK(a2,a3,LPC(src3),LPC(src4)); \
OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
UNPACK(a0,a1,LPC(src1+4),LPC(src2+4)); \
UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
UNPACK(a0,a1,LPC(src1+8),LPC(src2+8)); \
UNPACK(a2,a3,LPC(src3+8),LPC(src4+8)); \
OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
UNPACK(a0,a1,LPC(src1+12),LPC(src2+12)); \
UNPACK(a2,a3,LPC(src3+12),LPC(src4+12)); \
OP(LP(dst+12),rnd_PACK(a0,a1,a2,a3)); \
src1+=src_stride1;\
src2+=src_stride2;\
src3+=src_stride3;\
src4+=src_stride4;\
dst+=dst_stride;\
} while(--h); \
} \
\
static inline void OPNAME ## _no_rnd_pixels16_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
do { \
uint32_t a0,a1,a2,a3; \
UNPACK(a0,a1,LPC(src1),LPC(src2)); \
UNPACK(a2,a3,LPC(src3),LPC(src4)); \
OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
UNPACK(a0,a1,LPC(src1+4),LPC(src2+4)); \
UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
UNPACK(a0,a1,LPC(src1+8),LPC(src2+8)); \
UNPACK(a2,a3,LPC(src3+8),LPC(src4+8)); \
OP(LP(dst+8),no_rnd_PACK(a0,a1,a2,a3)); \
UNPACK(a0,a1,LPC(src1+12),LPC(src2+12)); \
UNPACK(a2,a3,LPC(src3+12),LPC(src4+12)); \
OP(LP(dst+12),no_rnd_PACK(a0,a1,a2,a3)); \
src1+=src_stride1;\
src2+=src_stride2;\
src3+=src_stride3;\
src4+=src_stride4;\
dst+=dst_stride;\
} while(--h); \
} \
\
static inline void OPNAME ## _pixels16_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
do { /* src1 is unaligned */\
uint32_t a0,a1,a2,a3; \
UNPACK(a0,a1,AV_RN32(src1),LPC(src2)); \
UNPACK(a2,a3,LPC(src3),LPC(src4)); \
OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
UNPACK(a0,a1,AV_RN32(src1+4),LPC(src2+4)); \
UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
UNPACK(a0,a1,AV_RN32(src1+8),LPC(src2+8)); \
UNPACK(a2,a3,LPC(src3+8),LPC(src4+8)); \
OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
UNPACK(a0,a1,AV_RN32(src1+12),LPC(src2+12)); \
UNPACK(a2,a3,LPC(src3+12),LPC(src4+12)); \
OP(LP(dst+12),rnd_PACK(a0,a1,a2,a3)); \
src1+=src_stride1;\
src2+=src_stride2;\
src3+=src_stride3;\
src4+=src_stride4;\
dst+=dst_stride;\
} while(--h); \
} \
\
static inline void OPNAME ## _no_rnd_pixels16_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
do { \
uint32_t a0,a1,a2,a3; \
UNPACK(a0,a1,AV_RN32(src1),LPC(src2)); \
UNPACK(a2,a3,LPC(src3),LPC(src4)); \
OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
UNPACK(a0,a1,AV_RN32(src1+4),LPC(src2+4)); \
UNPACK(a2,a3,LPC(src3+4),LPC(src4+4)); \
OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
UNPACK(a0,a1,AV_RN32(src1+8),LPC(src2+8)); \
UNPACK(a2,a3,LPC(src3+8),LPC(src4+8)); \
OP(LP(dst+8),no_rnd_PACK(a0,a1,a2,a3)); \
UNPACK(a0,a1,AV_RN32(src1+12),LPC(src2+12)); \
UNPACK(a2,a3,LPC(src3+12),LPC(src4+12)); \
OP(LP(dst+12),no_rnd_PACK(a0,a1,a2,a3)); \
src1+=src_stride1;\
src2+=src_stride2;\
src3+=src_stride3;\
src4+=src_stride4;\
dst+=dst_stride;\
} while(--h); \
} \
\
#define op_avg(a, b) a = rnd_avg32(a,b)
#define op_put(a, b) a = b
PIXOP2(avg, op_avg)
PIXOP2(put, op_put)
#undef op_avg
#undef op_put
#define avg2(a,b) ((a+b+1)>>1)
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
{
const int A=(16-x16)*(16-y16);
const int B=( x16)*(16-y16);
const int C=(16-x16)*( y16);
const int D=( x16)*( y16);
do {
int t0,t1,t2,t3;
uint8_t *s0 = src;
uint8_t *s1 = src+stride;
t0 = *s0++; t2 = *s1++;
t1 = *s0++; t3 = *s1++;
dst[0]= (A*t0 + B*t1 + C*t2 + D*t3 + rounder)>>8;
t0 = *s0++; t2 = *s1++;
dst[1]= (A*t1 + B*t0 + C*t3 + D*t2 + rounder)>>8;
t1 = *s0++; t3 = *s1++;
dst[2]= (A*t0 + B*t1 + C*t2 + D*t3 + rounder)>>8;
t0 = *s0++; t2 = *s1++;
dst[3]= (A*t1 + B*t0 + C*t3 + D*t2 + rounder)>>8;
t1 = *s0++; t3 = *s1++;
dst[4]= (A*t0 + B*t1 + C*t2 + D*t3 + rounder)>>8;
t0 = *s0++; t2 = *s1++;
dst[5]= (A*t1 + B*t0 + C*t3 + D*t2 + rounder)>>8;
t1 = *s0++; t3 = *s1++;
dst[6]= (A*t0 + B*t1 + C*t2 + D*t3 + rounder)>>8;
t0 = *s0++; t2 = *s1++;
dst[7]= (A*t1 + B*t0 + C*t3 + D*t2 + rounder)>>8;
dst+= stride;
src+= stride;
}while(--h);
}
#define QPEL_MC(r, OPNAME, RND, OP) \
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
do {\
uint8_t *s = src; \
int src0,src1,src2,src3,src4,src5,src6,src7,src8;\
src0= *s++;\
src1= *s++;\
src2= *s++;\
src3= *s++;\
src4= *s++;\
OP(dst[0], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
src5= *s++;\
OP(dst[1], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
src6= *s++;\
OP(dst[2], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
src7= *s++;\
OP(dst[3], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
src8= *s++;\
OP(dst[4], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
OP(dst[5], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
OP(dst[6], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
OP(dst[7], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
dst+=dstStride;\
src+=srcStride;\
}while(--h);\
}\
\
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
int w=8;\
do{\
uint8_t *s = src, *d=dst;\
int src0,src1,src2,src3,src4,src5,src6,src7,src8;\
src0 = *s; s+=srcStride; \
src1 = *s; s+=srcStride; \
src2 = *s; s+=srcStride; \
src3 = *s; s+=srcStride; \
src4 = *s; s+=srcStride; \
OP(*d, (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));d+=dstStride;\
src5 = *s; s+=srcStride; \
OP(*d, (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));d+=dstStride;\
src6 = *s; s+=srcStride; \
OP(*d, (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));d+=dstStride;\
src7 = *s; s+=srcStride; \
OP(*d, (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));d+=dstStride;\
src8 = *s; \
OP(*d, (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));d+=dstStride;\
OP(*d, (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));d+=dstStride;\
OP(*d, (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));d+=dstStride;\
OP(*d, (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
dst++;\
src++;\
}while(--w);\
}\
\
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
do {\
uint8_t *s = src;\
int src0,src1,src2,src3,src4,src5,src6,src7,src8;\
int src9,src10,src11,src12,src13,src14,src15,src16;\
src0= *s++;\
src1= *s++;\
src2= *s++;\
src3= *s++;\
src4= *s++;\
OP(dst[ 0], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
src5= *s++;\
OP(dst[ 1], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
src6= *s++;\
OP(dst[ 2], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
src7= *s++;\
OP(dst[ 3], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
src8= *s++;\
OP(dst[ 4], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
src9= *s++;\
OP(dst[ 5], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
src10= *s++;\
OP(dst[ 6], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
src11= *s++;\
OP(dst[ 7], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
src12= *s++;\
OP(dst[ 8], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
src13= *s++;\
OP(dst[ 9], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
src14= *s++;\
OP(dst[10], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
src15= *s++;\
OP(dst[11], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
src16= *s++;\
OP(dst[12], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
OP(dst[13], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
OP(dst[14], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
OP(dst[15], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
dst+=dstStride;\
src+=srcStride;\
}while(--h);\
}\
\
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
int w=16;\
do {\
uint8_t *s = src, *d=dst;\
int src0,src1,src2,src3,src4,src5,src6,src7,src8;\
int src9,src10,src11,src12,src13,src14,src15,src16;\
src0 = *s; s+=srcStride; \
src1 = *s; s+=srcStride; \
src2 = *s; s+=srcStride; \
src3 = *s; s+=srcStride; \
src4 = *s; s+=srcStride; \
OP(*d, (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));d+=dstStride;\
src5 = *s; s+=srcStride; \
OP(*d, (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));d+=dstStride;\
src6 = *s; s+=srcStride; \
OP(*d, (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));d+=dstStride;\
src7 = *s; s+=srcStride; \
OP(*d, (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));d+=dstStride;\
src8 = *s; s+=srcStride; \
OP(*d, (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));d+=dstStride;\
src9 = *s; s+=srcStride; \
OP(*d, (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));d+=dstStride;\
src10 = *s; s+=srcStride; \
OP(*d, (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));d+=dstStride;\
src11 = *s; s+=srcStride; \
OP(*d, (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));d+=dstStride;\
src12 = *s; s+=srcStride; \
OP(*d, (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));d+=dstStride;\
src13 = *s; s+=srcStride; \
OP(*d, (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));d+=dstStride;\
src14 = *s; s+=srcStride; \
OP(*d, (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));d+=dstStride;\
src15 = *s; s+=srcStride; \
OP(*d, (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));d+=dstStride;\
src16 = *s; \
OP(*d, (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));d+=dstStride;\
OP(*d, (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));d+=dstStride;\
OP(*d, (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));d+=dstStride;\
OP(*d, (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
dst++;\
src++;\
}while(--w);\
}\
\
static void OPNAME ## qpel8_mc00_sh4 (uint8_t *dst, uint8_t *src, int stride){\
OPNAME ## pixels8_c(dst, src, stride, 8);\
}\
\
static void OPNAME ## qpel8_mc10_sh4(uint8_t *dst, uint8_t *src, int stride){\
uint8_t half[64];\
put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
OPNAME ## pixels8_l2_aligned2(dst, src, half, stride, stride, 8, 8);\
}\
\
static void OPNAME ## qpel8_mc20_sh4(uint8_t *dst, uint8_t *src, int stride){\
OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
}\
\
static void OPNAME ## qpel8_mc30_sh4(uint8_t *dst, uint8_t *src, int stride){\
uint8_t half[64];\
put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
OPNAME ## pixels8_l2_aligned2(dst, src+1, half, stride, stride, 8, 8);\
}\
\
static void OPNAME ## qpel8_mc01_sh4(uint8_t *dst, uint8_t *src, int stride){\
uint8_t full[16*9];\
uint8_t half[64];\
copy_block9(full, src, 16, stride, 9);\
put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
OPNAME ## pixels8_l2_aligned(dst, full, half, stride, 16, 8, 8);\
}\
\
static void OPNAME ## qpel8_mc02_sh4(uint8_t *dst, uint8_t *src, int stride){\
uint8_t full[16*9];\
copy_block9(full, src, 16, stride, 9);\
OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
}\
\
static void OPNAME ## qpel8_mc03_sh4(uint8_t *dst, uint8_t *src, int stride){\
uint8_t full[16*9];\
uint8_t half[64];\
copy_block9(full, src, 16, stride, 9);\
put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
OPNAME ## pixels8_l2_aligned(dst, full+16, half, stride, 16, 8, 8);\
}\
static void OPNAME ## qpel8_mc11_sh4(uint8_t *dst, uint8_t *src, int stride){\
uint8_t full[16*9];\
uint8_t halfH[72];\
uint8_t halfHV[64];\
copy_block9(full, src, 16, stride, 9);\
put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
put ## RND ## pixels8_l2_aligned(halfH, halfH, full, 8, 8, 16, 9);\
put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
OPNAME ## pixels8_l2_aligned(dst, halfH, halfHV, stride, 8, 8, 8);\
}\
static void OPNAME ## qpel8_mc31_sh4(uint8_t *dst, uint8_t *src, int stride){\
uint8_t full[16*9];\
uint8_t halfH[72];\
uint8_t halfHV[64];\
copy_block9(full, src, 16, stride, 9);\
put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
put ## RND ## pixels8_l2_aligned1(halfH, halfH, full+1, 8, 8, 16, 9);\
put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
OPNAME ## pixels8_l2_aligned(dst, halfH, halfHV, stride, 8, 8, 8);\
}\
static void OPNAME ## qpel8_mc13_sh4(uint8_t *dst, uint8_t *src, int stride){\
uint8_t full[16*9];\
uint8_t halfH[72];\
uint8_t halfHV[64];\
copy_block9(full, src, 16, stride, 9);\
put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
put ## RND ## pixels8_l2_aligned(halfH, halfH, full, 8, 8, 16, 9);\
put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
OPNAME ## pixels8_l2_aligned(dst, halfH+8, halfHV, stride, 8, 8, 8);\
}\
static void OPNAME ## qpel8_mc33_sh4(uint8_t *dst, uint8_t *src, int stride){\
uint8_t full[16*9];\
uint8_t halfH[72];\
uint8_t halfHV[64];\
copy_block9(full, src, 16, stride, 9);\
put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
put ## RND ## pixels8_l2_aligned1(halfH, halfH, full+1, 8, 8, 16, 9);\
put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
OPNAME ## pixels8_l2_aligned(dst, halfH+8, halfHV, stride, 8, 8, 8);\
}\
static void OPNAME ## qpel8_mc21_sh4(uint8_t *dst, uint8_t *src, int stride){\
uint8_t halfH[72];\
uint8_t halfHV[64];\
put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
OPNAME ## pixels8_l2_aligned(dst, halfH, halfHV, stride, 8, 8, 8);\
}\
static void OPNAME ## qpel8_mc23_sh4(uint8_t *dst, uint8_t *src, int stride){\
uint8_t halfH[72];\
uint8_t halfHV[64];\
put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
OPNAME ## pixels8_l2_aligned(dst, halfH+8, halfHV, stride, 8, 8, 8);\
}\
static void OPNAME ## qpel8_mc12_sh4(uint8_t *dst, uint8_t *src, int stride){\
uint8_t full[16*9];\
uint8_t halfH[72];\
copy_block9(full, src, 16, stride, 9);\
put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
put ## RND ## pixels8_l2_aligned(halfH, halfH, full, 8, 8, 16, 9);\
OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
}\
static void OPNAME ## qpel8_mc32_sh4(uint8_t *dst, uint8_t *src, int stride){\
uint8_t full[16*9];\
uint8_t halfH[72];\
copy_block9(full, src, 16, stride, 9);\
put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
put ## RND ## pixels8_l2_aligned1(halfH, halfH, full+1, 8, 8, 16, 9);\
OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
}\
static void OPNAME ## qpel8_mc22_sh4(uint8_t *dst, uint8_t *src, int stride){\
uint8_t halfH[72];\
put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
}\
static void OPNAME ## qpel16_mc00_sh4 (uint8_t *dst, uint8_t *src, int stride){\
OPNAME ## pixels16_c(dst, src, stride, 16);\
}\
\
static void OPNAME ## qpel16_mc10_sh4(uint8_t *dst, uint8_t *src, int stride){\
uint8_t half[256];\
put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
OPNAME ## pixels16_l2_aligned2(dst, src, half, stride, stride, 16, 16);\
}\
\
static void OPNAME ## qpel16_mc20_sh4(uint8_t *dst, uint8_t *src, int stride){\
OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
}\
\
static void OPNAME ## qpel16_mc30_sh4(uint8_t *dst, uint8_t *src, int stride){\
uint8_t half[256];\
put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
OPNAME ## pixels16_l2_aligned2(dst, src+1, half, stride, stride, 16, 16);\
}\
\
static void OPNAME ## qpel16_mc01_sh4(uint8_t *dst, uint8_t *src, int stride){\
uint8_t full[24*17];\
uint8_t half[256];\
copy_block17(full, src, 24, stride, 17);\
put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
OPNAME ## pixels16_l2_aligned(dst, full, half, stride, 24, 16, 16);\
}\
\
static void OPNAME ## qpel16_mc02_sh4(uint8_t *dst, uint8_t *src, int stride){\
uint8_t full[24*17];\
copy_block17(full, src, 24, stride, 17);\
OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
}\
\
static void OPNAME ## qpel16_mc03_sh4(uint8_t *dst, uint8_t *src, int stride){\
uint8_t full[24*17];\
uint8_t half[256];\
copy_block17(full, src, 24, stride, 17);\
put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
OPNAME ## pixels16_l2_aligned(dst, full+24, half, stride, 24, 16, 16);\
}\
static void OPNAME ## qpel16_mc11_sh4(uint8_t *dst, uint8_t *src, int stride){\
uint8_t full[24*17];\
uint8_t halfH[272];\
uint8_t halfHV[256];\
copy_block17(full, src, 24, stride, 17);\
put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
put ## RND ## pixels16_l2_aligned(halfH, halfH, full, 16, 16, 24, 17);\
put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
OPNAME ## pixels16_l2_aligned(dst, halfH, halfHV, stride, 16, 16, 16);\
}\
static void OPNAME ## qpel16_mc31_sh4(uint8_t *dst, uint8_t *src, int stride){\
uint8_t full[24*17];\
uint8_t halfH[272];\
uint8_t halfHV[256];\
copy_block17(full, src, 24, stride, 17);\
put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
put ## RND ## pixels16_l2_aligned1(halfH, halfH, full+1, 16, 16, 24, 17);\
put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
OPNAME ## pixels16_l2_aligned(dst, halfH, halfHV, stride, 16, 16, 16);\
}\
static void OPNAME ## qpel16_mc13_sh4(uint8_t *dst, uint8_t *src, int stride){\
uint8_t full[24*17];\
uint8_t halfH[272];\
uint8_t halfHV[256];\
copy_block17(full, src, 24, stride, 17);\
put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
put ## RND ## pixels16_l2_aligned(halfH, halfH, full, 16, 16, 24, 17);\
put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
OPNAME ## pixels16_l2_aligned(dst, halfH+16, halfHV, stride, 16, 16, 16);\
}\
static void OPNAME ## qpel16_mc33_sh4(uint8_t *dst, uint8_t *src, int stride){\
uint8_t full[24*17];\
uint8_t halfH[272];\
uint8_t halfHV[256];\
copy_block17(full, src, 24, stride, 17);\
put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
put ## RND ## pixels16_l2_aligned1(halfH, halfH, full+1, 16, 16, 24, 17);\
put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
OPNAME ## pixels16_l2_aligned(dst, halfH+16, halfHV, stride, 16, 16, 16);\
}\
static void OPNAME ## qpel16_mc21_sh4(uint8_t *dst, uint8_t *src, int stride){\
uint8_t halfH[272];\
uint8_t halfHV[256];\
put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
OPNAME ## pixels16_l2_aligned(dst, halfH, halfHV, stride, 16, 16, 16);\
}\
static void OPNAME ## qpel16_mc23_sh4(uint8_t *dst, uint8_t *src, int stride){\
uint8_t halfH[272];\
uint8_t halfHV[256];\
put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
OPNAME ## pixels16_l2_aligned(dst, halfH+16, halfHV, stride, 16, 16, 16);\
}\
static void OPNAME ## qpel16_mc12_sh4(uint8_t *dst, uint8_t *src, int stride){\
uint8_t full[24*17];\
uint8_t halfH[272];\
copy_block17(full, src, 24, stride, 17);\
put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
put ## RND ## pixels16_l2_aligned(halfH, halfH, full, 16, 16, 24, 17);\
OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
}\
static void OPNAME ## qpel16_mc32_sh4(uint8_t *dst, uint8_t *src, int stride){\
uint8_t full[24*17];\
uint8_t halfH[272];\
copy_block17(full, src, 24, stride, 17);\
put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
put ## RND ## pixels16_l2_aligned1(halfH, halfH, full+1, 16, 16, 24, 17);\
OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
}\
static void OPNAME ## qpel16_mc22_sh4(uint8_t *dst, uint8_t *src, int stride){\
uint8_t halfH[272];\
put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
}
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
#define op_put(a, b) a = cm[((b) + 16)>>5]
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
QPEL_MC(0, put_ , _ , op_put)
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
QPEL_MC(0, avg_ , _ , op_avg)
//QPEL_MC(1, avg_no_rnd , _ , op_avg)
#undef op_avg
#undef op_avg_no_rnd
#undef op_put
#undef op_put_no_rnd
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
do{
int src_1,src0,src1,src2,src3,src4,src5,src6,src7,src8,src9;
uint8_t *s = src;
src_1 = s[-1];
src0 = *s++;
src1 = *s++;
src2 = *s++;
dst[0]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
src3 = *s++;
dst[1]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
src4 = *s++;
dst[2]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
src5 = *s++;
dst[3]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
src6 = *s++;
dst[4]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
src7 = *s++;
dst[5]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
src8 = *s++;
dst[6]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
src9 = *s++;
dst[7]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
dst+=dstStride;
src+=srcStride;
}while(--h);
}
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
do{
int src_1,src0,src1,src2,src3,src4,src5,src6,src7,src8,src9;
uint8_t *s = src,*d = dst;
src_1 = *(s-srcStride);
src0 = *s; s+=srcStride;
src1 = *s; s+=srcStride;
src2 = *s; s+=srcStride;
*d= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4]; d+=dstStride;
src3 = *s; s+=srcStride;
*d= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4]; d+=dstStride;
src4 = *s; s+=srcStride;
*d= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4]; d+=dstStride;
src5 = *s; s+=srcStride;
*d= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4]; d+=dstStride;
src6 = *s; s+=srcStride;
*d= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4]; d+=dstStride;
src7 = *s; s+=srcStride;
*d= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4]; d+=dstStride;
src8 = *s; s+=srcStride;
*d= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4]; d+=dstStride;
src9 = *s;
*d= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4]; d+=dstStride;
src++;
dst++;
}while(--w);
}
static void put_mspel8_mc00_sh4 (uint8_t *dst, uint8_t *src, int stride){
put_pixels8_c(dst, src, stride, 8);
}
static void put_mspel8_mc10_sh4(uint8_t *dst, uint8_t *src, int stride){
uint8_t half[64];
wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
put_pixels8_l2_aligned2(dst, src, half, stride, stride, 8, 8);
}
static void put_mspel8_mc20_sh4(uint8_t *dst, uint8_t *src, int stride){
wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
}
static void put_mspel8_mc30_sh4(uint8_t *dst, uint8_t *src, int stride){
uint8_t half[64];
wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
put_pixels8_l2_aligned2(dst, src+1, half, stride, stride, 8, 8);
}
static void put_mspel8_mc02_sh4(uint8_t *dst, uint8_t *src, int stride){
wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
}
static void put_mspel8_mc12_sh4(uint8_t *dst, uint8_t *src, int stride){
uint8_t halfH[88];
uint8_t halfV[64];
uint8_t halfHV[64];
wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
put_pixels8_l2_aligned(dst, halfV, halfHV, stride, 8, 8, 8);
}
static void put_mspel8_mc32_sh4(uint8_t *dst, uint8_t *src, int stride){
uint8_t halfH[88];
uint8_t halfV[64];
uint8_t halfHV[64];
wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
put_pixels8_l2_aligned(dst, halfV, halfHV, stride, 8, 8, 8);
}
static void put_mspel8_mc22_sh4(uint8_t *dst, uint8_t *src, int stride){
uint8_t halfH[88];
wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
}
Loading…
Cancel
Save