Merge commit '880e2aa23645ed9871c66ee1cbd00f93c72d2d73'

* commit '880e2aa23645ed9871c66ee1cbd00f93c72d2d73':
  Remove all Blackfin architecture optimizations

Conflicts:
	libavcodec/bfin/dsputil.S
	libavcodec/bfin/dsputil_init.c
	libavcodec/bfin/fdct_bfin.S
	libavcodec/bfin/hpel_pixels_no_rnd.S
	libavcodec/bfin/hpeldsp_init.c
	libavcodec/bfin/idct_bfin.S
	libavcodec/bfin/mathops.h
	libavcodec/bfin/pixels.S
	libavcodec/bfin/pixels.h
	libavcodec/bfin/vp3dsp.S
	libavcodec/bfin/vp3dsp_init.c
	libavutil/bfin/asm.h
	libavutil/bfin/attributes.h
	libswscale/bfin/internal_bfin.S
	libswscale/bfin/swscale_bfin.c
	libswscale/bfin/yuv2rgb_bfin.c
	libswscale/swscale_internal.h
	libswscale/version.h

If someone wants to maintain blackfin support in FFmpeg, please contact
ffmpeg-devel@ffmpeg.org

Merged-by: Michael Niedermayer <michaelni@gmx.at>
pull/76/merge
Michael Niedermayer 11 years ago
commit b55d3bbeed
  1. 10
      libavcodec/bfin/Makefile
  2. 382
      libavcodec/bfin/dsputil.S
  3. 195
      libavcodec/bfin/dsputil_init.c
  4. 327
      libavcodec/bfin/fdct_bfin.S
  5. 81
      libavcodec/bfin/hpel_pixels_no_rnd.S
  6. 146
      libavcodec/bfin/hpeldsp_init.c
  7. 297
      libavcodec/bfin/idct_bfin.S
  8. 44
      libavcodec/bfin/mathops.h
  9. 207
      libavcodec/bfin/pixels.S
  10. 42
      libavcodec/bfin/pixels.h
  11. 273
      libavcodec/bfin/vp3dsp.S
  12. 66
      libavcodec/bfin/vp3dsp_init.c
  13. 12
      libavcodec/dct-test.c
  14. 2
      libavcodec/dsputil.c
  15. 2
      libavcodec/dsputil.h
  16. 2
      libavcodec/hpeldsp.c
  17. 1
      libavcodec/hpeldsp.h
  18. 2
      libavcodec/vp3dsp.c
  19. 1
      libavcodec/vp3dsp.h
  20. 54
      libavutil/bfin/asm.h
  21. 34
      libavutil/bfin/attributes.h
  22. 3
      libswscale/bfin/Makefile
  23. 599
      libswscale/bfin/internal_bfin.S
  24. 84
      libswscale/bfin/swscale_bfin.c
  25. 198
      libswscale/bfin/yuv2rgb_bfin.c
  26. 2
      libswscale/swscale.h
  27. 16
      libswscale/swscale_internal.h
  28. 2
      libswscale/swscale_unscaled.c
  29. 3
      libswscale/version.h
  30. 2
      libswscale/yuv2rgb.c

@ -1,10 +0,0 @@
OBJS-$(CONFIG_DSPUTIL) += bfin/dsputil_init.o \
bfin/dsputil.o \
bfin/fdct_bfin.o \
bfin/idct_bfin.o \
bfin/pixels.o
OBJS-$(CONFIG_HPELDSP) += bfin/hpeldsp_init.o \
bfin/hpel_pixels_no_rnd.o \
bfin/pixels.o
OBJS-$(CONFIG_VP3DSP) += bfin/vp3dsp_init.o \
bfin/vp3dsp.o

@ -1,382 +0,0 @@
/*
* Blackfin Pixel Operations
* Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/bfin/asm.h"
DEFUN(put_pixels_clamped,mL1,
(int16_t *block, uint8_t *dest, int line_size)):
[--SP] = (R7:4);
R4 = 0;
R5.l = 0x00ff;
R5.h = 0x00ff;
I0 = R0; // block
I1 = R1; // dest
R2 += -4; // line_size
M1 = R2;
P0 = 8;
R0 = [I0++];
R1 = [I0++];
R2 = MAX(R0, R4) (V);
LSETUP (ppc$0,ppc$1) LC0=P0;
ppc$0: R2 = MIN(R2, R5) (V);
R3 = MAX(R1, R4) (V);
R3 = MIN(R3, R5) (V) || R0 = [I0++];
R6 = BYTEPACK (R2,R3) || R1 = [I0++];
R2 = MAX(R0, R4) (V) || [I1++] = R6;
R2 = MIN(R2, R5) (V);
R3 = MAX(R1, R4) (V);
R3 = MIN(R3, R5) (V) || R0 = [I0++];
R6 = BYTEPACK (R2,R3) || R1 = [I0++];
ppc$1: R2 = Max(R0, R4) (V) || [I1++M1] = R6;
(R7:4) = [SP++];
RTS;
DEFUN_END(put_pixels_clamped)
DEFUN(add_pixels_clamped,mL1,
(int16_t *block, uint8_t *dest, int line_size)):
[-- SP] = (R7:4);
R4 = 0;
I0 = 0;
R2 += -4; // line_size
M0 = R2;
I1 = R1; // dest
I3 = R0; // block
I2 = R1; // dest
P0 = 8;
M3 = 2;
R0 = [I3++] || R2 = [I1];
R2 = R2 << 8 || R0.H = W[I3--] || R3 = [I1++];
R3 = R3 >> 8 || R1.L = W[I3] || I3 += 4;
R6 = BYTEOP3P(R1:0, R3:2) (LO) || R1.H = W[I3++] || R2 = [I1];
LSETUP(apc$2,apc$3) LC1 = P0;
apc$2: R7 = BYTEOP3P(R1:0, R3:2) (HI, R) || R0 = [I3++] || R3 = [I1++M0];
R2 = R2 << 8 || R0.H = W[I3--];
R3 = R3 >> 8 || R1.L = W[I3] || I3 += 4;
R6 = R6 + R7 (S) || R1.H = W[I3];
R6 = BYTEOP3P(R1:0, R3:2) (LO) || I3+=M3 || [I2++]=R6;
R7 = BYTEOP3P(R1:0, R3:2) (HI, R) || R0 = [I3++] || R2 = [I1];
R2 = R2 << 8 || R0.H = W[I3--] || R3 = [I1++];
R3 = R3 >> 8 || R1.L = W[I3] || I3 += 4;
R6 = R6 + R7 (S) || R1.H = W[I3++];
apc$3: R6 = BYTEOP3P(R1:0, R3:2) (LO) || [I2++M0] = R6 || R2 = [I1];
(R7:4) = [SP++];
RTS;
DEFUN_END(add_pixels_clamped)
DEFUN(diff_pixels,mL1,
(int16_t *block, uint8_t *s1, uint8_t *s2, int stride)):
link 0;
[--sp] = (r7:4);
p0=8;
i3=r0; // block
i0=r1; // s1
i1=r2; // s2
r2=[fp+20]; // stride
r2+=-8;
m0=r2;
LSETUP(.LS0,.LE0) LC0=P0;
DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
.LS0: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++];
(R5,R4) = BYTEOP16M (R1:0,R3:2) || R0 = [I0++M0] || R2 = [I1++M0];
(R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R0 = [I0++] || [I3++] = R4;
DISALGNEXCPT || R2 = [I1++] || [I3++] = R5;
[i3++]=r6;
.LE0: [i3++]=r7;
(r7:4) = [sp++];
unlink;
rts;
DEFUN_END(diff_pixels)
/*
for (i = 0; i < 16; i++) {
for (j = 0; j < 16; j++) {
sum += pix[j];
}
pix += line_size;
}
*/
DEFUN(pix_sum,mL1,
(uint8_t *p, int stride)):
link 0;
[--sp] = (r7:4);
p0=8;
i0=r0; // s1
i1=r0;
m1=r1;
r1=r1+r1;
r1+=-16; // stride
m0=r1;
i1+=m1;
r6=0;
LSETUP(LS$PS,LE$PS) LC0=P0;
DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
LS$PS: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++];
(R5,R4) = BYTEOP16P (R3:2,R1:0) || R0 = [I0++] || R2 = [I1++];
r6=r6+|+r5;
r6=r6+|+r4;
(R5,R4) = BYTEOP16P (R3:2,R1:0) (R)|| R1 = [I0++] || R3 = [I1++];
r6=r6+|+r5;
r6=r6+|+r4;
(R5,R4) = BYTEOP16P (R3:2,R1:0) || R0 = [I0++m0] || R2 = [I1++m0];
r6=r6+|+r5;
r6=r6+|+r4;
(R5,R4) = BYTEOP16P (R3:2,R1:0) (R)|| R0 = [I0++] || R2 = [I1++];
r6=r6+|+r5;
LE$PS: r6=r6+|+r4;
r0.l=r6.l+r6.h;
r0.h=0;
(r7:4) = [sp++];
unlink;
rts;
DEFUN_END(pix_sum)
DEFUN(get_pixels,mL1,
(int16_t *av_restrict block, const uint8_t *pixels, int line_size)):
[--sp] = (r7:4);
i3=r0; // dest
i0=r1; // src0
p0=8;
r2+=-8;
m0=r2;
LSETUP(gp8$0,gp8$1) LC0=P0;
DISALGNEXCPT || R0 = [I0++];
DISALGNEXCPT || R1 = [I0++];
gp8$0: (R7,R6) = byteunpack R1:0 || R0 = [I0++M0];
(R5,R4) = byteunpack R1:0 (R) || R0 = [I0++] || [I3++]=R6;
DISALGNEXCPT || R1 = [I0++] || [I3++]=R7;
[I3++]=R4;
gp8$1: [I3++]=R5
(r7:4) = [sp++];
RTS;
DEFUN_END(get_pixels)
/* sad = sad16x16 (ubyte *mb, ubyte *refwin, srcwidth, refwinwidth, h) */
/* 91 cycles */
DEFUN(z_sad16x16,mL1,
(uint8_t *blk1, uint8_t *blk2, int dsz, int line_size, int h)):
link 0;
I0 = R0;
I1 = R1;
A1 = A0 = 0;
R0 = [sp+20]; // rwidth
P2 = [sp+24]; // height
R3 = 16;
R0 = R0 - R3;
R3 = R2 - R3;
M1 = R0;
M0 = R3;
DISALGNEXCPT || R0 = [I0++] || R2 = [I1++];
LSETUP (s$16, e$16) LC0=P2;
s$16: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++];
SAA (R1:0,R3:2) || R0 = [I0++] || R2 = [I1++];
SAA (R1:0,R3:2) (R) || R1 = [I0++] || R3 = [I1++];
SAA (R1:0,R3:2) || R0 = [I0++M0] || R2 = [I1++M1];
e$16: SAA (R1:0,R3:2) (R) || R0 = [I0++] || R2 = [I1++];
R3=A1.L+A1.H, R2=A0.L+A0.H ;
R0 = R2 + R3 ;
unlink;
RTS;
DEFUN_END(z_sad16x16)
/* sad = sad8x8 (ubyte *mb, ubyte *refwin, int srcwidth, int refwinwidth, int h) */
/* 36 cycles */
DEFUN(z_sad8x8,mL1,
(uint8_t *blk1, uint8_t *blk2, int dsz, int line_size, int h)):
I0 = R0;
I1 = R1;
A1 = A0 = 0;
r0 = [sp+12]; // rwidth
P2 = [sp+16]; //height
R3 = 8;
R0 = R0 - R3;
R3 = R2 - R3;
M0 = R3;
M1 = R0;
LSETUP (s$8, e$8) LC0=P2;
DISALGNEXCPT || R0 = [I0++] || R2 = [I1++];
DISALGNEXCPT || R1 = [I0++] || R3 = [I1++];
s$8: SAA (R1:0,R3:2) || R0 = [I0++M0] || R2 = [I1++M1];
SAA (R1:0,R3:2) (R) || R0 = [I0++] || R2 = [I1++];
e$8: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++];
R3=A1.L+A1.H, R2=A0.L+A0.H ;
R0 = R2 + R3 ;
RTS;
DEFUN_END(z_sad8x8)
DEFUN(pix_norm1,mL1,
(uint8_t * pix, int line_size)):
[--SP]=(R7:4,P5:3);
// Fetch the input arguments.
P1 = R0; // pix
P0 = R1; // line_size
P5 = 16; // loop ctr.
P0 -= P5;
M0 = P0; // M0 = line_size-16;
// Now for the real work.
A1 = A0 = 0;
lsetup(_pix_norm1_blkfn_loopStart, _pix_norm1_blkfn_loopEnd) LC1 = P5;
I0 = P1;
DISALGNEXCPT || r0 = [i0++];
_pix_norm1_blkfn_loopStart:
// following unpacks pix1[0..15] pix1+line_size[0..15]
DISALGNEXCPT || r1 = [i0++];
(r5, r4) = byteunpack r1:0 || r0 = [i0++];
a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
(r5, r4) = byteunpack r1:0(r) || r1 = [i0++];
a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
(r5, r4) = byteunpack r1:0 || r0 = [i0++M0];
a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
(r5, r4) = byteunpack r1:0(r) || r0 = [i0++];
a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
_pix_norm1_blkfn_loopEnd:
a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
// Clean up at the end:
R2 = A0, R3 = A1;
R0 = R2 + R3 (S);
(R7:4,P5:3)=[SP++];
RTS;
DEFUN_END(pix_norm1)
DEFUN(sse4,mL1,
(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)):
link 0;
[--sp] = (r7:6);
p0=[fp+24]; // h
i0=r1; // pix1
i1=r2; // pix2
r2=[fp+20]; // line_size
r2+=-4;
m0=r2;
a0=a1=0;
LSETUP(.S40,.E40) LC0=P0;
DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
.S40: DISALGNEXCPT || R1 = [I0++M0] || R3 = [I1++M0];
(R7,R6) = BYTEOP16M (R1:0,R3:2);
a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
.E40: a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
a0 += a1;
r0 = a0;
(r7:6) = [sp++];
unlink;
rts;
DEFUN_END(sse4)
DEFUN(sse8,mL1,
(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)):
link 0;
[--sp] = (r7:6);
p0=[fp+24]; // h
i0=r1; // pix1
i1=r2; // pix2
r2=[fp+20]; // line_size
r2+=-8;
m0=r2;
a0=a1=0;
LSETUP(.S80,.E80) LC0=P0;
DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
.S80: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++];
(R7,R6) = BYTEOP16M (R1:0,R3:2) || R0 = [I0++M0] || R2 = [I1++M0];
a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
(R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R0 = [I0++] || R2 = [I1++];
a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
.E80: a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
a0 += a1;
r0 = a0;
(r7:6) = [sp++];
unlink;
rts;
DEFUN_END(sse8)
DEFUN(sse16,mL1,
(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)):
link 0;
[--sp] = (r7:6);
p0=[fp+24]; // h
i0=r1; // pix1
i1=r2; // pix2
r2=[fp+20]; // line_size
r2+=-16;
m0=r2;
a0=a1=0;
DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
LSETUP(.S160,.E160) LC0=P0;
.S160: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++];
(R7,R6) = BYTEOP16M (R1:0,R3:2) || R0 = [I0++] || R2 = [I1++];
a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
(R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R1 = [I0++] || R3 = [I1++];
a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
(R7,R6) = BYTEOP16M (R1:0,R3:2) || R0 = [I0++M0] || R2 = [I1++M0];
a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
(R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R0 = [I0++] || R2 = [I1++];
a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
.E160: a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
a0 += a1;
r0 = a0;
(r7:6) = [sp++];
unlink;
rts;
DEFUN_END(sse16)

@ -1,195 +0,0 @@
/*
* BlackFin DSPUTILS
*
* Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
* Copyright (c) 2006 Michael Benjamin <michael.benjamin@analog.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/bfin/attributes.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/dsputil.h"
#include "libavcodec/mpegvideo.h"
void ff_bfin_idct(int16_t *block) attribute_l1_text;
void ff_bfin_fdct(int16_t *block) attribute_l1_text;
void ff_bfin_add_pixels_clamped(const int16_t *block, uint8_t *dest,
int line_size) attribute_l1_text;
void ff_bfin_put_pixels_clamped(const int16_t *block, uint8_t *dest,
int line_size) attribute_l1_text;
void ff_bfin_diff_pixels(int16_t *block, const uint8_t *s1, const uint8_t *s2,
int stride) attribute_l1_text;
void ff_bfin_get_pixels(int16_t *restrict block, const uint8_t *pixels,
int line_size) attribute_l1_text;
int ff_bfin_pix_norm1(uint8_t *pix, int line_size) attribute_l1_text;
int ff_bfin_pix_sum(uint8_t *p, int stride) attribute_l1_text;
int ff_bfin_z_sad8x8(uint8_t *blk1, uint8_t *blk2, int dsz,
int line_size, int h) attribute_l1_text;
int ff_bfin_z_sad16x16(uint8_t *blk1, uint8_t *blk2, int dsz,
int line_size, int h) attribute_l1_text;
int ff_bfin_sse4(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
int line_size, int h) attribute_l1_text;
int ff_bfin_sse8(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
int line_size, int h) attribute_l1_text;
int ff_bfin_sse16(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
int line_size, int h) attribute_l1_text;
static void bfin_idct_add(uint8_t *dest, int line_size, int16_t *block)
{
ff_bfin_idct(block);
ff_bfin_add_pixels_clamped(block, dest, line_size);
}
static void bfin_idct_put(uint8_t *dest, int line_size, int16_t *block)
{
ff_bfin_idct(block);
ff_bfin_put_pixels_clamped(block, dest, line_size);
}
static void bfin_clear_blocks(int16_t *blocks)
{
// This is just a simple memset.
__asm__ ("P0=192; "
"I0=%0; "
"R0=0; "
"LSETUP(clear_blocks_blkfn_lab,clear_blocks_blkfn_lab)LC0=P0;"
"clear_blocks_blkfn_lab:"
"[I0++]=R0;"
::"a" (blocks) : "P0", "I0", "R0");
}
static int bfin_pix_abs16(MpegEncContext *c, uint8_t *blk1, uint8_t *blk2,
int line_size, int h)
{
return ff_bfin_z_sad16x16(blk1, blk2, line_size, line_size, h);
}
static uint8_t vtmp_blk[256] attribute_l1_data_b;
static int bfin_pix_abs16_x2(MpegEncContext *c, uint8_t *blk1, uint8_t *blk2,
int line_size, int h)
{
ff_bfin_put_pixels16uc(vtmp_blk, blk2, blk2 + 1, 16, line_size, h);
return ff_bfin_z_sad16x16(blk1, vtmp_blk, line_size, 16, h);
}
static int bfin_pix_abs16_y2(MpegEncContext *c, uint8_t *blk1, uint8_t *blk2,
int line_size, int h)
{
ff_bfin_put_pixels16uc(vtmp_blk, blk2, blk2 + line_size, 16, line_size, h);
return ff_bfin_z_sad16x16(blk1, vtmp_blk, line_size, 16, h);
}
static int bfin_pix_abs16_xy2(MpegEncContext *c, uint8_t *blk1, uint8_t *blk2,
int line_size, int h)
{
ff_bfin_z_put_pixels16_xy2(vtmp_blk, blk2, 16, line_size, h);
return ff_bfin_z_sad16x16(blk1, vtmp_blk, line_size, 16, h);
}
static int bfin_pix_abs8(MpegEncContext *c, uint8_t *blk1, uint8_t *blk2,
int line_size, int h)
{
return ff_bfin_z_sad8x8(blk1, blk2, line_size, line_size, h);
}
static int bfin_pix_abs8_x2(MpegEncContext *c, uint8_t *blk1, uint8_t *blk2,
int line_size, int h)
{
ff_bfin_put_pixels8uc(vtmp_blk, blk2, blk2 + 1, 8, line_size, h);
return ff_bfin_z_sad8x8(blk1, vtmp_blk, line_size, 8, h);
}
static int bfin_pix_abs8_y2(MpegEncContext *c, uint8_t *blk1, uint8_t *blk2,
int line_size, int h)
{
ff_bfin_put_pixels8uc(vtmp_blk, blk2, blk2 + line_size, 8, line_size, h);
return ff_bfin_z_sad8x8(blk1, vtmp_blk, line_size, 8, h);
}
static int bfin_pix_abs8_xy2(MpegEncContext *c, uint8_t *blk1, uint8_t *blk2,
int line_size, int h)
{
ff_bfin_z_put_pixels8_xy2(vtmp_blk, blk2, 8, line_size, h);
return ff_bfin_z_sad8x8(blk1, vtmp_blk, line_size, 8, h);
}
/*
* decoder optimization
* start on 2/11 100 frames of 352x240@25 compiled with no optimization -g debugging
* 9.824s ~ 2.44x off
* 6.360s ~ 1.58x off with -O2
* 5.740s ~ 1.43x off with idcts
*
* 2.64s 2/20 same sman.mp4 decode only
*/
av_cold void ff_dsputil_init_bfin(DSPContext *c, AVCodecContext *avctx,
unsigned high_bit_depth)
{
c->diff_pixels = ff_bfin_diff_pixels;
c->put_pixels_clamped = ff_bfin_put_pixels_clamped;
c->add_pixels_clamped = ff_bfin_add_pixels_clamped;
c->clear_blocks = bfin_clear_blocks;
c->pix_sum = ff_bfin_pix_sum;
c->pix_norm1 = ff_bfin_pix_norm1;
c->sad[0] = bfin_pix_abs16;
c->sad[1] = bfin_pix_abs8;
/* TODO [0] 16 [1] 8 */
c->pix_abs[0][0] = bfin_pix_abs16;
c->pix_abs[0][1] = bfin_pix_abs16_x2;
c->pix_abs[0][2] = bfin_pix_abs16_y2;
c->pix_abs[0][3] = bfin_pix_abs16_xy2;
c->pix_abs[1][0] = bfin_pix_abs8;
c->pix_abs[1][1] = bfin_pix_abs8_x2;
c->pix_abs[1][2] = bfin_pix_abs8_y2;
c->pix_abs[1][3] = bfin_pix_abs8_xy2;
c->sse[0] = ff_bfin_sse16;
c->sse[1] = ff_bfin_sse8;
c->sse[2] = ff_bfin_sse4;
if (!high_bit_depth) {
c->get_pixels = ff_bfin_get_pixels;
if (avctx->dct_algo == FF_DCT_AUTO)
c->fdct = ff_bfin_fdct;
if (avctx->idct_algo == FF_IDCT_AUTO) {
c->idct_permutation_type = FF_NO_IDCT_PERM;
c->idct = ff_bfin_idct;
c->idct_add = bfin_idct_add;
c->idct_put = bfin_idct_put;
}
}
}

@ -1,327 +0,0 @@
/*
* fdct BlackFin
*
* Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
void ff_bfin_fdct (int16_t *buf);
This implementation works only for 8x8 input. The range of input
must be -256 to 255 i.e. 8bit input represented in a 16bit data
word. The original data must be sign extended into the 16bit data
words.
Chen factorization of
8
X(m) = sum (x(n) * cos ((2n+1)*m*pi/16))
n=0
C4
0 --*-------------*0+7---*-----*0+3-------*-*-------------------> 0
\ / \ / X S4,S4
1 --*-\---------/-*1+6---*-\-/-*1+2-------*-*-------------------> 4
\ / \ -C4 C3
2 --*---\-----/---*2+5---*-/-\-*1-2---------------*-*-----------> 2
\ / / \ X S3,-S3
3 --*-----\-/-----*3+4---*-----*0-3---------------*-*-----------> 6
/ C7 C3
4 --*-----/-\-----*3-4------------*-*4+5--*-----*---------------> 1
/ \ -C4 X \ /S7 C3
5 --*---/-----\---*2-5---*-*------*=*4-5----\-/------*-*--------> 5
/ \ X S4,S4 / X S3,-S3
6 --*-/---------\-*1-6---*-*------*=*7-6----/-\------*-*--------> 3
/ \ C4 X / \-S7 C3
--*-------------*0-7------------*-*7+6--*-----*---------------> 7
C7
Notation
Cn = cos(n*pi/8) used throughout the code.
Registers used:
R0, R1, R2, R3, R4, R5, R6,R7, P0, P1, P2, P3, P4, P5, A0, A1.
Other registers used:
I0, I1, I2, I3, B0, B2, B3, M0, M1, L3 registers and LC0.
Input - r0 - pointer to start of int16_t *block
Output - The DCT output coefficients in the int16_t *block
Register constraint:
This code is called from jpeg_encode.
R6, R5, R4 if modified should be stored and restored.
Performance: (Timer version 0.6.33)
Code Size : 240 Bytes.
Memory Required :
Input Matrix : 8 * 8 * 2 Bytes.
Coefficients : 16 Bytes
Temporary matrix: 8 * 8 * 2 Bytes.
Cycle Count :26+{18+8*(14+2S)}*2 where S -> Stalls
(7.45 c/pel)
-----------------------------------------
| Size | Forward DCT | Inverse DCT |
-----------------------------------------
| 8x8 | 284 Cycles | 311 Cycles |
-----------------------------------------
Ck = int16(cos(k/16*pi)*32767+.5)/2
#define C4 23170
#define C3 13623
#define C6 6270
#define C7 3196
Sk = int16(sin(k/16*pi)*32767+.5)/2
#define S4 11585
#define S3 9102
#define S6 15137
#define S7 16069
the coefficients are ordered as follows:
short dct_coef[]
C4,S4,
C6,S6,
C7,S7,
S3,C3,
-----------------------------------------------------------
FFMPEG conformance testing results
-----------------------------------------------------------
dct-test: modified with the following
dct_error("BFINfdct", 0, ff_bfin_fdct, fdct, test);
produces the following output:
root:/u/ffmpeg/bhead/libavcodec> ./dct-test
ffmpeg DCT/IDCT test
2 -131 -6 -48 -36 33 -83 24
34 52 -24 -15 5 92 57 143
-67 -43 -1 74 -16 5 -71 32
-78 106 92 -34 -38 81 20 -18
7 -62 40 2 -15 90 -62 -83
-83 1 -104 -13 43 -19 7 11
-63 31 12 -29 83 72 21 10
-17 -63 -15 73 50 -91 159 -14
DCT BFINfdct: err_inf=2 err2=0.16425938 syserr=0.00795000 maxout=2098 blockSumErr=27
DCT BFINfdct: 92.1 kdct/s
root:/u/ffmpeg/bhead/libavcodec>
*/
#include "libavutil/bfin/asm.h"
SECTION_L1_DATA_B
.align 4;
dct_coeff:
.short 0x5a82, 0x2d41, 0x187e, 0x3b21, 0x0c7c, 0x3ec5, 0x238e, 0x3537;
SECTION_L1_DATA_A
.align 4
vtmp: .space 128
.text
DEFUN(fdct,mL1,
(int16_t *block)):
[--SP] = (R7:4, P5:3); // Push the registers onto the stack.
b0 = r0;
RELOC(r0, P3, dct_coeff);
b3 = r0;
RELOC(r0, P3, vtmp);
b2 = r0;
L3 = 16; // L3 is set to 16 to make the coefficient
// array Circular.
//----------------------------------------------------------------------------
/*
* I0, I1, and I2 registers are used to read the input data. I3 register is used
* to read the coefficients. P0 and P1 registers are used for writing the output
* data.
*/
M0 = 12 (X); // All these initializations are used in the
M1 = 16 (X); // modification of address offsets.
M2 = 128 (X);
P2 = 16;
P3 = 32 (X);
P4 = -110 (X);
P5 = -62 (X);
P0 = 2(X);
// Prescale the input to get the correct precision.
i0=b0;
i1=b0;
lsetup (.0, .1) LC0 = P3;
r0=[i0++];
.0: r1=r0<<3 (v) || r0=[i0++] ;
.1: [i1++]=r1;
/*
* B0 points to the "in" buffer.
* B2 points to "temp" buffer in the first iteration.
*/
lsetup (.2, .3) LC0 = P0;
.2:
I0 = B0; // I0 points to Input Element (0, 0).
I1 = B0; // Element 1 and 0 is read in R0.
I1 += M0 || R0 = [I0++]; // I1 points to Input Element (0, 6).
I2 = I1; // Element 6 is read into R3.H.
I2 -= 4 || R3.H = W[I1++]; // I2 points to Input Element (0, 4).
I3 = B3; // I3 points to Coefficients.
P0 = B2; // P0 points to temporary array Element
// (0, 0).
P1 = B2; // P1 points to temporary array.
R7 = [P1++P2] || R2 = [I2++]; // P1 points to temporary array
// Element (1, 0).
// R7 is a dummy read. X4,X5
// are read into R2.
R3.L = W[I1--]; // X7 is read into R3.L.
R1.H = W[I0++]; // X2 is read into R1.H.
/*
* X0 = (X0 + X7) / 2.
* X1 = (X1 + X6) / 2.
* X6 = (X1 - X6) / 2.
* X7 = (X0 - X7) / 2.
* It reads the data 3 in R1.L.
*/
R0 = R0 +|+ R3, R3 = R0 -|- R3 || R1.L = W[I0++] || NOP;
/*
* X2 = (X2 + X5) / 2.
* X3 = (X3 + X4) / 2.
* X4 = (X3 - X4) / 2.
* X5 = (X2 - X5) / 2.
* R7 = C4 = cos(4*pi/16)
*/
R1 = R1 +|+ R2, R2 = R1 -|- R2 (CO) || NOP || R7 = [I3++];
/*
* At the end of stage 1 R0 has (1,0), R1 has (2,3), R2 has (4, 5) and
* R3 has (6,7).
* Where the notation (x, y) represents uper/lower half pairs.
*/
/*
* X0 = X0 + X3.
* X1 = X1 + X2.
* X2 = X1 - X2.
* X3 = X0 - X3.
*/
R0 = R0 +|+ R1, R1 = R0 -|- R1;
lsetup (.row0, .row1) LC1 = P2 >> 1; // 1d dct, loops 8x
.row0:
/*
* This is part 2 computation continued.....
* A1 = X6 * cos(pi/4)
* A0 = X6 * cos(pi/4)
* A1 = A1 - X5 * cos(pi/4)
* A0 = A0 + X5 * cos(pi/4).
* The instruction W[I0] = R3.L is used for packing it to R2.L.
*/
A1=R3.H*R7.l, A0=R3.H*R7.l || I1+=M1 || W[I0] = R3.L;
R4.H=(A1-=R2.L*R7.l), R4.L=(A0+=R2.L*R7.l) || I2+=M0 || NOP;
/* R0 = (X1,X0) R1 = (X2,X3) R4 = (X5, X6). */
/*
* A1 = X0 * cos(pi/4)
* A0 = X0 * cos(pi/4)
* A1 = A1 - X1 * cos(pi/4)
* A0 = A0 + X1 * cos(pi/4)
* R7 = (C2,C6)
*/
A1=R0.L*R7.h, A0=R0.L*R7.h || NOP || R3.H=W[I1++];
R5.H=(A1-=R0.H*R7.h),R5.L=(A0+=R0.H*R7.h) || R7=[I3++] || NOP;
/*
* A1 = X2 * cos(3pi/8)
* A0 = X3 * cos(3pi/8)
* A1 = A1 + X3 * cos(pi/8)
* A0 = A0 - X2 * cos(pi/8)
* R3 = cos(pi/4)
* R7 = (cos(7pi/8),cos(pi/8))
* X4 = X4 + X5.
* X5 = X4 - X5.
* X6 = X7 - X6.
* X7 = X7 + X6.
*/
A1=R1.H*R7.L, A0=R1.L*R7.L || W[P0++P3]=R5.L || R2.L=W[I0];
R2=R2+|+R4, R4=R2-|-R4 || I0+=4 || R3.L=W[I1--];
R6.H=(A1+=R1.L*R7.H),R6.L=(A0 -= R1.H * R7.H) || I0+=4 || R7=[I3++];
/* R2 = (X4, X7) R4 = (X5,X6) R5 = (X1, X0) R6 = (X2,X3). */
/*
* A1 = X4 * cos(7pi/16)
* A0 = X7 * cos(7pi/16)
* A1 = A1 + X7 * cos(pi/16)
* A0 = A0 - X4 * cos(pi/16)
*/
A1=R2.H*R7.L, A0=R2.L*R7.L || W[P0++P3]=R6.H || R0=[I0++];
R2.H=(A1+=R2.L*R7.H),R2.L=(A0-=R2.H*R7.H) || W[P0++P3]=R5.H || R7=[I3++];
/*
* A1 = X5 * cos(3pi/16)
* A0 = X6 * cos(3pi/16)
* A1 = A1 + X6 * cos(5pi/16)
* A0 = A0 - X5 * cos(5pi/16)
* The output values are written.
*/
A1=R4.H*R7.H, A0=R4.L*R7.H || W[P0++P2]=R6.L || R1.H=W[I0++];
R4.H=(A1+=R4.L*R7.L),R4.L=(A0-=R4.H*R7.L) || W[P0++P4]=R2.L || R1.L=W[I0++];
/* Beginning of next stage, **pipelined** + drain and store the
rest of the column store. */
R0=R0+|+R3,R3=R0-|-R3 || W[P1++P3]=R2.H || R2=[I2++];
R1=R1+|+R2,R2=R1-|-R2 (CO) || W[P1++P3]=R4.L || R7=[I3++];
.row1: R0=R0+|+R1,R1=R0-|-R1 || W[P1++P5]=R4.H || NOP;
// Exchange input with output.
B1 = B0;
B0 = B2;
.3: B2 = B1;
L3=0;
(r7:4,p5:3) = [sp++];
RTS;
DEFUN_END(fdct)

@ -1,81 +0,0 @@
/*
* Blackfin Pixel Operations
* Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/bfin/asm.h"
DEFUN(put_pixels8uc_no_rnd,mL1,
(uint8_t *block, const uint8_t *s0, const uint8_t *s1,
int line_size, int h)):
i3=r0; // dest
i0=r1; // src0
i1=r2; // src1
r2=[sp+12]; // line_size
p0=[sp+16]; // h
[--sp] = (r7:6);
r2+=-4;
m3=r2;
r2+=-4;
m0=r2;
LSETUP(pp8$2,pp8$3) LC0=P0;
DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
pp8$2: DISALGNEXCPT || R1 = [I0++] || R3 =[I1++];
R6 = BYTEOP1P(R1:0,R3:2)(T) || R0 = [I0++M0]|| R2 =[I1++M0];
R7 = BYTEOP1P(R1:0,R3:2)(T,R) || R0 = [I0++] || [I3++] = R6 ;
pp8$3: DISALGNEXCPT || R2 = [I1++] || [I3++M3] = R7;
(r7:6) = [sp++];
RTS;
DEFUN_END(put_pixels8uc_no_rnd)
DEFUN(put_pixels16uc_no_rnd,mL1,
(uint8_t *block, const uint8_t *s0, const uint8_t *s1,
int line_size, int h)):
i3=r0; // dest
i0=r1; // src0
i1=r2; // src1
r2=[sp+12]; // line_size
p0=[sp+16]; // h
[--sp] = (r7:6);
r2+=-12;
m3=r2; // line_size
r2+=-4;
m0=r2;
LSETUP(pp16$2,pp16$3) LC0=P0;
DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
pp16$2:
DISALGNEXCPT || R1 = [I0++] || R3 =[I1++];
R6 = BYTEOP1P(R1:0,R3:2)(T) || R0 = [I0++] || R2 =[I1++];
R7 = BYTEOP1P(R1:0,R3:2)(T,R) || R1 = [I0++] || R3 =[I1++];
[I3++] = R6;
R6 = BYTEOP1P(R1:0,R3:2)(T) || R0 = [I0++M0] || R2 =[I1++M0];
R7 = BYTEOP1P(R1:0,R3:2)(T,R) || R0 = [I0++] || [I3++] = R7 ;
[I3++] = R6;
pp16$3: DISALGNEXCPT || R2 = [I1++] || [I3++M3] = R7;
(r7:6) = [sp++];
RTS;
DEFUN_END(put_pixels16uc_no_rnd)

@ -1,146 +0,0 @@
/*
* BlackFin halfpel functions
*
* Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
* Copyright (c) 2006 Michael Benjamin <michael.benjamin@analog.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stddef.h>
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/bfin/attributes.h"
#include "libavcodec/hpeldsp.h"
#include "pixels.h"
void ff_bfin_put_pixels8uc_no_rnd(uint8_t *block, const uint8_t *s0,
const uint8_t *s1, int line_size,
int h) attribute_l1_text;
void ff_bfin_put_pixels16uc_no_rnd(uint8_t *block, const uint8_t *s0,
const uint8_t *s1, int line_size,
int h) attribute_l1_text;
static void bfin_put_pixels8(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
{
ff_bfin_put_pixels8uc(block, pixels, pixels, line_size, line_size, h);
}
static void bfin_put_pixels8_x2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
{
ff_bfin_put_pixels8uc(block, pixels, pixels + 1, line_size, line_size, h);
}
static void bfin_put_pixels8_y2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
{
ff_bfin_put_pixels8uc(block, pixels, pixels + line_size,
line_size, line_size, h);
}
static void bfin_put_pixels8_xy2(uint8_t *block, const uint8_t *s0,
ptrdiff_t line_size, int h)
{
ff_bfin_z_put_pixels8_xy2(block, s0, line_size, line_size, h);
}
static void bfin_put_pixels16(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
{
ff_bfin_put_pixels16uc(block, pixels, pixels, line_size, line_size, h);
}
static void bfin_put_pixels16_x2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
{
ff_bfin_put_pixels16uc(block, pixels, pixels + 1, line_size, line_size, h);
}
static void bfin_put_pixels16_y2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
{
ff_bfin_put_pixels16uc(block, pixels, pixels + line_size,
line_size, line_size, h);
}
static void bfin_put_pixels16_xy2(uint8_t *block, const uint8_t *s0,
ptrdiff_t line_size, int h)
{
ff_bfin_z_put_pixels16_xy2(block, s0, line_size, line_size, h);
}
static void bfin_put_pixels8_no_rnd(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
{
ff_bfin_put_pixels8uc_no_rnd(block, pixels, pixels, line_size, h);
}
static void bfin_put_pixels8_x2_no_rnd(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
{
ff_bfin_put_pixels8uc_no_rnd(block, pixels, pixels + 1, line_size, h);
}
static void bfin_put_pixels8_y2_no_rnd(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
{
ff_bfin_put_pixels8uc_no_rnd(block, pixels, pixels + line_size,
line_size, h);
}
static void bfin_put_pixels16_no_rnd(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
{
ff_bfin_put_pixels16uc_no_rnd(block, pixels, pixels, line_size, h);
}
static void bfin_put_pixels16_x2_no_rnd(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
{
ff_bfin_put_pixels16uc_no_rnd(block, pixels, pixels + 1, line_size, h);
}
static void bfin_put_pixels16_y2_no_rnd(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
{
ff_bfin_put_pixels16uc_no_rnd(block, pixels, pixels + line_size,
line_size, h);
}
av_cold void ff_hpeldsp_init_bfin(HpelDSPContext *c, int flags)
{
c->put_pixels_tab[0][0] = bfin_put_pixels16;
c->put_pixels_tab[0][1] = bfin_put_pixels16_x2;
c->put_pixels_tab[0][2] = bfin_put_pixels16_y2;
c->put_pixels_tab[0][3] = bfin_put_pixels16_xy2;
c->put_pixels_tab[1][0] = bfin_put_pixels8;
c->put_pixels_tab[1][1] = bfin_put_pixels8_x2;
c->put_pixels_tab[1][2] = bfin_put_pixels8_y2;
c->put_pixels_tab[1][3] = bfin_put_pixels8_xy2;
c->put_no_rnd_pixels_tab[1][0] = bfin_put_pixels8_no_rnd;
c->put_no_rnd_pixels_tab[1][1] = bfin_put_pixels8_x2_no_rnd;
c->put_no_rnd_pixels_tab[1][2] = bfin_put_pixels8_y2_no_rnd;
c->put_no_rnd_pixels_tab[0][0] = bfin_put_pixels16_no_rnd;
c->put_no_rnd_pixels_tab[0][1] = bfin_put_pixels16_x2_no_rnd;
c->put_no_rnd_pixels_tab[0][2] = bfin_put_pixels16_y2_no_rnd;
}

@ -1,297 +0,0 @@
/*
* idct BlackFin
*
* Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
This blackfin DSP code implements an 8x8 inverse type II DCT.
Prototype : void ff_bfin_idct(int16_t *in)
Registers Used : A0, A1, R0-R7, I0-I3, B0, B2, B3, M0-M2, L0-L3, P0-P5, LC0.
Performance :
Code Size : 498 Bytes.
Cycle Count : 417 Cycles
-----------------------------------------------------------
FFMPEG conformance testing results
-----------------------------------------------------------
dct-test: modified with the following
dct_error("BFINidct", 1, ff_bfin_idct, idct, test);
produces the following output
root:/u/ffmpeg/bhead/libavcodec> ./dct-test -i
ffmpeg DCT/IDCT test
8 15 -2 21 24 17 0 10
2 -10 -5 -5 -3 7 -14 -3
2 -13 -10 -19 18 -6 6 -2
9 4 16 -3 9 12 10 15
15 -9 -2 10 1 16 0 -15
-15 5 7 3 13 0 13 20
-6 -15 24 9 -18 1 9 -22
-8 25 23 2 -7 0 30 13
IDCT BFINidct: err_inf=1 err2=0.01002344 syserr=0.00150000 maxout=266 blockSumErr=64
IDCT BFINidct: 88.3 kdct/s
*/
#include "libavutil/bfin/asm.h"
SECTION_L1_DATA_B
.align 4;
coefs:
.short 0x5a82; // C4
.short 0x5a82; // C4
.short 0x30FC; //cos(3pi/8) C6
.short 0x7642; //cos(pi/8) C2
.short 0x18F9; //cos(7pi/16)
.short 0x7D8A; //cos(pi/16)
.short 0x471D; //cos(5pi/16)
.short 0x6A6E; //cos(3pi/16)
.short 0x18F9; //cos(7pi/16)
.short 0x7D8A; //cos(pi/16)
SECTION_L1_DATA_A
vtmp: .space 256
#define TMP0 FP-8
#define TMP1 FP-12
#define TMP2 FP-16
.text
DEFUN(idct,mL1,
(int16_t *block)):
/********************** Function Prologue *********************************/
link 16;
[--SP] = (R7:4, P5:3); // Push the registers onto the stack.
B0 = R0; // Pointer to Input matrix
RELOC(R1, P3, coefs); // Pointer to Coefficients
RELOC(R2, P3, vtmp); // Pointer to Temporary matrix
B3 = R1;
B2 = R2;
L3 = 20; // L3 is used for making the coefficient array
// circular.
// MUST BE RESTORED TO ZERO at function exit.
M1 = 16 (X); // All these registers are initialized for
M3 = 8(X); // modifying address offsets.
I0 = B0; // I0 points to Input Element (0, 0).
I2 = B0; // I2 points to Input Element (0, 0).
I2 += M3 || R0.H = W[I0];
// Element 0 is read into R0.H
I1 = I2; // I1 points to input Element (0, 6).
I1 += 4 || R0.L = W[I2++];
// I2 points to input Element (0, 4).
// Element 4 is read into R0.L.
P2 = 8 (X);
P3 = 32 (X);
P4 = -32 (X);
P5 = 98 (X);
R7 = 0x8000(Z);
I3 = B3; // I3 points to Coefficients
P0 = B2; // P0 points to array Element (0, 0) of temp
P1 = B2;
R7 = [I3++] || [TMP2]=R7; // Coefficient C4 is read into R7.H and R7.L.
MNOP;
NOP;
/*
* A1 = Y0 * cos(pi/4)
* A0 = Y0 * cos(pi/4)
* A1 = A1 + Y4 * cos(pi/4)
* A0 = A0 - Y4 * cos(pi/4)
* load:
* R1=(Y2,Y6)
* R7=(C2,C6)
* res:
* R3=Y0, R2=Y4
*/
A1=R7.H*R0.H, A0=R7.H*R0.H (IS) || I0+= 4 || R1.L=W[I1++];
R3=(A1+=R7.H*R0.L), R2=(A0-=R7.H*R0.L) (IS) || R1.H=W[I0--] || R7=[I3++];
LSETUP (.0, .1) LC0 = P2; // perform 8 1d idcts
P2 = 112 (X);
P1 = P1 + P2; // P1 points to element (7, 0) of temp buffer.
P2 = -94(X);
.0:
/*
* A1 = Y2 * cos(3pi/8)
* A0 = Y2 * cos(pi/8)
* A1 = A1 - Y6 * cos(pi/8)
* A0 = A0 + Y6 * cos(3pi/8)
* R5 = (Y1,Y7)
* R7 = (C1,C7)
* res:
* R1=Y2, R0=Y6
*/
A1=R7.L*R1.H, A0=R7.H*R1.H (IS) || I0+=4 || R5.H=W[I0];
R1=(A1-=R7.H*R1.L), R0=(A0+=R7.L*R1.L) (IS) || R5.L=W[I1--] || R7=[I3++];
/*
* Y0 = Y0 + Y6.
* Y4 = Y4 + Y2.
* Y2 = Y4 - Y2.
* Y6 = Y0 - Y6.
* R3 is saved
* R6.l=Y3
* note: R3: Y0, R2: Y4, R1: Y2, R0: Y6
*/
R3=R3+R0, R0=R3-R0;
R2=R2+R1, R1=R2-R1 || [TMP0]=R3 || R6.L=W[I0--];
/*
* Compute the odd portion (1,3,5,7) even is done.
*
* Y1 = C7 * Y1 - C1 * Y7 + C3 * Y5 - C5 * Y3.
* Y7 = C1 * Y1 + C7 * Y7 + C5 * Y5 + C3 * Y3.
* Y5 = C5 * Y1 + C3 * Y7 + C7 * Y5 - C1 * Y3.
* Y3 = C3 * Y1 - C5 * Y7 - C1 * Y5 - C7 * Y3.
*/
// R5=(Y1,Y7) R6=(Y5,Y3) // R7=(C1,C7)
A1 =R7.L*R5.H, A0 =R7.H*R5.H (IS) || [TMP1]=R2 || R6.H=W[I2--];
A1-=R7.H*R5.L, A0+=R7.L*R5.L (IS) || I0-=4 || R7=[I3++];
A1+=R7.H*R6.H, A0+=R7.L*R6.H (IS) || I0+=M1; // R7=(C3,C5)
R3 =(A1-=R7.L*R6.L), R2 =(A0+=R7.H*R6.L) (IS);
A1 =R7.L*R5.H, A0 =R7.H*R5.H (IS) || R4=[TMP0];
A1+=R7.H*R5.L, A0-=R7.L*R5.L (IS) || I1+=M1 || R7=[I3++]; // R7=(C1,C7)
A1+=R7.L*R6.H, A0-=R7.H*R6.H (IS);
R7 =(A1-=R7.H*R6.L), R6 =(A0-=R7.L*R6.L) (IS) || I2+=M1;
// R3=Y1, R2=Y7, R7=Y5, R6=Y3
/* Transpose write column. */
R5.H=R4+R2 (RND12); // Y0=Y0+Y7
R5.L=R4-R2 (RND12) || R4 = [TMP1]; // Y7=Y7-Y0
R2.H=R1+R7 (RND12) || W[P0++P3]=R5.H; // Y2=Y2+Y5 st Y0
R2.L=R1-R7 (RND12) || W[P1++P4]=R5.L || R7=[I3++]; // Y5=Y2-Y5 st Y7
R5.H=R0-R3 (RND12) || W[P0++P3]=R2.H || R1.L=W[I1++]; // Y1=Y6-Y1 st Y2
R5.L=R0+R3 (RND12) || W[P1++P4]=R2.L || R0.H=W[I0++]; // Y6=Y6+Y1 st Y5
R3.H=R4-R6 (RND12) || W[P0++P3]=R5.H || R0.L=W[I2++]; // Y3=Y3-Y4 st Y1
R3.L=R4+R6 (RND12) || W[P1++P4]=R5.L || R1.H=W[I0++]; // Y4=Y3+Y4 st Y6
/* pipeline loop start, + drain Y3, Y4 */
A1=R7.H*R0.H, A0=R7.H*R0.H (IS) || W[P0++P2]= R3.H || R1.H = W[I0--];
.1: R3=(A1+=R7.H*R0.L), R2=(A0-=R7.H*R0.L) (IS) || W[P1++P5]= R3.L || R7 = [I3++];
I0 = B2; // I0 points to Input Element (0, 0)
I2 = B2; // I2 points to Input Element (0, 0)
I2 += M3 || R0.H = W[I0];
// Y0 is read in R0.H
I1 = I2; // I1 points to input Element (0, 6)
I1 += 4 || R0.L = W[I2++];
// I2 points to input Element (0, 4)
// Y4 is read in R0.L
P2 = 8 (X);
I3 = B3; // I3 points to Coefficients
P0 = B0; // P0 points to array Element (0, 0) for writing
// output
P1 = B0;
R7 = [I3++]; // R7.H = C4 and R7.L = C4
NOP;
/*
* A1 = Y0 * cos(pi/4)
* A0 = Y0 * cos(pi/4)
* A1 = A1 + Y4 * cos(pi/4)
* A0 = A0 - Y4 * cos(pi/4)
* load:
* R1=(Y2,Y6)
* R7=(C2,C6)
* res:
* R3=Y0, R2=Y4
*/
A1=R7.H*R0.H, A0=R7.H*R0.H (IS) || I0+=4 || R1.L=W[I1++];
R3=(A1+=R7.H*R0.L), R2=(A0-=R7.H*R0.L) (IS) || R1.H=W[I0--] || R7=[I3++];
LSETUP (.2, .3) LC0 = P2; // peform 8 1d idcts
P2 = 112 (X);
P1 = P1 + P2;
P2 = -94(X);
.2:
/*
* A1 = Y2 * cos(3pi/8)
* A0 = Y2 * cos(pi/8)
* A1 = A1 - Y6 * cos(pi/8)
* A0 = A0 + Y6 * cos(3pi/8)
* R5 = (Y1,Y7)
* R7 = (C1,C7)
* res:
* R1=Y2, R0=Y6
*/
A1=R7.L*R1.H, A0=R7.H*R1.H (IS) || I0+=4 || R5.H=W[I0];
R1=(A1-=R7.H*R1.L), R0=(A0+=R7.L*R1.L) (IS) || R5.L=W[I1--] || R7=[I3++];
/*
* Y0 = Y0 + Y6.
* Y4 = Y4 + Y2.
* Y2 = Y4 - Y2.
* Y6 = Y0 - Y6.
* R3 is saved
* R6.l=Y3
* note: R3: Y0, R2: Y4, R1: Y2, R0: Y6
*/
R3=R3+R0, R0=R3-R0;
R2=R2+R1, R1=R2-R1 || [TMP0]=R3 || R6.L=W[I0--];
/*
* Compute the odd portion (1,3,5,7) even is done.
*
* Y1 = C7 * Y1 - C1 * Y7 + C3 * Y5 - C5 * Y3.
* Y7 = C1 * Y1 + C7 * Y7 + C5 * Y5 + C3 * Y3.
* Y5 = C5 * Y1 + C3 * Y7 + C7 * Y5 - C1 * Y3.
* Y3 = C3 * Y1 - C5 * Y7 - C1 * Y5 - C7 * Y3.
*/
// R5=(Y1,Y7) R6=(Y5,Y3) // R7=(C1,C7)
A1 =R7.L*R5.H, A0 =R7.H*R5.H (IS) || [TMP1]=R2 || R6.H=W[I2--];
A1-=R7.H*R5.L, A0+=R7.L*R5.L (IS) || I0-=4 || R7=[I3++];
A1+=R7.H*R6.H, A0+=R7.L*R6.H (IS) || I0+=M1; // R7=(C3,C5)
R3 =(A1-=R7.L*R6.L), R2 =(A0+=R7.H*R6.L) (IS);
A1 =R7.L*R5.H, A0 =R7.H*R5.H (IS) || R4=[TMP0];
A1+=R7.H*R5.L, A0-=R7.L*R5.L (IS) || I1+=M1 || R7=[I3++]; // R7=(C1,C7)
A1+=R7.L*R6.H, A0-=R7.H*R6.H (IS);
R7 =(A1-=R7.H*R6.L), R6 =(A0-=R7.L*R6.L) (IS) || I2+=M1;
// R3=Y1, R2=Y7, R7=Y5, R6=Y3
/* Transpose write column. */
R5.H=R4+R2 (RND20); // Y0=Y0+Y7
R5.L=R4-R2 (RND20) || R4 = [TMP1]; // Y7=Y7-Y0
R2.H=R1+R7 (RND20) || W[P0++P3]=R5.H; // Y2=Y2+Y5 st Y0
R2.L=R1-R7 (RND20) || W[P1++P4]=R5.L || R7=[I3++]; // Y5=Y2-Y5 st Y7
R5.H=R0-R3 (RND20) || W[P0++P3]=R2.H || R1.L=W[I1++]; // Y1=Y6-Y1 st Y2
R5.L=R0+R3 (RND20) || W[P1++P4]=R2.L || R0.H=W[I0++]; // Y6=Y6+Y1 st Y5
R3.H=R4-R6 (RND20) || W[P0++P3]=R5.H || R0.L=W[I2++]; // Y3=Y3-Y4 st Y1
R3.L=R4+R6 (RND20) || W[P1++P4]=R5.L || R1.H=W[I0++]; // Y4=Y3+Y4 st Y6
/* pipeline loop start, + drain Y3, Y4 */
A1=R7.H*R0.H, A0=R7.H*R0.H (IS) || W[P0++P2]= R3.H || R1.H = W[I0--];
.3: R3=(A1+=R7.H*R0.L), R2=(A0-=R7.H*R0.L) (IS) || W[P1++P5]= R3.L || R7 = [I3++];
L3 = 0;
(R7:4,P5:3)=[SP++];
unlink;
RTS;
DEFUN_END(idct)

@ -1,44 +0,0 @@
/*
* simple math operations
*
* Copyright (C) 2007 Marc Hoffman <mmhoffm@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_BFIN_MATHOPS_H
#define AVCODEC_BFIN_MATHOPS_H
#include "config.h"
#define MULH(X,Y) ({ int xxo; \
__asm__ ( \
"a1 = %2.L * %1.L (FU);\n\t" \
"a1 = a1 >> 16;\n\t" \
"a1 += %2.H * %1.L (IS,M);\n\t" \
"a0 = %1.H * %2.H, a1+= %1.H * %2.L (IS,M);\n\t"\
"a1 = a1 >>> 16;\n\t" \
"%0 = (a0 += a1);\n\t" \
: "=d" (xxo) : "d" (X), "d" (Y) : "A0","A1"); xxo; })
/* signed 16x16 -> 32 multiply */
#define MUL16(a, b) ({ int xxo; \
__asm__ ( \
"%0 = %1.l*%2.l (is);\n\t" \
: "=W" (xxo) : "d" (a), "d" (b) : "A1"); \
xxo; })
#endif /* AVCODEC_BFIN_MATHOPS_H */

@ -1,207 +0,0 @@
/*
* Blackfin Pixel Operations
* Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/bfin/asm.h"
/*
motion compensation
primitives
* Halfpel motion compensation with rounding (a+b+1)>>1.
* This is an array[4][4] of motion compensation funcions for 4
* horizontal blocksizes (8,16) and the 4 halfpel positions<br>
* *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
* @param block destination where the result is stored
* @param pixels source
* @param line_size number of bytes in a horizontal line of block
* @param h height
*/
DEFUN(put_pixels8uc,mL1,
(uint8_t *block, const uint8_t *s0, const uint8_t *s1,
int dest_size, int line_size, int h)):
i3=r0; // dest
i0=r1; // src0
i1=r2; // src1
r0=[sp+12]; // dest_size
r2=[sp+16]; // line_size
p0=[sp+20]; // h
[--sp] = (r7:6);
r0+=-4;
m3=r0;
r2+=-8;
m0=r2;
LSETUP(pp8$0,pp8$1) LC0=P0;
DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
pp8$0: DISALGNEXCPT || R1 = [I0++] || R3 =[I1++];
R6 = BYTEOP1P(R1:0,R3:2) || R0 = [I0++M0]|| R2 =[I1++M0];
R7 = BYTEOP1P(R1:0,R3:2)(R) || R0 = [I0++] || [I3++] = R6 ;
pp8$1: DISALGNEXCPT || R2 = [I1++] || [I3++M3] = R7;
(r7:6) = [sp++];
RTS;
DEFUN_END(put_pixels8uc)
DEFUN(put_pixels16uc,mL1,
(uint8_t *block, const uint8_t *s0, const uint8_t *s1,
int dest_size, int line_size, int h)):
link 0;
[--sp] = (r7:6);
i3=r0; // dest
i0=r1; // src0
i1=r2; // src1
r0=[fp+20]; // dest_size
r2=[fp+24]; // line_size
p0=[fp+28]; // h
r0+=-12;
m3=r0; // line_size
r2+=-16;
m0=r2;
LSETUP(pp16$0,pp16$1) LC0=P0;
DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
pp16$0: DISALGNEXCPT || R1 = [I0++] || R3 =[I1++];
R6 = BYTEOP1P(R1:0,R3:2) || R0 = [I0++] || R2 =[I1++];
R7 = BYTEOP1P(R1:0,R3:2)(R) || R1 = [I0++] || R3 =[I1++];
[I3++] = R6;
R6 = BYTEOP1P(R1:0,R3:2) || R0 = [I0++M0] || R2 =[I1++M0];
R7 = BYTEOP1P(R1:0,R3:2)(R) || R0 = [I0++] || [I3++] = R7 ;
[I3++] = R6;
pp16$1: DISALGNEXCPT || R2 = [I1++] || [I3++M3] = R7;
(r7:6) = [sp++];
unlink;
RTS;
DEFUN_END(put_pixels16uc)
DEFUN(z_put_pixels16_xy2,mL1,
(uint8_t *block, const uint8_t *s0,
int dest_size, int line_size, int h)):
link 0;
[--sp] = (r7:4);
i3=r0; // dest
i0=r1; // src0--> pixels
i1=r1; // src1--> pixels + line_size
r2+=-12;
m2=r2; // m2=dest_width-4
r2=[fp+20];
m3=r2; // line_size
p0=[fp+24]; // h
r2+=-16;
i1+=m3; /* src1 + line_size */
m0=r2; /* line-size - 20 */
B0 = I0;
B1 = I1;
B3 = I3;
DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
LSETUP(LS$16E,LE$16E) LC0=P0;
LS$16E: DISALGNEXCPT || R1 = [I0++] || R3 =[I1++];
R4 = BYTEOP2P (R3:2,R1:0) (RNDL) || R0 = [I0++] || R2 =[I1++];
R5 = BYTEOP2P (R3:2,R1:0) (RNDL,R) || R1 = [I0++] || [I3++] = R4 ;
DISALGNEXCPT || R3 = [I1++] || [I3++] = R5;
R4 = BYTEOP2P (R3:2,R1:0) (RNDL) || R0 = [I0++M0]|| R2 = [I1++M0];
R5 = BYTEOP2P (R3:2,R1:0) (RNDL,R) || R0 = [I0++] || [I3++] = R4 ;
LE$16E: DISALGNEXCPT || R2 = [I1++] || [I3++M2] = R5;
M1 = 1;
I3 = B3;
I1 = B1;
I0 = B0;
I0 += M1;
I1 += M1;
DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
LSETUP(LS$16O,LE$16O) LC0=P0;
LS$16O: DISALGNEXCPT || R1 = [I0++] || R3 =[I1++];
R4 = BYTEOP2P (R3:2,R1:0) (RNDH) || R0 = [I0++] || R2 =[I1++];
R5 = BYTEOP2P (R3:2,R1:0) (RNDH,R) || R1 = [I0++] || R6 =[I3++];
R4 = R4 +|+ R6 || R7 = [I3--];
R5 = R5 +|+ R7 || [I3++] = R4;
DISALGNEXCPT || R3 =[I1++] || [I3++] = R5;
R4 = BYTEOP2P (R3:2,R1:0) (RNDH) || R0 = [I0++M0]|| R2 = [I1++M0];
R5 = BYTEOP2P (R3:2,R1:0) (RNDH,R) || R0 = [I0++] || R6 = [I3++];
R4 = R4 +|+ R6 || R7 = [I3--];
R5 = R5 +|+ R7 || [I3++] = R4;
LE$16O: DISALGNEXCPT || R2 = [I1++] || [I3++M2] = R5;
(r7:4) = [sp++];
unlink;
rts;
DEFUN_END(z_put_pixels16_xy2)
DEFUN(z_put_pixels8_xy2,mL1,
(uint8_t *block, const uint8_t *s0,
int dest_size, int line_size, int h)):
link 0;
[--sp] = (r7:4);
i3=r0; // dest
i0=r1; // src0--> pixels
i1=r1; // src1--> pixels + line_size
r2+=-4;
m2=r2; // m2=dest_width-4
r2=[fp+20];
m3=r2; // line_size
p0=[fp+24]; // h
r2+=-8;
i1+=m3; /* src1 + line_size */
m0=r2; /* line-size - 20 */
b0 = I0;
b1 = I1;
b3 = I3;
LSETUP(LS$8E,LE$8E) LC0=P0;
DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
LS$8E: DISALGNEXCPT || R1 = [I0++] || R3 =[I1++];
R4 = BYTEOP2P (R3:2,R1:0) (RNDL) || R0 = [I0++M0] || R2 =[I1++M0];
R5 = BYTEOP2P (R3:2,R1:0) (RNDL,R) || R0 = [I0++] || [I3++] = R4 ;
LE$8E: DISALGNEXCPT || R2 = [I1++] || [I3++M2] = R5;
M1 = 1;
I3 = b3;
I1 = b1;
I0 = b0;
I0 += M1;
I1 += M1;
LSETUP(LS$8O,LE$8O) LC0=P0;
DISALGNEXCPT || R0 = [I0++] || R2 =[I1++];
LS$8O: DISALGNEXCPT || R1 = [I0++] || R3 =[I1++];
R4 = BYTEOP2P (R3:2,R1:0) (RNDH) || R0 = [I0++M0] || R2 =[I1++M0];
R5 = BYTEOP2P (R3:2,R1:0) (RNDH,R) || R0 = [I0++] || R6 =[I3++];
R4 = R4 +|+ R6 || R7 = [I3--];
R5 = R5 +|+ R7 || [I3++] = R4;
LE$8O: DISALGNEXCPT || R2 =[I1++] || [I3++M2] = R5;
(r7:4) = [sp++];
unlink;
rts;
DEFUN_END(z_put_pixels8_xy2)

@ -1,42 +0,0 @@
/*
* BlackFin DSPUTILS COMMON OPTIMIZATIONS HEADER
*
* Copyright (C) 2007 Marc Hoffman <mmh@pleasantst.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_BFIN_PIXELS_H
#define AVCODEC_BFIN_PIXELS_H
#include <stdint.h>
#include "libavutil/bfin/attributes.h"
void ff_bfin_z_put_pixels16_xy2(uint8_t *block, const uint8_t *s0,
int dest_size, int line_size, int h) attribute_l1_text;
void ff_bfin_z_put_pixels8_xy2(uint8_t *block, const uint8_t *s0,
int dest_size, int line_size, int h) attribute_l1_text;
void ff_bfin_put_pixels8uc(uint8_t *block, const uint8_t *s0,
const uint8_t *s1, int dest_size, int line_size,
int h) attribute_l1_text;
void ff_bfin_put_pixels16uc(uint8_t *block, const uint8_t *s0,
const uint8_t *s1, int dest_size, int line_size,
int h) attribute_l1_text;
#endif /* AVCODEC_BFIN_PIXELS_H */

@ -1,273 +0,0 @@
/*
* vp3_idct BlackFin
*
* Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
This blackfin DSP code implements an 8x8 inverse type II DCT.
Prototype : void ff_bfin_vp3_idct(int16_t *in)
Registers Used : A0, A1, R0-R7, I0-I3, B0, B2, B3, M0-M2, L0-L3, P0-P5, LC0.
*/
#include "libavutil/bfin/asm.h"
SECTION_L1_DATA_B
.align 4;
coefs:
.short 0x5a82; // C4
.short 0x5a82; // C4
.short 0x30FC; //cos(3pi/8) C6
.short 0x7642; //cos(pi/8) C2
.short 0x18F9; //cos(7pi/16)
.short 0x7D8A; //cos(pi/16)
.short 0x471D; //cos(5pi/16)
.short 0x6A6E; //cos(3pi/16)
.short 0x18F9; //cos(7pi/16)
.short 0x7D8A; //cos(pi/16)
SECTION_L1_DATA_A
vtmp: .space 256
#define TMP0 FP-8
#define TMP1 FP-12
#define TMP2 FP-16
.text
DEFUN(vp3_idct,mL1,
(int16_t *block)):
/********************** Function Prologue *********************************/
link 16;
[--SP] = (R7:4, P5:3); // Push the registers onto the stack.
B0 = R0; // Pointer to Input matrix
RELOC(R1, P3, coefs); // Pointer to Coefficients
RELOC(R2, P3, vtmp); // Pointer to Temporary matrix
B3 = R1;
B2 = R2;
L3 = 20; // L3 is used for making the coefficient array
// circular.
// MUST BE RESTORED TO ZERO at function exit.
M1 = 16 (X); // All these registers are initialized for
M3 = 8(X); // modifying address offsets.
I0 = B0; // I0 points to Input Element (0, 0).
I2 = B0; // I2 points to Input Element (0, 0).
I2 += M3 || R0.H = W[I0];
// Element 0 is read into R0.H
I1 = I2; // I1 points to input Element (0, 6).
I1 += 4 || R0.L = W[I2++];
// I2 points to input Element (0, 4).
// Element 4 is read into R0.L.
P2 = 8 (X);
P3 = 32 (X);
P4 = -32 (X);
P5 = 98 (X);
R7 = 0x8000(Z);
I3 = B3; // I3 points to Coefficients
P0 = B2; // P0 points to array Element (0, 0) of temp
P1 = B2;
R7 = [I3++] || [TMP2]=R7; // Coefficient C4 is read into R7.H and R7.L.
MNOP;
NOP;
/*
* A1 = Y0 * cos(pi/4)
* A0 = Y0 * cos(pi/4)
* A1 = A1 + Y4 * cos(pi/4)
* A0 = A0 - Y4 * cos(pi/4)
* load:
* R1=(Y2,Y6)
* R7=(C2,C6)
* res:
* R3=Y0, R2=Y4
*/
A1=R7.H*R0.H, A0=R7.H*R0.H (IS) || I0+= 4 || R1.L=W[I1++];
R3=(A1+=R7.H*R0.L), R2=(A0-=R7.H*R0.L) (IS) || R1.H=W[I0--] || R7=[I3++];
LSETUP (.0, .1) LC0 = P2; // perform 8 1d idcts
P2 = 112 (X);
P1 = P1 + P2; // P1 points to element (7, 0) of temp buffer.
P2 = -94(X);
.0:
/*
* A1 = Y2 * cos(3pi/8)
* A0 = Y2 * cos(pi/8)
* A1 = A1 - Y6 * cos(pi/8)
* A0 = A0 + Y6 * cos(3pi/8)
* R5 = (Y1,Y7)
* R7 = (C1,C7)
* res:
* R1=Y2, R0=Y6
*/
A1=R7.L*R1.H, A0=R7.H*R1.H (IS) || I0+=4 || R5.H=W[I0];
R1=(A1-=R7.H*R1.L), R0=(A0+=R7.L*R1.L) (IS) || R5.L=W[I1--] || R7=[I3++];
/*
* Y0 = Y0 + Y6.
* Y4 = Y4 + Y2.
* Y2 = Y4 - Y2.
* Y6 = Y0 - Y6.
* R3 is saved
* R6.l=Y3
* note: R3: Y0, R2: Y4, R1: Y2, R0: Y6
*/
R3=R3+R0, R0=R3-R0;
R2=R2+R1, R1=R2-R1 || [TMP0]=R3 || R6.L=W[I0--];
/*
* Compute the odd portion (1,3,5,7) even is done.
*
* Y1 = C7 * Y1 - C1 * Y7 + C3 * Y5 - C5 * Y3.
* Y7 = C1 * Y1 + C7 * Y7 + C5 * Y5 + C3 * Y3.
* Y5 = C5 * Y1 + C3 * Y7 + C7 * Y5 - C1 * Y3.
* Y3 = C3 * Y1 - C5 * Y7 - C1 * Y5 - C7 * Y3.
*/
// R5=(Y1,Y7) R6=(Y5,Y3) // R7=(C1,C7)
A1 =R7.L*R5.H, A0 =R7.H*R5.H (IS) || [TMP1]=R2 || R6.H=W[I2--];
A1-=R7.H*R5.L, A0+=R7.L*R5.L (IS) || I0-=4 || R7=[I3++];
A1+=R7.H*R6.H, A0+=R7.L*R6.H (IS) || I0+=M1; // R7=(C3,C5)
R3 =(A1-=R7.L*R6.L), R2 =(A0+=R7.H*R6.L) (IS);
A1 =R7.L*R5.H, A0 =R7.H*R5.H (IS) || R4=[TMP0];
A1+=R7.H*R5.L, A0-=R7.L*R5.L (IS) || I1+=M1 || R7=[I3++]; // R7=(C1,C7)
A1+=R7.L*R6.H, A0-=R7.H*R6.H (IS);
R7 =(A1-=R7.H*R6.L), R6 =(A0-=R7.L*R6.L) (IS) || I2+=M1;
// R3=Y1, R2=Y7, R7=Y5, R6=Y3
/* Transpose write column. */
R5.H=R4+R2 (RND12); // Y0=Y0+Y7
R5.L=R4-R2 (RND12) || R4 = [TMP1]; // Y7=Y7-Y0
R2.H=R1+R7 (RND12) || W[P0++P3]=R5.H; // Y2=Y2+Y5 st Y0
R2.L=R1-R7 (RND12) || W[P1++P4]=R5.L || R7=[I3++]; // Y5=Y2-Y5 st Y7
R5.H=R0-R3 (RND12) || W[P0++P3]=R2.H || R1.L=W[I1++]; // Y1=Y6-Y1 st Y2
R5.L=R0+R3 (RND12) || W[P1++P4]=R2.L || R0.H=W[I0++]; // Y6=Y6+Y1 st Y5
R3.H=R4-R6 (RND12) || W[P0++P3]=R5.H || R0.L=W[I2++]; // Y3=Y3-Y4 st Y1
R3.L=R4+R6 (RND12) || W[P1++P4]=R5.L || R1.H=W[I0++]; // Y4=Y3+Y4 st Y6
/* pipeline loop start, + drain Y3, Y4 */
A1=R7.H*R0.H, A0=R7.H*R0.H (IS) || W[P0++P2]= R3.H || R1.H = W[I0--];
.1: R3=(A1+=R7.H*R0.L), R2=(A0-=R7.H*R0.L) (IS) || W[P1++P5]= R3.L || R7 = [I3++];
I0 = B2; // I0 points to Input Element (0, 0)
I2 = B2; // I2 points to Input Element (0, 0)
I2 += M3 || R0.H = W[I0];
// Y0 is read in R0.H
I1 = I2; // I1 points to input Element (0, 6)
I1 += 4 || R0.L = W[I2++];
// I2 points to input Element (0, 4)
// Y4 is read in R0.L
P2 = 8 (X);
I3 = B3; // I3 points to Coefficients
P0 = B0; // P0 points to array Element (0, 0) for writing
// output
P1 = B0;
R7 = [I3++]; // R7.H = C4 and R7.L = C4
NOP;
/*
* A1 = Y0 * cos(pi/4)
* A0 = Y0 * cos(pi/4)
* A1 = A1 + Y4 * cos(pi/4)
* A0 = A0 - Y4 * cos(pi/4)
* load:
* R1=(Y2,Y6)
* R7=(C2,C6)
* res:
* R3=Y0, R2=Y4
*/
A1=R7.H*R0.H, A0=R7.H*R0.H (IS) || I0+=4 || R1.L=W[I1++];
R3=(A1+=R7.H*R0.L), R2=(A0-=R7.H*R0.L) (IS) || R1.H=W[I0--] || R7=[I3++];
LSETUP (.2, .3) LC0 = P2; // peform 8 1d idcts
P2 = 112 (X);
P1 = P1 + P2;
P2 = -94(X);
.2:
/*
* A1 = Y2 * cos(3pi/8)
* A0 = Y2 * cos(pi/8)
* A1 = A1 - Y6 * cos(pi/8)
* A0 = A0 + Y6 * cos(3pi/8)
* R5 = (Y1,Y7)
* R7 = (C1,C7)
* res:
* R1=Y2, R0=Y6
*/
A1=R7.L*R1.H, A0=R7.H*R1.H (IS) || I0+=4 || R5.H=W[I0];
R1=(A1-=R7.H*R1.L), R0=(A0+=R7.L*R1.L) (IS) || R5.L=W[I1--] || R7=[I3++];
/*
* Y0 = Y0 + Y6.
* Y4 = Y4 + Y2.
* Y2 = Y4 - Y2.
* Y6 = Y0 - Y6.
* R3 is saved
* R6.l=Y3
* note: R3: Y0, R2: Y4, R1: Y2, R0: Y6
*/
R3=R3+R0, R0=R3-R0;
R2=R2+R1, R1=R2-R1 || [TMP0]=R3 || R6.L=W[I0--];
/*
* Compute the odd portion (1,3,5,7) even is done.
*
* Y1 = C7 * Y1 - C1 * Y7 + C3 * Y5 - C5 * Y3.
* Y7 = C1 * Y1 + C7 * Y7 + C5 * Y5 + C3 * Y3.
* Y5 = C5 * Y1 + C3 * Y7 + C7 * Y5 - C1 * Y3.
* Y3 = C3 * Y1 - C5 * Y7 - C1 * Y5 - C7 * Y3.
*/
// R5=(Y1,Y7) R6=(Y5,Y3) // R7=(C1,C7)
A1 =R7.L*R5.H, A0 =R7.H*R5.H (IS) || [TMP1]=R2 || R6.H=W[I2--];
A1-=R7.H*R5.L, A0+=R7.L*R5.L (IS) || I0-=4 || R7=[I3++];
A1+=R7.H*R6.H, A0+=R7.L*R6.H (IS) || I0+=M1; // R7=(C3,C5)
R3 =(A1-=R7.L*R6.L), R2 =(A0+=R7.H*R6.L) (IS);
A1 =R7.L*R5.H, A0 =R7.H*R5.H (IS) || R4=[TMP0];
A1+=R7.H*R5.L, A0-=R7.L*R5.L (IS) || I1+=M1 || R7=[I3++]; // R7=(C1,C7)
A1+=R7.L*R6.H, A0-=R7.H*R6.H (IS);
R7 =(A1-=R7.H*R6.L), R6 =(A0-=R7.L*R6.L) (IS) || I2+=M1;
// R3=Y1, R2=Y7, R7=Y5, R6=Y3
/* Transpose write column. */
R5.H=R4+R2 (RND20); // Y0=Y0+Y7
R5.L=R4-R2 (RND20) || R4 = [TMP1]; // Y7=Y7-Y0
R5=R5>>>2(v);
R2.H=R1+R7 (RND20) || W[P0++P3]=R5.H; // Y2=Y2+Y5 st Y0
R2.L=R1-R7 (RND20) || W[P1++P4]=R5.L || R7=[I3++]; // Y5=Y2-Y5 st Y7
R2=R2>>>2(v);
R5.H=R0-R3 (RND20) || W[P0++P3]=R2.H || R1.L=W[I1++]; // Y1=Y6-Y1 st Y2
R5.L=R0+R3 (RND20) || W[P1++P4]=R2.L || R0.H=W[I0++]; // Y6=Y6+Y1 st Y5
R5=R5>>>2(v);
R3.H=R4-R6 (RND20) || W[P0++P3]=R5.H || R0.L=W[I2++]; // Y3=Y3-Y4 st Y1
R3.L=R4+R6 (RND20) || W[P1++P4]=R5.L || R1.H=W[I0++]; // Y4=Y3+Y4 st Y6
R3=R3>>>2(v);
/* pipeline loop start, + drain Y3, Y4 */
A1=R7.H*R0.H, A0=R7.H*R0.H (IS) || W[P0++P2]= R3.H || R1.H = W[I0--];
.3: R3=(A1+=R7.H*R0.L), R2=(A0-=R7.H*R0.L) (IS) || W[P1++P5]= R3.L || R7 = [I3++];
L3 = 0;
(R7:4,P5:3)=[SP++];
unlink;
RTS;
DEFUN_END(vp3_idct)

@ -1,66 +0,0 @@
/*
* Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include <string.h>
#include "libavutil/attributes.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/mathops.h"
#include "libavcodec/vp3dsp.h"
void ff_bfin_vp3_idct(int16_t *block);
/* Intra iDCT offset 128 */
static void bfin_vp3_idct_put(uint8_t *dest, int line_size, int16_t *block)
{
const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP + 128;
int i,j;
ff_bfin_vp3_idct (block);
for (i=0;i<8;i++)
for (j=0;j<8;j++)
dest[line_size*i + j] = cm[block[j*8 + i]];
memset(block, 0, 128);
}
/* Inter iDCT */
static void bfin_vp3_idct_add(uint8_t *dest, int line_size, int16_t *block)
{
const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
int i, j;
ff_bfin_vp3_idct (block);
for (i = 0; i < 8; i++)
for (j = 0; j < 8; j++)
dest[line_size*i + j] = cm[dest[line_size*i + j] + block[j*8 + i]];
memset(block, 0, 128);
}
av_cold void ff_vp3dsp_init_bfin(VP3DSPContext *c, int flags)
{
if (!(flags & CODEC_FLAG_BITEXACT)) {
c->idct_add = bfin_vp3_idct_add;
c->idct_put = bfin_vp3_idct_put;
}
}

@ -47,10 +47,6 @@
#include "x86/idct_xvid.h"
#include "dctref.h"
// BFIN
void ff_bfin_idct(int16_t *block);
void ff_bfin_fdct(int16_t *block);
// ALTIVEC
void ff_fdct_altivec(int16_t *block);
@ -92,10 +88,6 @@ static const struct algo fdct_tab[] = {
{ "altivecfdct", ff_fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
#endif
#if ARCH_BFIN
{ "BFINfdct", ff_bfin_fdct, NO_PERM },
#endif
{ 0 }
};
@ -153,10 +145,6 @@ static const struct algo idct_tab[] = {
#endif
#endif
#if ARCH_BFIN
{ "BFINidct", ff_bfin_idct, NO_PERM },
#endif
#if ARCH_ARM
{ "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
{ "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },

@ -1727,8 +1727,6 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
ff_dsputil_init_alpha(c, avctx);
if (ARCH_ARM)
ff_dsputil_init_arm(c, avctx, high_bit_depth);
if (ARCH_BFIN)
ff_dsputil_init_bfin(c, avctx, high_bit_depth);
if (ARCH_PPC)
ff_dsputil_init_ppc(c, avctx, high_bit_depth);
if (ARCH_X86)

@ -252,8 +252,6 @@ void ff_set_cmp(DSPContext *c, me_cmp_func *cmp, int type);
void ff_dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx);
void ff_dsputil_init_arm(DSPContext *c, AVCodecContext *avctx,
unsigned high_bit_depth);
void ff_dsputil_init_bfin(DSPContext *c, AVCodecContext *avctx,
unsigned high_bit_depth);
void ff_dsputil_init_ppc(DSPContext *c, AVCodecContext *avctx,
unsigned high_bit_depth);
void ff_dsputil_init_x86(DSPContext *c, AVCodecContext *avctx,

@ -361,8 +361,6 @@ av_cold void ff_hpeldsp_init(HpelDSPContext *c, int flags)
ff_hpeldsp_init_alpha(c, flags);
if (ARCH_ARM)
ff_hpeldsp_init_arm(c, flags);
if (ARCH_BFIN)
ff_hpeldsp_init_bfin(c, flags);
if (ARCH_PPC)
ff_hpeldsp_init_ppc(c, flags);
if (ARCH_X86)

@ -97,7 +97,6 @@ void ff_hpeldsp_init(HpelDSPContext *c, int flags);
void ff_hpeldsp_init_aarch64(HpelDSPContext *c, int flags);
void ff_hpeldsp_init_alpha(HpelDSPContext *c, int flags);
void ff_hpeldsp_init_arm(HpelDSPContext *c, int flags);
void ff_hpeldsp_init_bfin(HpelDSPContext *c, int flags);
void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags);
void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags);

@ -289,8 +289,6 @@ av_cold void ff_vp3dsp_init(VP3DSPContext *c, int flags)
if (ARCH_ARM)
ff_vp3dsp_init_arm(c, flags);
if (ARCH_BFIN)
ff_vp3dsp_init_bfin(c, flags);
if (ARCH_PPC)
ff_vp3dsp_init_ppc(c, flags);
if (ARCH_X86)

@ -47,7 +47,6 @@ typedef struct VP3DSPContext {
void ff_vp3dsp_init(VP3DSPContext *c, int flags);
void ff_vp3dsp_init_arm(VP3DSPContext *c, int flags);
void ff_vp3dsp_init_bfin(VP3DSPContext *c, int flags);
void ff_vp3dsp_init_ppc(VP3DSPContext *c, int flags);
void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags);

@ -1,54 +0,0 @@
/*
* Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVUTIL_BFIN_ASM_H
#define AVUTIL_BFIN_ASM_H
#include "config.h"
#define mL3 .text
#if defined(__FDPIC__) && CONFIG_SRAM
#define mL1 .l1.text
#define SECTION_L1_DATA_A .section .l1.data.A,"aw",@progbits
#define SECTION_L1_DATA_B .section .l1.data.B,"aw",@progbits
#else
#define mL1 mL3
#define SECTION_L1_DATA_A
#define SECTION_L1_DATA_B .data
#endif
#define DEFUN(fname, where, interface) \
.section where; \
.global _ff_bfin_ ## fname; \
.type _ff_bfin_ ## fname, STT_FUNC; \
.align 8; \
_ff_bfin_ ## fname
#define DEFUN_END(fname) \
.size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname
#ifdef __FDPIC__
#define RELOC(reg, got, obj) reg = [got + obj@GOT17M4]
#else
#define RELOC(reg, got, obj) reg.L = obj; reg.H = obj
#endif
#endif /* AVUTIL_BFIN_ASM_H */

@ -1,34 +0,0 @@
/*
* Copyright (C) 2007 Marc Hoffman <mmh@pleasantst.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVUTIL_BFIN_ATTRIBUTES_H
#define AVUTIL_BFIN_ATTRIBUTES_H
#include "config.h"
#if defined(__FDPIC__) && CONFIG_SRAM
#define attribute_l1_text __attribute__((l1_text))
#define attribute_l1_data_b __attribute__((l1_data_B))
#else
#define attribute_l1_text
#define attribute_l1_data_b
#endif
#endif /* AVUTIL_BFIN_ATTRIBUTES_H */

@ -1,3 +0,0 @@
OBJS += bfin/internal_bfin.o \
bfin/swscale_bfin.o \
bfin/yuv2rgb_bfin.o \

@ -1,599 +0,0 @@
/*
* Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
* April 20, 2007
*
* Blackfin video color space converter operations
* convert I420 YV12 to RGB in various formats
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock
and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts.
The following calculation is used for the conversion:
r = clipz((y - oy) * cy + crv * (v - 128))
g = clipz((y - oy) * cy + cgv * (v - 128) + cgu * (u - 128))
b = clipz((y - oy) * cy + cbu * (u - 128))
y, u, v are prescaled by a factor of 4 i.e. left-shifted to gain precision.
New factorization to eliminate the truncation error which was
occurring due to the byteop3p.
1) Use the bytop16m to subtract quad bytes we use this in U8 this
then so the offsets need to be renormalized to 8bits.
2) Scale operands up by a factor of 4 not 8 because Blackfin
multiplies include a shift.
3) Compute into the accumulators cy * yx0, cy * yx1.
4) Compute each of the linear equations:
r = clipz((y - oy) * cy + crv * (v - 128))
g = clipz((y - oy) * cy + cgv * (v - 128) + cgu * (u - 128))
b = clipz((y - oy) * cy + cbu * (u - 128))
Reuse of the accumulators requires that we actually multiply
twice once with addition and the second time with a subtraction.
Because of this we need to compute the equations in the order R B
then G saving the writes for B in the case of 24/32 bit color
formats.
API: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
int dW, uint32_t *coeffs);
A B
--- ---
i2 = cb i3 = cr
i1 = coeff i0 = y
Where coeffs have the following layout in memory.
uint32_t oy, oc, zero, cy, crv, rmask, cbu, bmask, cgu, cgv;
coeffs is a pointer to oy.
The {rgb} masks are only utilized by the 565 packing algorithm. Note the data
replication is used to simplify the internal algorithms for the dual Mac
architecture of BlackFin.
All routines are exported with _ff_bfin_ as a symbol prefix.
Rough performance gain compared against -O3:
2779809/1484290 187.28%
which translates to ~33c/pel to ~57c/pel for the reference vs 17.5
c/pel for the optimized implementations. Not sure why there is such a
huge variation on the reference codes on Blackfin I guess it must have
to do with the memory system.
*/
#include "libavutil/bfin/asm.h"
#define MEM mL1
.text
#define COEFF_LEN 11*4
#define COEFF_REL_CY_OFF 4*4
#define ARG_OUT 20
#define ARG_W 24
#define ARG_COEFF 28
DEFUN(yuv2rgb565_line,MEM,
(uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
link 0;
[--sp] = (r7:4);
p1 = [fp+ARG_OUT];
r3 = [fp+ARG_W];
i0 = r0;
i2 = r1;
i3 = r2;
r0 = [fp+ARG_COEFF];
i1 = r0;
b1 = i1;
l1 = COEFF_LEN;
m0 = COEFF_REL_CY_OFF;
p0 = r3;
r0 = [i0++]; // 2Y
r1.l = w[i2++]; // 2u
r1.h = w[i3++]; // 2v
p0 = p0>>2;
lsetup (.L0565, .L1565) lc0 = p0;
/*
uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
r0 -- used to load 4ys
r1 -- used to load 2us,2vs
r4 -- y3,y2
r5 -- y1,y0
r6 -- u1,u0
r7 -- v1,v0
*/
r2=[i1++]; // oy
.L0565:
/*
rrrrrrrr gggggggg bbbbbbbb
5432109876543210
bbbbb >>3
gggggggg <<3
rrrrrrrr <<8
rrrrrggggggbbbbb
*/
(r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
(r7,r6) = byteop16m (r1:0, r3:2) (r);
r5 = r5 << 2 (v); // y1,y0
r4 = r4 << 2 (v); // y3,y2
r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
/* Y' = y*cy */
a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
/* R = Y+ crv*(Cr-128) */
r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
r2 = r2 >> 3 (v);
r3 = r2 & r5;
/* B = Y+ cbu*(Cb-128) */
r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
r2 = r2 << 8 (v);
r2 = r2 & r5;
r3 = r3 | r2;
/* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask
r2 = r2 << 3 (v);
r2 = r2 & r5;
r3 = r3 | r2;
[p1++]=r3 || r1=[i1++]; // cy
/* Y' = y*cy */
a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
/* R = Y+ crv*(Cr-128) */
r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
r2 = r2 >> 3 (v);
r3 = r2 & r5;
/* B = Y+ cbu*(Cb-128) */
r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
r2 = r2 << 8 (v);
r2 = r2 & r5;
r3 = r3 | r2;
/* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
r2 = byteop3p(r3:2, r1:0)(LO) || r0 = [i0++]; // 2Y
r2 = r2 << 3 (v) || r1.l = w[i2++]; // 2u
r2 = r2 & r5;
r3 = r3 | r2;
[p1++]=r3 || r1.h = w[i3++]; // 2v
.L1565: r2=[i1++]; // oy
l1 = 0;
(r7:4) = [sp++];
unlink;
rts;
DEFUN_END(yuv2rgb565_line)
DEFUN(yuv2rgb555_line,MEM,
(uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
link 0;
[--sp] = (r7:4);
p1 = [fp+ARG_OUT];
r3 = [fp+ARG_W];
i0 = r0;
i2 = r1;
i3 = r2;
r0 = [fp+ARG_COEFF];
i1 = r0;
b1 = i1;
l1 = COEFF_LEN;
m0 = COEFF_REL_CY_OFF;
p0 = r3;
r0 = [i0++]; // 2Y
r1.l = w[i2++]; // 2u
r1.h = w[i3++]; // 2v
p0 = p0>>2;
lsetup (.L0555, .L1555) lc0 = p0;
/*
uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
r0 -- used to load 4ys
r1 -- used to load 2us,2vs
r4 -- y3,y2
r5 -- y1,y0
r6 -- u1,u0
r7 -- v1,v0
*/
r2=[i1++]; // oy
.L0555:
/*
rrrrrrrr gggggggg bbbbbbbb
5432109876543210
bbbbb >>3
gggggggg <<2
rrrrrrrr <<7
xrrrrrgggggbbbbb
*/
(r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
(r7,r6) = byteop16m (r1:0, r3:2) (r);
r5 = r5 << 2 (v); // y1,y0
r4 = r4 << 2 (v); // y3,y2
r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
/* Y' = y*cy */
a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
/* R = Y+ crv*(Cr-128) */
r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
r2 = r2 >> 3 (v);
r3 = r2 & r5;
/* B = Y+ cbu*(Cb-128) */
r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
r2 = r2 << 7 (v);
r2 = r2 & r5;
r3 = r3 | r2;
/* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask
r2 = r2 << 2 (v);
r2 = r2 & r5;
r3 = r3 | r2;
[p1++]=r3 || r1=[i1++]; // cy
/* Y' = y*cy */
a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
/* R = Y+ crv*(Cr-128) */
r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
r2 = r2 >> 3 (v);
r3 = r2 & r5;
/* B = Y+ cbu*(Cb-128) */
r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
r2 = r2 << 7 (v);
r2 = r2 & r5;
r3 = r3 | r2;
/* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
r2 = byteop3p(r3:2, r1:0)(LO) || r0=[i0++]; // 4Y
r2 = r2 << 2 (v) || r1.l=w[i2++]; // 2u
r2 = r2 & r5;
r3 = r3 | r2;
[p1++]=r3 || r1.h=w[i3++]; // 2v
.L1555: r2=[i1++]; // oy
l1 = 0;
(r7:4) = [sp++];
unlink;
rts;
DEFUN_END(yuv2rgb555_line)
DEFUN(yuv2rgb24_line,MEM,
(uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
link 0;
[--sp] = (r7:4);
p1 = [fp+ARG_OUT];
r3 = [fp+ARG_W];
p2 = p1;
p2 += 3;
i0 = r0;
i2 = r1;
i3 = r2;
r0 = [fp+ARG_COEFF]; // coeff buffer
i1 = r0;
b1 = i1;
l1 = COEFF_LEN;
m0 = COEFF_REL_CY_OFF;
p0 = r3;
r0 = [i0++]; // 2Y
r1.l = w[i2++]; // 2u
r1.h = w[i3++]; // 2v
p0 = p0>>2;
lsetup (.L0888, .L1888) lc0 = p0;
/*
uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
r0 -- used to load 4ys
r1 -- used to load 2us,2vs
r4 -- y3,y2
r5 -- y1,y0
r6 -- u1,u0
r7 -- v1,v0
*/
r2=[i1++]; // oy
.L0888:
(r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
(r7,r6) = byteop16m (r1:0, r3:2) (r);
r5 = r5 << 2 (v); // y1,y0
r4 = r4 << 2 (v); // y3,y2
r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
/* Y' = y*cy */
a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
/* R = Y+ crv*(Cr-128) */
r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
r2=r2>>16 || B[p1++]=r2;
B[p2++]=r2;
/* B = Y+ cbu*(Cb-128) */
r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
/* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask, oy,cy,zero
r2=r2>>16 || B[p1++]=r2;
B[p2++]=r2;
r3=r3>>16 || B[p1++]=r3;
B[p2++]=r3 || r1=[i1++]; // cy
p1+=3;
p2+=3;
/* Y' = y*cy */
a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
/* R = Y+ crv*(Cr-128) */
r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
r2=r2>>16 || B[p1++]=r2;
B[p2++]=r2;
/* B = Y+ cbu*(Cb-128) */
r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
/* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++]; // gmask
r2=r2>>16 || B[p1++]=r2 || r0 = [i0++]; // 4y
B[p2++]=r2 || r1.l = w[i2++]; // 2u
r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v
B[p2++]=r3 || r2=[i1++]; // oy
p1+=3;
.L1888: p2+=3;
l1 = 0;
(r7:4) = [sp++];
unlink;
rts;
DEFUN_END(yuv2rgb24_line)
#define ARG_vdst 20
#define ARG_width 24
#define ARG_height 28
#define ARG_lumStride 32
#define ARG_chromStride 36
#define ARG_srcStride 40
DEFUN(uyvytoyv12, mL3, (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
int width, int height,
int lumStride, int chromStride, int srcStride)):
link 0;
[--sp] = (r7:4,p5:4);
p0 = r1; // Y top even
i2 = r2; // *u
r2 = [fp + ARG_vdst];
i3 = r2; // *v
r1 = [fp + ARG_srcStride];
r2 = r0 + r1;
i0 = r0; // uyvy_T even
i1 = r2; // uyvy_B odd
p2 = [fp + ARG_lumStride];
p1 = p0 + p2; // Y bot odd
p5 = [fp + ARG_width];
p4 = [fp + ARG_height];
r0 = p5;
p4 = p4 >> 1;
p5 = p5 >> 2;
r2 = r0 << 1;
r1 = r1 << 1;
r1 = r1 - r2; // srcStride + (srcStride - 2*width)
r1 += -8; // i0,i1 is pre read need to correct
m0 = r1;
r2 = [fp + ARG_chromStride];
r0 = r0 >> 1;
r2 = r2 - r0;
m1 = r2;
/* I0,I1 - src input line pointers
* p0,p1 - luma output line pointers
* I2 - dstU
* I3 - dstV
*/
lsetup (0f, 1f) lc1 = p4; // H/2
0: r0 = [i0++] || r2 = [i1++];
r1 = [i0++] || r3 = [i1++];
r4 = byteop1p(r1:0, r3:2);
r5 = byteop1p(r1:0, r3:2) (r);
lsetup (2f, 3f) lc0 = p5; // W/4
2: r0 = r0 >> 8(v);
r1 = r1 >> 8(v);
r2 = r2 >> 8(v);
r3 = r3 >> 8(v);
r0 = bytepack(r0, r1);
r2 = bytepack(r2, r3) || [p0++] = r0; // yyyy
r6 = pack(r5.l, r4.l) || [p1++] = r2; // yyyy
r7 = pack(r5.h, r4.h) || r0 = [i0++] || r2 = [i1++];
r6 = bytepack(r6, r7) || r1 = [i0++] || r3 = [i1++];
r4 = byteop1p(r1:0, r3:2) || w[i2++] = r6.l; // uu
3: r5 = byteop1p(r1:0, r3:2) (r) || w[i3++] = r6.h; // vv
i0 += m0;
i1 += m0;
i2 += m1;
i3 += m1;
p0 = p0 + p2;
1: p1 = p1 + p2;
(r7:4,p5:4) = [sp++];
unlink;
rts;
DEFUN_END(uyvytoyv12)
DEFUN(yuyvtoyv12, mL3, (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
int width, int height,
int lumStride, int chromStride, int srcStride)):
link 0;
[--sp] = (r7:4,p5:4);
p0 = r1; // Y top even
i2 = r2; // *u
r2 = [fp + ARG_vdst];
i3 = r2; // *v
r1 = [fp + ARG_srcStride];
r2 = r0 + r1;
i0 = r0; // uyvy_T even
i1 = r2; // uyvy_B odd
p2 = [fp + ARG_lumStride];
p1 = p0 + p2; // Y bot odd
p5 = [fp + ARG_width];
p4 = [fp + ARG_height];
r0 = p5;
p4 = p4 >> 1;
p5 = p5 >> 2;
r2 = r0 << 1;
r1 = r1 << 1;
r1 = r1 - r2; // srcStride + (srcStride - 2*width)
r1 += -8; // i0,i1 is pre read need to correct
m0 = r1;
r2 = [fp + ARG_chromStride];
r0 = r0 >> 1;
r2 = r2 - r0;
m1 = r2;
/* I0,I1 - src input line pointers
* p0,p1 - luma output line pointers
* I2 - dstU
* I3 - dstV
*/
lsetup (0f, 1f) lc1 = p4; // H/2
0: r0 = [i0++] || r2 = [i1++];
r1 = [i0++] || r3 = [i1++];
r4 = bytepack(r0, r1);
r5 = bytepack(r2, r3);
lsetup (2f, 3f) lc0 = p5; // W/4
2: r0 = r0 >> 8(v) || [p0++] = r4; // yyyy-even
r1 = r1 >> 8(v) || [p1++] = r5; // yyyy-odd
r2 = r2 >> 8(v);
r3 = r3 >> 8(v);
r4 = byteop1p(r1:0, r3:2);
r5 = byteop1p(r1:0, r3:2) (r);
r6 = pack(r5.l, r4.l);
r7 = pack(r5.h, r4.h) || r0 = [i0++] || r2 = [i1++];
r6 = bytepack(r6, r7) || r1 = [i0++] || r3 = [i1++];
r4 = bytepack(r0, r1) || w[i2++] = r6.l; // uu
3: r5 = bytepack(r2, r3) || w[i3++] = r6.h; // vv
i0 += m0;
i1 += m0;
i2 += m1;
i3 += m1;
p0 = p0 + p2;
1: p1 = p1 + p2;
(r7:4,p5:4) = [sp++];
unlink;
rts;
DEFUN_END(yuyvtoyv12)

@ -1,84 +0,0 @@
/*
* Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
*
* Blackfin software video scaler operations
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/bfin/attributes.h"
#include "libswscale/swscale_internal.h"
int ff_bfin_uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
uint8_t *vdst, int width, int height,
int lumStride, int chromStride,
int srcStride) attribute_l1_text;
int ff_bfin_yuyvtoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
uint8_t *vdst, int width, int height,
int lumStride, int chromStride,
int srcStride) attribute_l1_text;
static int uyvytoyv12_unscaled(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
uint8_t *dsty = dst[0] + dstStride[0] * srcSliceY;
uint8_t *dstu = dst[1] + dstStride[1] * srcSliceY / 2;
uint8_t *dstv = dst[2] + dstStride[2] * srcSliceY / 2;
const uint8_t *ip = src[0] + srcStride[0] * srcSliceY;
int w = dstStride[0];
ff_bfin_uyvytoyv12(ip, dsty, dstu, dstv, w, srcSliceH,
dstStride[0], dstStride[1], srcStride[0]);
return srcSliceH;
}
static int yuyvtoyv12_unscaled(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
uint8_t *dsty = dst[0] + dstStride[0] * srcSliceY;
uint8_t *dstu = dst[1] + dstStride[1] * srcSliceY / 2;
uint8_t *dstv = dst[2] + dstStride[2] * srcSliceY / 2;
const uint8_t *ip = src[0] + srcStride[0] * srcSliceY;
int w = dstStride[0];
ff_bfin_yuyvtoyv12(ip, dsty, dstu, dstv, w, srcSliceH,
dstStride[0], dstStride[1], srcStride[0]);
return srcSliceH;
}
av_cold void ff_get_unscaled_swscale_bfin(SwsContext *c)
{
if (c->dstFormat == AV_PIX_FMT_YUV420P && c->srcFormat == AV_PIX_FMT_UYVY422) {
av_log(NULL, AV_LOG_VERBOSE,
"selecting Blackfin optimized uyvytoyv12_unscaled\n");
c->swscale = uyvytoyv12_unscaled;
}
if (c->dstFormat == AV_PIX_FMT_YUV420P && c->srcFormat == AV_PIX_FMT_YUYV422) {
av_log(NULL, AV_LOG_VERBOSE,
"selecting Blackfin optimized yuyvtoyv12_unscaled\n");
c->swscale = yuyvtoyv12_unscaled;
}
}

@ -1,198 +0,0 @@
/*
* Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
*
* Blackfin video color space converter operations
* convert I420 YV12 to RGB in various formats
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/pixdesc.h"
#include <stdint.h>
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/bfin/attributes.h"
#include "libswscale/swscale_internal.h"
void ff_bfin_yuv2rgb555_line(const uint8_t *Y, const uint8_t *U,
const uint8_t *V, uint8_t *out,
int w, uint32_t *coeffs) attribute_l1_text;
void ff_bfin_yuv2rgb565_line(const uint8_t *Y, const uint8_t *U,
const uint8_t *V, uint8_t *out,
int w, uint32_t *coeffs) attribute_l1_text;
void ff_bfin_yuv2rgb24_line(const uint8_t *Y, const uint8_t *U,
const uint8_t *V, uint8_t *out,
int w, uint32_t *coeffs) attribute_l1_text;
typedef void (*ltransform)(const uint8_t *Y, const uint8_t *U, const uint8_t *V,
uint8_t *out, int w, uint32_t *coeffs);
static void bfin_prepare_coefficients(SwsContext *c, int rgb, int masks)
{
int oy;
oy = c->yOffset & 0xffff;
oy = oy >> 3; // keep everything U8.0 for offset calculation
c->oc = 128 * 0x01010101U;
c->oy = oy * 0x01010101U;
/* copy 64bit vector coeffs down to 32bit vector coeffs */
c->cy = c->yCoeff;
c->zero = 0;
if (rgb) {
c->crv = c->vrCoeff;
c->cbu = c->ubCoeff;
c->cgu = c->ugCoeff;
c->cgv = c->vgCoeff;
} else {
c->crv = c->ubCoeff;
c->cbu = c->vrCoeff;
c->cgu = c->vgCoeff;
c->cgv = c->ugCoeff;
}
if (masks == 555) {
c->rmask = 0x001f * 0x00010001U;
c->gmask = 0x03e0 * 0x00010001U;
c->bmask = 0x7c00 * 0x00010001U;
} else if (masks == 565) {
c->rmask = 0x001f * 0x00010001U;
c->gmask = 0x07e0 * 0x00010001U;
c->bmask = 0xf800 * 0x00010001U;
}
}
static int core_yuv420_rgb(SwsContext *c, const uint8_t **in, int *instrides,
int srcSliceY, int srcSliceH, uint8_t **oplanes,
int *outstrides, ltransform lcscf,
int rgb, int masks)
{
const uint8_t *py, *pu, *pv;
uint8_t *op;
int w = instrides[0];
int h2 = srcSliceH >> 1;
int i;
bfin_prepare_coefficients(c, rgb, masks);
py = in[0];
pu = in[1 + (1 ^ rgb)];
pv = in[1 + (0 ^ rgb)];
op = oplanes[0] + srcSliceY * outstrides[0];
for (i = 0; i < h2; i++) {
lcscf(py, pu, pv, op, w, &c->oy);
py += instrides[0];
op += outstrides[0];
lcscf(py, pu, pv, op, w, &c->oy);
py += instrides[0];
pu += instrides[1];
pv += instrides[2];
op += outstrides[0];
}
return srcSliceH;
}
static int bfin_yuv420_rgb555(SwsContext *c, const uint8_t **in, int *instrides,
int srcSliceY, int srcSliceH,
uint8_t **oplanes, int *outstrides)
{
return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
outstrides, ff_bfin_yuv2rgb555_line, 1, 555);
}
static int bfin_yuv420_bgr555(SwsContext *c, const uint8_t **in, int *instrides,
int srcSliceY, int srcSliceH,
uint8_t **oplanes, int *outstrides)
{
return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
outstrides, ff_bfin_yuv2rgb555_line, 0, 555);
}
static int bfin_yuv420_rgb24(SwsContext *c, const uint8_t **in, int *instrides,
int srcSliceY, int srcSliceH,
uint8_t **oplanes, int *outstrides)
{
return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
outstrides, ff_bfin_yuv2rgb24_line, 1, 888);
}
static int bfin_yuv420_bgr24(SwsContext *c, const uint8_t **in, int *instrides,
int srcSliceY, int srcSliceH,
uint8_t **oplanes, int *outstrides)
{
return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
outstrides, ff_bfin_yuv2rgb24_line, 0, 888);
}
static int bfin_yuv420_rgb565(SwsContext *c, const uint8_t **in, int *instrides,
int srcSliceY, int srcSliceH,
uint8_t **oplanes, int *outstrides)
{
return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
outstrides, ff_bfin_yuv2rgb565_line, 1, 565);
}
static int bfin_yuv420_bgr565(SwsContext *c, const uint8_t **in, int *instrides,
int srcSliceY, int srcSliceH,
uint8_t **oplanes, int *outstrides)
{
return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
outstrides, ff_bfin_yuv2rgb565_line, 0, 565);
}
av_cold SwsFunc ff_yuv2rgb_init_bfin(SwsContext *c)
{
SwsFunc f;
switch (c->dstFormat) {
case AV_PIX_FMT_RGB555:
f = bfin_yuv420_rgb555;
break;
case AV_PIX_FMT_BGR555:
f = bfin_yuv420_bgr555;
break;
case AV_PIX_FMT_RGB565:
f = bfin_yuv420_rgb565;
break;
case AV_PIX_FMT_BGR565:
f = bfin_yuv420_bgr565;
break;
case AV_PIX_FMT_RGB24:
f = bfin_yuv420_rgb24;
break;
case AV_PIX_FMT_BGR24:
f = bfin_yuv420_bgr24;
break;
default:
return 0;
}
av_log(c, AV_LOG_INFO, "BlackFin accelerated color space converter %s\n",
av_get_pix_fmt_name(c->dstFormat));
return f;
}

@ -92,7 +92,9 @@ const char *swscale_license(void);
#define SWS_CPU_CAPS_MMX2 0x20000000
#define SWS_CPU_CAPS_3DNOW 0x40000000
#define SWS_CPU_CAPS_ALTIVEC 0x10000000
#if FF_API_ARCH_BFIN
#define SWS_CPU_CAPS_BFIN 0x01000000
#endif
#define SWS_CPU_CAPS_SSE2 0x02000000
#endif

@ -473,20 +473,6 @@ typedef struct SwsContext {
vector signed short *vYCoeffsBank, *vCCoeffsBank;
#endif
#if ARCH_BFIN
DECLARE_ALIGNED(4, uint32_t, oy);
DECLARE_ALIGNED(4, uint32_t, oc);
DECLARE_ALIGNED(4, uint32_t, zero);
DECLARE_ALIGNED(4, uint32_t, cy);
DECLARE_ALIGNED(4, uint32_t, crv);
DECLARE_ALIGNED(4, uint32_t, rmask);
DECLARE_ALIGNED(4, uint32_t, cbu);
DECLARE_ALIGNED(4, uint32_t, bmask);
DECLARE_ALIGNED(4, uint32_t, cgu);
DECLARE_ALIGNED(4, uint32_t, cgv);
DECLARE_ALIGNED(4, uint32_t, gmask);
#endif
int use_mmx_vfilter;
/* pre defined color-spaces gamma */
@ -622,7 +608,6 @@ av_cold void ff_sws_init_range_convert(SwsContext *c);
SwsFunc ff_yuv2rgb_init_x86(SwsContext *c);
SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c);
SwsFunc ff_yuv2rgb_init_bfin(SwsContext *c);
#if FF_API_SWS_FORMAT_NAME
/**
@ -851,7 +836,6 @@ extern const AVClass sws_context_class;
* source and destination formats, bit depths, flags, etc.
*/
void ff_get_unscaled_swscale(SwsContext *c);
void ff_get_unscaled_swscale_bfin(SwsContext *c);
void ff_get_unscaled_swscale_ppc(SwsContext *c);
void ff_get_unscaled_swscale_arm(SwsContext *c);

@ -1749,8 +1749,6 @@ void ff_get_unscaled_swscale(SwsContext *c)
c->swscale = planarCopyWrapper;
}
if (ARCH_BFIN)
ff_get_unscaled_swscale_bfin(c);
if (ARCH_PPC)
ff_get_unscaled_swscale_ppc(c);
// if (ARCH_ARM)

@ -55,5 +55,8 @@
#ifndef FF_API_SWS_FORMAT_NAME
#define FF_API_SWS_FORMAT_NAME (LIBSWSCALE_VERSION_MAJOR < 3)
#endif
#ifndef FF_API_ARCH_BFIN
#define FF_API_ARCH_BFIN (LIBSWSCALE_VERSION_MAJOR < 3)
#endif
#endif /* SWSCALE_VERSION_H */

@ -601,8 +601,6 @@ SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c)
{
SwsFunc t = NULL;
if (ARCH_BFIN)
t = ff_yuv2rgb_init_bfin(c);
if (ARCH_PPC)
t = ff_yuv2rgb_init_ppc(c);
if (ARCH_X86)

Loading…
Cancel
Save