spelling/grammar/wording overhaul

Originally committed as revision 27190 to svn://svn.mplayerhq.hu/mplayer/trunk/libswscale
pull/126/head
Diego Biurrun 17 years ago
parent 4bdc44c7fe
commit 8a3227968c
  1. 38
      libswscale/internal_bfin.S
  2. 32
      libswscale/rgb2rgb.c
  3. 44
      libswscale/rgb2rgb.h
  4. 58
      libswscale/rgb2rgb_template.c
  5. 16
      libswscale/swscale_altivec_template.c
  6. 2
      libswscale/swscale_bfin.c
  7. 6
      libswscale/swscale_internal.h
  8. 41
      libswscale/swscale_template.c
  9. 8
      libswscale/yuv2rgb.c
  10. 67
      libswscale/yuv2rgb_altivec.c
  11. 7
      libswscale/yuv2rgb_bfin.c
  12. 3
      libswscale/yuv2rgb_mlib.c
  13. 32
      libswscale/yuv2rgb_template.c

@ -2,8 +2,8 @@
* Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
* April 20, 2007
*
* Blackfin Video Color Space Converters Operations
* convert I420 YV12 to RGB in various formats,
* Blackfin video color space converter operations
* convert I420 YV12 to RGB in various formats
*
* This file is part of FFmpeg.
*
@ -24,8 +24,8 @@
/*
YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock
and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts
YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock
and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts.
The following calculation is used for the conversion:
@ -34,36 +34,36 @@ The following calculation is used for the conversion:
g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128))
b = clipz((y-oy)*cy + cbu*(u-128))
y,u,v are pre scaled by a factor of 4 i.e. left shifted to gain precision.
y,u,v are prescaled by a factor of 4 i.e. left-shifted to gain precision.
New factorization to eliminate the truncation error which was
occuring due to the byteop3p.
occurring due to the byteop3p.
1) use the bytop16m to subtract quad bytes we use this in U8 this
1) Use the bytop16m to subtract quad bytes we use this in U8 this
then so the offsets need to be renormalized to 8bits.
2) scale operands up by a factor of 4 not 8 because Blackfin
2) Scale operands up by a factor of 4 not 8 because Blackfin
multiplies include a shift.
3) compute into the accumulators cy*yx0, cy*yx1
3) Compute into the accumulators cy*yx0, cy*yx1.
4) compute each of the linear equations
4) Compute each of the linear equations:
r = clipz((y - oy) * cy + crv * (v - 128))
g = clipz((y - oy) * cy + cgv * (v - 128) + cgu * (u - 128))
b = clipz((y - oy) * cy + cbu * (u - 128))
reuse of the accumulators requires that we actually multiply
twice once with addition and the second time with a subtaction.
Reuse of the accumulators requires that we actually multiply
twice once with addition and the second time with a subtraction.
because of this we need to compute the equations in the order R B
Because of this we need to compute the equations in the order R B
then G saving the writes for B in the case of 24/32 bit color
formats.
api: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
API: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
int dW, uint32_t *coeffs);
A B
@ -77,13 +77,13 @@ uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv;
coeffs is a pointer to oy.
the {rgb} masks are only utilized by the 565 packing algorithm. Note the data
replication is used to simplify the internal algorithms for the dual mac architecture
of BlackFin.
The {rgb} masks are only utilized by the 565 packing algorithm. Note the data
replication is used to simplify the internal algorithms for the dual Mac
architecture of BlackFin.
All routines are exported with _ff_bfin_ as a symbol prefix
All routines are exported with _ff_bfin_ as a symbol prefix.
rough performance gain compared against -O3:
Rough performance gain compared against -O3:
2779809/1484290 187.28%

@ -1,10 +1,10 @@
/*
* rgb2rgb.c, Software RGB to RGB convertor
* pluralize by Software PAL8 to RGB convertor
* Software YUV to YUV convertor
* Software YUV to RGB convertor
* Written by Nick Kurshev.
* palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
* software RGB to RGB converter
* pluralize by software PAL8 to RGB converter
* software YUV to YUV converter
* software YUV to RGB converter
* Written by Nick Kurshev.
* palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
*
* This file is part of FFmpeg.
*
@ -22,8 +22,8 @@
* along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
* the C code (not assembly, mmx, ...) of this file can be used
* under the LGPL license too
* The C code (not assembly, MMX, ...) of this file can be used
* under the LGPL license.
*/
#include <inttypes.h>
#include "config.h"
@ -33,7 +33,7 @@
#include "swscale.h"
#include "swscale_internal.h"
#define FAST_BGR2YV12 // use 7 bit coeffs instead of 15bit
#define FAST_BGR2YV12 // use 7-bit instead of 15-bit coefficients
void (*rgb24to32)(const uint8_t *src, uint8_t *dst, long src_size);
void (*rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size);
@ -149,8 +149,8 @@ static uint64_t __attribute__((aligned(8))) dither8[2]={
#define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
#define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
//Note: we have C, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
//Plain C versions
//Note: We have C, MMX, MMX2, 3DNOW versions, there is no 3DNOW + MMX2 one.
//plain C versions
#undef HAVE_MMX
#undef HAVE_MMX2
#undef HAVE_3DNOW
@ -190,10 +190,10 @@ static uint64_t __attribute__((aligned(8))) dither8[2]={
#endif //ARCH_X86 || ARCH_X86_64
/*
rgb15->rgb16 Original by Strepto/Astral
RGB15->RGB16 original by Strepto/Astral
ported to gcc & bugfixed : A'rpi
MMX2, 3DNOW optimization by Nick Kurshev
32bit c version, and and&add trick by Michael Niedermayer
32-bit C version, and and&add trick by Michael Niedermayer
*/
void sws_rgb2rgb_init(int flags){
@ -266,7 +266,7 @@ void palette8torgb24(const uint8_t *src, uint8_t *dst, long num_pixels, const ui
{
long i;
/*
writes 1 byte o much and might cause alignment issues on some architectures?
Writes 1 byte too much and might cause alignment issues on some architectures?
for (i=0; i<num_pixels; i++)
((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[src[i]];
*/
@ -284,7 +284,7 @@ void palette8tobgr24(const uint8_t *src, uint8_t *dst, long num_pixels, const ui
{
long i;
/*
writes 1 byte o much and might cause alignment issues on some architectures?
Writes 1 byte too much and might cause alignment issues on some architectures?
for (i=0; i<num_pixels; i++)
((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[src[i]];
*/
@ -299,7 +299,7 @@ void palette8tobgr24(const uint8_t *src, uint8_t *dst, long num_pixels, const ui
}
/**
* Palette is assumed to contain bgr16, see rgb32to16 to convert the palette
* Palette is assumed to contain BGR16, see rgb32to16 to convert the palette.
*/
void palette8torgb16(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette)
{

@ -1,8 +1,8 @@
/*
* rgb2rgb.h, Software RGB to RGB convertor
* pluralize by Software PAL8 to RGB convertor
* Software YUV to YUV convertor
* Software YUV to RGB convertor
* software RGB to RGB converter
* pluralize by Software PAL8 to RGB converter
* Software YUV to YUV converter
* Software YUV to RGB converter
* Written by Nick Kurshev.
* palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
*
@ -28,7 +28,7 @@
#include <inttypes.h>
/* A full collection of rgb to rgb(bgr) convertors */
/* A full collection of RGB to RGB(BGR) converters */
extern void (*rgb24to32) (const uint8_t *src, uint8_t *dst, long src_size);
extern void (*rgb24to16) (const uint8_t *src, uint8_t *dst, long src_size);
extern void (*rgb24to15) (const uint8_t *src, uint8_t *dst, long src_size);
@ -71,53 +71,49 @@ extern void palette8torgb15(const uint8_t *src, uint8_t *dst, long num_pixels, c
extern void palette8tobgr15(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
/**
*
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a
* problem for anyone then tell me, and ill fix it)
* chrominance data is only taken from every secound line others are ignored FIXME write HQ version
* Height should be a multiple of 2 and width should be a multiple of 16.
* (If this is a problem for anyone then tell me, and I will fix it.)
* Chrominance data is only taken from every second line, others are ignored.
* FIXME: Write HQ version.
*/
//void uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
/**
*
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a
* problem for anyone then tell me, and ill fix it)
* Height should be a multiple of 2 and width should be a multiple of 16.
* (If this is a problem for anyone then tell me, and I will fix it.)
*/
extern void (*yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
long width, long height,
long lumStride, long chromStride, long dstStride);
/**
*
* width should be a multiple of 16
* Width should be a multiple of 16.
*/
extern void (*yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
long width, long height,
long lumStride, long chromStride, long dstStride);
/**
*
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a
* problem for anyone then tell me, and ill fix it)
* Height should be a multiple of 2 and width should be a multiple of 16.
* (If this is a problem for anyone then tell me, and I will fix it.)
*/
extern void (*yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
long width, long height,
long lumStride, long chromStride, long srcStride);
/**
*
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a
* problem for anyone then tell me, and ill fix it)
* Height should be a multiple of 2 and width should be a multiple of 16.
* (If this is a problem for anyone then tell me, and I will fix it.)
*/
extern void (*yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
long width, long height,
long lumStride, long chromStride, long dstStride);
/**
*
* height should be a multiple of 2 and width should be a multiple of 2 (if this is a
* problem for anyone then tell me, and ill fix it)
* chrominance data is only taken from every secound line others are ignored FIXME write HQ version
* Height should be a multiple of 2 and width should be a multiple of 2.
* (If this is a problem for anyone then tell me, and I will fix it.)
* Chrominance data is only taken from every second line, others are ignored.
* FIXME: Write HQ version.
*/
extern void (*rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
long width, long height,

@ -1,11 +1,11 @@
/*
* rgb2rgb.c, Software RGB to RGB convertor
* pluralize by Software PAL8 to RGB convertor
* Software YUV to YUV convertor
* Software YUV to RGB convertor
* Written by Nick Kurshev.
* palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
* lot of big-endian byteorder fixes by Alex Beregszaszi
* software RGB to RGB converter
* pluralize by software PAL8 to RGB converter
* software YUV to YUV converter
* software YUV to RGB converter
* Written by Nick Kurshev.
* palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
* lot of big-endian byte order fixes by Alex Beregszaszi
*
* This file is part of FFmpeg.
*
@ -23,7 +23,7 @@
* along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
* The C code (not assembly, mmx, ...) of this file can be used
* The C code (not assembly, MMX, ...) of this file can be used
* under the LGPL license.
*/
@ -229,10 +229,10 @@ static inline void RENAME(rgb32to24)(const uint8_t *src, uint8_t *dst, long src_
}
/*
Original by Strepto/Astral
ported to gcc & bugfixed : A'rpi
original by Strepto/Astral
ported to gcc & bugfixed: A'rpi
MMX2, 3DNOW optimization by Nick Kurshev
32 bit C version, and and&add trick by Michael Niedermayer
32-bit C version, and and&add trick by Michael Niedermayer
*/
static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
{
@ -926,9 +926,9 @@ static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long s
----------------
1 1 0 1 1 1 1 0
|=======| |===|
| Leftmost Bits Repeated to Fill Open Bits
| leftmost bits repeated to fill open bits
|
Original Bits
original bits
*/
static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size)
{
@ -1006,7 +1006,7 @@ static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_
:"=m"(*d)
:"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
:"memory");
/* Borrowed 32 to 24 */
/* borrowed 32 to 24 */
asm volatile(
"movq %%mm0, %%mm4 \n\t"
"movq %%mm3, %%mm5 \n\t"
@ -1147,7 +1147,7 @@ static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_
:"=m"(*d)
:"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
:"memory");
/* Borrowed 32 to 24 */
/* borrowed 32 to 24 */
asm volatile(
"movq %%mm0, %%mm4 \n\t"
"movq %%mm3, %%mm5 \n\t"
@ -1479,7 +1479,7 @@ static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long s
asm volatile(SFENCE:::"memory");
asm volatile(EMMS:::"memory");
if (mmx_size==23) return; //finihsed, was multiple of 8
if (mmx_size==23) return; //finished, was multiple of 8
src+= src_size;
dst+= src_size;
@ -1638,8 +1638,8 @@ asm( EMMS" \n\t"
}
/**
* Height should be a multiple of 2 and width should be a multiple of 16 (if
* this is a problem for anyone then tell me, and I will fix it).
* Height should be a multiple of 2 and width should be a multiple of 16.
* (If this is a problem for anyone then tell me, and I will fix it.)
*/
static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
long width, long height,
@ -1720,7 +1720,7 @@ static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *u
(vc[0] << 8) + (yc[1] << 0);
#else
*idst++ = uc[0] + (yc[0] << 8) +
(vc[0] << 16) + (yc[1] << 24);
(vc[0] << 16) + (yc[1] << 24);
#endif
yc += 2;
uc++;
@ -1744,8 +1744,8 @@ asm( EMMS" \n\t"
}
/**
* Height should be a multiple of 2 and width should be a multiple of 16 (if
* this is a problem for anyone then tell me, and I will fix it).
* Height should be a multiple of 2 and width should be a multiple of 16
* (If this is a problem for anyone then tell me, and I will fix it.)
*/
static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
long width, long height,
@ -1766,8 +1766,8 @@ static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usr
}
/**
* Height should be a multiple of 2 and width should be a multiple of 16 (if
* this is a problem for anyone then tell me, and I will fix it).
* Height should be a multiple of 2 and width should be a multiple of 16.
* (If this is a problem for anyone then tell me, and I will fix it.)
*/
static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
long width, long height,
@ -2002,9 +2002,9 @@ asm volatile( EMMS" \n\t"
}
/**
* Height should be a multiple of 2 and width should be a multiple of 16 (if
* this is a problem for anyone then tell me, and I will fix it).
* Chrominance data is only taken from every secound line, others are ignored.
* Height should be a multiple of 2 and width should be a multiple of 16.
* (If this is a problem for anyone then tell me, and I will fix it.)
* Chrominance data is only taken from every second line, others are ignored.
* FIXME: Write HQ version.
*/
static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
@ -2128,9 +2128,9 @@ asm volatile( EMMS" \n\t"
}
/**
* Height should be a multiple of 2 and width should be a multiple of 2 (if
* this is a problem for anyone then tell me, and I will fix it).
* Chrominance data is only taken from every secound line,
* Height should be a multiple of 2 and width should be a multiple of 2.
* (If this is a problem for anyone then tell me, and I will fix it.)
* Chrominance data is only taken from every second line,
* others are ignored in the C version.
* FIXME: Write HQ version.
*/

@ -245,12 +245,12 @@ static inline void hScale_altivec_real(int16_t *dst, int dstW, uint8_t *src, int
src_v = vec_mergeh(src_v, (vector signed short)vzero);
filter_v = vec_ld(i << 3, filter);
// the 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2)
// The 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2).
// the neat trick : we only care for half the elements,
// The neat trick: We only care for half the elements,
// high or low depending on (i<<3)%16 (it's 0 or 8 here),
// and we're going to use vec_mule, so we chose
// carefully how to "unpack" the elements into the even slots
// and we're going to use vec_mule, so we choose
// carefully how to "unpack" the elements into the even slots.
if ((i << 3) % 16)
filter_v = vec_mergel(filter_v, (vector signed short)vzero);
else
@ -405,12 +405,12 @@ static inline int yv12toyuy2_unscaled_altivec(SwsContext *c, uint8_t* src[], int
return srcSliceH;
}
/* this code assume:
/* This code assumes:
1) dst is 16 bytes-aligned
2) dstStride is a multiple of 16
3) width is a multiple of 16
4) lum&chrom stride are multiple of 8
4) lum & chrom stride are multiples of 8
*/
for (y=0; y<height; y++) {
@ -482,12 +482,12 @@ static inline int yv12touyvy_unscaled_altivec(SwsContext *c, uint8_t* src[], int
return srcSliceH;
}
/* this code assume:
/* This code assumes:
1) dst is 16 bytes-aligned
2) dstStride is a multiple of 16
3) width is a multiple of 16
4) lum&chrom stride are multiple of 8
4) lum & chrom stride are multiples of 8
*/
for (y=0; y<height; y++) {

@ -1,7 +1,7 @@
/*
* Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
*
* Blackfin Software Video SCALER Operations
* Blackfin software video scaler operations
*
* This file is part of FFmpeg.
*

@ -37,7 +37,7 @@
typedef int (*SwsFunc)(struct SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
int srcSliceH, uint8_t* dst[], int dstStride[]);
/* this struct should be aligned on at least 32-byte boundary */
/* This struct should be aligned on at least a 32-byte boundary. */
typedef struct SwsContext{
/**
* info on struct for av_log
@ -73,7 +73,7 @@ typedef struct SwsContext{
int16_t *vChrFilter;
int16_t *vChrFilterPos;
uint8_t formatConvBuffer[VOF]; //FIXME dynamic alloc, but we have to change a lot of code for this to be useful
uint8_t formatConvBuffer[VOF]; //FIXME dynamic allocation, but we have to change a lot of code for this to be useful
int hLumFilterSize;
int hChrFilterSize;
@ -122,7 +122,7 @@ typedef struct SwsContext{
#define V_OFFSET "10*8"
#define LUM_MMX_FILTER_OFFSET "11*8"
#define CHR_MMX_FILTER_OFFSET "11*8+4*4*256"
#define DSTW_OFFSET "11*8+4*4*256*2" //do not change, it is hardcoded in the asm
#define DSTW_OFFSET "11*8+4*4*256*2" //do not change, it is hardcoded in the ASM
#define ESP_OFFSET "11*8+4*4*256*2+8"
#define VROUNDER_OFFSET "11*8+4*4*256*2+16"
#define U_TEMP "11*8+4*4*256*2+24"

@ -17,8 +17,8 @@
* along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
* the C code (not assembly, mmx, ...) of this file can be used
* under the LGPL license too
* The C code (not assembly, MMX, ...) of this file can be used
* under the LGPL license.
*/
#undef REAL_MOVNTQ
@ -30,7 +30,7 @@
#undef SFENCE
#ifdef HAVE_3DNOW
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
/* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
#define EMMS "femms"
#else
#define EMMS "emms"
@ -1503,7 +1503,7 @@ static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *
const int yalpha1=0;
int i;
uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
const int yalpha= 4096; //FIXME ...
if (flags&SWS_FULL_CHR_H_INT)
@ -1700,7 +1700,7 @@ static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *
}
}
//FIXME yuy2* can read upto 7 samples to much
//FIXME yuy2* can read up to 7 samples too much
static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
{
@ -2297,7 +2297,7 @@ static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1,
}
}
// Bilinear / Bicubic scaling
// bilinear / bicubic scaling
static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
int16_t *filter, int16_t *filterPos, long filterSize)
{
@ -2544,7 +2544,7 @@ static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, i
}
#ifdef HAVE_MMX
// use the new MMX scaler if the mmx2 can't be used (it is faster than the x86 ASM one)
// Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
#else
if (!(flags&SWS_FAST_BILINEAR))
@ -2552,7 +2552,7 @@ static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, i
{
RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
}
else // Fast Bilinear upscale / crap downscale
else // fast bilinear upscale / crap downscale
{
#if defined(ARCH_X86)
#ifdef HAVE_MMX2
@ -2761,7 +2761,7 @@ inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1,
}
#ifdef HAVE_MMX
// use the new MMX scaler if the mmx2 can't be used (it is faster than the x86 ASM one)
// Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
#else
if (!(flags&SWS_FAST_BILINEAR))
@ -2770,7 +2770,7 @@ inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1,
RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
}
else // Fast Bilinear upscale / crap downscale
else // fast bilinear upscale / crap downscale
{
#if defined(ARCH_X86)
#ifdef HAVE_MMX2
@ -2890,8 +2890,8 @@ FUNNY_UV_CODE
"cmp %2, %%"REG_a" \n\t"
" jb 1b \n\t"
/* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
which is needed to support GCC-4.0 */
/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
which is needed to support GCC 4.0. */
#if defined(ARCH_X86_64) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
:: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
#else
@ -2963,7 +2963,7 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
int lastDstY;
uint8_t *pal=NULL;
/* vars whch will change and which we need to storw back in the context */
/* vars which will change and which we need to store back in the context */
int dstY= c->dstY;
int lumBufIndex= c->lumBufIndex;
int chrBufIndex= c->chrBufIndex;
@ -3004,13 +3004,14 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
if (flags & SWS_PRINT_INFO && firstTime)
{
av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
" ->cannot do aligned memory acesses anymore\n");
" ->cannot do aligned memory accesses anymore\n");
firstTime=0;
}
}
/* Note the user might start scaling the picture in the middle so this will not get executed
this is not really intended but works currently, so ppl might do it */
/* Note the user might start scaling the picture in the middle so this
will not get executed. This is not really intended but works
currently, so people might do it. */
if (srcSliceY ==0){
lumBufIndex=0;
chrBufIndex=0;
@ -3182,7 +3183,7 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
{
const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
if (vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
{
int16_t *lumBuf = lumPixBuf[0];
int16_t *chrBuf= chrPixBuf[0];
@ -3200,13 +3201,13 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
{
ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
if (vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
{
int chrAlpha= vChrFilter[2*dstY+1];
RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
dest, dstW, chrAlpha, dstFormat, flags, dstY);
}
else if (vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
{
int lumAlpha= vLumFilter[2*dstY+1];
int chrAlpha= vChrFilter[2*dstY+1];
@ -3217,7 +3218,7 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
dest, dstW, lumAlpha, chrAlpha, dstY);
}
else //General RGB
else //general RGB
{
RENAME(yuv2packedX)(c,
vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,

@ -39,7 +39,7 @@
#include "swscale.h"
#include "swscale_internal.h"
#define DITHER1XBPP // only for mmx
#define DITHER1XBPP // only for MMX
const uint8_t __attribute__((aligned(8))) dither_2x2_4[2][8]={
{ 1, 3, 1, 3, 1, 3, 1, 3, },
@ -155,8 +155,8 @@ DECLARE_ASM_CONST(8, uint64_t, mmx_00ffw) = 0x00ff00ff00ff00ffULL;
DECLARE_ASM_CONST(8, uint64_t, mmx_redmask) = 0xf8f8f8f8f8f8f8f8ULL;
DECLARE_ASM_CONST(8, uint64_t, mmx_grnmask) = 0xfcfcfcfcfcfcfcfcULL;
// the volatile is required because gcc otherwise optimizes some writes away not knowing that these
// are read in the asm block
// The volatile is required because gcc otherwise optimizes some writes away
// not knowing that these are read in the ASM block.
static volatile uint64_t attribute_used __attribute__((aligned(8))) b5Dither;
static volatile uint64_t attribute_used __attribute__((aligned(8))) g5Dither;
static volatile uint64_t attribute_used __attribute__((aligned(8))) g6Dither;
@ -641,7 +641,7 @@ SwsFunc yuv2rgb_get_func_ptr (SwsContext *c)
}
#endif
av_log(c, AV_LOG_WARNING, "No accelerated colorspace conversion found\n");
av_log(c, AV_LOG_WARNING, "No accelerated colorspace conversion found.\n");
switch(c->dstFormat){
case PIX_FMT_BGR32:

@ -21,63 +21,68 @@
*/
/*
convert I420 YV12 to RGB in various formats,
it rejects images that are not in 420 formats
it rejects images that don't have widths of multiples of 16
it rejects images that don't have heights of multiples of 2
reject defers to C simulation codes.
Convert I420 YV12 to RGB in various formats,
it rejects images that are not in 420 formats,
it rejects images that don't have widths of multiples of 16,
it rejects images that don't have heights of multiples of 2.
Reject defers to C simulation code.
lots of optimizations to be done here
Lots of optimizations to be done here.
1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
so we currently use max min to clip
1. Need to fix saturation code. I just couldn't get it to fly with packs
and adds, so we currently use max/min to clip.
2. the inefficient use of chroma loading needs a bit of brushing up
2. The inefficient use of chroma loading needs a bit of brushing up.
3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
3. Analysis of pipeline stalls needs to be done. Use shark to identify
pipeline stalls.
MODIFIED to calculate coeffs from currently selected color space.
MODIFIED core to be a macro which you spec the output format.
ADDED UYVY conversion which is never called due to some thing in SWSCALE.
MODIFIED core to be a macro where you specify the output format.
ADDED UYVY conversion which is never called due to some thing in swscale.
CORRECTED algorithim selection to be strict on input formats.
ADDED runtime detection of altivec.
ADDED runtime detection of AltiVec.
ADDED altivec_yuv2packedX vertical scl + RGB converter
March 27,2004
PERFORMANCE ANALYSIS
The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
used as test.
The AltiVec version uses 10% of the processor or ~100Mips for D1 video
same sequence.
720*480*30 ~10MPS
720 * 480 * 30 ~10MPS
so we have roughly 10clocks per pixel this is too high something has to be wrong.
so we have roughly 10 clocks per pixel. This is too high, something has
to be wrong.
OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
need for vec_min.
OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
guaranteed to have the input video frame it was just decompressed so
it probably resides in L1 caches. However we are creating the
output video stream this needs to use the DSTST instruction to
optimize for the cache. We couple this with the fact that we are
not going to be visiting the input buffer again so we mark it Least
Recently Used. This shaves 25% of the processor cycles off.
OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have
the input video frame, it was just decompressed so it probably resides in L1
caches. However, we are creating the output video stream. This needs to use the
DSTST instruction to optimize for the cache. We couple this with the fact that
we are not going to be visiting the input buffer again so we mark it Least
Recently Used. This shaves 25% of the processor cycles off.
Now MEMCPY is the largest mips consumer in the system, probably due
Now memcpy is the largest mips consumer in the system, probably due
to the inefficient X11 stuff.
GL libraries seem to be very slow on this machine 1.33Ghz PB running
Jaguar, this is not the case for my 1Ghz PB. I thought it might be
a versioning issues, however I have libGL.1.2.dylib for both
machines. ((We need to figure this out now))
a versioning issue, however I have libGL.1.2.dylib for both
machines. (We need to figure this out now.)
GL2 libraries work now with patch for RGB32
GL2 libraries work now with patch for RGB32.
NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment.
Integrated luma prescaling adjustment for saturation/contrast/brightness
adjustment.
*/
#include <stdio.h>

@ -1,9 +1,8 @@
/*
* Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
* April 20, 2007
*
* Blackfin Video Color Space Converters Operations
* convert I420 YV12 to RGB in various formats,
* Blackfin video color space converter operations
* convert I420 YV12 to RGB in various formats
*
* This file is part of FFmpeg.
*
@ -200,7 +199,7 @@ SwsFunc ff_bfin_yuv2rgb_get_func_ptr (SwsContext *c)
return 0;
}
av_log(c, AV_LOG_INFO, "BlackFin Accelerated Color Space Converter %s\n",
av_log(c, AV_LOG_INFO, "BlackFin accelerated color space converter %s\n",
sws_format_name (c->dstFormat));
return f;

@ -1,5 +1,6 @@
/*
* yuv2rgb_mlib.c, Software YUV to RGB converter using mediaLib
* software YUV to RGB converter using mediaLib
*
* Copyright (C) 2003 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.

@ -1,5 +1,5 @@
/*
* yuv2rgb_mmx.c, Software YUV to RGB converter with Intel MMX "technology"
* yuv2rgb_mmx.c, software YUV to RGB converter with Intel MMX "technology"
*
* Copyright (C) 2000, Silicon Integrated System Corp.
*
@ -31,7 +31,7 @@
#undef SFENCE
#ifdef HAVE_3DNOW
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
/* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
#define EMMS "femms"
#else
#define EMMS "emms"
@ -147,8 +147,8 @@ static inline int RENAME(yuv420_rgb16)(SwsContext *c, uint8_t* src[], int srcStr
g6Dither= ff_dither4[y&1];
g5Dither= ff_dither8[y&1];
r5Dither= ff_dither8[(y+1)&1];
/* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
pixels in each iteration */
/* This MMX assembly code deals with a SINGLE scan line at a time,
* it converts 8 pixels in each iteration. */
asm volatile (
/* load data for start of next scan line */
"movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
@ -156,8 +156,8 @@ static inline int RENAME(yuv420_rgb16)(SwsContext *c, uint8_t* src[], int srcStr
"movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
//".balign 16 \n\t"
"1: \n\t"
/* no speed diference on my p3@500 with prefetch,
* if it is faster for anyone with -benchmark then tell me
/* No speed difference on my p3@500 with prefetch,
* if it is faster for anyone with -benchmark then tell me.
PREFETCH" 64(%0) \n\t"
PREFETCH" 64(%1) \n\t"
PREFETCH" 64(%2) \n\t"
@ -180,7 +180,7 @@ YUV2RGB
"movq %%mm0, %%mm5;" /* Copy B7-B0 */
"movq %%mm2, %%mm7;" /* Copy G7-G0 */
/* convert rgb24 plane to rgb16 pack for pixel 0-3 */
/* convert RGB24 plane to RGB16 pack for pixel 0-3 */
"punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */
"punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
@ -190,7 +190,7 @@ YUV2RGB
"movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
MOVNTQ " %%mm0, (%1);" /* store pixel 0-3 */
/* convert rgb24 plane to rgb16 pack for pixel 0-3 */
/* convert RGB24 plane to RGB16 pack for pixel 0-3 */
"punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */
"punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
@ -242,8 +242,8 @@ static inline int RENAME(yuv420_rgb15)(SwsContext *c, uint8_t* src[], int srcStr
g6Dither= ff_dither4[y&1];
g5Dither= ff_dither8[y&1];
r5Dither= ff_dither8[(y+1)&1];
/* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
pixels in each iteration */
/* This MMX assembly code deals with a SINGLE scan line at a time,
* it converts 8 pixels in each iteration. */
asm volatile (
/* load data for start of next scan line */
"movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
@ -271,7 +271,7 @@ YUV2RGB
"movq %%mm0, %%mm5;" /* Copy B7-B0 */
"movq %%mm2, %%mm7;" /* Copy G7-G0 */
/* convert rgb24 plane to rgb16 pack for pixel 0-3 */
/* convert RGB24 plane to RGB16 pack for pixel 0-3 */
"punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3_0_0_0 */
"punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
@ -281,7 +281,7 @@ YUV2RGB
"movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
MOVNTQ " %%mm0, (%1);" /* store pixel 0-3 */
/* convert rgb24 plane to rgb16 pack for pixel 0-3 */
/* convert RGB24 plane to RGB16 pack for pixel 0-3 */
"punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 0_g7g6g5 g4g3_0_0 */
"punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
@ -326,8 +326,8 @@ static inline int RENAME(yuv420_rgb24)(SwsContext *c, uint8_t* src[], int srcStr
uint8_t *pv = src[2] + (y>>1)*srcStride[2];
long index= -h_size/2;
/* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
pixels in each iteration */
/* This MMX assembly code deals with a SINGLE scan line at a time,
* it converts 8 pixels in each iteration. */
asm volatile (
/* load data for start of next scan line */
"movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
@ -472,8 +472,8 @@ static inline int RENAME(yuv420_rgb32)(SwsContext *c, uint8_t* src[], int srcStr
uint8_t *pv = src[2] + (y>>1)*srcStride[2];
long index= -h_size/2;
/* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
pixels in each iteration */
/* This MMX assembly code deals with a SINGLE scan line at a time,
* it converts 8 pixels in each iteration. */
asm volatile (
/* load data for start of next scan line */
"movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */

Loading…
Cancel
Save