FFmpeg/libavcodec/ppc/gmc_altivec.c

/*
 * GMC (Global Motion Compensation)
 * AltiVec-enabled
 * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavcodec/dsputil.h"
#include "dsputil_ppc.h"
#include "util_altivec.h"
#include "types_altivec.h"
#include "dsputil_altivec.h"

/*
  altivec-enhanced gmc1. ATM this code assume stride is a multiple of 8,
  to preserve proper dst alignment.
*/
#define GMC1_PERF_COND (h==8)
void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int stride, int h, int x16, int y16, int rounder)
{
POWERPC_PERF_DECLARE(altivec_gmc1_num, GMC1_PERF_COND);
    const DECLARE_ALIGNED(16, unsigned short, rounder_a) = rounder;
    const DECLARE_ALIGNED(16, unsigned short, ABCD)[8] =
        {
            (16-x16)*(16-y16), /* A */
            (   x16)*(16-y16), /* B */
            (16-x16)*(   y16), /* C */
            (   x16)*(   y16), /* D */
            0, 0, 0, 0         /* padding */
        };
    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
    register const vector unsigned short vcsr8 = (const vector unsigned short)vec_splat_u16(8);
    register vector unsigned char dstv, dstv2, src_0, src_1, srcvA, srcvB, srcvC, srcvD;
    register vector unsigned short Av, Bv, Cv, Dv, rounderV, tempA, tempB, tempC, tempD;
    int i;
    unsigned long dst_odd = (unsigned long)dst & 0x0000000F;
    unsigned long src_really_odd = (unsigned long)src & 0x0000000F;


POWERPC_PERF_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND);

    tempA = vec_ld(0, (unsigned short*)ABCD);
    Av = vec_splat(tempA, 0);
    Bv = vec_splat(tempA, 1);
    Cv = vec_splat(tempA, 2);
    Dv = vec_splat(tempA, 3);

    rounderV = vec_splat((vec_u16)vec_lde(0, &rounder_a), 0);

    // we'll be able to pick-up our 9 char elements
    // at src from those 32 bytes
    // we load the first batch here, as inside the loop
    // we can re-use 'src+stride' from one iteration
    // as the 'src' of the next.
    src_0 = vec_ld(0, src);
    src_1 = vec_ld(16, src);
    srcvA = vec_perm(src_0, src_1, vec_lvsl(0, src));

    if (src_really_odd != 0x0000000F) {
        // if src & 0xF == 0xF, then (src+1) is properly aligned
        // on the second vector.
        srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src));
    } else {
        srcvB = src_1;
    }
    srcvA = vec_mergeh(vczero, srcvA);
    srcvB = vec_mergeh(vczero, srcvB);

    for(i=0; i<h; i++) {
        dst_odd = (unsigned long)dst & 0x0000000F;
        src_really_odd = (((unsigned long)src) + stride) & 0x0000000F;

        dstv = vec_ld(0, dst);

        // we we'll be able to pick-up our 9 char elements
        // at src + stride from those 32 bytes
        // then reuse the resulting 2 vectors srvcC and srcvD
        // as the next srcvA and srcvB
        src_0 = vec_ld(stride + 0, src);
        src_1 = vec_ld(stride + 16, src);
        srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src));

        if (src_really_odd != 0x0000000F) {
            // if src & 0xF == 0xF, then (src+1) is properly aligned
            // on the second vector.
            srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src));
        } else {
            srcvD = src_1;
        }

        srcvC = vec_mergeh(vczero, srcvC);
        srcvD = vec_mergeh(vczero, srcvD);


        // OK, now we (finally) do the math :-)
        // those four instructions replaces 32 int muls & 32 int adds.
        // isn't AltiVec nice ?
        tempA = vec_mladd((vector unsigned short)srcvA, Av, rounderV);
        tempB = vec_mladd((vector unsigned short)srcvB, Bv, tempA);
        tempC = vec_mladd((vector unsigned short)srcvC, Cv, tempB);
        tempD = vec_mladd((vector unsigned short)srcvD, Dv, tempC);

        srcvA = srcvC;
        srcvB = srcvD;

        tempD = vec_sr(tempD, vcsr8);

        dstv2 = vec_pack(tempD, (vector unsigned short)vczero);

        if (dst_odd) {
            dstv2 = vec_perm(dstv, dstv2, vcprm(0,1,s0,s1));
        } else {
            dstv2 = vec_perm(dstv, dstv2, vcprm(s0,s1,2,3));
        }

        vec_st(dstv2, 0, dst);

        dst += stride;
        src += stride;
    }

POWERPC_PERF_STOP_COUNT(altivec_gmc1_num, GMC1_PERF_COND);
}
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>) Originally committed as revision 1448 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`/*`
dct_unquantize_h263_altivec by (Romain Dolbeau <dolbeaur at club-internet dot fr>) Originally committed as revision 1455 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`* GMC (Global Motion Compensation)`
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>) Originally committed as revision 1448 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`* AltiVec-enabled`
			`* Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>`
			`*`
Change license headers to say 'FFmpeg' instead of 'this program/this library' and fix GPL/LGPL version mismatches. Originally committed as revision 6577 to svn://svn.ffmpeg.org/ffmpeg/trunk 18 years ago			`* This file is part of FFmpeg.`
			`*`
			`* FFmpeg is free software; you can redistribute it and/or`
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>) Originally committed as revision 1448 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`* modify it under the terms of the GNU Lesser General Public`
			`* License as published by the Free Software Foundation; either`
Change license headers to say 'FFmpeg' instead of 'this program/this library' and fix GPL/LGPL version mismatches. Originally committed as revision 6577 to svn://svn.ffmpeg.org/ffmpeg/trunk 18 years ago			`* version 2.1 of the License, or (at your option) any later version.`
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>) Originally committed as revision 1448 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`*`
Change license headers to say 'FFmpeg' instead of 'this program/this library' and fix GPL/LGPL version mismatches. Originally committed as revision 6577 to svn://svn.ffmpeg.org/ffmpeg/trunk 18 years ago			`* FFmpeg is distributed in the hope that it will be useful,`
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>) Originally committed as revision 1448 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public`
Change license headers to say 'FFmpeg' instead of 'this program/this library' and fix GPL/LGPL version mismatches. Originally committed as revision 6577 to svn://svn.ffmpeg.org/ffmpeg/trunk 18 years ago			`* License along with FFmpeg; if not, write to the Free Software`
Update licensing information: The FSF changed postal address. Originally committed as revision 4842 to svn://svn.ffmpeg.org/ffmpeg/trunk 19 years ago			`* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>) Originally committed as revision 1448 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`*/`

Use full path for #includes from another directory. Originally committed as revision 13098 to svn://svn.ffmpeg.org/ffmpeg/trunk 17 years ago			`#include "libavcodec/dsputil.h"`
Sanitize altivec code so it can be built with runtime check properly Originally committed as revision 10640 to svn://svn.ffmpeg.org/ffmpeg/trunk 17 years ago			`#include "dsputil_ppc.h"`
			`#include "util_altivec.h"`
PPC: simplify loading some values into altivec registers Instead of filling a local array with the desired value and loading it, load a single element and vec_splat() it to fill the vector. Originally committed as revision 19691 to svn://svn.ffmpeg.org/ffmpeg/trunk 15 years ago			`#include "types_altivec.h"`
PPC: move prototypes to headers and make some functions static Originally committed as revision 22267 to svn://svn.ffmpeg.org/ffmpeg/trunk 15 years ago			`#include "dsputil_altivec.h"`
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>) Originally committed as revision 1448 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago
			`/*`
			`altivec-enhanced gmc1. ATM this code assume stride is a multiple of 8,`
spelling Originally committed as revision 11122 to svn://svn.ffmpeg.org/ffmpeg/trunk 17 years ago			`to preserve proper dst alignment.`
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>) Originally committed as revision 1448 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`*/`
PPC fixes & clean-up patch by (Romain Dolbeau <dolbeau at irisa dot fr>) Originally committed as revision 2008 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`#define GMC1_PERF_COND (h==8)`
* UINTX -> uintx_t INTX -> intx_t Originally committed as revision 1578 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`void gmc1_altivec(uint8_t dst / align 8 /, uint8_t src /* align1 */, int stride, int h, int x16, int y16, int rounder)`
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>) Originally committed as revision 1448 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`{`
1) remove TBL support in PPC performance. It's much more useful to use the PMCs, and with Apple's CHUD it's fairly easy too. No reason to keep useless code around 2) make the PPC perf stuff a configure option 3) make put_pixels16_altivec a bit faster by unrolling the loop by 4 patch by (Romain Dolbeau <dolbeau at irisa dot fr>) Originally committed as revision 2022 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`POWERPC_PERF_DECLARE(altivec_gmc1_num, GMC1_PERF_COND);`
Remove DECLARE_ALIGNED_{8,16} macros These macros are redundant. All uses are replaced with the generic DECLARE_ALIGNED macro instead. Originally committed as revision 22233 to svn://svn.ffmpeg.org/ffmpeg/trunk 15 years ago			`const DECLARE_ALIGNED(16, unsigned short, rounder_a) = rounder;`
			`const DECLARE_ALIGNED(16, unsigned short, ABCD)[8] =`
cosmetics: Reformat PPC code in libavcodec according to style guidelines. This includes indentation changes, comment reformatting, consistent brace placement and some prettyprinting. Originally committed as revision 14316 to svn://svn.ffmpeg.org/ffmpeg/trunk 17 years ago			`{`
			`(16-x16)(16-y16), / A */`
			`( x16)(16-y16), / B */`
			`(16-x16)( y16), / C */`
			`( x16)( y16), / D */`
			`0, 0, 0, 0 /* padding */`
			`};`
Remove const vector macro indirection that is useless and obfuscating now that the Metrowerks workarounds are gone. Originally committed as revision 10633 to svn://svn.ffmpeg.org/ffmpeg/trunk 17 years ago			`register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);`
			`register const vector unsigned short vcsr8 = (const vector unsigned short)vec_splat_u16(8);`
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>) Originally committed as revision 1448 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`register vector unsigned char dstv, dstv2, src_0, src_1, srcvA, srcvB, srcvC, srcvD;`
			`register vector unsigned short Av, Bv, Cv, Dv, rounderV, tempA, tempB, tempC, tempD;`
			`int i;`
			`unsigned long dst_odd = (unsigned long)dst & 0x0000000F;`
			`unsigned long src_really_odd = (unsigned long)src & 0x0000000F;`

AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>) Originally committed as revision 1464 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago
1) remove TBL support in PPC performance. It's much more useful to use the PMCs, and with Apple's CHUD it's fairly easy too. No reason to keep useless code around 2) make the PPC perf stuff a configure option 3) make put_pixels16_altivec a bit faster by unrolling the loop by 4 patch by (Romain Dolbeau <dolbeau at irisa dot fr>) Originally committed as revision 2022 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`POWERPC_PERF_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND);`
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>) Originally committed as revision 1464 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>) Originally committed as revision 1448 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`tempA = vec_ld(0, (unsigned short*)ABCD);`
			`Av = vec_splat(tempA, 0);`
			`Bv = vec_splat(tempA, 1);`
			`Cv = vec_splat(tempA, 2);`
			`Dv = vec_splat(tempA, 3);`

PPC: simplify loading some values into altivec registers Instead of filling a local array with the desired value and loading it, load a single element and vec_splat() it to fill the vector. Originally committed as revision 19691 to svn://svn.ffmpeg.org/ffmpeg/trunk 15 years ago			`rounderV = vec_splat((vec_u16)vec_lde(0, &rounder_a), 0);`
COSMETICS: Remove all trailing whitespace. Originally committed as revision 4749 to svn://svn.ffmpeg.org/ffmpeg/trunk 19 years ago
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>) Originally committed as revision 1448 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`// we'll be able to pick-up our 9 char elements`
			`// at src from those 32 bytes`
			`// we load the first batch here, as inside the loop`
			`// we can re-use 'src+stride' from one iteration`
			`// as the 'src' of the next.`
			`src_0 = vec_ld(0, src);`
			`src_1 = vec_ld(16, src);`
			`srcvA = vec_perm(src_0, src_1, vec_lvsl(0, src));`
COSMETICS: Remove all trailing whitespace. Originally committed as revision 4749 to svn://svn.ffmpeg.org/ffmpeg/trunk 19 years ago
cosmetics: Reformat PPC code in libavcodec according to style guidelines. This includes indentation changes, comment reformatting, consistent brace placement and some prettyprinting. Originally committed as revision 14316 to svn://svn.ffmpeg.org/ffmpeg/trunk 17 years ago			`if (src_really_odd != 0x0000000F) {`
			`// if src & 0xF == 0xF, then (src+1) is properly aligned`
			`// on the second vector.`
			`srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src));`
			`} else {`
			`srcvB = src_1;`
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>) Originally committed as revision 1448 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`}`
			`srcvA = vec_mergeh(vczero, srcvA);`
			`srcvB = vec_mergeh(vczero, srcvB);`
COSMETICS: Remove all trailing whitespace. Originally committed as revision 4749 to svn://svn.ffmpeg.org/ffmpeg/trunk 19 years ago
cosmetics: Reformat PPC code in libavcodec according to style guidelines. This includes indentation changes, comment reformatting, consistent brace placement and some prettyprinting. Originally committed as revision 14316 to svn://svn.ffmpeg.org/ffmpeg/trunk 17 years ago			`for(i=0; i<h; i++) {`
			`dst_odd = (unsigned long)dst & 0x0000000F;`
			`src_really_odd = (((unsigned long)src) + stride) & 0x0000000F;`

			`dstv = vec_ld(0, dst);`

			`// we we'll be able to pick-up our 9 char elements`
			`// at src + stride from those 32 bytes`
			`// then reuse the resulting 2 vectors srvcC and srcvD`
			`// as the next srcvA and srcvB`
			`src_0 = vec_ld(stride + 0, src);`
			`src_1 = vec_ld(stride + 16, src);`
			`srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src));`

			`if (src_really_odd != 0x0000000F) {`
			`// if src & 0xF == 0xF, then (src+1) is properly aligned`
			`// on the second vector.`
			`srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src));`
			`} else {`
			`srcvD = src_1;`
			`}`

			`srcvC = vec_mergeh(vczero, srcvC);`
			`srcvD = vec_mergeh(vczero, srcvD);`


			`// OK, now we (finally) do the math :-)`
			`// those four instructions replaces 32 int muls & 32 int adds.`
			`// isn't AltiVec nice ?`
			`tempA = vec_mladd((vector unsigned short)srcvA, Av, rounderV);`
			`tempB = vec_mladd((vector unsigned short)srcvB, Bv, tempA);`
			`tempC = vec_mladd((vector unsigned short)srcvC, Cv, tempB);`
			`tempD = vec_mladd((vector unsigned short)srcvD, Dv, tempC);`

			`srcvA = srcvC;`
			`srcvB = srcvD;`

			`tempD = vec_sr(tempD, vcsr8);`

			`dstv2 = vec_pack(tempD, (vector unsigned short)vczero);`

			`if (dst_odd) {`
			`dstv2 = vec_perm(dstv, dstv2, vcprm(0,1,s0,s1));`
			`} else {`
			`dstv2 = vec_perm(dstv, dstv2, vcprm(s0,s1,2,3));`
			`}`

			`vec_st(dstv2, 0, dst);`

			`dst += stride;`
			`src += stride;`
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>) Originally committed as revision 1448 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`}`
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>) Originally committed as revision 1464 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago
1) remove TBL support in PPC performance. It's much more useful to use the PMCs, and with Apple's CHUD it's fairly easy too. No reason to keep useless code around 2) make the PPC perf stuff a configure option 3) make put_pixels16_altivec a bit faster by unrolling the loop by 4 patch by (Romain Dolbeau <dolbeau at irisa dot fr>) Originally committed as revision 2022 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`POWERPC_PERF_STOP_COUNT(altivec_gmc1_num, GMC1_PERF_COND);`
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>) Originally committed as revision 1448 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`}`