FFmpeg/libavcodec/ppc/fft_altivec.c

/*
 * FFT/IFFT transforms
 * AltiVec-enabled
 * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
 * Based on code Copyright (c) 2002 Fabrice Bellard.
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
#include "dsputil.h"

#include "gcc_fixes.h"

#include "dsputil_altivec.h"

/*
  those three macros are from libavcodec/fft.c
  and are required for the reference C code
*/
/* butter fly op */
#define BF(pre, pim, qre, qim, pre1, pim1, qre1, qim1) \
{\
  FFTSample ax, ay, bx, by;\
  bx=pre1;\
  by=pim1;\
  ax=qre1;\
  ay=qim1;\
  pre = (bx + ax);\
  pim = (by + ay);\
  qre = (bx - ax);\
  qim = (by - ay);\
}
#define MUL16(a,b) ((a) * (b))
#define CMUL(pre, pim, are, aim, bre, bim) \
{\
   pre = (MUL16(are, bre) - MUL16(aim, bim));\
   pim = (MUL16(are, bim) + MUL16(bre, aim));\
}


/**
 * Do a complex FFT with the parameters defined in ff_fft_init(). The
 * input data must be permuted before with s->revtab table. No
 * 1.0/sqrt(n) normalization is done.
 * AltiVec-enabled
 * This code assumes that the 'z' pointer is 16 bytes-aligned
 * It also assumes all FFTComplex are 8 bytes-aligned pair of float
 * The code is exactly the same as the SSE version, except
 * that successive MUL + ADD/SUB have been merged into
 * fused multiply-add ('vec_madd' in altivec)
 */
void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z)
{
POWERPC_PERF_DECLARE(altivec_fft_num, s->nbits >= 6);
    register const vector float vczero = (const vector float)vec_splat_u32(0.);

    int ln = s->nbits;
    int j, np, np2;
    int nblocks, nloops;
    register FFTComplex *p, *q;
    FFTComplex *cptr, *cptr1;
    int k;

POWERPC_PERF_START_COUNT(altivec_fft_num, s->nbits >= 6);

    np = 1 << ln;

    {
        vector float *r, a, b, a1, c1, c2;

        r = (vector float *)&z[0];

        c1 = vcii(p,p,n,n);

        if (s->inverse)
            {
                c2 = vcii(p,p,n,p);
            }
        else
            {
                c2 = vcii(p,p,p,n);
            }

        j = (np >> 2);
        do {
            a = vec_ld(0, r);
            a1 = vec_ld(sizeof(vector float), r);

            b = vec_perm(a,a,vcprmle(1,0,3,2));
            a = vec_madd(a,c1,b);
            /* do the pass 0 butterfly */

            b = vec_perm(a1,a1,vcprmle(1,0,3,2));
            b = vec_madd(a1,c1,b);
            /* do the pass 0 butterfly */

            /* multiply third by -i */
            b = vec_perm(b,b,vcprmle(2,3,1,0));

            /* do the pass 1 butterfly */
            vec_st(vec_madd(b,c2,a), 0, r);
            vec_st(vec_nmsub(b,c2,a), sizeof(vector float), r);

            r += 2;
        } while (--j != 0);
    }
    /* pass 2 .. ln-1 */

    nblocks = np >> 3;
    nloops = 1 << 2;
    np2 = np >> 1;

    cptr1 = s->exptab1;
    do {
        p = z;
        q = z + nloops;
        j = nblocks;
        do {
            cptr = cptr1;
            k = nloops >> 1;
            do {
                vector float a,b,c,t1;

                a = vec_ld(0, (float*)p);
                b = vec_ld(0, (float*)q);

                /* complex mul */
                c = vec_ld(0, (float*)cptr);
                /*  cre*re cim*re */
                t1 = vec_madd(c, vec_perm(b,b,vcprmle(2,2,0,0)),vczero);
                c = vec_ld(sizeof(vector float), (float*)cptr);
                /*  -cim*im cre*im */
                b = vec_madd(c, vec_perm(b,b,vcprmle(3,3,1,1)),t1);

                /* butterfly */
                vec_st(vec_add(a,b), 0, (float*)p);
                vec_st(vec_sub(a,b), 0, (float*)q);

                p += 2;
                q += 2;
                cptr += 4;
            } while (--k);

            p += nloops;
            q += nloops;
        } while (--j);
        cptr1 += nloops * 2;
        nblocks = nblocks >> 1;
        nloops = nloops << 1;
    } while (nblocks != 0);

POWERPC_PERF_STOP_COUNT(altivec_fft_num, s->nbits >= 6);
}
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`/*`
			`* FFT/IFFT transforms`
			`* AltiVec-enabled`
altivec patches by Romain Dolbeau Originally committed as revision 1423 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`* Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>`
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`* Based on code Copyright (c) 2002 Fabrice Bellard.`
			`*`
Change license headers to say 'FFmpeg' instead of 'this program/this library' and fix GPL/LGPL version mismatches. Originally committed as revision 6577 to svn://svn.ffmpeg.org/ffmpeg/trunk 19 years ago			`* This file is part of FFmpeg.`
			`*`
			`* FFmpeg is free software; you can redistribute it and/or`
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`* modify it under the terms of the GNU Lesser General Public`
			`* License as published by the Free Software Foundation; either`
Change license headers to say 'FFmpeg' instead of 'this program/this library' and fix GPL/LGPL version mismatches. Originally committed as revision 6577 to svn://svn.ffmpeg.org/ffmpeg/trunk 19 years ago			`* version 2.1 of the License, or (at your option) any later version.`
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`*`
Change license headers to say 'FFmpeg' instead of 'this program/this library' and fix GPL/LGPL version mismatches. Originally committed as revision 6577 to svn://svn.ffmpeg.org/ffmpeg/trunk 19 years ago			`* FFmpeg is distributed in the hope that it will be useful,`
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public`
Change license headers to say 'FFmpeg' instead of 'this program/this library' and fix GPL/LGPL version mismatches. Originally committed as revision 6577 to svn://svn.ffmpeg.org/ffmpeg/trunk 19 years ago			`* License along with FFmpeg; if not, write to the Free Software`
Update licensing information: The FSF changed postal address. Originally committed as revision 4842 to svn://svn.ffmpeg.org/ffmpeg/trunk 19 years ago			`* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`*/`
Add libavcodec to compiler include flags in order to simplify header include paths in the source files. mostly from a patch by Ronald S. Bultje, rbultje ronald.bitfreak net Originally committed as revision 9034 to svn://svn.ffmpeg.org/ffmpeg/trunk 18 years ago			`#include "dsputil.h"`
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago
altivec gcc-3 fixes by (Magnus Damm <damm at opensource dot se>) Originally committed as revision 1896 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`#include "gcc_fixes.h"`

fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`#include "dsputil_altivec.h"`

AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>) Originally committed as revision 1464 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`/*`
			`those three macros are from libavcodec/fft.c`
			`and are required for the reference C code`
			`*/`
			`/* butter fly op */`
			`#define BF(pre, pim, qre, qim, pre1, pim1, qre1, qim1) \`
			`{\`
			`FFTSample ax, ay, bx, by;\`
			`bx=pre1;\`
			`by=pim1;\`
			`ax=qre1;\`
			`ay=qim1;\`
			`pre = (bx + ax);\`
			`pim = (by + ay);\`
			`qre = (bx - ax);\`
			`qim = (by - ay);\`
			`}`
			`#define MUL16(a,b) ((a) * (b))`
			`#define CMUL(pre, pim, are, aim, bre, bim) \`
			`{\`
			`pre = (MUL16(are, bre) - MUL16(aim, bim));\`
			`pim = (MUL16(are, bim) + MUL16(bre, aim));\`
			`}`


fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`/**`
fft_() renamed into ff_fft_() patch by (Gildas Bazin <gbazin at altern dot org>) Originally committed as revision 2882 to svn://svn.ffmpeg.org/ffmpeg/trunk 21 years ago			`* Do a complex FFT with the parameters defined in ff_fft_init(). The`
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`* input data must be permuted before with s->revtab table. No`
			`* 1.0/sqrt(n) normalization is done.`
			`* AltiVec-enabled`
			`* This code assumes that the 'z' pointer is 16 bytes-aligned`
			`* It also assumes all FFTComplex are 8 bytes-aligned pair of float`
			`* The code is exactly the same as the SSE version, except`
altivec patches by Romain Dolbeau Originally committed as revision 1423 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`* that successive MUL + ADD/SUB have been merged into`
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`* fused multiply-add ('vec_madd' in altivec)`
			`*/`
fft_() renamed into ff_fft_() patch by (Gildas Bazin <gbazin at altern dot org>) Originally committed as revision 2882 to svn://svn.ffmpeg.org/ffmpeg/trunk 21 years ago			`void ff_fft_calc_altivec(FFTContext s, FFTComplex z)`
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`{`
1) remove TBL support in PPC performance. It's much more useful to use the PMCs, and with Apple's CHUD it's fairly easy too. No reason to keep useless code around 2) make the PPC perf stuff a configure option 3) make put_pixels16_altivec a bit faster by unrolling the loop by 4 patch by (Romain Dolbeau <dolbeau at irisa dot fr>) Originally committed as revision 2022 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`POWERPC_PERF_DECLARE(altivec_fft_num, s->nbits >= 6);`
Simplify Originally committed as revision 6932 to svn://svn.ffmpeg.org/ffmpeg/trunk 19 years ago			`register const vector float vczero = (const vector float)vec_splat_u32(0.);`
COSMETICS: Remove all trailing whitespace. Originally committed as revision 4749 to svn://svn.ffmpeg.org/ffmpeg/trunk 19 years ago
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`int ln = s->nbits;`
COSMETICS: tabs --> spaces, some prettyprinting Originally committed as revision 4764 to svn://svn.ffmpeg.org/ffmpeg/trunk 19 years ago			`int j, np, np2;`
			`int nblocks, nloops;`
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`register FFTComplex p, q;`
			`FFTComplex cptr, cptr1;`
			`int k;`

1) remove TBL support in PPC performance. It's much more useful to use the PMCs, and with Apple's CHUD it's fairly easy too. No reason to keep useless code around 2) make the PPC perf stuff a configure option 3) make put_pixels16_altivec a bit faster by unrolling the loop by 4 patch by (Romain Dolbeau <dolbeau at irisa dot fr>) Originally committed as revision 2022 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`POWERPC_PERF_START_COUNT(altivec_fft_num, s->nbits >= 6);`
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>) Originally committed as revision 1464 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`np = 1 << ln;`

			`{`
			`vector float *r, a, b, a1, c1, c2;`

			`r = (vector float *)&z[0];`

			`c1 = vcii(p,p,n,n);`
COSMETICS: Remove all trailing whitespace. Originally committed as revision 4749 to svn://svn.ffmpeg.org/ffmpeg/trunk 19 years ago
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`if (s->inverse)`
			`{`
			`c2 = vcii(p,p,n,p);`
			`}`
			`else`
			`{`
			`c2 = vcii(p,p,p,n);`
			`}`
COSMETICS: Remove all trailing whitespace. Originally committed as revision 4749 to svn://svn.ffmpeg.org/ffmpeg/trunk 19 years ago
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`j = (np >> 2);`
			`do {`
			`a = vec_ld(0, r);`
			`a1 = vec_ld(sizeof(vector float), r);`
COSMETICS: Remove all trailing whitespace. Originally committed as revision 4749 to svn://svn.ffmpeg.org/ffmpeg/trunk 19 years ago
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`b = vec_perm(a,a,vcprmle(1,0,3,2));`
			`a = vec_madd(a,c1,b);`
			`/* do the pass 0 butterfly */`
COSMETICS: Remove all trailing whitespace. Originally committed as revision 4749 to svn://svn.ffmpeg.org/ffmpeg/trunk 19 years ago
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`b = vec_perm(a1,a1,vcprmle(1,0,3,2));`
			`b = vec_madd(a1,c1,b);`
			`/* do the pass 0 butterfly */`
COSMETICS: Remove all trailing whitespace. Originally committed as revision 4749 to svn://svn.ffmpeg.org/ffmpeg/trunk 19 years ago
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`/* multiply third by -i */`
			`b = vec_perm(b,b,vcprmle(2,3,1,0));`
COSMETICS: Remove all trailing whitespace. Originally committed as revision 4749 to svn://svn.ffmpeg.org/ffmpeg/trunk 19 years ago
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`/* do the pass 1 butterfly */`
			`vec_st(vec_madd(b,c2,a), 0, r);`
			`vec_st(vec_nmsub(b,c2,a), sizeof(vector float), r);`
COSMETICS: Remove all trailing whitespace. Originally committed as revision 4749 to svn://svn.ffmpeg.org/ffmpeg/trunk 19 years ago
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`r += 2;`
			`} while (--j != 0);`
			`}`
			`/* pass 2 .. ln-1 */`

			`nblocks = np >> 3;`
			`nloops = 1 << 2;`
			`np2 = np >> 1;`

			`cptr1 = s->exptab1;`
			`do {`
			`p = z;`
			`q = z + nloops;`
			`j = nblocks;`
			`do {`
			`cptr = cptr1;`
			`k = nloops >> 1;`
			`do {`
			`vector float a,b,c,t1;`

			`a = vec_ld(0, (float*)p);`
			`b = vec_ld(0, (float*)q);`
COSMETICS: Remove all trailing whitespace. Originally committed as revision 4749 to svn://svn.ffmpeg.org/ffmpeg/trunk 19 years ago
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`/* complex mul */`
			`c = vec_ld(0, (float*)cptr);`
			`/* crere cimre */`
			`t1 = vec_madd(c, vec_perm(b,b,vcprmle(2,2,0,0)),vczero);`
			`c = vec_ld(sizeof(vector float), (float*)cptr);`
			`/* -cimim creim */`
			`b = vec_madd(c, vec_perm(b,b,vcprmle(3,3,1,1)),t1);`
COSMETICS: Remove all trailing whitespace. Originally committed as revision 4749 to svn://svn.ffmpeg.org/ffmpeg/trunk 19 years ago
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`/* butterfly */`
			`vec_st(vec_add(a,b), 0, (float*)p);`
			`vec_st(vec_sub(a,b), 0, (float*)q);`
COSMETICS: Remove all trailing whitespace. Originally committed as revision 4749 to svn://svn.ffmpeg.org/ffmpeg/trunk 19 years ago
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`p += 2;`
			`q += 2;`
			`cptr += 4;`
			`} while (--k);`
COSMETICS: Remove all trailing whitespace. Originally committed as revision 4749 to svn://svn.ffmpeg.org/ffmpeg/trunk 19 years ago
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`p += nloops;`
			`q += nloops;`
			`} while (--j);`
			`cptr1 += nloops * 2;`
			`nblocks = nblocks >> 1;`
			`nloops = nloops << 1;`
			`} while (nblocks != 0);`

1) remove TBL support in PPC performance. It's much more useful to use the PMCs, and with Apple's CHUD it's fairly easy too. No reason to keep useless code around 2) make the PPC perf stuff a configure option 3) make put_pixels16_altivec a bit faster by unrolling the loop by 4 patch by (Romain Dolbeau <dolbeau at irisa dot fr>) Originally committed as revision 2022 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`POWERPC_PERF_STOP_COUNT(altivec_fft_num, s->nbits >= 6);`
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>) Originally committed as revision 1464 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`}`