FFmpeg/libavcodec/ppc/fft_altivec.c

/*
 * FFT/IFFT transforms
 * AltiVec-enabled
 * Copyright (c) 2009 Loren Merritt
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "config.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/fft.h"

/**
 * Do a complex FFT with the parameters defined in ff_fft_init(). The
 * input data must be permuted before with s->revtab table. No
 * 1.0/sqrt(n) normalization is done.
 * AltiVec-enabled
 * This code assumes that the 'z' pointer is 16 bytes-aligned
 * It also assumes all FFTComplex are 8 bytes-aligned pair of float
 */

void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z);
void ff_fft_calc_interleave_altivec(FFTContext *s, FFTComplex *z);

#if HAVE_GNU_AS && HAVE_ALTIVEC
static void imdct_half_altivec(FFTContext *s, FFTSample *output, const FFTSample *input)
{
    int j, k;
    int n = 1 << s->mdct_bits;
    int n4 = n >> 2;
    int n8 = n >> 3;
    int n32 = n >> 5;
    const uint16_t *revtabj = s->revtab;
    const uint16_t *revtabk = s->revtab+n4;
    const vec_f *tcos = (const vec_f*)(s->tcos+n8);
    const vec_f *tsin = (const vec_f*)(s->tsin+n8);
    const vec_f *pin = (const vec_f*)(input+n4);
    vec_f *pout = (vec_f*)(output+n4);

    /* pre rotation */
    k = n32-1;
    do {
        vec_f cos,sin,cos0,sin0,cos1,sin1,re,im,r0,i0,r1,i1,a,b,c,d;
#define CMULA(p,o0,o1,o2,o3)\
        a = pin[ k*2+p];                       /* { z[k].re,    z[k].im,    z[k+1].re,  z[k+1].im  } */\
        b = pin[-k*2-p-1];                     /* { z[-k-2].re, z[-k-2].im, z[-k-1].re, z[-k-1].im } */\
        re = vec_perm(a, b, vcprm(0,2,s0,s2)); /* { z[k].re,    z[k+1].re,  z[-k-2].re, z[-k-1].re } */\
        im = vec_perm(a, b, vcprm(s3,s1,3,1)); /* { z[-k-1].im, z[-k-2].im, z[k+1].im,  z[k].im    } */\
        cos = vec_perm(cos0, cos1, vcprm(o0,o1,s##o2,s##o3)); /* { cos[k], cos[k+1], cos[-k-2], cos[-k-1] } */\
        sin = vec_perm(sin0, sin1, vcprm(o0,o1,s##o2,s##o3));\
        r##p = im*cos - re*sin;\
        i##p = re*cos + im*sin;
#define STORE2(v,dst)\
        j = dst;\
        vec_ste(v, 0, output+j*2);\
        vec_ste(v, 4, output+j*2);
#define STORE8(p)\
        a = vec_perm(r##p, i##p, vcprm(0,s0,0,s0));\
        b = vec_perm(r##p, i##p, vcprm(1,s1,1,s1));\
        c = vec_perm(r##p, i##p, vcprm(2,s2,2,s2));\
        d = vec_perm(r##p, i##p, vcprm(3,s3,3,s3));\
        STORE2(a, revtabk[ p*2-4]);\
        STORE2(b, revtabk[ p*2-3]);\
        STORE2(c, revtabj[-p*2+2]);\
        STORE2(d, revtabj[-p*2+3]);

        cos0 = tcos[k];
        sin0 = tsin[k];
        cos1 = tcos[-k-1];
        sin1 = tsin[-k-1];
        CMULA(0, 0,1,2,3);
        CMULA(1, 2,3,0,1);
        STORE8(0);
        STORE8(1);
        revtabj += 4;
        revtabk -= 4;
        k--;
    } while(k >= 0);

    ff_fft_calc_altivec(s, (FFTComplex*)output);

    /* post rotation + reordering */
    j = -n32;
    k = n32-1;
    do {
        vec_f cos,sin,re,im,a,b,c,d;
#define CMULB(d0,d1,o)\
        re = pout[o*2];\
        im = pout[o*2+1];\
        cos = tcos[o];\
        sin = tsin[o];\
        d0 = im*sin - re*cos;\
        d1 = re*sin + im*cos;

        CMULB(a,b,j);
        CMULB(c,d,k);
        pout[2*j]   = vec_perm(a, d, vcprm(0,s3,1,s2));
        pout[2*j+1] = vec_perm(a, d, vcprm(2,s1,3,s0));
        pout[2*k]   = vec_perm(c, b, vcprm(0,s3,1,s2));
        pout[2*k+1] = vec_perm(c, b, vcprm(2,s1,3,s0));
        j++;
        k--;
    } while(k >= 0);
}

static void imdct_calc_altivec(FFTContext *s, FFTSample *output, const FFTSample *input)
{
    int k;
    int n = 1 << s->mdct_bits;
    int n4 = n >> 2;
    int n16 = n >> 4;
    vec_u32 sign = {1U<<31,1U<<31,1U<<31,1U<<31};
    vec_u32 *p0 = (vec_u32*)(output+n4);
    vec_u32 *p1 = (vec_u32*)(output+n4*3);

    imdct_half_altivec(s, output + n4, input);

    for (k = 0; k < n16; k++) {
        vec_u32 a = p0[k] ^ sign;
        vec_u32 b = p1[-k-1];
        p0[-k-1] = vec_perm(a, a, vcprm(3,2,1,0));
        p1[k]    = vec_perm(b, b, vcprm(3,2,1,0));
    }
}
#endif /* HAVE_GNU_AS && HAVE_ALTIVEC */

av_cold void ff_fft_init_ppc(FFTContext *s)
{
#if HAVE_GNU_AS && HAVE_ALTIVEC
    if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
        return;

    s->fft_calc   = ff_fft_calc_interleave_altivec;
    if (s->mdct_bits >= 5) {
        s->imdct_calc = imdct_calc_altivec;
        s->imdct_half = imdct_half_altivec;
    }
#endif /* HAVE_GNU_AS && HAVE_ALTIVEC */
}
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`/*`
			`* FFT/IFFT transforms`
			`* AltiVec-enabled`
PPC: Altivec split-radix FFT 1.8x faster than altivec radix-2 on a G4 8% faster vorbis decoding Patch (mostly) by Loren Merritt Originally committed as revision 23956 to svn://svn.ffmpeg.org/ffmpeg/trunk 15 years ago			`* Copyright (c) 2009 Loren Merritt`
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`*`
Change license headers to say 'FFmpeg' instead of 'this program/this library' and fix GPL/LGPL version mismatches. Originally committed as revision 6577 to svn://svn.ffmpeg.org/ffmpeg/trunk 19 years ago			`* This file is part of FFmpeg.`
			`*`
			`* FFmpeg is free software; you can redistribute it and/or`
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`* modify it under the terms of the GNU Lesser General Public`
			`* License as published by the Free Software Foundation; either`
Change license headers to say 'FFmpeg' instead of 'this program/this library' and fix GPL/LGPL version mismatches. Originally committed as revision 6577 to svn://svn.ffmpeg.org/ffmpeg/trunk 19 years ago			`* version 2.1 of the License, or (at your option) any later version.`
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`*`
Change license headers to say 'FFmpeg' instead of 'this program/this library' and fix GPL/LGPL version mismatches. Originally committed as revision 6577 to svn://svn.ffmpeg.org/ffmpeg/trunk 19 years ago			`* FFmpeg is distributed in the hope that it will be useful,`
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public`
Change license headers to say 'FFmpeg' instead of 'this program/this library' and fix GPL/LGPL version mismatches. Originally committed as revision 6577 to svn://svn.ffmpeg.org/ffmpeg/trunk 19 years ago			`* License along with FFmpeg; if not, write to the Free Software`
Update licensing information: The FSF changed postal address. Originally committed as revision 4842 to svn://svn.ffmpeg.org/ffmpeg/trunk 19 years ago			`* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`*/`
PPC: Move types_altivec.h and util_altivec.h from libavcodec to libavutil This will allow for easier implementation of Altivec functions in libraries other than libavcodec. 13 years ago
ppc: More consistent arch initialization 12 years ago			`#include "config.h"`
ppc: Add missing AltiVec cpuflag detection invocations 12 years ago			`#include "libavutil/cpu.h"`
PPC: Move types_altivec.h and util_altivec.h from libavcodec to libavutil This will allow for easier implementation of Altivec functions in libraries other than libavcodec. 13 years ago			`#include "libavutil/ppc/types_altivec.h"`
			`#include "libavutil/ppc/util_altivec.h"`
Move FFT parts from dsputil.h to fft.h Originally committed as revision 22235 to svn://svn.ffmpeg.org/ffmpeg/trunk 15 years ago			`#include "libavcodec/fft.h"`
PPC: move prototypes to headers and make some functions static Originally committed as revision 22267 to svn://svn.ffmpeg.org/ffmpeg/trunk 15 years ago
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`/**`
fft_() renamed into ff_fft_() patch by (Gildas Bazin <gbazin at altern dot org>) Originally committed as revision 2882 to svn://svn.ffmpeg.org/ffmpeg/trunk 21 years ago			`* Do a complex FFT with the parameters defined in ff_fft_init(). The`
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 22 years ago			`* input data must be permuted before with s->revtab table. No`
			`* 1.0/sqrt(n) normalization is done.`
			`* AltiVec-enabled`
			`* This code assumes that the 'z' pointer is 16 bytes-aligned`
			`* It also assumes all FFTComplex are 8 bytes-aligned pair of float`
			`*/`
COSMETICS: Remove all trailing whitespace. Originally committed as revision 4749 to svn://svn.ffmpeg.org/ffmpeg/trunk 19 years ago
PPC: convert Altivec FFT to pure assembler On PPC a leaf function has a 288-byte red zone below the stack pointer, sparing these functions the chore of setting up a full stack frame. When a function call is disguised within an inline asm block, the compiler might not adjust the stack pointer as required before a function call, resulting in the red zone being clobbered. Moving the entire function to pure asm avoids this problem and also results in somewhat better code. Originally committed as revision 24044 to svn://svn.ffmpeg.org/ffmpeg/trunk 15 years ago			`void ff_fft_calc_altivec(FFTContext s, FFTComplex z);`
			`void ff_fft_calc_interleave_altivec(FFTContext s, FFTComplex z);`
COSMETICS: Remove all trailing whitespace. Originally committed as revision 4749 to svn://svn.ffmpeg.org/ffmpeg/trunk 19 years ago
ppc: More consistent arch initialization 12 years ago			`#if HAVE_GNU_AS && HAVE_ALTIVEC`
ppc: Drop unnecessary ff_ name prefixes from static functions 12 years ago			`static void imdct_half_altivec(FFTContext s, FFTSample output, const FFTSample *input)`
PPC: Altivec IMDCT Patch by Loren Merritt Originally committed as revision 23959 to svn://svn.ffmpeg.org/ffmpeg/trunk 15 years ago			`{`
			`int j, k;`
			`int n = 1 << s->mdct_bits;`
			`int n4 = n >> 2;`
			`int n8 = n >> 3;`
			`int n32 = n >> 5;`
			`const uint16_t *revtabj = s->revtab;`
			`const uint16_t *revtabk = s->revtab+n4;`
			`const vec_f tcos = (const vec_f)(s->tcos+n8);`
			`const vec_f tsin = (const vec_f)(s->tsin+n8);`
			`const vec_f pin = (const vec_f)(input+n4);`
			`vec_f pout = (vec_f)(output+n4);`

			`/* pre rotation */`
			`k = n32-1;`
			`do {`
			`vec_f cos,sin,cos0,sin0,cos1,sin1,re,im,r0,i0,r1,i1,a,b,c,d;`
			`#define CMULA(p,o0,o1,o2,o3)\`
			`a = pin[ k2+p]; / { z[k].re, z[k].im, z[k+1].re, z[k+1].im } */\`
			`b = pin[-k2-p-1]; / { z[-k-2].re, z[-k-2].im, z[-k-1].re, z[-k-1].im } */\`
			`re = vec_perm(a, b, vcprm(0,2,s0,s2)); /* { z[k].re, z[k+1].re, z[-k-2].re, z[-k-1].re } */\`
			`im = vec_perm(a, b, vcprm(s3,s1,3,1)); /* { z[-k-1].im, z[-k-2].im, z[k+1].im, z[k].im } */\`
			`cos = vec_perm(cos0, cos1, vcprm(o0,o1,s##o2,s##o3)); /* { cos[k], cos[k+1], cos[-k-2], cos[-k-1] } */\`
			`sin = vec_perm(sin0, sin1, vcprm(o0,o1,s##o2,s##o3));\`
			`r##p = imcos - resin;\`
			`i##p = recos + imsin;`
			`#define STORE2(v,dst)\`
			`j = dst;\`
			`vec_ste(v, 0, output+j*2);\`
			`vec_ste(v, 4, output+j*2);`
			`#define STORE8(p)\`
			`a = vec_perm(r##p, i##p, vcprm(0,s0,0,s0));\`
			`b = vec_perm(r##p, i##p, vcprm(1,s1,1,s1));\`
			`c = vec_perm(r##p, i##p, vcprm(2,s2,2,s2));\`
			`d = vec_perm(r##p, i##p, vcprm(3,s3,3,s3));\`
			`STORE2(a, revtabk[ p*2-4]);\`
			`STORE2(b, revtabk[ p*2-3]);\`
			`STORE2(c, revtabj[-p*2+2]);\`
			`STORE2(d, revtabj[-p*2+3]);`

			`cos0 = tcos[k];`
			`sin0 = tsin[k];`
			`cos1 = tcos[-k-1];`
			`sin1 = tsin[-k-1];`
			`CMULA(0, 0,1,2,3);`
			`CMULA(1, 2,3,0,1);`
			`STORE8(0);`
			`STORE8(1);`
			`revtabj += 4;`
			`revtabk -= 4;`
			`k--;`
			`} while(k >= 0);`

PPC: convert Altivec FFT to pure assembler On PPC a leaf function has a 288-byte red zone below the stack pointer, sparing these functions the chore of setting up a full stack frame. When a function call is disguised within an inline asm block, the compiler might not adjust the stack pointer as required before a function call, resulting in the red zone being clobbered. Moving the entire function to pure asm avoids this problem and also results in somewhat better code. Originally committed as revision 24044 to svn://svn.ffmpeg.org/ffmpeg/trunk 15 years ago			`ff_fft_calc_altivec(s, (FFTComplex*)output);`
PPC: Altivec IMDCT Patch by Loren Merritt Originally committed as revision 23959 to svn://svn.ffmpeg.org/ffmpeg/trunk 15 years ago
			`/* post rotation + reordering */`
			`j = -n32;`
			`k = n32-1;`
			`do {`
			`vec_f cos,sin,re,im,a,b,c,d;`
			`#define CMULB(d0,d1,o)\`
			`re = pout[o*2];\`
			`im = pout[o*2+1];\`
			`cos = tcos[o];\`
			`sin = tsin[o];\`
			`d0 = imsin - recos;\`
			`d1 = resin + imcos;`

			`CMULB(a,b,j);`
			`CMULB(c,d,k);`
			`pout[2*j] = vec_perm(a, d, vcprm(0,s3,1,s2));`
			`pout[2*j+1] = vec_perm(a, d, vcprm(2,s1,3,s0));`
			`pout[2*k] = vec_perm(c, b, vcprm(0,s3,1,s2));`
			`pout[2*k+1] = vec_perm(c, b, vcprm(2,s1,3,s0));`
			`j++;`
			`k--;`
			`} while(k >= 0);`
			`}`

ppc: Drop unnecessary ff_ name prefixes from static functions 12 years ago			`static void imdct_calc_altivec(FFTContext s, FFTSample output, const FFTSample *input)`
PPC: Altivec IMDCT Patch by Loren Merritt Originally committed as revision 23959 to svn://svn.ffmpeg.org/ffmpeg/trunk 15 years ago			`{`
			`int k;`
			`int n = 1 << s->mdct_bits;`
			`int n4 = n >> 2;`
			`int n16 = n >> 4;`
Convert some undefined 1<<31 shifts into 1U<<31. According to ISO 9899:1999 S 6.5.7/4: The result of E1 << E2 is E1 left-shifted E2 bit positions; vacated bits are filled with zeros. If E1 has an unsigned type, the value of the result is E1× 2^E2, reduced modulo one more than the maximum value representable in the result type. If E1 has a signed type and nonnegative value, and E1× 2^E2 is representable in the result type, then that is the resulting value; otherwise, the behavior is undefined. 14 years ago			`vec_u32 sign = {1U<<31,1U<<31,1U<<31,1U<<31};`
PPC: Altivec IMDCT Patch by Loren Merritt Originally committed as revision 23959 to svn://svn.ffmpeg.org/ffmpeg/trunk 15 years ago			`vec_u32 p0 = (vec_u32)(output+n4);`
			`vec_u32 p1 = (vec_u32)(output+n4*3);`

ppc: Drop unnecessary ff_ name prefixes from static functions 12 years ago			`imdct_half_altivec(s, output + n4, input);`
PPC: Altivec IMDCT Patch by Loren Merritt Originally committed as revision 23959 to svn://svn.ffmpeg.org/ffmpeg/trunk 15 years ago
			`for (k = 0; k < n16; k++) {`
			`vec_u32 a = p0[k] ^ sign;`
			`vec_u32 b = p1[-k-1];`
			`p0[-k-1] = vec_perm(a, a, vcprm(3,2,1,0));`
			`p1[k] = vec_perm(b, b, vcprm(3,2,1,0));`
			`}`
			`}`
ppc: More consistent arch initialization 12 years ago			`#endif /* HAVE_GNU_AS && HAVE_ALTIVEC */`
PPC: Altivec IMDCT Patch by Loren Merritt Originally committed as revision 23959 to svn://svn.ffmpeg.org/ffmpeg/trunk 15 years ago
ppc: More consistent arch initialization 12 years ago			`av_cold void ff_fft_init_ppc(FFTContext *s)`
Move per-arch fft init bits into the corresponding subdirs Originally committed as revision 19864 to svn://svn.ffmpeg.org/ffmpeg/trunk 16 years ago			`{`
ppc: More consistent arch initialization 12 years ago			`#if HAVE_GNU_AS && HAVE_ALTIVEC`
ppc: Add missing AltiVec cpuflag detection invocations 12 years ago			`if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))`
			`return;`

PPC: convert Altivec FFT to pure assembler On PPC a leaf function has a 288-byte red zone below the stack pointer, sparing these functions the chore of setting up a full stack frame. When a function call is disguised within an inline asm block, the compiler might not adjust the stack pointer as required before a function call, resulting in the red zone being clobbered. Moving the entire function to pure asm avoids this problem and also results in somewhat better code. Originally committed as revision 24044 to svn://svn.ffmpeg.org/ffmpeg/trunk 15 years ago			`s->fft_calc = ff_fft_calc_interleave_altivec;`
PPC: use Altivec IMDCT only for supported sizes The Altivec IMDCT works with size 32 and higher only. Signed-off-by: Mans Rullgard <mans@mansr.com> 14 years ago			`if (s->mdct_bits >= 5) {`
ppc: Drop unnecessary ff_ name prefixes from static functions 12 years ago			`s->imdct_calc = imdct_calc_altivec;`
			`s->imdct_half = imdct_half_altivec;`
PPC: use Altivec IMDCT only for supported sizes The Altivec IMDCT works with size 32 and higher only. Signed-off-by: Mans Rullgard <mans@mansr.com> 14 years ago			`}`
ppc: More consistent arch initialization 12 years ago			`#endif /* HAVE_GNU_AS && HAVE_ALTIVEC */`
Move per-arch fft init bits into the corresponding subdirs Originally committed as revision 19864 to svn://svn.ffmpeg.org/ffmpeg/trunk 16 years ago			`}`