PPC: add _interleave versions of fft{4,6,16}_altivec

This removes the need for a post-swizzle with the small FFTs.

Originally committed as revision 24025 to svn://svn.ffmpeg.org/ffmpeg/trunk
oldabi
Måns Rullgård 15 years ago
parent f054aaf731
commit a075902f3d
  1. 15
      libavcodec/ppc/fft_altivec.c
  2. 60
      libavcodec/ppc/fft_altivec_s.S

@ -38,19 +38,6 @@
extern void *ff_fft_dispatch_altivec[2][15]; extern void *ff_fft_dispatch_altivec[2][15];
#if HAVE_GNU_AS #if HAVE_GNU_AS
// Convert from simd order to C order.
static void swizzle(vec_f *z, int n)
{
int i;
n >>= 1;
for (i = 0; i < n; i += 2) {
vec_f re = z[i];
vec_f im = z[i+1];
z[i] = vec_mergeh(re, im);
z[i+1] = vec_mergel(re, im);
}
}
static av_always_inline void fft_dispatch(FFTContext *s, FFTComplex *z, int do_swizzle) static av_always_inline void fft_dispatch(FFTContext *s, FFTComplex *z, int do_swizzle)
{ {
register vec_f v14 __asm__("v14") = {0,0,0,0}; register vec_f v14 __asm__("v14") = {0,0,0,0};
@ -84,8 +71,6 @@ static av_always_inline void fft_dispatch(FFTContext *s, FFTComplex *z, int do_s
: "lr","ctr","r0","r4","r5","r6","r7","r8","r9","r10","r11", : "lr","ctr","r0","r4","r5","r6","r7","r8","r9","r10","r11",
"v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11","v12","v13" "v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11","v12","v13"
); );
if (do_swizzle && s->nbits <= 4)
swizzle((vec_f*)z, 1<<s->nbits);
} }
static void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z) static void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z)

@ -143,28 +143,53 @@
vaddfp \d0,\s0,\s1 vaddfp \d0,\s0,\s1
.endm .endm
fft4_altivec: .macro zip d0,d1,s0,s1
vmrghw \d0,\s0,\s1
vmrglw \d1,\s0,\s1
.endm
.macro def_fft4 interleave
fft4\interleave\()_altivec:
lvx v0, 0,r3 lvx v0, 0,r3
lvx v1,r9,r3 lvx v1,r9,r3
FFT4 v0,v1,v2,v3 FFT4 v0,v1,v2,v3
.ifnb \interleave
zip v0,v1,v2,v3
stvx v0, 0,r3
stvx v1,r9,r3
.else
stvx v2, 0,r3 stvx v2, 0,r3
stvx v3,r9,r3 stvx v3,r9,r3
.endif
blr blr
.endm
fft8_altivec: .macro def_fft8 interleave
fft8\interleave\()_altivec:
addi r4,r3,32 addi r4,r3,32
lvx v0, 0,r3 lvx v0, 0,r3
lvx v1,r9,r3 lvx v1,r9,r3
lvx v2, 0,r4 lvx v2, 0,r4
lvx v3,r9,r4 lvx v3,r9,r4
FFT8 v0,v1,v2,v3,v4,v5,v6,v7,v8 FFT8 v0,v1,v2,v3,v4,v5,v6,v7,v8
.ifnb \interleave
zip v4,v5,v0,v1
zip v6,v7,v2,v3
stvx v4, 0,r3
stvx v5,r9,r3
stvx v6, 0,r4
stvx v7,r9,r4
.else
stvx v0, 0,r3 stvx v0, 0,r3
stvx v1,r9,r3 stvx v1,r9,r3
stvx v2, 0,r4 stvx v2, 0,r4
stvx v3,r9,r4 stvx v3,r9,r4
.endif
blr blr
.endm
fft16_altivec: .macro def_fft16 interleave
fft16\interleave\()_altivec:
addi r5,r3,64 addi r5,r3,64
addi r6,r3,96 addi r6,r3,96
addi r4,r3,32 addi r4,r3,32
@ -190,17 +215,33 @@ fft16_altivec:
BF v11,v13,v9,v11 BF v11,v13,v9,v11
BF v0,v4,v0,v10 BF v0,v4,v0,v10
BF v3,v7,v3,v12 BF v3,v7,v3,v12
BF v1,v5,v1,v11
BF v2,v6,v2,v13
.ifnb \interleave
zip v8, v9,v0,v1
zip v10,v11,v2,v3
zip v12,v13,v4,v5
zip v14,v15,v6,v7
stvx v8, 0,r3
stvx v9,r9,r3
stvx v10, 0,r4
stvx v11,r9,r4
stvx v12, 0,r5
stvx v13,r9,r5
stvx v14, 0,r6
stvx v15,r9,r6
.else
stvx v0, 0,r3 stvx v0, 0,r3
stvx v4, 0,r5 stvx v4, 0,r5
stvx v3,r9,r4 stvx v3,r9,r4
stvx v7,r9,r6 stvx v7,r9,r6
BF v1,v5,v1,v11
BF v2,v6,v2,v13
stvx v1,r9,r3 stvx v1,r9,r3
stvx v5,r9,r5 stvx v5,r9,r5
stvx v2, 0,r4 stvx v2, 0,r4
stvx v6, 0,r6 stvx v6, 0,r6
.endif
blr blr
.endm
// void pass(float *z, float *wre, int n) // void pass(float *z, float *wre, int n)
.macro PASS interleave, suffix .macro PASS interleave, suffix
@ -297,6 +338,9 @@ fft\n\suffix\()_altivec:
.macro DECL_FFTS interleave, suffix .macro DECL_FFTS interleave, suffix
.text .text
def_fft4 \suffix
def_fft8 \suffix
def_fft16 \suffix
PASS \interleave, \suffix PASS \interleave, \suffix
DECL_FFT \suffix, 5, 32, 16, 8 DECL_FFT \suffix, 5, 32, 16, 8
DECL_FFT \suffix, 6, 64, 32, 16 DECL_FFT \suffix, 6, 64, 32, 16
@ -314,9 +358,9 @@ fft\n\suffix\()_altivec:
.rodata .rodata
.global EXTERN_ASM\()ff_fft_dispatch\suffix\()_altivec .global EXTERN_ASM\()ff_fft_dispatch\suffix\()_altivec
EXTERN_ASM\()ff_fft_dispatch\suffix\()_altivec: EXTERN_ASM\()ff_fft_dispatch\suffix\()_altivec:
PTR fft4_altivec PTR fft4\suffix\()_altivec
PTR fft8_altivec PTR fft8\suffix\()_altivec
PTR fft16_altivec PTR fft16\suffix\()_altivec
PTR fft32\suffix\()_altivec PTR fft32\suffix\()_altivec
PTR fft64\suffix\()_altivec PTR fft64\suffix\()_altivec
PTR fft128\suffix\()_altivec PTR fft128\suffix\()_altivec

Loading…
Cancel
Save