x86/tx_float: add 15xN PFA FFT AVX SIMD

~4x faster than the C version.
The shuffles in the 15pt dim1 are seriously expensive. Not happy with it,
but I'm contempt.

Can be easily converted to pure AVX by removing all vpermpd/vpermps
instructions.
pull/388/head
Lynne 2 years ago
parent 3241e9225c
commit ace42cf581
No known key found for this signature in database
GPG Key ID: A2FEA5F03F034464
  1. 323
      doc/transforms.md
  2. 59
      libavutil/tx_template.c
  3. 280
      libavutil/x86/tx_float.asm
  4. 68
      libavutil/x86/tx_float_init.c
  5. 4
      tests/checkasm/av_tx.c

@ -704,3 +704,326 @@ Also note that this function requires a special iteration way, due to coefficien
beginning to overlap, particularly `[o1]` with `[0]` after the second iteration.
To iterate further, set `z = &z[16]` via `z += 8` for the second iteration. After
the 4th iteration, the layout resets, so repeat the same.
# 15-point AVX FFT transform
The 15-point transform is based on the following unrolling. The input
must be permuted via the following loop:
``` C
for (int k = 0; k < s->sub[0].len; k++) {
int cnt = 0;
int tmp[15];
memcpy(tmp, &s->map[k*15], 15*sizeof(*tmp));
for (int i = 1; i < 15; i += 3) {
s->map[k*15 + cnt] = tmp[i];
cnt++;
}
for (int i = 2; i < 15; i += 3) {
s->map[k*15 + cnt] = tmp[i];
cnt++;
}
for (int i = 0; i < 15; i += 3) {
s->map[k*15 + cnt] = tmp[i];
cnt++;
}
memmove(&s->map[k*15 + 7], &s->map[k*15 + 6], 4*sizeof(int));
memmove(&s->map[k*15 + 3], &s->map[k*15 + 1], 4*sizeof(int));
s->map[k*15 + 1] = tmp[2];
s->map[k*15 + 2] = tmp[0];
}
```
This separates the ACs from the DCs and flips the SIMD direction to
performing 5x3pt transforms at once, followed by 3x5pt transforms.
``` C
static av_always_inline void fft15(TXComplex *out, TXComplex *in,
ptrdiff_t stride)
{
const TXSample *tab = TX_TAB(ff_tx_tab_53);
TXComplex q[20];
TXComplex dc[3], pc[32];
TXComplex y[32], k[32];
TXComplex t[32];
TXComplex r[32];
TXComplex z0[32];
/* DC */
pc[0].re = in[ 1].im - in[ 0].im;
pc[0].im = in[ 1].re - in[ 0].re;
pc[1].re = in[ 1].re + in[ 0].re;
pc[1].im = in[ 1].im + in[ 0].im;
dc[0].re = in[2].re + pc[1].re;
dc[0].im = in[2].im + pc[1].im;
pc[0].re = tab[ 8] * pc[0].re;
pc[0].im = tab[ 9] * pc[0].im;
pc[1].re = tab[10] * pc[1].re;
pc[1].im = tab[11] * pc[1].im;
dc[1].re = pc[0].re + pc[1].re;
dc[1].im = pc[0].im + pc[1].im;
dc[2].re = pc[1].re - pc[0].re;
dc[2].im = pc[1].im - pc[0].im;
dc[1].re = in[2].re - dc[1].re;
dc[1].im = in[2].im + dc[1].im;
dc[2].re = in[2].re - dc[2].re;
dc[2].im = in[2].im + dc[2].im;
/* ACs */
q[0].im = in[ 3].re - in[ 7].re; // NOTE THE ORDER
q[0].re = in[ 3].im - in[ 7].im;
q[1].im = in[ 4].re - in[ 8].re;
q[1].re = in[ 4].im - in[ 8].im;
q[2].im = in[ 5].re - in[ 9].re;
q[2].re = in[ 5].im - in[ 9].im;
q[3].re = in[ 6].im - in[10].im;
q[3].im = in[ 6].re - in[10].re;
q[4].re = in[ 3].re + in[ 7].re;
q[4].im = in[ 3].im + in[ 7].im;
q[5].re = in[ 4].re + in[ 8].re;
q[5].im = in[ 4].im + in[ 8].im;
q[6].re = in[ 5].re + in[ 9].re;
q[6].im = in[ 5].im + in[ 9].im;
q[7].re = in[ 6].re + in[10].re;
q[7].im = in[ 6].im + in[10].im;
y[0].re = in[11].re + q[4].re;
y[0].im = in[11].im + q[4].im;
y[1].re = in[12].re + q[5].re;
y[1].im = in[12].im + q[5].im;
y[2].re = in[13].re + q[6].re;
y[2].im = in[13].im + q[6].im;
y[3].re = in[14].re + q[7].re;
y[3].im = in[14].im + q[7].im;
q[0].re = tab[ 8] * q[0].re;
q[0].im = tab[ 9] * q[0].im;
q[1].re = tab[ 8] * q[1].re;
q[1].im = tab[ 9] * q[1].im;
q[2].re = tab[ 8] * q[2].re;
q[2].im = tab[ 9] * q[2].im;
q[3].re = tab[ 8] * q[3].re;
q[3].im = tab[ 9] * q[3].im;
q[4].re = tab[10] * q[4].re;
q[4].im = tab[11] * q[4].im;
q[5].re = tab[10] * q[5].re;
q[5].im = tab[11] * q[5].im;
q[6].re = tab[10] * q[6].re;
q[6].im = tab[11] * q[6].im;
q[7].re = tab[10] * q[7].re;
q[7].im = tab[11] * q[7].im;
k[0].re = q[4].re - q[0].re;
k[0].im = q[4].im - q[0].im;
k[1].re = q[5].re - q[1].re;
k[1].im = q[5].im - q[1].im;
k[2].re = q[6].re - q[2].re;
k[2].im = q[6].im - q[2].im;
k[3].re = q[7].re - q[3].re;
k[3].im = q[7].im - q[3].im;
k[4].re = q[4].re + q[0].re;
k[4].im = q[4].im + q[0].im;
k[5].re = q[5].re + q[1].re;
k[5].im = q[5].im + q[1].im;
k[6].re = q[6].re + q[2].re;
k[6].im = q[6].im + q[2].im;
k[7].re = q[7].re + q[3].re;
k[7].im = q[7].im + q[3].im;
k[0].re = in[11].re - k[0].re;
k[0].im = in[11].im + k[0].im;
k[1].re = in[12].re - k[1].re;
k[1].im = in[12].im + k[1].im;
k[2].re = in[13].re - k[2].re;
k[2].im = in[13].im + k[2].im;
k[3].re = in[14].re - k[3].re;
k[3].im = in[14].im + k[3].im;
k[4].re = in[11].re - k[4].re;
k[4].im = in[11].im + k[4].im;
k[5].re = in[12].re - k[5].re;
k[5].im = in[12].im + k[5].im;
k[6].re = in[13].re - k[6].re;
k[6].im = in[13].im + k[6].im;
k[7].re = in[14].re - k[7].re;
k[7].im = in[14].im + k[7].im;
/* 15pt start here */
t[0].re = y[3].re + y[0].re;
t[0].im = y[3].im + y[0].im;
t[1].re = y[2].re + y[1].re;
t[1].im = y[2].im + y[1].im;
t[2].re = y[1].re - y[2].re;
t[2].im = y[1].im - y[2].im;
t[3].re = y[0].re - y[3].re;
t[3].im = y[0].im - y[3].im;
t[4].re = k[3].re + k[0].re;
t[4].im = k[3].im + k[0].im;
t[5].re = k[2].re + k[1].re;
t[5].im = k[2].im + k[1].im;
t[6].re = k[1].re - k[2].re;
t[6].im = k[1].im - k[2].im;
t[7].re = k[0].re - k[3].re;
t[7].im = k[0].im - k[3].im;
t[ 8].re = k[7].re + k[4].re;
t[ 8].im = k[7].im + k[4].im;
t[ 9].re = k[6].re + k[5].re;
t[ 9].im = k[6].im + k[5].im;
t[10].re = k[5].re - k[6].re;
t[10].im = k[5].im - k[6].im;
t[11].re = k[4].re - k[7].re;
t[11].im = k[4].im - k[7].im;
out[ 0*stride].re = dc[0].re + t[0].re + t[ 1].re;
out[ 0*stride].im = dc[0].im + t[0].im + t[ 1].im;
out[10*stride].re = dc[1].re + t[4].re + t[ 5].re;
out[10*stride].im = dc[1].im + t[4].im + t[ 5].im;
out[ 5*stride].re = dc[2].re + t[8].re + t[ 9].re;
out[ 5*stride].im = dc[2].im + t[8].im + t[ 9].im;
r[0].re = t[0].re * tab[0];
r[0].im = t[0].im * tab[1];
r[1].re = t[1].re * tab[0];
r[1].im = t[1].im * tab[1];
r[2].re = t[2].re * tab[4];
r[2].im = t[2].im * tab[5];
r[3].re = t[3].re * tab[4];
r[3].im = t[3].im * tab[5];
r[4].re = t[4].re * tab[0];
r[4].im = t[4].im * tab[1];
r[5].re = t[5].re * tab[0];
r[5].im = t[5].im * tab[1];
r[6].re = t[6].re * tab[4];
r[6].im = t[6].im * tab[5];
r[7].re = t[7].re * tab[4];
r[7].im = t[7].im * tab[5];
r[ 8].re = t[ 8].re * tab[0];
r[ 8].im = t[ 8].im * tab[1];
r[ 9].re = t[ 9].re * tab[0];
r[ 9].im = t[ 9].im * tab[1];
r[10].re = t[10].re * tab[4];
r[10].im = t[10].im * tab[5];
r[11].re = t[11].re * tab[4];
r[11].im = t[11].im * tab[5];
t[0].re = t[0].re * tab[2];
t[0].im = t[0].im * tab[3];
t[1].re = t[1].re * tab[2];
t[1].im = t[1].im * tab[3];
t[2].re = t[2].re * tab[6];
t[2].im = t[2].im * tab[7];
t[3].re = t[3].re * tab[6];
t[3].im = t[3].im * tab[7];
t[4].re = t[4].re * tab[2];
t[4].im = t[4].im * tab[3];
t[5].re = t[5].re * tab[2];
t[5].im = t[5].im * tab[3];
t[6].re = t[6].re * tab[6];
t[6].im = t[6].im * tab[7];
t[7].re = t[7].re * tab[6];
t[7].im = t[7].im * tab[7];
t[ 8].re = t[ 8].re * tab[2];
t[ 8].im = t[ 8].im * tab[3];
t[ 9].re = t[ 9].re * tab[2];
t[ 9].im = t[ 9].im * tab[3];
t[10].re = t[10].re * tab[6];
t[10].im = t[10].im * tab[7];
t[11].re = t[11].re * tab[6];
t[11].im = t[11].im * tab[7];
r[0].re = r[0].re - t[1].re;
r[0].im = r[0].im - t[1].im;
r[1].re = r[1].re - t[0].re;
r[1].im = r[1].im - t[0].im;
r[2].re = r[2].re - t[3].re;
r[2].im = r[2].im - t[3].im;
r[3].re = r[3].re + t[2].re;
r[3].im = r[3].im + t[2].im;
r[4].re = r[4].re - t[5].re;
r[4].im = r[4].im - t[5].im;
r[5].re = r[5].re - t[4].re;
r[5].im = r[5].im - t[4].im;
r[6].re = r[6].re - t[7].re;
r[6].im = r[6].im - t[7].im;
r[7].re = r[7].re + t[6].re;
r[7].im = r[7].im + t[6].im;
r[ 8].re = r[ 8].re - t[ 9].re;
r[ 8].im = r[ 8].im - t[ 9].im;
r[ 9].re = r[ 9].re - t[ 8].re;
r[ 9].im = r[ 9].im - t[ 8].im;
r[10].re = r[10].re - t[11].re;
r[10].im = r[10].im - t[11].im;
r[11].re = r[11].re + t[10].re;
r[11].im = r[11].im + t[10].im;
z0[ 0].re = r[ 3].im + r[ 0].re;
z0[ 0].im = r[ 3].re + r[ 0].im;
z0[ 1].re = r[ 2].im + r[ 1].re;
z0[ 1].im = r[ 2].re + r[ 1].im;
z0[ 2].re = r[ 1].im - r[ 2].re;
z0[ 2].im = r[ 1].re - r[ 2].im;
z0[ 3].re = r[ 0].im - r[ 3].re;
z0[ 3].im = r[ 0].re - r[ 3].im;
z0[ 4].re = r[ 7].im + r[ 4].re;
z0[ 4].im = r[ 7].re + r[ 4].im;
z0[ 5].re = r[ 6].im + r[ 5].re;
z0[ 5].im = r[ 6].re + r[ 5].im;
z0[ 6].re = r[ 5].im - r[ 6].re;
z0[ 6].im = r[ 5].re - r[ 6].im;
z0[ 7].re = r[ 4].im - r[ 7].re;
z0[ 7].im = r[ 4].re - r[ 7].im;
z0[ 8].re = r[11].im + r[ 8].re;
z0[ 8].im = r[11].re + r[ 8].im;
z0[ 9].re = r[10].im + r[ 9].re;
z0[ 9].im = r[10].re + r[ 9].im;
z0[10].re = r[ 9].im - r[10].re;
z0[10].im = r[ 9].re - r[10].im;
z0[11].re = r[ 8].im - r[11].re;
z0[11].im = r[ 8].re - r[11].im;
out[ 6*stride].re = dc[0].re + z0[0].re;
out[ 6*stride].im = dc[0].im + z0[3].re;
out[12*stride].re = dc[0].re + z0[2].im;
out[12*stride].im = dc[0].im + z0[1].im;
out[ 3*stride].re = dc[0].re + z0[1].re;
out[ 3*stride].im = dc[0].im + z0[2].re;
out[ 9*stride].re = dc[0].re + z0[3].im;
out[ 9*stride].im = dc[0].im + z0[0].im;
out[ 1*stride].re = dc[1].re + z0[4].re;
out[ 1*stride].im = dc[1].im + z0[7].re;
out[ 7*stride].re = dc[1].re + z0[6].im;
out[ 7*stride].im = dc[1].im + z0[5].im;
out[13*stride].re = dc[1].re + z0[5].re;
out[13*stride].im = dc[1].im + z0[6].re;
out[ 4*stride].re = dc[1].re + z0[7].im;
out[ 4*stride].im = dc[1].im + z0[4].im;
out[11*stride].re = dc[2].re + z0[8].re;
out[11*stride].im = dc[2].im + z0[11].re;
out[ 2*stride].re = dc[2].re + z0[10].im;
out[ 2*stride].im = dc[2].im + z0[9].im;
out[ 8*stride].re = dc[2].re + z0[9].re;
out[ 8*stride].im = dc[2].im + z0[10].re;
out[14*stride].re = dc[2].re + z0[11].im;
out[14*stride].im = dc[2].im + z0[8].im;
}
```

@ -48,9 +48,9 @@ SR_TABLE(65536);
SR_TABLE(131072);
/* Other factors' tables */
TABLE_DEF(53, 8);
TABLE_DEF( 7, 6);
TABLE_DEF( 9, 8);
TABLE_DEF(53, 12);
TABLE_DEF( 7, 6);
TABLE_DEF( 9, 8);
typedef struct FFSRTabsInitOnce {
void (*func)(void);
@ -104,19 +104,26 @@ static FFSRTabsInitOnce sr_tabs_init_once[] = {
{ TX_TAB(ff_tx_init_tab_131072), AV_ONCE_INIT },
};
static void TX_TAB(ff_tx_init_tab_53)(void)
static av_cold void TX_TAB(ff_tx_init_tab_53)(void)
{
TX_TAB(ff_tx_tab_53)[0] = RESCALE(cos(2 * M_PI / 12));
TX_TAB(ff_tx_tab_53)[1] = RESCALE(cos(2 * M_PI / 12));
TX_TAB(ff_tx_tab_53)[2] = RESCALE(cos(2 * M_PI / 6));
TX_TAB(ff_tx_tab_53)[3] = RESCALE(cos(8 * M_PI / 6));
TX_TAB(ff_tx_tab_53)[4] = RESCALE(cos(2 * M_PI / 5));
TX_TAB(ff_tx_tab_53)[5] = RESCALE(sin(8 * M_PI / 5));
TX_TAB(ff_tx_tab_53)[6] = RESCALE(cos(2 * M_PI / 10));
TX_TAB(ff_tx_tab_53)[7] = RESCALE(sin(6 * M_PI / 5));
/* 5pt, doubled to eliminate AVX lane shuffles */
TX_TAB(ff_tx_tab_53)[0] = RESCALE(cos(2 * M_PI / 5));
TX_TAB(ff_tx_tab_53)[1] = RESCALE(cos(2 * M_PI / 5));
TX_TAB(ff_tx_tab_53)[2] = RESCALE(cos(2 * M_PI / 10));
TX_TAB(ff_tx_tab_53)[3] = RESCALE(cos(2 * M_PI / 10));
TX_TAB(ff_tx_tab_53)[4] = RESCALE(sin(2 * M_PI / 5));
TX_TAB(ff_tx_tab_53)[5] = RESCALE(sin(2 * M_PI / 5));
TX_TAB(ff_tx_tab_53)[6] = RESCALE(sin(2 * M_PI / 10));
TX_TAB(ff_tx_tab_53)[7] = RESCALE(sin(2 * M_PI / 10));
/* 3pt */
TX_TAB(ff_tx_tab_53)[ 8] = RESCALE(cos(2 * M_PI / 12));
TX_TAB(ff_tx_tab_53)[ 9] = RESCALE(cos(2 * M_PI / 12));
TX_TAB(ff_tx_tab_53)[10] = RESCALE(cos(2 * M_PI / 6));
TX_TAB(ff_tx_tab_53)[11] = RESCALE(cos(8 * M_PI / 6));
}
static void TX_TAB(ff_tx_init_tab_7)(void)
static av_cold void TX_TAB(ff_tx_init_tab_7)(void)
{
TX_TAB(ff_tx_tab_7)[0] = RESCALE(cos(2 * M_PI / 7));
TX_TAB(ff_tx_tab_7)[1] = RESCALE(sin(2 * M_PI / 7));
@ -126,7 +133,7 @@ static void TX_TAB(ff_tx_init_tab_7)(void)
TX_TAB(ff_tx_tab_7)[5] = RESCALE(sin(2 * M_PI / 14));
}
static void TX_TAB(ff_tx_init_tab_9)(void)
static av_cold void TX_TAB(ff_tx_init_tab_9)(void)
{
TX_TAB(ff_tx_tab_9)[0] = RESCALE(cos(2 * M_PI / 3));
TX_TAB(ff_tx_tab_9)[1] = RESCALE(sin(2 * M_PI / 3));
@ -189,19 +196,19 @@ static av_always_inline void fft3(TXComplex *out, TXComplex *in,
out[0*stride].im = in[0].im + tmp[1].im;
#ifdef TX_INT32
mtmp[0] = (int64_t)tab[0] * tmp[0].re;
mtmp[1] = (int64_t)tab[1] * tmp[0].im;
mtmp[2] = (int64_t)tab[2] * tmp[1].re;
mtmp[3] = (int64_t)tab[2] * tmp[1].im;
mtmp[0] = (int64_t)tab[ 8] * tmp[0].re;
mtmp[1] = (int64_t)tab[ 9] * tmp[0].im;
mtmp[2] = (int64_t)tab[10] * tmp[1].re;
mtmp[3] = (int64_t)tab[10] * tmp[1].im;
out[1*stride].re = in[0].re - (mtmp[2] + mtmp[0] + 0x40000000 >> 31);
out[1*stride].im = in[0].im - (mtmp[3] - mtmp[1] + 0x40000000 >> 31);
out[2*stride].re = in[0].re - (mtmp[2] - mtmp[0] + 0x40000000 >> 31);
out[2*stride].im = in[0].im - (mtmp[3] + mtmp[1] + 0x40000000 >> 31);
#else
tmp[0].re = tab[0] * tmp[0].re;
tmp[0].im = tab[1] * tmp[0].im;
tmp[1].re = tab[2] * tmp[1].re;
tmp[1].im = tab[2] * tmp[1].im;
tmp[0].re = tab[ 8] * tmp[0].re;
tmp[0].im = tab[ 9] * tmp[0].im;
tmp[1].re = tab[10] * tmp[1].re;
tmp[1].im = tab[10] * tmp[1].im;
out[1*stride].re = in[0].re - tmp[1].re + tmp[0].re;
out[1*stride].im = in[0].im - tmp[1].im - tmp[0].im;
out[2*stride].re = in[0].re - tmp[1].re - tmp[0].re;
@ -224,10 +231,10 @@ static av_always_inline void NAME(TXComplex *out, TXComplex *in, \
out[D0*stride].re = in[0].re + t[0].re + t[2].re; \
out[D0*stride].im = in[0].im + t[0].im + t[2].im; \
\
SMUL(t[4].re, t[0].re, tab[4], tab[6], t[2].re, t[0].re); \
SMUL(t[4].im, t[0].im, tab[4], tab[6], t[2].im, t[0].im); \
CMUL(t[5].re, t[1].re, -tab[5], -tab[7], t[3].re, t[1].re); \
CMUL(t[5].im, t[1].im, -tab[5], -tab[7], t[3].im, t[1].im); \
SMUL(t[4].re, t[0].re, tab[0], tab[2], t[2].re, t[0].re); \
SMUL(t[4].im, t[0].im, tab[0], tab[2], t[2].im, t[0].im); \
CMUL(t[5].re, t[1].re, tab[4], tab[6], t[3].re, t[1].re); \
CMUL(t[5].im, t[1].im, tab[4], tab[6], t[3].im, t[1].im); \
\
BF(z0[0].re, z0[3].re, t[0].re, t[1].re); \
BF(z0[0].im, z0[3].im, t[0].im, t[1].im); \

@ -51,6 +51,8 @@ cextern tab_ %+ i %+ _float ; ff_tab_i_float...
%assign i (i << 1)
%endrep
cextern tab_53_float
struc AVTXContext
.len: resd 1 ; Length
.inv resd 1 ; Inverse flag
@ -87,6 +89,9 @@ s16_mult_odd1: dd COS16_1, COS16_1, COS16_3, COS16_3, COS16_1, -COS16_1, CO
s16_mult_odd2: dd COS16_3, -COS16_3, COS16_1, -COS16_1, -COS16_3, -COS16_3, -COS16_1, -COS16_1
s16_perm: dd 0, 1, 2, 3, 1, 0, 3, 2
s15_perm: dd 0, 6, 5, 3, 2, 4, 7, 1
mask_mmmmmmpp: dd NEG, NEG, NEG, NEG, NEG, NEG, POS, POS
mask_mmmmpppm: dd NEG, NEG, NEG, NEG, POS, POS, POS, NEG
mask_ppmpmmpm: dd POS, POS, NEG, POS, NEG, NEG, POS, NEG
mask_mppmmpmp: dd NEG, POS, POS, NEG, NEG, POS, NEG, POS
@ -1565,3 +1570,278 @@ cglobal mdct_sr_inv_float, 4, 13, 16, 272, ctx, out, in, stride, len, lut, exp,
%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
IMDCT_FN avx2
%endif
%macro PFA_15_FN 2
INIT_YMM %1
%if %2
cglobal fft_pfa_15xM_asm_float, 0, 0, 0, ctx, out, in, stride, len, lut, buf, map, tgt, tmp, \
tgt5, stride3, stride5, btmp
%else
cglobal fft_pfa_15xM_float, 4, 14, 16, 320, ctx, out, in, stride, len, lut, buf, map, tgt, tmp, \
tgt5, stride3, stride5, btmp
%endif
%if %2
PUSH inq
PUSH tgt5q
PUSH stride3q
PUSH stride5q
PUSH btmpq
%endif
mov btmpq, outq
mov outq, [ctxq + AVTXContext.tmp]
%if !%2
movsxd lenq, dword [ctxq + AVTXContext.len]
mov lutq, [ctxq + AVTXContext.map]
%endif
; Load stride (second transform's length) and second transform's LUT
mov tmpq, [ctxq + AVTXContext.sub]
movsxd strideq, dword [tmpq + AVTXContext.len]
mov mapq, [tmpq + AVTXContext.map]
shl strideq, 3
imul stride3q, strideq, 3
imul stride5q, strideq, 5
movaps m13, [mask_mmmmmmpp] ; mmmmmmpp
vpermpd m12, m13, q0033 ; ppppmmmm
vextractf128 xm11, m13, 1 ; mmpp
movaps m10, [ff_tx_tab_53_float] ; tab5
movaps xm9, [ff_tx_tab_53_float + 32] ; tab3
movaps m8, [s15_perm]
.dim1:
mov tmpd, [mapq]
lea tgtq, [outq + tmpq*8]
%if %2
movups xm0, [inq]
%else
LOAD64_LUT xm0, inq, lutq, 0, tmpq, m14, xm15 ; in[0,1].reim
%endif
shufps xm1, xm0, xm0, q3223 ; in[1].imrereim
shufps xm0, xm0, xm0, q1001 ; in[0].imrereim
xorps xm1, xm11
addps xm1, xm0 ; pc[0,1].imre
%if %2
movddup xm14, [inq + 16] ; in[2].reimreim
%else
mov tmpd, [lutq + 8]
movddup xm14, [inq + tmpq*8] ; in[2].reimreim
%endif
shufps xm0, xm1, xm1, q3232 ; pc[1].reimreim
addps xm0, xm14 ; dc[0].reimreim
mulps xm1, xm9 ; tab[0123]*pc[01]
shufpd xm5, xm1, xm1, 01b ; pc[1,0].reim
xorps xm1, xm11
addps xm1, xm1, xm5
addsubps xm1, xm14, xm1 ; dc[1,2].reim
%if %2
movups m2, [inq + mmsize*0 + 24]
movups m3, [inq + mmsize*1 + 24]
%else
LOAD64_LUT m2, inq, lutq, (mmsize/2)*0 + 12, tmpq, m14, m15
LOAD64_LUT m3, inq, lutq, (mmsize/2)*1 + 12, tmpq, m14, m15
%endif
subps m7, m2, m3 ; q[0-3].imre
addps m6, m2, m3 ; q[4-7]
shufps m7, m7, m7, q2301 ; q[0-3].reim
%if %2
movups m4, [inq + mmsize*2 + 24]
%else
LOAD64_LUT m4, inq, lutq, (mmsize/2)*2 + 12, tmpq, m14, m15
%endif
addps m5, m4, m6 ; y[0-3]
vpermpd m14, m9, q1111 ; tab[23232323]
vbroadcastsd m15, xm9 ; tab[01010101]
mulps m6, m14
mulps m7, m15
subps m2, m6, m7 ; k[0-3]
addps m3, m6, m7 ; k[4-7]
addsubps m6, m4, m2 ; k[0-3]
addsubps m7, m4, m3 ; k[4-7]
; 15pt from here on
vpermpd m2, m5, q0123 ; y[3-0]
vpermpd m3, m6, q0123 ; k[3-0]
vpermpd m4, m7, q0123 ; k[7-4]
xorps m5, m12
xorps m6, m12
xorps m7, m12
addps m2, m5 ; t[0-3]
addps m3, m6 ; t[4-7]
addps m4, m7 ; t[8-11]
movlhps xm14, xm2 ; out[0]
unpcklpd xm7, xm3, xm4 ; out[10,5]
unpckhpd xm5, xm3, xm4 ; out[10,5]
addps xm14, xm2 ; out[0]
addps xm7, xm5 ; out[10,5]
addps xm14, xm0 ; out[0]
addps xm7, xm1 ; out[10,5]
movhps [tgtq], xm14 ; out[0]
movhps [tgtq + stride5q*1], xm7 ; out[5]
movlps [tgtq + stride5q*2], xm7 ; out[10]
shufps m14, m10, m10, q3232 ; tab5 4 5 4 5 8 9 8 9
shufps m15, m10, m10, q1010 ; tab5 6 7 6 7 10 11 10 11
mulps m5, m2, m14 ; t[0-3]
mulps m6, m3, m14 ; t[4-7]
mulps m7, m4, m14 ; t[8-11]
mulps m2, m15 ; r[0-3]
mulps m3, m15 ; r[4-7]
mulps m4, m15 ; r[8-11]
shufps m5, m5, m5, q1032 ; t[1,0,3,2].reim
shufps m6, m6, m6, q1032 ; t[5,4,7,6].reim
shufps m7, m7, m7, q1032 ; t[9,8,11,10].reim
lea tgt5q, [tgtq + stride5q]
lea tmpq, [tgtq + stride5q*2]
xorps m5, m13
xorps m6, m13
xorps m7, m13
addps m2, m5 ; r[0,1,2,3]
addps m3, m6 ; r[4,5,6,7]
addps m4, m7 ; r[8,9,10,11]
shufps m5, m2, m2, q2301
shufps m6, m3, m3, q2301
shufps m7, m4, m4, q2301
xorps m2, m12
xorps m3, m12
xorps m4, m12
vpermpd m5, m5, q0123
vpermpd m6, m6, q0123
vpermpd m7, m7, q0123
addps m5, m2
addps m6, m3
addps m7, m4
vpermps m5, m8, m5
vpermps m6, m8, m6
vpermps m7, m8, m7
vbroadcastsd m0, xm0 ; dc[0]
vpermpd m2, m1, q1111 ; dc[2]
vbroadcastsd m1, xm1 ; dc[1]
addps m0, m5
addps m1, m6
addps m2, m7
vextractf128 xm3, m0, 1
vextractf128 xm4, m1, 1
vextractf128 xm5, m2, 1
movlps [tgtq + strideq*1], xm1
movhps [tgtq + strideq*2], xm2
movlps [tgtq + stride3q*1], xm3
movhps [tgtq + strideq*4], xm4
movlps [tgtq + stride3q*2], xm0
movlps [tgtq + strideq*8], xm5
movhps [tgtq + stride3q*4], xm0
movhps [tgt5q + strideq*2], xm1
movhps [tgt5q + strideq*4], xm3
movlps [tmpq + strideq*1], xm2
movlps [tmpq + stride3q*1], xm4
movhps [tmpq + strideq*4], xm5
%if %2
add inq, mmsize*3 + 24
%else
add lutq, (mmsize/2)*3 + 12
%endif
add mapq, 4
sub lenq, 15
jg .dim1
; Second transform setup
mov stride5q, ctxq ; backup original context
movsxd stride3q, dword [ctxq + AVTXContext.len] ; full length
mov tgt5q, [ctxq + AVTXContext.fn] ; subtransform's jump point
mov inq, outq ; in-place transform
mov ctxq, [ctxq + AVTXContext.sub] ; load subtransform's context
mov lutq, [ctxq + AVTXContext.map] ; load subtransform's map
movsxd lenq, dword [ctxq + AVTXContext.len] ; load subtransform's length
.dim2:
call tgt5q ; call the FFT
lea inq, [inq + lenq*8]
lea outq, [outq + lenq*8]
sub stride3q, lenq
jg .dim2
mov ctxq, stride5q ; restore original context
mov lutq, [ctxq + AVTXContext.map]
mov inq, [ctxq + AVTXContext.tmp]
movsxd lenq, dword [ctxq + AVTXContext.len] ; full length
lea stride3q, [lutq + lenq*4] ; second part of the LUT
mov stride5q, lenq
mov tgt5q, btmpq
.post:
LOAD64_LUT m0, inq, stride3q, 0, tmpq, m8, m9
movups [tgt5q], m0
add tgt5q, mmsize
add stride3q, mmsize/2
sub stride5q, mmsize/8
jg .post
%if %2
mov outq, btmpq
POP btmpq
POP stride5q
POP stride3q
POP tgt5q
POP inq
ret
%else
RET
%endif
%if %2
cglobal fft_pfa_15xM_ns_float, 4, 14, 16, 320, ctx, out, in, stride, len, lut, buf, map, tgt, tmp, \
tgt5, stride3, stride5, btmp
movsxd lenq, dword [ctxq + AVTXContext.len]
mov lutq, [ctxq + AVTXContext.map]
call mangle(fft_pfa_15xM_asm_float)
RET
%endif
%endmacro
%if ARCH_X86_64
PFA_15_FN avx2, 0
PFA_15_FN avx2, 1
%endif

@ -43,6 +43,9 @@ TX_DECL_FN(fft_sr_ns, fma3)
TX_DECL_FN(fft_sr, avx2)
TX_DECL_FN(fft_sr_ns, avx2)
TX_DECL_FN(fft_pfa_15xM, avx2)
TX_DECL_FN(fft_pfa_15xM_ns, avx2)
TX_DECL_FN(mdct_sr_inv, avx2)
TX_DECL_FN(fft2_asm, sse3)
@ -57,6 +60,8 @@ TX_DECL_FN(fft32_asm, fma3)
TX_DECL_FN(fft_sr_asm, fma3)
TX_DECL_FN(fft_sr_asm, avx2)
TX_DECL_FN(fft_pfa_15xM_asm, avx2)
#define DECL_INIT_FN(basis, interleave) \
static av_cold int b ##basis## _i ##interleave(AVTXContext *s, \
const FFTXCodelet *cd, \
@ -102,6 +107,61 @@ static av_cold int m_inv_init(AVTXContext *s, const FFTXCodelet *cd,
return 0;
}
static av_cold int fft_pfa_init(AVTXContext *s,
const FFTXCodelet *cd,
uint64_t flags,
FFTXCodeletOptions *opts,
int len, int inv,
const void *scale)
{
int ret;
int sub_len = len / cd->factors[0];
FFTXCodeletOptions sub_opts = { .invert_lookup = 0 };
flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
flags |= AV_TX_INPLACE; /* in-place */
flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */
flags |= FF_TX_ASM_CALL; /* We want an assembly function, not C */
if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
sub_len, inv, scale)))
return ret;
if ((ret = ff_tx_gen_compound_mapping(s, cd->factors[0], sub_len)))
return ret;
if (cd->factors[0] == 15) {
for (int k = 0; k < s->sub[0].len; k++) {
int cnt = 0;
int tmp[15];
memcpy(tmp, &s->map[k*15], 15*sizeof(*tmp));
for (int i = 1; i < 15; i += 3) {
s->map[k*15 + cnt] = tmp[i];
cnt++;
}
for (int i = 2; i < 15; i += 3) {
s->map[k*15 + cnt] = tmp[i];
cnt++;
}
for (int i = 0; i < 15; i += 3) {
s->map[k*15 + cnt] = tmp[i];
cnt++;
}
memmove(&s->map[k*15 + 7], &s->map[k*15 + 6], 4*sizeof(int));
memmove(&s->map[k*15 + 3], &s->map[k*15 + 1], 4*sizeof(int));
s->map[k*15 + 1] = tmp[2];
s->map[k*15 + 2] = tmp[0];
}
}
if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
return AVERROR(ENOMEM);
TX_TAB(ff_tx_init_tabs)(len / sub_len);
return 0;
}
const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = {
TX_DEF(fft2, FFT, 2, 2, 2, 0, 128, NULL, sse3, SSE3, AV_TX_INPLACE, 0),
TX_DEF(fft2_asm, FFT, 2, 2, 2, 0, 192, b8_i0, sse3, SSE3,
@ -150,6 +210,7 @@ const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = {
AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW),
TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
AV_CPU_FLAG_AVXSLOW),
#if HAVE_AVX2_EXTERNAL
TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 320, b8_i2, avx2, AVX2, 0,
AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
@ -158,6 +219,13 @@ const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = {
TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 384, b8_i2, avx2, AVX2, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
TX_DEF(fft_pfa_15xM, FFT, 60, TX_LEN_UNLIMITED, 15, TX_FACTOR_ANY, 320, fft_pfa_init, avx2, AVX2,
AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
TX_DEF(fft_pfa_15xM_asm, FFT, 60, TX_LEN_UNLIMITED, 15, TX_FACTOR_ANY, 384, fft_pfa_init, avx2, AVX2,
AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
TX_DEF(fft_pfa_15xM_ns, FFT, 60, TX_LEN_UNLIMITED, 15, TX_FACTOR_ANY, 384, fft_pfa_init, avx2, AVX2,
AV_TX_INPLACE | FF_TX_PRESHUFFLE, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
TX_DEF(mdct_sr_inv, MDCT, 16, TX_LEN_UNLIMITED, 2, TX_FACTOR_ANY, 384, m_inv_init, avx2, AVX2,
FF_TX_INVERSE_ONLY, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
#endif

@ -24,7 +24,7 @@
#include <stdlib.h>
#define EPS 0.00005
#define EPS 0.0005
#define SCALE_NOOP(x) (x)
#define SCALE_INT20(x) (av_clip64(lrintf((x) * 2147483648.0), INT32_MIN, INT32_MAX) >> 12)
@ -40,7 +40,7 @@
} while (0)
static const int check_lens[] = {
2, 4, 8, 16, 32, 64, 1024, 16384,
2, 4, 8, 16, 32, 64, 120, 960, 1024, 1920, 16384,
};
static AVTXContext *tx_refs[AV_TX_NB][2 /* Direction */][FF_ARRAY_ELEMS(check_lens)] = { 0 };

Loading…
Cancel
Save