diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index 8108a61e60..1f64cd2ace 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -244,6 +244,7 @@ typedef signed char schar; /* fundamental constants */ #define CV_PI 3.1415926535897932384626433832795 +#define CV_2PI 6.283185307179586476925286766559 #define CV_LOG2 0.69314718055994530941723212145818 /****************************************************************************************\ diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp index d3327940db..bbe0f74001 100644 --- a/modules/core/src/dxt.cpp +++ b/modules/core/src/dxt.cpp @@ -43,6 +43,7 @@ #include "opencv2/core/opencl/runtime/opencl_clamdfft.hpp" #include "opencv2/core/opencl/runtime/opencl_core.hpp" #include "opencl_kernels.hpp" +#include namespace cv { @@ -1781,6 +1782,375 @@ static bool ippi_DFT_R_32F(const Mat& src, Mat& dst, bool inv, int norm_flag) #endif } +#ifdef HAVE_OPENCL + +namespace cv +{ + +enum FftType +{ + R2R = 0, // real to CCS in case forward transform, CCS to real otherwise + C2R = 1, // complex to real in case inverse transform + R2C = 2, // real to complex in case forward transform + C2C = 3 // complex to complex +}; + +struct OCL_FftPlan +{ +private: + UMat twiddles; + String buildOptions; + int thread_count; + bool status; + int dft_size; + +public: + OCL_FftPlan(int _size): dft_size(_size), status(true) + { + int min_radix; + std::vector radixes, blocks; + ocl_getRadixes(dft_size, radixes, blocks, min_radix); + thread_count = dft_size / min_radix; + + if (thread_count > (int) ocl::Device::getDefault().maxWorkGroupSize()) + { + status = false; + return; + } + + // generate string with radix calls + String radix_processing; + int n = 1, twiddle_size = 0; + for (size_t i=0; i 1) + radix_processing += format("fft_radix%d_B%d(smem,twiddles+%d,ind,%d,%d);", radix, block, twiddle_size, n, dft_size/radix); + else + radix_processing += format("fft_radix%d(smem,twiddles+%d,ind,%d,%d);", radix, twiddle_size, n, dft_size/radix); + twiddle_size += (radix-1)*n; + n *= radix; + } + + Mat tw(1, twiddle_size, CV_32FC2); + float* ptr = tw.ptr(); + int ptr_index = 0; + + n = 1; + for (size_t i=0; i& radixes, std::vector& blocks, int& min_radix) + { + int factors[34]; + int nf = DFTFactorize(cols, factors); + + int n = 1; + int factor_index = 0; + min_radix = INT_MAX; + + // 2^n transforms + if ((factors[factor_index] & 1) == 0) + { + for( ; n < factors[factor_index];) + { + int radix = 2, block = 1; + if (8*n <= factors[0]) + radix = 8; + else if (4*n <= factors[0]) + { + radix = 4; + if (cols % 12 == 0) + block = 3; + else if (cols % 8 == 0) + block = 2; + } + else + { + if (cols % 10 == 0) + block = 5; + else if (cols % 8 == 0) + block = 4; + else if (cols % 6 == 0) + block = 3; + else if (cols % 4 == 0) + block = 2; + } + + radixes.push_back(radix); + blocks.push_back(block); + min_radix = min(min_radix, block*radix); + n *= radix; + } + factor_index++; + } + + // all the other transforms + for( ; factor_index < nf; factor_index++) + { + int radix = factors[factor_index], block = 1; + if (radix == 3) + { + if (cols % 12 == 0) + block = 4; + else if (cols % 9 == 0) + block = 3; + else if (cols % 6 == 0) + block = 2; + } + else if (radix == 5) + { + if (cols % 10 == 0) + block = 2; + } + radixes.push_back(radix); + blocks.push_back(block); + min_radix = min(min_radix, block*radix); + } + } +}; + +class OCL_FftPlanCache +{ +public: + static OCL_FftPlanCache & getInstance() + { + static OCL_FftPlanCache planCache; + return planCache; + } + + Ptr getFftPlan(int dft_size) + { + std::map >::iterator f = planStorage.find(dft_size); + if (f != planStorage.end()) + { + return f->second; + } + else + { + Ptr newPlan = Ptr(new OCL_FftPlan(dft_size)); + planStorage[dft_size] = newPlan; + return newPlan; + } + } + + ~OCL_FftPlanCache() + { + planStorage.clear(); + } + +protected: + OCL_FftPlanCache() : + planStorage() + { + } + std::map > planStorage; +}; + +static bool ocl_dft_rows(InputArray _src, OutputArray _dst, int nonzero_rows, int flags, int fftType) +{ + Ptr plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.cols()); + return plan->enqueueTransform(_src, _dst, nonzero_rows, flags, fftType, true); +} + +static bool ocl_dft_cols(InputArray _src, OutputArray _dst, int nonzero_cols, int flags, int fftType) +{ + Ptr plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.rows()); + return plan->enqueueTransform(_src, _dst, nonzero_cols, flags, fftType, false); +} + +static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_rows) +{ + int type = _src.type(), cn = CV_MAT_CN(type); + Size ssize = _src.size(); + if ( !(type == CV_32FC1 || type == CV_32FC2) ) + return false; + + // if is not a multiplication of prime numbers { 2, 3, 5 } + if (ssize.area() != getOptimalDFTSize(ssize.area())) + return false; + + UMat src = _src.getUMat(); + int complex_input = cn == 2 ? 1 : 0; + int complex_output = (flags & DFT_COMPLEX_OUTPUT) != 0; + int real_input = cn == 1 ? 1 : 0; + int real_output = (flags & DFT_REAL_OUTPUT) != 0; + bool inv = (flags & DFT_INVERSE) != 0 ? 1 : 0; + + if( nonzero_rows <= 0 || nonzero_rows > _src.rows() ) + nonzero_rows = _src.rows(); + bool is1d = (flags & DFT_ROWS) != 0 || nonzero_rows == 1; + + // if output format is not specified + if (complex_output + real_output == 0) + { + if (real_input) + real_output = 1; + else + complex_output = 1; + } + + FftType fftType = (FftType)(complex_input << 0 | complex_output << 1); + + // Forward Complex to CCS not supported + if (fftType == C2R && !inv) + fftType = C2C; + + // Inverse CCS to Complex not supported + if (fftType == R2C && inv) + fftType = R2R; + + UMat output; + if (fftType == C2C || fftType == R2C) + { + // complex output + _dst.create(src.size(), CV_32FC2); + output = _dst.getUMat(); + } + else + { + // real output + if (is1d) + { + _dst.create(src.size(), CV_32FC1); + output = _dst.getUMat(); + } + else + { + _dst.create(src.size(), CV_32FC1); + output.create(src.size(), CV_32FC2); + } + } + + if (!inv) + { + if (!ocl_dft_rows(src, output, nonzero_rows, flags, fftType)) + return false; + + if (!is1d) + { + int nonzero_cols = fftType == R2R ? output.cols/2 + 1 : output.cols; + if (!ocl_dft_cols(output, _dst, nonzero_cols, flags, fftType)) + return false; + } + } + else + { + if (fftType == C2C) + { + // complex output + if (!ocl_dft_rows(src, output, nonzero_rows, flags, fftType)) + return false; + + if (!is1d) + { + if (!ocl_dft_cols(output, output, output.cols, flags, fftType)) + return false; + } + } + else + { + if (is1d) + { + if (!ocl_dft_rows(src, output, nonzero_rows, flags, fftType)) + return false; + } + else + { + int nonzero_cols = src.cols/2 + 1; + if (!ocl_dft_cols(src, output, nonzero_cols, flags, fftType)) + return false; + + if (!ocl_dft_rows(output, _dst, nonzero_rows, flags, fftType)) + return false; + } + } + } + return true; +} + +} // namespace cv; + +#endif + #ifdef HAVE_CLAMDFFT namespace cv { @@ -1791,14 +2161,6 @@ namespace cv { CV_Assert(s == CLFFT_SUCCESS); \ } -enum FftType -{ - R2R = 0, // real to real - C2R = 1, // opencl HERMITIAN_INTERLEAVED to real - R2C = 2, // real to opencl HERMITIAN_INTERLEAVED - C2C = 3 // complex to complex -}; - class PlanCache { struct FftPlan @@ -1923,7 +2285,7 @@ public: } // no baked plan is found, so let's create a new one - FftPlan * newPlan = new FftPlan(dft_size, src_step, dst_step, doubleFP, inplace, flags, fftType); + Ptr newPlan = Ptr(new FftPlan(dft_size, src_step, dst_step, doubleFP, inplace, flags, fftType)); planStorage.push_back(newPlan); return newPlan->plHandle; @@ -1931,8 +2293,6 @@ public: ~PlanCache() { - for (std::vector::iterator i = planStorage.begin(), end = planStorage.end(); i != end; ++i) - delete (*i); planStorage.clear(); } @@ -1942,7 +2302,7 @@ protected: { } - std::vector planStorage; + std::vector > planStorage; }; extern "C" { @@ -1960,7 +2320,7 @@ static void CL_CALLBACK oclCleanupCallback(cl_event e, cl_int, void *p) } -static bool ocl_dft(InputArray _src, OutputArray _dst, int flags) +static bool ocl_dft_amdfft(InputArray _src, OutputArray _dst, int flags) { int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); Size ssize = _src.size(); @@ -2019,7 +2379,6 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags) tmpBuffer.addref(); clSetEventCallback(e, CL_COMPLETE, oclCleanupCallback, tmpBuffer.u); - return true; } @@ -2034,7 +2393,12 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows ) #ifdef HAVE_CLAMDFFT CV_OCL_RUN(ocl::haveAmdFft() && ocl::Device::getDefault().type() != ocl::Device::TYPE_CPU && _dst.isUMat() && _src0.dims() <= 2 && nonzero_rows == 0, - ocl_dft(_src0, _dst, flags)) + ocl_dft_amdfft(_src0, _dst, flags)) +#endif + +#ifdef HAVE_OPENCL + CV_OCL_RUN(_dst.isUMat() && _src0.dims() <= 2, + ocl_dft(_src0, _dst, flags, nonzero_rows)) #endif static DFTFunc dft_tbl[6] = @@ -2046,10 +2410,8 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows ) (DFTFunc)RealDFT_64f, (DFTFunc)CCSIDFT_64f }; - AutoBuffer buf; void *spec = 0; - Mat src0 = _src0.getMat(), src = src0; int prev_len = 0, stage = 0; bool inv = (flags & DFT_INVERSE) != 0; diff --git a/modules/core/src/opencl/fft.cl b/modules/core/src/opencl/fft.cl new file mode 100644 index 0000000000..1268c4d6e4 --- /dev/null +++ b/modules/core/src/opencl/fft.cl @@ -0,0 +1,864 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +// Copyright (C) 2014, Itseez, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. + +#define SQRT_2 0.707106781188f +#define sin_120 0.866025403784f +#define fft5_2 0.559016994374f +#define fft5_3 -0.951056516295f +#define fft5_4 -1.538841768587f +#define fft5_5 0.363271264002f + +__attribute__((always_inline)) +float2 mul_float2(float2 a, float2 b) { + return (float2)(fma(a.x, b.x, -a.y * b.y), fma(a.x, b.y, a.y * b.x)); +} + +__attribute__((always_inline)) +float2 twiddle(float2 a) { + return (float2)(a.y, -a.x); +} + +__attribute__((always_inline)) +void butterfly2(float2 a0, float2 a1, __local float2* smem, __global const float2* twiddles, + const int x, const int block_size) +{ + const int k = x & (block_size - 1); + a1 = mul_float2(twiddles[k], a1); + const int dst_ind = (x << 1) - k; + + smem[dst_ind] = a0 + a1; + smem[dst_ind+block_size] = a0 - a1; +} + +__attribute__((always_inline)) +void butterfly4(float2 a0, float2 a1, float2 a2, float2 a3, __local float2* smem, __global const float2* twiddles, + const int x, const int block_size) +{ + const int k = x & (block_size - 1); + a1 = mul_float2(twiddles[k], a1); + a2 = mul_float2(twiddles[k + block_size], a2); + a3 = mul_float2(twiddles[k + 2*block_size], a3); + + const int dst_ind = ((x - k) << 2) + k; + + float2 b0 = a0 + a2; + a2 = a0 - a2; + float2 b1 = a1 + a3; + a3 = twiddle(a1 - a3); + + smem[dst_ind] = b0 + b1; + smem[dst_ind + block_size] = a2 + a3; + smem[dst_ind + 2*block_size] = b0 - b1; + smem[dst_ind + 3*block_size] = a2 - a3; +} + +__attribute__((always_inline)) +void butterfly3(float2 a0, float2 a1, float2 a2, __local float2* smem, __global const float2* twiddles, + const int x, const int block_size) +{ + const int k = x % block_size; + a1 = mul_float2(twiddles[k], a1); + a2 = mul_float2(twiddles[k+block_size], a2); + const int dst_ind = ((x - k) * 3) + k; + + float2 b1 = a1 + a2; + a2 = twiddle(sin_120*(a1 - a2)); + float2 b0 = a0 - (float2)(0.5f)*b1; + + smem[dst_ind] = a0 + b1; + smem[dst_ind + block_size] = b0 + a2; + smem[dst_ind + 2*block_size] = b0 - a2; +} + +__attribute__((always_inline)) +void butterfly5(float2 a0, float2 a1, float2 a2, float2 a3, float2 a4, __local float2* smem, __global const float2* twiddles, + const int x, const int block_size) +{ + const int k = x % block_size; + a1 = mul_float2(twiddles[k], a1); + a2 = mul_float2(twiddles[k + block_size], a2); + a3 = mul_float2(twiddles[k+2*block_size], a3); + a4 = mul_float2(twiddles[k+3*block_size], a4); + + const int dst_ind = ((x - k) * 5) + k; + __local float2* dst = smem + dst_ind; + + float2 b0, b1, b5; + + b1 = a1 + a4; + a1 -= a4; + + a4 = a3 + a2; + a3 -= a2; + + a2 = b1 + a4; + b0 = a0 - (float2)0.25f * a2; + + b1 = fft5_2 * (b1 - a4); + a4 = fft5_3 * (float2)(-a1.y - a3.y, a1.x + a3.x); + b5 = (float2)(a4.x - fft5_5 * a1.y, a4.y + fft5_5 * a1.x); + + a4.x += fft5_4 * a3.y; + a4.y -= fft5_4 * a3.x; + + a1 = b0 + b1; + b0 -= b1; + + dst[0] = a0 + a2; + dst[block_size] = a1 + a4; + dst[2 * block_size] = b0 + b5; + dst[3 * block_size] = b0 - b5; + dst[4 * block_size] = a1 - a4; +} + +__attribute__((always_inline)) +void fft_radix2(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t) +{ + float2 a0, a1; + + if (x < t) + { + a0 = smem[x]; + a1 = smem[x+t]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (x < t) + butterfly2(a0, a1, smem, twiddles, x, block_size); + + barrier(CLK_LOCAL_MEM_FENCE); +} + +__attribute__((always_inline)) +void fft_radix2_B2(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) +{ + const int x2 = x1 + t/2; + float2 a0, a1, a2, a3; + + if (x1 < t/2) + { + a0 = smem[x1]; a1 = smem[x1+t]; + a2 = smem[x2]; a3 = smem[x2+t]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (x1 < t/2) + { + butterfly2(a0, a1, smem, twiddles, x1, block_size); + butterfly2(a2, a3, smem, twiddles, x2, block_size); + } + + barrier(CLK_LOCAL_MEM_FENCE); +} + +__attribute__((always_inline)) +void fft_radix2_B3(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) +{ + const int x2 = x1 + t/3; + const int x3 = x1 + 2*t/3; + float2 a0, a1, a2, a3, a4, a5; + + if (x1 < t/3) + { + a0 = smem[x1]; a1 = smem[x1+t]; + a2 = smem[x2]; a3 = smem[x2+t]; + a4 = smem[x3]; a5 = smem[x3+t]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (x1 < t/3) + { + butterfly2(a0, a1, smem, twiddles, x1, block_size); + butterfly2(a2, a3, smem, twiddles, x2, block_size); + butterfly2(a4, a5, smem, twiddles, x3, block_size); + } + + barrier(CLK_LOCAL_MEM_FENCE); +} + +__attribute__((always_inline)) +void fft_radix2_B4(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) +{ + const int thread_block = t/4; + const int x2 = x1 + thread_block; + const int x3 = x1 + 2*thread_block; + const int x4 = x1 + 3*thread_block; + float2 a0, a1, a2, a3, a4, a5, a6, a7; + + if (x1 < t/4) + { + a0 = smem[x1]; a1 = smem[x1+t]; + a2 = smem[x2]; a3 = smem[x2+t]; + a4 = smem[x3]; a5 = smem[x3+t]; + a6 = smem[x4]; a7 = smem[x4+t]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (x1 < t/4) + { + butterfly2(a0, a1, smem, twiddles, x1, block_size); + butterfly2(a2, a3, smem, twiddles, x2, block_size); + butterfly2(a4, a5, smem, twiddles, x3, block_size); + butterfly2(a6, a7, smem, twiddles, x4, block_size); + } + + barrier(CLK_LOCAL_MEM_FENCE); +} + +__attribute__((always_inline)) +void fft_radix2_B5(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) +{ + const int thread_block = t/5; + const int x2 = x1 + thread_block; + const int x3 = x1 + 2*thread_block; + const int x4 = x1 + 3*thread_block; + const int x5 = x1 + 4*thread_block; + float2 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9; + + if (x1 < t/5) + { + a0 = smem[x1]; a1 = smem[x1+t]; + a2 = smem[x2]; a3 = smem[x2+t]; + a4 = smem[x3]; a5 = smem[x3+t]; + a6 = smem[x4]; a7 = smem[x4+t]; + a8 = smem[x5]; a9 = smem[x5+t]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (x1 < t/5) + { + butterfly2(a0, a1, smem, twiddles, x1, block_size); + butterfly2(a2, a3, smem, twiddles, x2, block_size); + butterfly2(a4, a5, smem, twiddles, x3, block_size); + butterfly2(a6, a7, smem, twiddles, x4, block_size); + butterfly2(a8, a9, smem, twiddles, x5, block_size); + } + + barrier(CLK_LOCAL_MEM_FENCE); +} + +__attribute__((always_inline)) +void fft_radix4(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t) +{ + float2 a0, a1, a2, a3; + + if (x < t) + { + a0 = smem[x]; a1 = smem[x+t]; a2 = smem[x+2*t]; a3 = smem[x+3*t]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (x < t) + butterfly4(a0, a1, a2, a3, smem, twiddles, x, block_size); + + barrier(CLK_LOCAL_MEM_FENCE); +} + +__attribute__((always_inline)) +void fft_radix4_B2(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) +{ + const int x2 = x1 + t/2; + float2 a0, a1, a2, a3, a4, a5, a6, a7; + + if (x1 < t/2) + { + a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t]; + a4 = smem[x2]; a5 = smem[x2+t]; a6 = smem[x2+2*t]; a7 = smem[x2+3*t]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (x1 < t/2) + { + butterfly4(a0, a1, a2, a3, smem, twiddles, x1, block_size); + butterfly4(a4, a5, a6, a7, smem, twiddles, x2, block_size); + } + + barrier(CLK_LOCAL_MEM_FENCE); +} + +__attribute__((always_inline)) +void fft_radix4_B3(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) +{ + const int x2 = x1 + t/3; + const int x3 = x2 + t/3; + float2 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11; + + if (x1 < t/3) + { + a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t]; + a4 = smem[x2]; a5 = smem[x2+t]; a6 = smem[x2+2*t]; a7 = smem[x2+3*t]; + a8 = smem[x3]; a9 = smem[x3+t]; a10 = smem[x3+2*t]; a11 = smem[x3+3*t]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (x1 < t/3) + { + butterfly4(a0, a1, a2, a3, smem, twiddles, x1, block_size); + butterfly4(a4, a5, a6, a7, smem, twiddles, x2, block_size); + butterfly4(a8, a9, a10, a11, smem, twiddles, x3, block_size); + } + + barrier(CLK_LOCAL_MEM_FENCE); +} + +__attribute__((always_inline)) +void fft_radix8(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t) +{ + const int k = x % block_size; + float2 a0, a1, a2, a3, a4, a5, a6, a7; + + if (x < t) + { + int tw_ind = block_size / 8; + + a0 = smem[x]; + a1 = mul_float2(twiddles[k], smem[x + t]); + a2 = mul_float2(twiddles[k + block_size],smem[x+2*t]); + a3 = mul_float2(twiddles[k+2*block_size],smem[x+3*t]); + a4 = mul_float2(twiddles[k+3*block_size],smem[x+4*t]); + a5 = mul_float2(twiddles[k+4*block_size],smem[x+5*t]); + a6 = mul_float2(twiddles[k+5*block_size],smem[x+6*t]); + a7 = mul_float2(twiddles[k+6*block_size],smem[x+7*t]); + + float2 b0, b1, b6, b7; + + b0 = a0 + a4; + a4 = a0 - a4; + b1 = a1 + a5; + a5 = a1 - a5; + a5 = (float2)(SQRT_2) * (float2)(a5.x + a5.y, -a5.x + a5.y); + b6 = twiddle(a2 - a6); + a2 = a2 + a6; + b7 = a3 - a7; + b7 = (float2)(SQRT_2) * (float2)(-b7.x + b7.y, -b7.x - b7.y); + a3 = a3 + a7; + + a0 = b0 + a2; + a2 = b0 - a2; + a1 = b1 + a3; + a3 = twiddle(b1 - a3); + a6 = a4 - b6; + a4 = a4 + b6; + a7 = twiddle(a5 - b7); + a5 = a5 + b7; + + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (x < t) + { + const int dst_ind = ((x - k) << 3) + k; + __local float2* dst = smem + dst_ind; + + dst[0] = a0 + a1; + dst[block_size] = a4 + a5; + dst[2 * block_size] = a2 + a3; + dst[3 * block_size] = a6 + a7; + dst[4 * block_size] = a0 - a1; + dst[5 * block_size] = a4 - a5; + dst[6 * block_size] = a2 - a3; + dst[7 * block_size] = a6 - a7; + } + + barrier(CLK_LOCAL_MEM_FENCE); +} + +__attribute__((always_inline)) +void fft_radix3(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t) +{ + float2 a0, a1, a2; + + if (x < t) + { + a0 = smem[x]; a1 = smem[x+t]; a2 = smem[x+2*t]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (x < t) + butterfly3(a0, a1, a2, smem, twiddles, x, block_size); + + barrier(CLK_LOCAL_MEM_FENCE); +} + +__attribute__((always_inline)) +void fft_radix3_B2(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) +{ + const int x2 = x1 + t/2; + float2 a0, a1, a2, a3, a4, a5; + + if (x1 < t/2) + { + a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t]; + a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (x1 < t/2) + { + butterfly3(a0, a1, a2, smem, twiddles, x1, block_size); + butterfly3(a3, a4, a5, smem, twiddles, x2, block_size); + } + + barrier(CLK_LOCAL_MEM_FENCE); +} + +__attribute__((always_inline)) +void fft_radix3_B3(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) +{ + const int x2 = x1 + t/3; + const int x3 = x2 + t/3; + float2 a0, a1, a2, a3, a4, a5, a6, a7, a8; + + if (x1 < t/2) + { + a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t]; + a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t]; + a6 = smem[x3]; a7 = smem[x3+t]; a8 = smem[x3+2*t]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (x1 < t/2) + { + butterfly3(a0, a1, a2, smem, twiddles, x1, block_size); + butterfly3(a3, a4, a5, smem, twiddles, x2, block_size); + butterfly3(a6, a7, a8, smem, twiddles, x3, block_size); + } + + barrier(CLK_LOCAL_MEM_FENCE); +} + +__attribute__((always_inline)) +void fft_radix3_B4(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) +{ + const int thread_block = t/4; + const int x2 = x1 + thread_block; + const int x3 = x1 + 2*thread_block; + const int x4 = x1 + 3*thread_block; + float2 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11; + + if (x1 < t/4) + { + a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t]; + a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t]; + a6 = smem[x3]; a7 = smem[x3+t]; a8 = smem[x3+2*t]; + a9 = smem[x4]; a10 = smem[x4+t]; a11 = smem[x4+2*t]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (x1 < t/4) + { + butterfly3(a0, a1, a2, smem, twiddles, x1, block_size); + butterfly3(a3, a4, a5, smem, twiddles, x2, block_size); + butterfly3(a6, a7, a8, smem, twiddles, x3, block_size); + butterfly3(a9, a10, a11, smem, twiddles, x4, block_size); + } + + barrier(CLK_LOCAL_MEM_FENCE); +} + +__attribute__((always_inline)) +void fft_radix5(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t) +{ + const int k = x % block_size; + float2 a0, a1, a2, a3, a4; + + if (x < t) + { + a0 = smem[x]; a1 = smem[x + t]; a2 = smem[x+2*t]; a3 = smem[x+3*t]; a4 = smem[x+4*t]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (x < t) + butterfly5(a0, a1, a2, a3, a4, smem, twiddles, x, block_size); + + barrier(CLK_LOCAL_MEM_FENCE); +} + +__attribute__((always_inline)) +void fft_radix5_B2(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) +{ + const int x2 = x1+t/2; + float2 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9; + + if (x1 < t/2) + { + a0 = smem[x1]; a1 = smem[x1 + t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t]; a4 = smem[x1+4*t]; + a5 = smem[x2]; a6 = smem[x2 + t]; a7 = smem[x2+2*t]; a8 = smem[x2+3*t]; a9 = smem[x2+4*t]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (x1 < t/2) + { + butterfly5(a0, a1, a2, a3, a4, smem, twiddles, x1, block_size); + butterfly5(a5, a6, a7, a8, a9, smem, twiddles, x2, block_size); + } + + barrier(CLK_LOCAL_MEM_FENCE); +} + +#ifdef DFT_SCALE +#define SCALE_VAL(x, scale) x*scale +#else +#define SCALE_VAL(x, scale) x +#endif + +__kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols, + __global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, + __global float2* twiddles_ptr, const int t, const int nz) +{ + const int x = get_global_id(0); + const int y = get_group_id(1); + const int block_size = LOCAL_SIZE/kercn; + if (y < nz) + { + __local float2 smem[LOCAL_SIZE]; + __global const float2* twiddles = (__global float2*) twiddles_ptr; + const int ind = x; +#ifdef IS_1D + float scale = 1.f/dst_cols; +#else + float scale = 1.f/(dst_cols*dst_rows); +#endif + +#ifdef COMPLEX_INPUT + __global const float2* src = (__global const float2*)(src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(float)*2), src_offset))); + #pragma unroll + for (int i=0; i