From 5dd92638485085805fc31034a7a64dc4a5e4e178 Mon Sep 17 00:00:00 2001 From: Alexander Karsakov Date: Thu, 26 Jun 2014 10:09:15 +0400 Subject: [PATCH 01/13] Multi-radix with kernel generation --- modules/core/perf/opencl/perf_dxt.cpp | 6 +- modules/core/src/dxt.cpp | 257 +++++++++++++++++++++- modules/core/src/opencl/fft.cl | 297 ++++++++++++++++++++++++++ modules/core/test/ocl/test_dft.cpp | 73 +++++-- samples/cpp/dft.cpp | 69 +++++- 5 files changed, 666 insertions(+), 36 deletions(-) create mode 100644 modules/core/src/opencl/fft.cl diff --git a/modules/core/perf/opencl/perf_dxt.cpp b/modules/core/perf/opencl/perf_dxt.cpp index d0219913b5..09d657d7a1 100644 --- a/modules/core/perf/opencl/perf_dxt.cpp +++ b/modules/core/perf/opencl/perf_dxt.cpp @@ -57,9 +57,9 @@ namespace ocl { typedef tuple DftParams; typedef TestBaseWithParam DftFixture; -OCL_PERF_TEST_P(DftFixture, Dft, ::testing::Combine(Values(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3), - Values((int)DFT_ROWS, (int)DFT_SCALE, (int)DFT_INVERSE, - (int)DFT_INVERSE | DFT_SCALE, (int)DFT_ROWS | DFT_INVERSE))) +OCL_PERF_TEST_P(DftFixture, Dft, ::testing::Combine(Values(/*OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3, */Size(1024, 1024), Size(1024, 2048), Size(512, 512), Size(2048, 2048)), + Values((int)DFT_ROWS/*, (int) 0/*, (int)DFT_SCALE, (int)DFT_INVERSE, + (int)DFT_INVERSE | DFT_SCALE, (int)DFT_ROWS | DFT_INVERSE*/))) { const DftParams params = GetParam(); const Size srcSize = get<0>(params); diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp index 2a08899167..68256eaa07 100644 --- a/modules/core/src/dxt.cpp +++ b/modules/core/src/dxt.cpp @@ -1960,7 +1960,7 @@ static void CL_CALLBACK oclCleanupCallback(cl_event e, cl_int, void *p) } -static bool ocl_dft(InputArray _src, OutputArray _dst, int flags) +static bool ocl_dft_amdfft(InputArray _src, OutputArray _dst, int flags) { int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); Size ssize = _src.size(); @@ -2029,12 +2029,257 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags) #endif // HAVE_CLAMDFFT +namespace cv +{ + +#ifdef HAVE_OPENCL + +static bool fft_radixN(InputArray _src, OutputArray _dst, int radix, int block_size, int nonzero_rows, int flags) +{ + int N = _src.size().width; + if (N % radix) + return false; + + UMat src = _src.getUMat(); + UMat dst = _dst.getUMat(); + + int thread_count = N / radix; + size_t globalsize[2] = { thread_count, nonzero_rows }; + String kernel_name = format("fft_radix%d", radix); + ocl::Kernel k(kernel_name.c_str(), ocl::core::fft_oclsrc, (flags & DFT_INVERSE) != 0 ? "-D INVERSE" : ""); + if (k.empty()) + return false; + + k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnlyNoSize(dst), block_size, thread_count, nonzero_rows); + return k.run(2, globalsize, NULL, false); +} + +static bool ocl_packToCCS(InputArray _buffer, OutputArray _dst, int flags) +{ + UMat buffer = _buffer.getUMat(); + UMat dst = _dst.getUMat(); + + buffer = buffer.reshape(1); + if ((flags & DFT_ROWS) == 0 && buffer.rows > 1) + { + // pack to CCS by rows + if (dst.cols > 2) + buffer.colRange(2, dst.cols + (dst.cols % 2)).copyTo(dst.colRange(1, dst.cols-1 + (dst.cols % 2))); + + Mat dst_mat = dst.getMat(ACCESS_WRITE); + Mat buffer_mat = buffer.getMat(ACCESS_READ); + + dst_mat.at(0,0) = buffer_mat.at(0,0); + dst_mat.at(dst_mat.rows-1,0) = buffer_mat.at(buffer.rows/2,0); + for (int i=1; i(i,0) = buffer_mat.at((i+1)/2,0); + dst_mat.at(i+1,0) = buffer_mat.at((i+1)/2,1); + } + + if (dst_mat.cols % 2 == 0) + { + dst_mat.at(0,dst_mat.cols-1) = buffer_mat.at(0,buffer.cols/2); + dst_mat.at(dst_mat.rows-1,dst_mat.cols-1) = buffer_mat.at(buffer.rows/2,buffer.cols/2); + + for (int i=1; i(i,dst_mat.cols-1) = buffer_mat.at((i+1)/2,buffer.cols/2); + dst_mat.at(i+1,dst_mat.cols-1) = buffer_mat.at((i+1)/2,buffer.cols/2+1); + } + } + } + else + { + // pack to CCS each row + buffer.colRange(0,1).copyTo(dst.colRange(0,1)); + buffer.colRange(2, (dst.cols+1)).copyTo(dst.colRange(1, dst.cols)); + } + return true; +} + +static bool ocl_dft_C2C_row(InputArray _src, OutputArray _dst, int nonzero_rows, int flags) +{ + int type = _src.type(), depth = CV_MAT_DEPTH(type), channels = CV_MAT_CN(type); + UMat src = _src.getUMat(); + + bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; + if (depth == CV_64F && !doubleSupport) + return false; + + int factors[34]; + int nf = DFTFactorize( src.cols, factors ); + + int n = 1; + int factor_index = 0; + + String radix_processing; + int min_radix = INT_MAX; + // 1. 2^n transforms + if ( (factors[factor_index] & 1) == 0 ) + { + for( ; n < factors[factor_index]; ) + { + int radix = 2; + if (8*n <= factors[0]) + radix = 8; + else if (4*n <= factors[0]) + radix = 4; + + radix_processing += format("fft_radix%d(smem,x,%d,%d);", radix, n, src.cols/radix); + min_radix = min(radix, min_radix); + n *= radix; + } + factor_index++; + } + + // 2. all the other transforms + for( ; factor_index < nf; factor_index++ ) + { + int radix = factors[factor_index]; + radix_processing += format("fft_radix%d(smem,x,%d,%d);", radix, n, src.cols/radix); + min_radix = min(radix, min_radix); + n *= radix; + } + + UMat dst = _dst.getUMat(); + + int thread_count = src.cols / min_radix; + size_t globalsize[2] = { thread_count, nonzero_rows }; + size_t localsize[2] = { thread_count, 1 }; + + String buildOptions = format("-D LOCAL_SIZE=%d -D kercn=%d -D RADIX_PROCESS=%s", + src.cols, src.cols/thread_count, radix_processing.c_str()); + ocl::Kernel k("fft_multi_radix", ocl::core::fft_oclsrc, buildOptions); + if (k.empty()) + return false; + + k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnlyNoSize(dst), thread_count, nonzero_rows); + return k.run(2, globalsize, localsize, false); +} + +static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_rows) +{ + int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); + Size ssize = _src.size(); + bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; + if ( (!doubleSupport && depth == CV_64F) || + !(type == CV_32FC1 || type == CV_32FC2 || type == CV_64FC1 || type == CV_64FC2)) + return false; + + // if is not a multiplication of prime numbers { 2, 3, 5 } + if (ssize.area() != getOptimalDFTSize(ssize.area())) + return false; + + UMat src = _src.getUMat(); + int complex_input = cn == 2 ? 1 : 0; + int complex_output = (flags & DFT_COMPLEX_OUTPUT) != 0; + int real_input = cn == 1 ? 1 : 0; + int real_output = (flags & DFT_REAL_OUTPUT) != 0; + bool inv = (flags & DFT_INVERSE) != 0 ? 1 : 0; + bool is1d = (flags & DFT_ROWS) != 0 || src.rows == 1; + + // if output format is not specified + if (complex_output + real_output == 0) + { + if (!inv) + { + if (real_input) + real_output = 1; + else + complex_output = 1; + } + } + + if (complex_output) + { + //if (is1d) + // _dst.create(Size(src.cols/2+1, src.rows), CV_MAKE_TYPE(depth, 2)); + //else + _dst.create(src.size(), CV_MAKE_TYPE(depth, 2)); + } + else + _dst.create(src.size(), CV_MAKE_TYPE(depth, 1)); + UMat dst = _dst.getUMat(); + + bool inplace = src.u == dst.u; + //UMat buffer; + + //if (complex_input) + //{ + // if (inplace) + // buffer = src; + // else + // src.copyTo(buffer); + //} + //else + //{ + // if (!inv) + // { + // // in case real input convert it to complex + // buffer.create(src.size(), CV_MAKE_TYPE(depth, 2)); + // std::vector planes; + // planes.push_back(src); + // planes.push_back(UMat::zeros(src.size(), CV_32F)); + // merge(planes, buffer); + // } + // else + // { + // // TODO: unpack from CCS format + // } + //} + + if( nonzero_rows <= 0 || nonzero_rows > _src.rows() ) + nonzero_rows = _src.rows(); + + + if (!ocl_dft_C2C_row(src, dst, nonzero_rows, flags)) + return false; + + if ((flags & DFT_ROWS) == 0 && nonzero_rows > 1) + { + transpose(dst, dst); + if (!ocl_dft_C2C_row(dst, dst, dst.rows, flags)) + return false; + transpose(dst, dst); + } + + //if (complex_output) + //{ + // if (real_input && is1d) + // _dst.assign(buffer.colRange(0, buffer.cols/2+1)); + // else + // _dst.assign(buffer); + //} + //else + //{ + // if (!inv) + // ocl_packToCCS(buffer, _dst, flags); + // else + // { + // // copy real part to dst + // } + //} + return true; +} + +#endif + +} // namespace cv; + + + void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows ) { #ifdef HAVE_CLAMDFFT CV_OCL_RUN(ocl::haveAmdFft() && ocl::Device::getDefault().type() != ocl::Device::TYPE_CPU && _dst.isUMat() && _src0.dims() <= 2 && nonzero_rows == 0, - ocl_dft(_src0, _dst, flags)) + ocl_dft_amdfft(_src0, _dst, flags)) +#endif + +#ifdef HAVE_OPENCL + CV_OCL_RUN(_dst.isUMat() && _src0.dims() <= 2, + ocl_dft(_src0, _dst, flags, nonzero_rows)) #endif static DFTFunc dft_tbl[6] = @@ -2046,10 +2291,8 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows ) (DFTFunc)RealDFT_64f, (DFTFunc)CCSIDFT_64f }; - AutoBuffer buf; void *spec = 0; - Mat src0 = _src0.getMat(), src = src0; int prev_len = 0, stage = 0; bool inv = (flags & DFT_INVERSE) != 0; @@ -2058,6 +2301,7 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows ) int elem_size = (int)src.elemSize1(), complex_elem_size = elem_size*2; int factors[34]; bool inplace_transform = false; + bool is1d = (flags & DFT_ROWS) != 0 || src.rows == 1; #ifdef USE_IPP_DFT AutoBuffer ippbuf; int ipp_norm_flag = !(flags & DFT_SCALE) ? 8 : inv ? 2 : 1; @@ -2066,7 +2310,10 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows ) CV_Assert( type == CV_32FC1 || type == CV_32FC2 || type == CV_64FC1 || type == CV_64FC2 ); if( !inv && src.channels() == 1 && (flags & DFT_COMPLEX_OUTPUT) ) - _dst.create( src.size(), CV_MAKETYPE(depth, 2) ); + if (!is1d) + _dst.create( src.size(), CV_MAKETYPE(depth, 2) ); + else + _dst.create( Size(src.cols/2+1, src.rows), CV_MAKETYPE(depth, 2) ); else if( inv && src.channels() == 2 && (flags & DFT_REAL_OUTPUT) ) _dst.create( src.size(), depth ); else diff --git a/modules/core/src/opencl/fft.cl b/modules/core/src/opencl/fft.cl new file mode 100644 index 0000000000..7006b92e68 --- /dev/null +++ b/modules/core/src/opencl/fft.cl @@ -0,0 +1,297 @@ +__constant float PI = 3.14159265f; +__constant float SQRT_2 = 0.707106781188f; + +__constant float sin_120 = 0.866025403784f; +__constant float fft5_2 = 0.559016994374f; +__constant float fft5_3 = -0.951056516295f; +__constant float fft5_4 = -1.538841768587f; +__constant float fft5_5 = 0.363271264002f; + +inline float2 mul_float2(float2 a, float2 b){ + float2 res; + res.x = a.x * b.x - a.y * b.y; + res.y = a.x * b.y + a.y * b.x; + return res; +} + +inline float2 sincos_float2(float alpha) { + float cs, sn; + sn = sincos(alpha, &cs); // sincos + return (float2)(cs, sn); +} + +inline float2 twiddle(float2 a) { + return (float2)(a.y, -a.x); +} + +inline float2 square(float2 a) { + return (float2)(a.x * a.x - a.y * a.y, 2.0f * a.x * a.y); +} + +inline float2 square3(float2 a) { + return (float2)(a.x * a.x - a.y * a.y, 3.0f * a.x * a.y); +} + +inline float2 mul_p1q4(float2 a) { + return (float2)(SQRT_2) * (float2)(a.x + a.y, -a.x + a.y); +} + +inline float2 mul_p3q4(float2 a) { + return (float2)(SQRT_2) * (float2)(-a.x + a.y, -a.x - a.y); +} + +__attribute__((always_inline)) +void fft_radix2(__local float2* smem, const int x, const int block_size, const int t) +{ + const int k = x & (block_size - 1); + float2 in1, temp; + + if (x < t) + { + in1 = smem[x]; + float2 in2 = smem[x+t]; + + float theta = -PI * k / block_size; + float cs; + float sn = sincos(theta, &cs); + temp = (float2) (in2.x * cs - in2.y * sn, + in2.y * cs + in2.x * sn); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (x < t) + { + const int dst_ind = (x << 1) - k; + + smem[dst_ind] = in1 + temp; + smem[dst_ind+block_size] = in1 - temp; + } + + barrier(CLK_LOCAL_MEM_FENCE); +} + +__attribute__((always_inline)) +void fft_radix4(__local float2* smem, const int x, const int block_size, const int t) +{ + const int k = x & (block_size - 1); + float2 b0, b1, b2, b3; + + if (x < t) + { + float theta = -PI * k / (2 * block_size); + + float2 tw = sincos_float2(theta); + float2 a0 = smem[x]; + float2 a1 = mul_float2(tw, smem[x+t]); + float2 a2 = smem[x + 2*t]; + float2 a3 = mul_float2(tw, smem[x + 3*t]); + tw = square(tw); + a2 = mul_float2(tw, a2); + a3 = mul_float2(tw, a3); + + b0 = a0 + a2; + b1 = a0 - a2; + b2 = a1 + a3; + b3 = twiddle(a1 - a3); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (x < t) + { + const int dst_ind = ((x - k) << 2) + k; + smem[dst_ind] = b0 + b2; + smem[dst_ind + block_size] = b1 + b3; + smem[dst_ind + 2*block_size] = b0 - b2; + smem[dst_ind + 3*block_size] = b1 - b3; + } + + barrier(CLK_LOCAL_MEM_FENCE); +} + +__attribute__((always_inline)) +void fft_radix8(__local float2* smem, const int x, const int block_size, const int t) +{ + const int k = x % block_size; + float2 a0, a1, a2, a3, a4, a5, a6, a7; + + if (x < t) + { + float theta = -PI * k / (4 * block_size); + + float2 tw = sincos_float2(theta); // W + a0 = smem[x]; + a1 = mul_float2(tw, smem[x + t]); + a2 = smem[x + 2 * t]; + a3 = mul_float2(tw, smem[x + 3 * t]); + a4 = smem[x + 4 * t]; + a5 = mul_float2(tw, smem[x + 5 * t]); + a6 = smem[x + 6 * t]; + a7 = mul_float2(tw, smem[x + 7 * t]); + + tw = square(tw); // W^2 + a2 = mul_float2(tw, a2); + a3 = mul_float2(tw, a3); + a6 = mul_float2(tw, a6); + a7 = mul_float2(tw, a7); + tw = square(tw); // W^4 + a4 = mul_float2(tw, a4); + a5 = mul_float2(tw, a5); + a6 = mul_float2(tw, a6); + a7 = mul_float2(tw, a7); + + float2 b0 = a0 + a4; + float2 b4 = a0 - a4; + float2 b1 = a1 + a5; + float2 b5 = mul_p1q4(a1 - a5); + float2 b2 = a2 + a6; + float2 b6 = twiddle(a2 - a6); + float2 b3 = a3 + a7; + float2 b7 = mul_p3q4(a3 - a7); + + a0 = b0 + b2; + a2 = b0 - b2; + a1 = b1 + b3; + a3 = twiddle(b1 - b3); + a4 = b4 + b6; + a6 = b4 - b6; + a5 = b5 + b7; + a7 = twiddle(b5 - b7); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (x < t) + { + const int dst_ind = ((x - k) << 3) + k; + __local float2* dst = smem + dst_ind; + + dst[0] = a0 + a1; + dst[block_size] = a4 + a5; + dst[2 * block_size] = a2 + a3; + dst[3 * block_size] = a6 + a7; + dst[4 * block_size] = a0 - a1; + dst[5 * block_size] = a4 - a5; + dst[6 * block_size] = a2 - a3; + dst[7 * block_size] = a6 - a7; + } + + barrier(CLK_LOCAL_MEM_FENCE); +} + +__attribute__((always_inline)) +void fft_radix3(__local float2* smem, const int x, const int block_size, const int t) +{ + const int k = x % block_size; + float2 a0, a1, a2, b0, b1; + + if (x < t) + { + const float theta = -PI * k * 2 / (3 * block_size); + + a0 = smem[x]; + a1 = mul_float2(sincos_float2(theta), smem[x+t]); + a2 = mul_float2(sincos_float2(2 * theta), smem[x+2*t]); + b1 = a1 + a2; + a2 = twiddle((float2)sin_120*(a1 - a2)); + b0 = a0 - (float2)(0.5f)*b1; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (x < t) + { + const int dst_ind = ((x - k) * 3) + k; + + smem[dst_ind] = a0 + b1; + smem[dst_ind + block_size] = b0 + a2; + smem[dst_ind + 2*block_size] = b0 - a2; + } + + barrier(CLK_LOCAL_MEM_FENCE); +} + +__attribute__((always_inline)) +void fft_radix5(__local float2* smem, const int x, const int block_size, const int t) +{ + const int k = x % block_size; + float2 a0, a1, a2, a3, a4, b0, b1, b2, b5; + + if (x < t) + { + const float theta = -PI * k * 2 / (5 * block_size); + + a0 = smem[x]; + a1 = mul_float2(sincos_float2(theta), smem[x + t]); + a2 = mul_float2(sincos_float2(theta*2),smem[x+2*t]); + a3 = mul_float2(sincos_float2(theta*3),smem[x+3*t]); + a4 = mul_float2(sincos_float2(theta*4),smem[x+4*t]); + + b1 = a1 + a4; + a1 -= a4; + + a4 = a3 + a2; + a3 -= a2; + + b2 = b1 + a4; + b0 = a0 - (float2)0.25f * b2; + + b1 = (float2)fft5_2 * (b1 - a4); + a4 = -(float2)fft5_3 * (a1 + a3); + a4 = twiddle(a4); + + b5 = (float2)(a4.x - fft5_5 * a1.y, a4.y + fft5_5 * a1.x); + + a4.x += fft5_4 * a3.y; + a4.y -= fft5_4 * a3.x; + + a1 = b0 + b1; + b0 -= b1; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (x < t) + { + const int dst_ind = ((x - k) * 5) + k; + __local float2* dst = smem + dst_ind; + + dst[0] = a0 + b2; + dst[block_size] = a1 + a4; + dst[2 * block_size] = b0 + b5; + dst[3 * block_size] = b0 - b5; + dst[4 * block_size] = a1 - a4; + } + + barrier(CLK_LOCAL_MEM_FENCE); +} + +__kernel void fft_multi_radix(__global const uchar* srcptr, int src_step, int src_offset, + __global uchar* dstptr, int dst_step, int dst_offset, + const int t, const int nz) +{ + const int x = get_global_id(0); + const int y = get_group_id(1); + + if (y < nz) + { + __local float2 smem[LOCAL_SIZE]; + __global const float2* src = (__global const float2*)(srcptr + mad24(y, src_step, mad24(x, (int)(sizeof(float)*2), src_offset))); + __global float2* dst = (__global float2*)(dstptr + mad24(y, dst_step, mad24(x, (int)(sizeof(float)*2), dst_offset))); + + const int block_size = LOCAL_SIZE/kercn; + #pragma unroll + for (int i=0; i +#include +#include using namespace cv; using namespace std; @@ -24,6 +26,31 @@ const char* keys = int main(int argc, const char ** argv) { + //int cols = 4; + //int rows = 768; + //srand(0); + //Mat input(Size(cols, rows), CV_32FC2); + //for (int i=0; i(j,i) = Vec2f((float) rand() / RAND_MAX, (float) rand() / RAND_MAX); + //Mat dst; + // + //UMat gpu_input, gpu_dst; + //input.copyTo(gpu_input); + //auto start = std::chrono::system_clock::now(); + //dft(input, dst, DFT_ROWS); + //auto cpu_duration = chrono::duration_cast(chrono::system_clock::now() - start); + // + //start = std::chrono::system_clock::now(); + //dft(gpu_input, gpu_dst, DFT_ROWS); + //auto gpu_duration = chrono::duration_cast(chrono::system_clock::now() - start); + + //double n = norm(dst, gpu_dst); + //cout << "norm = " << n << endl; + //cout << "CPU time: " << cpu_duration.count() << "ms" << endl; + //cout << "GPU time: " << gpu_duration.count() << "ms" << endl; + + help(); CommandLineParser parser(argc, argv, keys); string filename = parser.get(0); @@ -35,16 +62,46 @@ int main(int argc, const char ** argv) printf("Cannot read image file: %s\n", filename.c_str()); return -1; } - int M = getOptimalDFTSize( img.rows ); - int N = getOptimalDFTSize( img.cols ); + + Mat small_img = img(Rect(0,0,6,6)); + + int M = getOptimalDFTSize( small_img.rows ); + int N = getOptimalDFTSize( small_img.cols ); Mat padded; - copyMakeBorder(img, padded, 0, M - img.rows, 0, N - img.cols, BORDER_CONSTANT, Scalar::all(0)); + copyMakeBorder(small_img, padded, 0, M - small_img.rows, 0, N - small_img.cols, BORDER_CONSTANT, Scalar::all(0)); - Mat planes[] = {Mat_(padded), Mat::zeros(padded.size(), CV_32F)}; - Mat complexImg; + Mat planes[] = {Mat_(padded), Mat::ones(padded.size(), CV_32F)}; + Mat complexImg, complexImg1, complexInput; merge(planes, 2, complexImg); - dft(complexImg, complexImg); + Mat realInput; + padded.convertTo(realInput, CV_32F); + complexInput = complexImg; + //cout << complexImg << endl; + //dft(complexImg, complexImg, DFT_REAL_OUTPUT); + //cout << "Complex to Complex" << endl; + //cout << complexImg << endl; + cout << "Complex input" << endl << complexInput << endl; + cout << "Real input" << endl << realInput << endl; + + dft(complexInput, complexImg1, DFT_COMPLEX_OUTPUT); + cout << "Complex to Complex image: " << endl; + cout << endl << complexImg1 << endl; + + Mat realImg1; + dft(complexInput, realImg1, DFT_REAL_OUTPUT); + cout << "Complex to Real image: " << endl; + cout << endl << realImg1 << endl; + + Mat realOut; + dft(complexImg1, realOut, DFT_INVERSE | DFT_COMPLEX_OUTPUT); + cout << "Complex to Complex (inverse):" << endl; + cout << realOut << endl; + + Mat complexOut; + dft(realImg1, complexOut, DFT_INVERSE | DFT_REAL_OUTPUT | DFT_SCALE); + cout << "Complex to Real (inverse):" << endl; + cout << complexOut << endl; // compute log(1 + sqrt(Re(DFT(img))**2 + Im(DFT(img))**2)) split(complexImg, planes); From 0318d2772086d5aaf6ca07bc994e2bed0943b64b Mon Sep 17 00:00:00 2001 From: Alexander Karsakov Date: Thu, 10 Jul 2014 18:10:46 +0400 Subject: [PATCH 02/13] Enabled precalculated wave --- modules/core/include/opencv2/core/cvdef.h | 1 + modules/core/perf/opencl/perf_dxt.cpp | 4 +- modules/core/src/dxt.cpp | 127 +++++++----- modules/core/src/opencl/fft.cl | 226 ++++++++++------------ modules/core/test/ocl/test_dft.cpp | 8 +- 5 files changed, 183 insertions(+), 183 deletions(-) diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index 8108a61e60..765c54cbe1 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -244,6 +244,7 @@ typedef signed char schar; /* fundamental constants */ #define CV_PI 3.1415926535897932384626433832795 +#define CV_TWO_PI 6.283185307179586476925286766559 #define CV_LOG2 0.69314718055994530941723212145818 /****************************************************************************************\ diff --git a/modules/core/perf/opencl/perf_dxt.cpp b/modules/core/perf/opencl/perf_dxt.cpp index 09d657d7a1..c0da96b373 100644 --- a/modules/core/perf/opencl/perf_dxt.cpp +++ b/modules/core/perf/opencl/perf_dxt.cpp @@ -57,8 +57,8 @@ namespace ocl { typedef tuple DftParams; typedef TestBaseWithParam DftFixture; -OCL_PERF_TEST_P(DftFixture, Dft, ::testing::Combine(Values(/*OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3, */Size(1024, 1024), Size(1024, 2048), Size(512, 512), Size(2048, 2048)), - Values((int)DFT_ROWS/*, (int) 0/*, (int)DFT_SCALE, (int)DFT_INVERSE, +OCL_PERF_TEST_P(DftFixture, Dft, ::testing::Combine(Values(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3, Size(1024, 1024), Size(1024, 2048), Size(512, 512), Size(2048, 2048)), + Values((int)DFT_ROWS, (int) 0/*, (int)DFT_SCALE, (int)DFT_INVERSE, (int)DFT_INVERSE | DFT_SCALE, (int)DFT_ROWS | DFT_INVERSE*/))) { const DftParams params = GetParam(); diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp index 68256eaa07..de17f07b23 100644 --- a/modules/core/src/dxt.cpp +++ b/modules/core/src/dxt.cpp @@ -2034,26 +2034,6 @@ namespace cv #ifdef HAVE_OPENCL -static bool fft_radixN(InputArray _src, OutputArray _dst, int radix, int block_size, int nonzero_rows, int flags) -{ - int N = _src.size().width; - if (N % radix) - return false; - - UMat src = _src.getUMat(); - UMat dst = _dst.getUMat(); - - int thread_count = N / radix; - size_t globalsize[2] = { thread_count, nonzero_rows }; - String kernel_name = format("fft_radix%d", radix); - ocl::Kernel k(kernel_name.c_str(), ocl::core::fft_oclsrc, (flags & DFT_INVERSE) != 0 ? "-D INVERSE" : ""); - if (k.empty()) - return false; - - k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnlyNoSize(dst), block_size, thread_count, nonzero_rows); - return k.run(2, globalsize, NULL, false); -} - static bool ocl_packToCCS(InputArray _buffer, OutputArray _dst, int flags) { UMat buffer = _buffer.getUMat(); @@ -2098,24 +2078,18 @@ static bool ocl_packToCCS(InputArray _buffer, OutputArray _dst, int flags) return true; } -static bool ocl_dft_C2C_row(InputArray _src, OutputArray _dst, int nonzero_rows, int flags) +static std::vector ocl_getRadixes(int cols, int& min_radix) { - int type = _src.type(), depth = CV_MAT_DEPTH(type), channels = CV_MAT_CN(type); - UMat src = _src.getUMat(); - - bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; - if (depth == CV_64F && !doubleSupport) - return false; - int factors[34]; - int nf = DFTFactorize( src.cols, factors ); + int nf = DFTFactorize( cols, factors ); int n = 1; int factor_index = 0; - - String radix_processing; - int min_radix = INT_MAX; - // 1. 2^n transforms + + // choose radix order + std::vector radixes; + + // 2^n transforms if ( (factors[factor_index] & 1) == 0 ) { for( ; n < factors[factor_index]; ) @@ -2126,24 +2100,76 @@ static bool ocl_dft_C2C_row(InputArray _src, OutputArray _dst, int nonzero_rows, else if (4*n <= factors[0]) radix = 4; - radix_processing += format("fft_radix%d(smem,x,%d,%d);", radix, n, src.cols/radix); - min_radix = min(radix, min_radix); + radixes.push_back(radix); + min_radix = min(min_radix, radix); n *= radix; } factor_index++; } - // 2. all the other transforms + // all the other transforms for( ; factor_index < nf; factor_index++ ) { - int radix = factors[factor_index]; - radix_processing += format("fft_radix%d(smem,x,%d,%d);", radix, n, src.cols/radix); - min_radix = min(radix, min_radix); + radixes.push_back(factors[factor_index]); + min_radix = min(min_radix, factors[factor_index]); + } + return radixes; +} + +static bool ocl_dft_C2C_row(InputArray _src, OutputArray _dst, InputOutputArray _twiddles, int nonzero_rows, int flags) +{ + int type = _src.type(), depth = CV_MAT_DEPTH(type), channels = CV_MAT_CN(type); + UMat src = _src.getUMat(); + + bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; + if (depth == CV_64F && !doubleSupport) + return false; + + int min_radix = INT_MAX; + std::vector radixes = ocl_getRadixes(src.cols, min_radix); + + // generate string with radix calls + String radix_processing; + int n = 1, twiddle_index = 0; + for (size_t i=0; i(); + int ptr_index = 0; + + int n = 1; + for (size_t i=0; i _src.rows() ) nonzero_rows = _src.rows(); + UMat buffer; - if (!ocl_dft_C2C_row(src, dst, nonzero_rows, flags)) + if (!ocl_dft_C2C_row(src, dst, buffer, nonzero_rows, flags)) return false; if ((flags & DFT_ROWS) == 0 && nonzero_rows > 1) { transpose(dst, dst); - if (!ocl_dft_C2C_row(dst, dst, dst.rows, flags)) + if (!ocl_dft_C2C_row(dst, dst, buffer, dst.rows, flags)) return false; transpose(dst, dst); } - //if (complex_output) - //{ - // if (real_input && is1d) - // _dst.assign(buffer.colRange(0, buffer.cols/2+1)); - // else - // _dst.assign(buffer); - //} + if (complex_output) + { + if (real_input && is1d) + _dst.assign(dst.colRange(0, dst.cols/2+1)); + else + _dst.assign(dst); + } //else //{ // if (!inv) diff --git a/modules/core/src/opencl/fft.cl b/modules/core/src/opencl/fft.cl index 7006b92e68..bd2b863c6c 100644 --- a/modules/core/src/opencl/fft.cl +++ b/modules/core/src/opencl/fft.cl @@ -7,55 +7,36 @@ __constant float fft5_3 = -0.951056516295f; __constant float fft5_4 = -1.538841768587f; __constant float fft5_5 = 0.363271264002f; -inline float2 mul_float2(float2 a, float2 b){ +__attribute__((always_inline)) +float2 mul_float2(float2 a, float2 b){ float2 res; res.x = a.x * b.x - a.y * b.y; res.y = a.x * b.y + a.y * b.x; return res; } -inline float2 sincos_float2(float alpha) { +__attribute__((always_inline)) +float2 sincos_float2(float alpha) { float cs, sn; sn = sincos(alpha, &cs); // sincos return (float2)(cs, sn); } -inline float2 twiddle(float2 a) { +__attribute__((always_inline)) +float2 twiddle(float2 a) { return (float2)(a.y, -a.x); } -inline float2 square(float2 a) { - return (float2)(a.x * a.x - a.y * a.y, 2.0f * a.x * a.y); -} - -inline float2 square3(float2 a) { - return (float2)(a.x * a.x - a.y * a.y, 3.0f * a.x * a.y); -} - -inline float2 mul_p1q4(float2 a) { - return (float2)(SQRT_2) * (float2)(a.x + a.y, -a.x + a.y); -} - -inline float2 mul_p3q4(float2 a) { - return (float2)(SQRT_2) * (float2)(-a.x + a.y, -a.x - a.y); -} - __attribute__((always_inline)) -void fft_radix2(__local float2* smem, const int x, const int block_size, const int t) +void fft_radix2(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t) { const int k = x & (block_size - 1); - float2 in1, temp; + float2 a0, a1; if (x < t) { - in1 = smem[x]; - float2 in2 = smem[x+t]; - - float theta = -PI * k / block_size; - float cs; - float sn = sincos(theta, &cs); - temp = (float2) (in2.x * cs - in2.y * sn, - in2.y * cs + in2.x * sn); + a0 = smem[x]; + a1 = mul_float2(twiddles[k],smem[x+t]); } barrier(CLK_LOCAL_MEM_FENCE); @@ -64,36 +45,25 @@ void fft_radix2(__local float2* smem, const int x, const int block_size, const i { const int dst_ind = (x << 1) - k; - smem[dst_ind] = in1 + temp; - smem[dst_ind+block_size] = in1 - temp; + smem[dst_ind] = a0 + a1; + smem[dst_ind+block_size] = a0 - a1; } barrier(CLK_LOCAL_MEM_FENCE); } __attribute__((always_inline)) -void fft_radix4(__local float2* smem, const int x, const int block_size, const int t) +void fft_radix4(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t) { const int k = x & (block_size - 1); - float2 b0, b1, b2, b3; + float2 a0, a1, a2, a3; if (x < t) { - float theta = -PI * k / (2 * block_size); - - float2 tw = sincos_float2(theta); - float2 a0 = smem[x]; - float2 a1 = mul_float2(tw, smem[x+t]); - float2 a2 = smem[x + 2*t]; - float2 a3 = mul_float2(tw, smem[x + 3*t]); - tw = square(tw); - a2 = mul_float2(tw, a2); - a3 = mul_float2(tw, a3); - - b0 = a0 + a2; - b1 = a0 - a2; - b2 = a1 + a3; - b3 = twiddle(a1 - a3); + a0 = smem[x]; + a1 = mul_float2(twiddles[3*k],smem[x+t]); + a2 = mul_float2(twiddles[3*k + 1],smem[x+2*t]); + a3 = mul_float2(twiddles[3*k + 2],smem[x+3*t]); } barrier(CLK_LOCAL_MEM_FENCE); @@ -101,63 +71,62 @@ void fft_radix4(__local float2* smem, const int x, const int block_size, const i if (x < t) { const int dst_ind = ((x - k) << 2) + k; - smem[dst_ind] = b0 + b2; - smem[dst_ind + block_size] = b1 + b3; - smem[dst_ind + 2*block_size] = b0 - b2; - smem[dst_ind + 3*block_size] = b1 - b3; + + float2 b0 = a0 + a2; + a2 = a0 - a2; + float2 b1 = a1 + a3; + a3 = twiddle(a1 - a3); + + smem[dst_ind] = b0 + b1; + smem[dst_ind + block_size] = a2 + a3; + smem[dst_ind + 2*block_size] = b0 - b1; + smem[dst_ind + 3*block_size] = a2 - a3; } barrier(CLK_LOCAL_MEM_FENCE); } __attribute__((always_inline)) -void fft_radix8(__local float2* smem, const int x, const int block_size, const int t) +void fft_radix8(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t) { const int k = x % block_size; float2 a0, a1, a2, a3, a4, a5, a6, a7; if (x < t) { - float theta = -PI * k / (4 * block_size); + int tw_ind = block_size / 8; - float2 tw = sincos_float2(theta); // W a0 = smem[x]; - a1 = mul_float2(tw, smem[x + t]); - a2 = smem[x + 2 * t]; - a3 = mul_float2(tw, smem[x + 3 * t]); - a4 = smem[x + 4 * t]; - a5 = mul_float2(tw, smem[x + 5 * t]); - a6 = smem[x + 6 * t]; - a7 = mul_float2(tw, smem[x + 7 * t]); - - tw = square(tw); // W^2 - a2 = mul_float2(tw, a2); - a3 = mul_float2(tw, a3); - a6 = mul_float2(tw, a6); - a7 = mul_float2(tw, a7); - tw = square(tw); // W^4 - a4 = mul_float2(tw, a4); - a5 = mul_float2(tw, a5); - a6 = mul_float2(tw, a6); - a7 = mul_float2(tw, a7); - - float2 b0 = a0 + a4; - float2 b4 = a0 - a4; - float2 b1 = a1 + a5; - float2 b5 = mul_p1q4(a1 - a5); - float2 b2 = a2 + a6; - float2 b6 = twiddle(a2 - a6); - float2 b3 = a3 + a7; - float2 b7 = mul_p3q4(a3 - a7); - - a0 = b0 + b2; - a2 = b0 - b2; - a1 = b1 + b3; - a3 = twiddle(b1 - b3); - a4 = b4 + b6; - a6 = b4 - b6; - a5 = b5 + b7; - a7 = twiddle(b5 - b7); + a1 = mul_float2(twiddles[7*k], smem[x + t]); + a2 = mul_float2(twiddles[7*k+1],smem[x+2*t]); + a3 = mul_float2(twiddles[7*k+2],smem[x+3*t]); + a4 = mul_float2(twiddles[7*k+3],smem[x+4*t]); + a5 = mul_float2(twiddles[7*k+4],smem[x+5*t]); + a6 = mul_float2(twiddles[7*k+5],smem[x+6*t]); + a7 = mul_float2(twiddles[7*k+6],smem[x+7*t]); + + float2 b0, b1, b6, b7; + + b0 = a0 + a4; + a4 = a0 - a4; + b1 = a1 + a5; + a5 = a1 - a5; + a5 = (float2)(SQRT_2) * (float2)(a5.x + a5.y, -a5.x + a5.y); + b6 = twiddle(a2 - a6); + a2 = a2 + a6; + b7 = a3 - a7; + b7 = (float2)(SQRT_2) * (float2)(-b7.x + b7.y, -b7.x - b7.y); + a3 = a3 + a7; + + a0 = b0 + a2; + a2 = b0 - a2; + a1 = b1 + a3; + a3 = twiddle(b1 - a3); + a6 = a4 - b6; + a4 = a4 + b6; + a7 = twiddle(a5 - b7); + a5 = a5 + b7; + } barrier(CLK_LOCAL_MEM_FENCE); @@ -181,21 +150,16 @@ void fft_radix8(__local float2* smem, const int x, const int block_size, const i } __attribute__((always_inline)) -void fft_radix3(__local float2* smem, const int x, const int block_size, const int t) +void fft_radix3(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t) { const int k = x % block_size; - float2 a0, a1, a2, b0, b1; + float2 a0, a1, a2; if (x < t) { - const float theta = -PI * k * 2 / (3 * block_size); - a0 = smem[x]; - a1 = mul_float2(sincos_float2(theta), smem[x+t]); - a2 = mul_float2(sincos_float2(2 * theta), smem[x+2*t]); - b1 = a1 + a2; - a2 = twiddle((float2)sin_120*(a1 - a2)); - b0 = a0 - (float2)(0.5f)*b1; + a1 = mul_float2(twiddles[2*k], smem[x+t]); + a2 = mul_float2(twiddles[2*k+1], smem[x+2*t]); } barrier(CLK_LOCAL_MEM_FENCE); @@ -204,6 +168,10 @@ void fft_radix3(__local float2* smem, const int x, const int block_size, const i { const int dst_ind = ((x - k) * 3) + k; + float2 b1 = a1 + a2; + a2 = twiddle((float2)sin_120*(a1 - a2)); + float2 b0 = a0 - (float2)(0.5f)*b1; + smem[dst_ind] = a0 + b1; smem[dst_ind + block_size] = b0 + a2; smem[dst_ind + 2*block_size] = b0 - a2; @@ -213,20 +181,30 @@ void fft_radix3(__local float2* smem, const int x, const int block_size, const i } __attribute__((always_inline)) -void fft_radix5(__local float2* smem, const int x, const int block_size, const int t) +void fft_radix5(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t) { const int k = x % block_size; - float2 a0, a1, a2, a3, a4, b0, b1, b2, b5; + float2 a0, a1, a2, a3, a4; if (x < t) { - const float theta = -PI * k * 2 / (5 * block_size); - + int tw_ind = block_size / 5; + a0 = smem[x]; - a1 = mul_float2(sincos_float2(theta), smem[x + t]); - a2 = mul_float2(sincos_float2(theta*2),smem[x+2*t]); - a3 = mul_float2(sincos_float2(theta*3),smem[x+3*t]); - a4 = mul_float2(sincos_float2(theta*4),smem[x+4*t]); + a1 = mul_float2(twiddles[4*k], smem[x + t]); + a2 = mul_float2(twiddles[4*k+1],smem[x+2*t]); + a3 = mul_float2(twiddles[4*k+2],smem[x+3*t]); + a4 = mul_float2(twiddles[4*k+3],smem[x+4*t]); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (x < t) + { + const int dst_ind = ((x - k) * 5) + k; + __local float2* dst = smem + dst_ind; + + float2 b0, b1, b5; b1 = a1 + a4; a1 -= a4; @@ -234,13 +212,11 @@ void fft_radix5(__local float2* smem, const int x, const int block_size, const i a4 = a3 + a2; a3 -= a2; - b2 = b1 + a4; - b0 = a0 - (float2)0.25f * b2; - - b1 = (float2)fft5_2 * (b1 - a4); - a4 = -(float2)fft5_3 * (a1 + a3); - a4 = twiddle(a4); + a2 = b1 + a4; + b0 = a0 - (float2)0.25f * a2; + b1 = (float2)fft5_2 * (b1 - a4); + a4 = (float2)fft5_3 * (float2)(-a1.y - a3.y, a1.x + a3.x); b5 = (float2)(a4.x - fft5_5 * a1.y, a4.y + fft5_5 * a1.x); a4.x += fft5_4 * a3.y; @@ -248,16 +224,8 @@ void fft_radix5(__local float2* smem, const int x, const int block_size, const i a1 = b0 + b1; b0 -= b1; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - if (x < t) - { - const int dst_ind = ((x - k) * 5) + k; - __local float2* dst = smem + dst_ind; - dst[0] = a0 + b2; + dst[0] = a0 + a2; dst[block_size] = a1 + a4; dst[2 * block_size] = b0 + b5; dst[3 * block_size] = b0 - b5; @@ -267,8 +235,9 @@ void fft_radix5(__local float2* smem, const int x, const int block_size, const i barrier(CLK_LOCAL_MEM_FENCE); } -__kernel void fft_multi_radix(__global const uchar* srcptr, int src_step, int src_offset, - __global uchar* dstptr, int dst_step, int dst_offset, +__kernel void fft_multi_radix(__global const uchar* src_ptr, int src_step, int src_offset, + __global uchar* dst_ptr, int dst_step, int dst_offset, + __global const uchar* twiddles_ptr, int twiddles_step, int twiddles_offset, const int t, const int nz) { const int x = get_global_id(0); @@ -277,8 +246,9 @@ __kernel void fft_multi_radix(__global const uchar* srcptr, int src_step, int sr if (y < nz) { __local float2 smem[LOCAL_SIZE]; - __global const float2* src = (__global const float2*)(srcptr + mad24(y, src_step, mad24(x, (int)(sizeof(float)*2), src_offset))); - __global float2* dst = (__global float2*)(dstptr + mad24(y, dst_step, mad24(x, (int)(sizeof(float)*2), dst_offset))); + __global const float2* src = (__global const float2*)(src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(float)*2), src_offset))); + __global float2* dst = (__global float2*)(dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(float)*2), dst_offset))); + __global const float2* twiddles = (__global float2*) twiddles_ptr; const int block_size = LOCAL_SIZE/kercn; #pragma unroll @@ -292,6 +262,8 @@ __kernel void fft_multi_radix(__global const uchar* srcptr, int src_step, int sr // copy data to dst #pragma unroll for (int i=0; i Date: Fri, 11 Jul 2014 15:01:46 +0400 Subject: [PATCH 03/13] Added fftplan cache --- modules/core/src/dxt.cpp | 332 +++++++++++++++++------------ modules/core/src/opencl/fft.cl | 55 ++--- modules/core/test/ocl/test_dft.cpp | 4 +- 3 files changed, 226 insertions(+), 165 deletions(-) diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp index de17f07b23..c11b699503 100644 --- a/modules/core/src/dxt.cpp +++ b/modules/core/src/dxt.cpp @@ -2034,50 +2034,6 @@ namespace cv #ifdef HAVE_OPENCL -static bool ocl_packToCCS(InputArray _buffer, OutputArray _dst, int flags) -{ - UMat buffer = _buffer.getUMat(); - UMat dst = _dst.getUMat(); - - buffer = buffer.reshape(1); - if ((flags & DFT_ROWS) == 0 && buffer.rows > 1) - { - // pack to CCS by rows - if (dst.cols > 2) - buffer.colRange(2, dst.cols + (dst.cols % 2)).copyTo(dst.colRange(1, dst.cols-1 + (dst.cols % 2))); - - Mat dst_mat = dst.getMat(ACCESS_WRITE); - Mat buffer_mat = buffer.getMat(ACCESS_READ); - - dst_mat.at(0,0) = buffer_mat.at(0,0); - dst_mat.at(dst_mat.rows-1,0) = buffer_mat.at(buffer.rows/2,0); - for (int i=1; i(i,0) = buffer_mat.at((i+1)/2,0); - dst_mat.at(i+1,0) = buffer_mat.at((i+1)/2,1); - } - - if (dst_mat.cols % 2 == 0) - { - dst_mat.at(0,dst_mat.cols-1) = buffer_mat.at(0,buffer.cols/2); - dst_mat.at(dst_mat.rows-1,dst_mat.cols-1) = buffer_mat.at(buffer.rows/2,buffer.cols/2); - - for (int i=1; i(i,dst_mat.cols-1) = buffer_mat.at((i+1)/2,buffer.cols/2); - dst_mat.at(i+1,dst_mat.cols-1) = buffer_mat.at((i+1)/2,buffer.cols/2+1); - } - } - } - else - { - // pack to CCS each row - buffer.colRange(0,1).copyTo(dst.colRange(0,1)); - buffer.colRange(2, (dst.cols+1)).copyTo(dst.colRange(1, dst.cols)); - } - return true; -} - static std::vector ocl_getRadixes(int cols, int& min_radix) { int factors[34]; @@ -2116,72 +2072,175 @@ static std::vector ocl_getRadixes(int cols, int& min_radix) return radixes; } -static bool ocl_dft_C2C_row(InputArray _src, OutputArray _dst, InputOutputArray _twiddles, int nonzero_rows, int flags) +struct OCL_FftPlan { - int type = _src.type(), depth = CV_MAT_DEPTH(type), channels = CV_MAT_CN(type); - UMat src = _src.getUMat(); + UMat twiddles; + String buildOptions; + int thread_count; - bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; - if (depth == CV_64F && !doubleSupport) - return false; - - int min_radix = INT_MAX; - std::vector radixes = ocl_getRadixes(src.cols, min_radix); - - // generate string with radix calls - String radix_processing; - int n = 1, twiddle_index = 0; - for (size_t i=0; i radixes = ocl_getRadixes(dft_size, min_radix); + thread_count = dft_size / min_radix; + + // generate string with radix calls + String radix_processing; + int n = 1, twiddle_size = 0; + for (size_t i=0; i(); int ptr_index = 0; - int n = 1; + n = 1; for (size_t i=0; idft_size == dft_size) + { + return plan; + } + } - k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnlyNoSize(dst), ocl::KernelArg::ReadOnlyNoSize(twiddles), thread_count, nonzero_rows); - return k.run(2, globalsize, localsize, false); + OCL_FftPlan * newPlan = new OCL_FftPlan(dft_size, flags); + planStorage.push_back(newPlan); + return newPlan; + } + + ~OCL_FftPlanCache() + { + for (std::vector::iterator i = planStorage.begin(), end = planStorage.end(); i != end; ++i) + delete (*i); + planStorage.clear(); + } + +protected: + OCL_FftPlanCache() : + planStorage() + { + } + + std::vector planStorage; +}; + +static bool ocl_packToCCS(InputArray _src, OutputArray _dst, int flags) +{ + UMat src = _src.getUMat(); + _dst.create(src.size(), CV_32F); + UMat dst = _dst.getUMat(); + + src = src.reshape(1); + if ((flags & DFT_ROWS) == 0 && src.rows > 1) + { + // pack to CCS by rows + if (dst.cols > 2) + src.colRange(2, dst.cols + (dst.cols % 2)).copyTo(dst.colRange(1, dst.cols-1 + (dst.cols % 2))); + + Mat dst_mat = dst.getMat(ACCESS_WRITE); + Mat buffer_mat = src.getMat(ACCESS_READ); + + dst_mat.at(0,0) = buffer_mat.at(0,0); + dst_mat.at(dst_mat.rows-1,0) = buffer_mat.at(src.rows/2,0); + for (int i=1; i(i,0) = buffer_mat.at((i+1)/2,0); + dst_mat.at(i+1,0) = buffer_mat.at((i+1)/2,1); + } + + if (dst_mat.cols % 2 == 0) + { + dst_mat.at(0,dst_mat.cols-1) = buffer_mat.at(0,src.cols/2); + dst_mat.at(dst_mat.rows-1,dst_mat.cols-1) = buffer_mat.at(src.rows/2,src.cols/2); + + for (int i=1; i(i,dst_mat.cols-1) = buffer_mat.at((i+1)/2,src.cols/2); + dst_mat.at(i+1,dst_mat.cols-1) = buffer_mat.at((i+1)/2,src.cols/2+1); + } + } + } + else + { + // pack to CCS each row + src.colRange(0,1).copyTo(dst.colRange(0,1)); + src.colRange(2, (dst.cols+1)).copyTo(dst.colRange(1, dst.cols)); + } + return true; +} + +static bool ocl_dft_C2C_row(InputArray _src, OutputArray _dst, int nonzero_rows, int flags) +{ + int type = _src.type(), depth = CV_MAT_DEPTH(type), channels = CV_MAT_CN(type); + + bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; + if (depth == CV_64F && !doubleSupport) + return false; + + const OCL_FftPlan* plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.cols(), flags); + return plan->enqueueTransform(_src, _dst, nonzero_rows); } static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_rows) @@ -2217,76 +2276,71 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro } } - if (complex_output) + UMat input, output; + if (complex_input) { - //if (is1d) - // _dst.create(Size(src.cols/2+1, src.rows), CV_MAKE_TYPE(depth, 2)); - //else - _dst.create(src.size(), CV_MAKE_TYPE(depth, 2)); + input = src; } else - _dst.create(src.size(), CV_MAKE_TYPE(depth, 1)); - UMat dst = _dst.getUMat(); + { + if (!inv) + { + // in case real input convert it to complex + input.create(src.size(), CV_MAKE_TYPE(depth, 2)); + std::vector planes; + planes.push_back(src); + planes.push_back(UMat::zeros(src.size(), CV_32F)); + merge(planes, input); + } + else + { + // TODO: unpack from CCS format + } + } - bool inplace = src.u == dst.u; - //UMat buffer; - - //if (complex_input) - //{ - // if (inplace) - // buffer = src; - // else - // src.copyTo(buffer); - //} - //else - //{ - // if (!inv) - // { - // // in case real input convert it to complex - // buffer.create(src.size(), CV_MAKE_TYPE(depth, 2)); - // std::vector planes; - // planes.push_back(src); - // planes.push_back(UMat::zeros(src.size(), CV_32F)); - // merge(planes, buffer); - // } - // else - // { - // // TODO: unpack from CCS format - // } - //} + + UMat dst = _dst.getUMat(); + if (complex_output) + { + if (real_input && is1d && !inv) + output.create(src.size(), CV_32FC2); + else + output = dst; + } else + { + output.create(src.size(), CV_32FC2); + } if( nonzero_rows <= 0 || nonzero_rows > _src.rows() ) nonzero_rows = _src.rows(); - UMat buffer; - - if (!ocl_dft_C2C_row(src, dst, buffer, nonzero_rows, flags)) + if (!ocl_dft_C2C_row(input, output, nonzero_rows, flags)) return false; if ((flags & DFT_ROWS) == 0 && nonzero_rows > 1) { - transpose(dst, dst); - if (!ocl_dft_C2C_row(dst, dst, buffer, dst.rows, flags)) + transpose(output, output); + if (!ocl_dft_C2C_row(output, output, output.rows, flags)) return false; - transpose(dst, dst); + transpose(output, output); } if (complex_output) { - if (real_input && is1d) - _dst.assign(dst.colRange(0, dst.cols/2+1)); + if (real_input && is1d && !inv) + _dst.assign(output.colRange(0, output.cols/2+1)); else - _dst.assign(dst); - } - //else - //{ - // if (!inv) - // ocl_packToCCS(buffer, _dst, flags); - // else - // { - // // copy real part to dst - // } - //} + _dst.assign(output); + } + else + { + if (!inv) + ocl_packToCCS(output, _dst, flags); + else + { + // copy real part to dst + } + } return true; } diff --git a/modules/core/src/opencl/fft.cl b/modules/core/src/opencl/fft.cl index bd2b863c6c..34da79fafb 100644 --- a/modules/core/src/opencl/fft.cl +++ b/modules/core/src/opencl/fft.cl @@ -28,7 +28,7 @@ float2 twiddle(float2 a) { } __attribute__((always_inline)) -void fft_radix2(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t) +void fft_radix2(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) { const int k = x & (block_size - 1); float2 a0, a1; @@ -53,17 +53,18 @@ void fft_radix2(__local float2* smem, __global const float2* twiddles, const int } __attribute__((always_inline)) -void fft_radix4(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t) +void fft_radix4(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) { const int k = x & (block_size - 1); float2 a0, a1, a2, a3; if (x < t) { + const int twiddle_block = block_size / 4; a0 = smem[x]; - a1 = mul_float2(twiddles[3*k],smem[x+t]); - a2 = mul_float2(twiddles[3*k + 1],smem[x+2*t]); - a3 = mul_float2(twiddles[3*k + 2],smem[x+3*t]); + a1 = mul_float2(twiddles[k],smem[x+t]); + a2 = mul_float2(twiddles[k + block_size],smem[x+2*t]); + a3 = mul_float2(twiddles[k + 2*block_size],smem[x+3*t]); } barrier(CLK_LOCAL_MEM_FENCE); @@ -87,7 +88,7 @@ void fft_radix4(__local float2* smem, __global const float2* twiddles, const int } __attribute__((always_inline)) -void fft_radix8(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t) +void fft_radix8(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) { const int k = x % block_size; float2 a0, a1, a2, a3, a4, a5, a6, a7; @@ -97,13 +98,13 @@ void fft_radix8(__local float2* smem, __global const float2* twiddles, const int int tw_ind = block_size / 8; a0 = smem[x]; - a1 = mul_float2(twiddles[7*k], smem[x + t]); - a2 = mul_float2(twiddles[7*k+1],smem[x+2*t]); - a3 = mul_float2(twiddles[7*k+2],smem[x+3*t]); - a4 = mul_float2(twiddles[7*k+3],smem[x+4*t]); - a5 = mul_float2(twiddles[7*k+4],smem[x+5*t]); - a6 = mul_float2(twiddles[7*k+5],smem[x+6*t]); - a7 = mul_float2(twiddles[7*k+6],smem[x+7*t]); + a1 = mul_float2(twiddles[k], smem[x + t]); + a2 = mul_float2(twiddles[k + block_size],smem[x+2*t]); + a3 = mul_float2(twiddles[k+2*block_size],smem[x+3*t]); + a4 = mul_float2(twiddles[k+3*block_size],smem[x+4*t]); + a5 = mul_float2(twiddles[k+4*block_size],smem[x+5*t]); + a6 = mul_float2(twiddles[k+5*block_size],smem[x+6*t]); + a7 = mul_float2(twiddles[k+6*block_size],smem[x+7*t]); float2 b0, b1, b6, b7; @@ -150,16 +151,23 @@ void fft_radix8(__local float2* smem, __global const float2* twiddles, const int } __attribute__((always_inline)) -void fft_radix3(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t) +void fft_radix3(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) { const int k = x % block_size; float2 a0, a1, a2; if (x < t) { + //const int twiddle_block = block_size / 3; + //const float theta = -PI * k * 2 / (3 * block_size); + //float2 tw = sincos_float2(theta); + //printf("radix3 %d (%f,%f)(%f,%f)\n", k, tw.x, tw.y, twiddles[k].x, twiddles[k].y); + //tw = sincos_float2(2*theta); + //printf("radix3- %d %d (%f,%f)(%f,%f)\n", k, twiddle_block, tw.x, tw.y, twiddles[k+block_size].x, twiddles[k+block_size].y); + a0 = smem[x]; - a1 = mul_float2(twiddles[2*k], smem[x+t]); - a2 = mul_float2(twiddles[2*k+1], smem[x+2*t]); + a1 = mul_float2(twiddles[k], smem[x+t]); + a2 = mul_float2(twiddles[k+block_size], smem[x+2*t]); } barrier(CLK_LOCAL_MEM_FENCE); @@ -181,7 +189,7 @@ void fft_radix3(__local float2* smem, __global const float2* twiddles, const int } __attribute__((always_inline)) -void fft_radix5(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t) +void fft_radix5(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) { const int k = x % block_size; float2 a0, a1, a2, a3, a4; @@ -191,10 +199,10 @@ void fft_radix5(__local float2* smem, __global const float2* twiddles, const int int tw_ind = block_size / 5; a0 = smem[x]; - a1 = mul_float2(twiddles[4*k], smem[x + t]); - a2 = mul_float2(twiddles[4*k+1],smem[x+2*t]); - a3 = mul_float2(twiddles[4*k+2],smem[x+3*t]); - a4 = mul_float2(twiddles[4*k+3],smem[x+4*t]); + a1 = mul_float2(twiddles[k], smem[x + t]); + a2 = mul_float2(twiddles[k + block_size],smem[x+2*t]); + a3 = mul_float2(twiddles[k+2*block_size],smem[x+3*t]); + a4 = mul_float2(twiddles[k+3*block_size],smem[x+4*t]); } barrier(CLK_LOCAL_MEM_FENCE); @@ -237,8 +245,7 @@ void fft_radix5(__local float2* smem, __global const float2* twiddles, const int __kernel void fft_multi_radix(__global const uchar* src_ptr, int src_step, int src_offset, __global uchar* dst_ptr, int dst_step, int dst_offset, - __global const uchar* twiddles_ptr, int twiddles_step, int twiddles_offset, - const int t, const int nz) + __constant float2 * twiddles_ptr, const int t, const int nz) { const int x = get_global_id(0); const int y = get_group_id(1); @@ -248,7 +255,7 @@ __kernel void fft_multi_radix(__global const uchar* src_ptr, int src_step, int s __local float2 smem[LOCAL_SIZE]; __global const float2* src = (__global const float2*)(src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(float)*2), src_offset))); __global float2* dst = (__global float2*)(dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(float)*2), dst_offset))); - __global const float2* twiddles = (__global float2*) twiddles_ptr; + __constant const float2* twiddles = (__constant float2*) twiddles_ptr; const int block_size = LOCAL_SIZE/kercn; #pragma unroll diff --git a/modules/core/test/ocl/test_dft.cpp b/modules/core/test/ocl/test_dft.cpp index 7a7a98852a..2529e949e0 100644 --- a/modules/core/test/ocl/test_dft.cpp +++ b/modules/core/test/ocl/test_dft.cpp @@ -181,9 +181,9 @@ OCL_TEST_P(MulSpectrums, Mat) OCL_INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MulSpectrums, testing::Combine(Bool(), Bool())); -OCL_INSTANTIATE_TEST_CASE_P(Core, Dft, Combine(Values(cv::Size(2, 3), cv::Size(5, 4), cv::Size(30, 20), +OCL_INSTANTIATE_TEST_CASE_P(Core, Dft, Combine(Values(cv::Size(1920, 1), cv::Size(5, 4), cv::Size(30, 20), cv::Size(512, 1), cv::Size(1024, 1024)), - Values((OCL_FFT_TYPE) C2C/*, (OCL_FFT_TYPE) R2R, (OCL_FFT_TYPE) R2C/*, (OCL_FFT_TYPE) C2R*/), + Values(/*(OCL_FFT_TYPE) C2C, (OCL_FFT_TYPE) R2C,*/ (OCL_FFT_TYPE) R2R/*, (OCL_FFT_TYPE) C2R*/), Bool() // DFT_ROWS ) ); From ed07241f89849c4d91a0c78494ed9a5823c09342 Mon Sep 17 00:00:00 2001 From: Alexander Karsakov Date: Tue, 15 Jul 2014 18:25:46 +0400 Subject: [PATCH 04/13] Completed all forward transforms. --- modules/core/perf/opencl/perf_arithm.cpp | 2 +- modules/core/perf/opencl/perf_dxt.cpp | 33 +++- modules/core/src/dxt.cpp | 139 +++++++++++------ modules/core/src/opencl/fft.cl | 182 ++++++++++++++++++----- modules/core/test/ocl/test_dft.cpp | 21 +-- 5 files changed, 276 insertions(+), 101 deletions(-) diff --git a/modules/core/perf/opencl/perf_arithm.cpp b/modules/core/perf/opencl/perf_arithm.cpp index 17badca765..ba808b494f 100644 --- a/modules/core/perf/opencl/perf_arithm.cpp +++ b/modules/core/perf/opencl/perf_arithm.cpp @@ -292,7 +292,7 @@ OCL_PERF_TEST_P(MagnitudeFixture, Magnitude, ::testing::Combine( typedef Size_MatType TransposeFixture; OCL_PERF_TEST_P(TransposeFixture, Transpose, ::testing::Combine( - OCL_TEST_SIZES, OCL_TEST_TYPES_134)) + OCL_TEST_SIZES, Values(CV_8UC1, CV_32FC1, CV_8UC2, CV_32FC2, CV_8UC4, CV_32FC4))) { const Size_MatType_t params = GetParam(); const Size srcSize = get<0>(params); diff --git a/modules/core/perf/opencl/perf_dxt.cpp b/modules/core/perf/opencl/perf_dxt.cpp index c0da96b373..edeeda7f0c 100644 --- a/modules/core/perf/opencl/perf_dxt.cpp +++ b/modules/core/perf/opencl/perf_dxt.cpp @@ -54,21 +54,40 @@ namespace ocl { ///////////// dft //////////////////////// -typedef tuple DftParams; +enum OCL_FFT_TYPE +{ + R2R = 0, // real to real (CCS) + C2R = 1, // complex to real + R2C = 2, // real to complex + C2C = 3 // complex to complex +}; + +typedef tuple DftParams; typedef TestBaseWithParam DftFixture; -OCL_PERF_TEST_P(DftFixture, Dft, ::testing::Combine(Values(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3, Size(1024, 1024), Size(1024, 2048), Size(512, 512), Size(2048, 2048)), +OCL_PERF_TEST_P(DftFixture, Dft, ::testing::Combine(Values(C2C, R2R, C2R, R2C), + Values(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3, Size(1024, 1024), Size(512, 512), Size(2048, 2048)), Values((int)DFT_ROWS, (int) 0/*, (int)DFT_SCALE, (int)DFT_INVERSE, (int)DFT_INVERSE | DFT_SCALE, (int)DFT_ROWS | DFT_INVERSE*/))) { const DftParams params = GetParam(); - const Size srcSize = get<0>(params); - const int flags = get<1>(params); - - UMat src(srcSize, CV_32FC2), dst(srcSize, CV_32FC2); + const int dft_type = get<0>(params); + const Size srcSize = get<1>(params); + int flags = get<2>(params); + + int in_cn, out_cn; + switch (dft_type) + { + case R2R: flags |= cv::DFT_REAL_OUTPUT; in_cn = 1; out_cn = 1; break; + case C2R: flags |= cv::DFT_REAL_OUTPUT; in_cn = 2; out_cn = 2; break; + case R2C: flags |= cv::DFT_COMPLEX_OUTPUT; in_cn = 1; out_cn = 2; break; + case C2C: flags |= cv::DFT_COMPLEX_OUTPUT; in_cn = 2; out_cn = 2; break; + } + + UMat src(srcSize, CV_MAKE_TYPE(CV_32F, in_cn)), dst(srcSize, CV_MAKE_TYPE(CV_32F, out_cn)); declare.in(src, WARMUP_RNG).out(dst); - OCL_TEST_CYCLE() cv::dft(src, dst, flags | DFT_COMPLEX_OUTPUT); + OCL_TEST_CYCLE() cv::dft(src, dst, flags); SANITY_CHECK(dst, 1e-3); } diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp index c11b699503..a3df694364 100644 --- a/modules/core/src/dxt.cpp +++ b/modules/core/src/dxt.cpp @@ -2034,7 +2034,7 @@ namespace cv #ifdef HAVE_OPENCL -static std::vector ocl_getRadixes(int cols, int& min_radix) +static std::vector ocl_getRadixes(int cols, std::vector& radixes, std::vector& blocks, int& min_radix) { int factors[34]; int nf = DFTFactorize( cols, factors ); @@ -2042,9 +2042,6 @@ static std::vector ocl_getRadixes(int cols, int& min_radix) int n = 1; int factor_index = 0; - // choose radix order - std::vector radixes; - // 2^n transforms if ( (factors[factor_index] & 1) == 0 ) { @@ -2057,7 +2054,10 @@ static std::vector ocl_getRadixes(int cols, int& min_radix) radix = 4; radixes.push_back(radix); - min_radix = min(min_radix, radix); + if (radix == 2 && cols % 4 == 0) + min_radix = min(min_radix, 2*radix); + else + min_radix = min(min_radix, radix); n *= radix; } factor_index++; @@ -2067,7 +2067,10 @@ static std::vector ocl_getRadixes(int cols, int& min_radix) for( ; factor_index < nf; factor_index++ ) { radixes.push_back(factors[factor_index]); - min_radix = min(min_radix, factors[factor_index]); + if (factors[factor_index] == 3 && cols % 6 == 0) + min_radix = min(min_radix, 2*factors[factor_index]); + else + min_radix = min(min_radix, factors[factor_index]); } return radixes; } @@ -2084,8 +2087,16 @@ struct OCL_FftPlan OCL_FftPlan(int _size, int _flags): dft_size(_size), flags(_flags) { int min_radix = INT_MAX; - std::vector radixes = ocl_getRadixes(dft_size, min_radix); - thread_count = dft_size / min_radix; + std::vector radixes, blocks; + ocl_getRadixes(dft_size, radixes, blocks, min_radix); + thread_count = (dft_size + min_radix-1) / min_radix; + + printf("cols: %d - ", dft_size); + for (int i=0; i 0; - if (depth == CV_64F && !doubleSupport) - return false; - const OCL_FftPlan* plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.cols(), flags); - return plan->enqueueTransform(_src, _dst, nonzero_rows); + return plan->enqueueTransform(_src, _dst, nonzero_rows, flags, true); +} + +static bool ocl_dft_C2C_cols(InputArray _src, OutputArray _dst, int flags) +{ + const OCL_FftPlan* plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.rows(), flags); + return plan->enqueueTransform(_src, _dst, _src.cols(), flags, false); } static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_rows) @@ -2262,7 +2295,10 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro int real_input = cn == 1 ? 1 : 0; int real_output = (flags & DFT_REAL_OUTPUT) != 0; bool inv = (flags & DFT_INVERSE) != 0 ? 1 : 0; - bool is1d = (flags & DFT_ROWS) != 0 || src.rows == 1; + + if( nonzero_rows <= 0 || nonzero_rows > _src.rows() ) + nonzero_rows = _src.rows(); + bool is1d = (flags & DFT_ROWS) != 0 || nonzero_rows == 1; // if output format is not specified if (complex_output + real_output == 0) @@ -2276,6 +2312,19 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro } } + // Forward Complex to CCS not supported + if (complex_input && real_output && !inv) + { + real_output = 0; + complex_output = 1; + } + // Inverse CCS to Complex not supported + if (real_input && complex_output && inv) + { + complex_output = 0; + real_output = 1; + } + UMat input, output; if (complex_input) { @@ -2285,12 +2334,7 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro { if (!inv) { - // in case real input convert it to complex - input.create(src.size(), CV_MAKE_TYPE(depth, 2)); - std::vector planes; - planes.push_back(src); - planes.push_back(UMat::zeros(src.size(), CV_32F)); - merge(planes, input); + input = src; } else { @@ -2298,31 +2342,34 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro } } - - UMat dst = _dst.getUMat(); if (complex_output) { if (real_input && is1d && !inv) output.create(src.size(), CV_32FC2); else - output = dst; + { + _dst.create(src.size(), CV_32FC2); + output = _dst.getUMat(); + } } else { - output.create(src.size(), CV_32FC2); + // CCS + if (is1d) + { + _dst.create(src.size(), CV_32FC1); + output = _dst.getUMat(); + } + else + output.create(src.size(), CV_32FC2); } - if( nonzero_rows <= 0 || nonzero_rows > _src.rows() ) - nonzero_rows = _src.rows(); - - if (!ocl_dft_C2C_row(input, output, nonzero_rows, flags)) + if (!ocl_dft_C2C_rows(input, output, nonzero_rows, flags)) return false; - if ((flags & DFT_ROWS) == 0 && nonzero_rows > 1) + if (!is1d) { - transpose(output, output); - if (!ocl_dft_C2C_row(output, output, output.rows, flags)) + if (!ocl_dft_C2C_cols(output, output, flags)) return false; - transpose(output, output); } if (complex_output) @@ -2335,12 +2382,18 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro else { if (!inv) - ocl_packToCCS(output, _dst, flags); + { + if (!is1d) + ocl_packToCCS(output, _dst, flags); + else + _dst.assign(output); + } else { // copy real part to dst } } + //printf("OCL!\n"); return true; } diff --git a/modules/core/src/opencl/fft.cl b/modules/core/src/opencl/fft.cl index 34da79fafb..7803cdbc21 100644 --- a/modules/core/src/opencl/fft.cl +++ b/modules/core/src/opencl/fft.cl @@ -1,25 +1,13 @@ -__constant float PI = 3.14159265f; -__constant float SQRT_2 = 0.707106781188f; - -__constant float sin_120 = 0.866025403784f; -__constant float fft5_2 = 0.559016994374f; -__constant float fft5_3 = -0.951056516295f; -__constant float fft5_4 = -1.538841768587f; -__constant float fft5_5 = 0.363271264002f; +#define SQRT_2 0.707106781188f +#define sin_120 0.866025403784f +#define fft5_2 0.559016994374f +#define fft5_3 -0.951056516295f +#define fft5_4 -1.538841768587f +#define fft5_5 0.363271264002f __attribute__((always_inline)) -float2 mul_float2(float2 a, float2 b){ - float2 res; - res.x = a.x * b.x - a.y * b.y; - res.y = a.x * b.y + a.y * b.x; - return res; -} - -__attribute__((always_inline)) -float2 sincos_float2(float alpha) { - float cs, sn; - sn = sincos(alpha, &cs); // sincos - return (float2)(cs, sn); +float2 mul_float2(float2 a, float2 b) { + return (float2)(fma(a.x, b.x, -a.y * b.y), fma(a.x, b.y, a.y * b.x)); } __attribute__((always_inline)) @@ -52,6 +40,38 @@ void fft_radix2(__local float2* smem, __constant const float2* twiddles, const i barrier(CLK_LOCAL_MEM_FENCE); } +__attribute__((always_inline)) +void fft_radix2_B2(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) +{ + const int k1 = x & (block_size - 1); + const int x2 = x + (t+1)/2; + const int k2 = x2 & (block_size - 1); + float2 a0, a1, a2, a3; + + if (x < (t+1)/2) + { + a0 = smem[x]; + a1 = mul_float2(twiddles[k1],smem[x+t]); + a2 = smem[x2]; + a3 = mul_float2(twiddles[k2],smem[x2+t]); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (x < (t+1)/2) + { + int dst_ind = (x << 1) - k1; + smem[dst_ind] = a0 + a1; + smem[dst_ind+block_size] = a0 - a1; + + dst_ind = (x2 << 1) - k2; + smem[dst_ind] = a2 + a3; + smem[dst_ind+block_size] = a2 - a3; + } + + barrier(CLK_LOCAL_MEM_FENCE); +} + __attribute__((always_inline)) void fft_radix4(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) { @@ -158,13 +178,6 @@ void fft_radix3(__local float2* smem, __constant const float2* twiddles, const i if (x < t) { - //const int twiddle_block = block_size / 3; - //const float theta = -PI * k * 2 / (3 * block_size); - //float2 tw = sincos_float2(theta); - //printf("radix3 %d (%f,%f)(%f,%f)\n", k, tw.x, tw.y, twiddles[k].x, twiddles[k].y); - //tw = sincos_float2(2*theta); - //printf("radix3- %d %d (%f,%f)(%f,%f)\n", k, twiddle_block, tw.x, tw.y, twiddles[k+block_size].x, twiddles[k+block_size].y); - a0 = smem[x]; a1 = mul_float2(twiddles[k], smem[x+t]); a2 = mul_float2(twiddles[k+block_size], smem[x+2*t]); @@ -177,7 +190,7 @@ void fft_radix3(__local float2* smem, __constant const float2* twiddles, const i const int dst_ind = ((x - k) * 3) + k; float2 b1 = a1 + a2; - a2 = twiddle((float2)sin_120*(a1 - a2)); + a2 = twiddle(sin_120*(a1 - a2)); float2 b0 = a0 - (float2)(0.5f)*b1; smem[dst_ind] = a0 + b1; @@ -188,6 +201,53 @@ void fft_radix3(__local float2* smem, __constant const float2* twiddles, const i barrier(CLK_LOCAL_MEM_FENCE); } +__attribute__((always_inline)) +void fft_radix3_B2(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) +{ + const int k = x % block_size; + const int x2 = x + (t+1)/2; + const int k2 = x2 % block_size; + float2 a0, a1, a2, a3, a4, a5; + + if (x < (t+1)/2) + { + a0 = smem[x]; + a1 = mul_float2(twiddles[k], smem[x+t]); + a2 = mul_float2(twiddles[k+block_size], smem[x+2*t]); + + a3 = smem[x2]; + a4 = mul_float2(twiddles[k2], smem[x2+t]); + a5 = mul_float2(twiddles[k2+block_size], smem[x2+2*t]); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (x < (t+1)/2) + { + int dst_ind = ((x - k) * 3) + k; + + float2 b1 = a1 + a2; + a2 = twiddle(sin_120*(a1 - a2)); + float2 b0 = a0 - (float2)(0.5f)*b1; + + smem[dst_ind] = a0 + b1; + smem[dst_ind + block_size] = b0 + a2; + smem[dst_ind + 2*block_size] = b0 - a2; + + dst_ind = ((x2 - k2) * 3) + k2; + + b1 = a4 + a5; + a5 = twiddle(sin_120*(a4 - a5)); + b0 = a3 - (float2)(0.5f)*b1; + + smem[dst_ind] = a3 + b1; + smem[dst_ind + block_size] = b0 + a5; + smem[dst_ind + 2*block_size] = b0 - a5; + } + + barrier(CLK_LOCAL_MEM_FENCE); +} + __attribute__((always_inline)) void fft_radix5(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) { @@ -196,8 +256,6 @@ void fft_radix5(__local float2* smem, __constant const float2* twiddles, const i if (x < t) { - int tw_ind = block_size / 5; - a0 = smem[x]; a1 = mul_float2(twiddles[k], smem[x + t]); a2 = mul_float2(twiddles[k + block_size],smem[x+2*t]); @@ -223,8 +281,8 @@ void fft_radix5(__local float2* smem, __constant const float2* twiddles, const i a2 = b1 + a4; b0 = a0 - (float2)0.25f * a2; - b1 = (float2)fft5_2 * (b1 - a4); - a4 = (float2)fft5_3 * (float2)(-a1.y - a3.y, a1.x + a3.x); + b1 = fft5_2 * (b1 - a4); + a4 = fft5_3 * (float2)(-a1.y - a3.y, a1.x + a3.x); b5 = (float2)(a4.x - fft5_5 * a1.y, a4.y + fft5_5 * a1.x); a4.x += fft5_4 * a3.y; @@ -243,9 +301,9 @@ void fft_radix5(__local float2* smem, __constant const float2* twiddles, const i barrier(CLK_LOCAL_MEM_FENCE); } -__kernel void fft_multi_radix(__global const uchar* src_ptr, int src_step, int src_offset, - __global uchar* dst_ptr, int dst_step, int dst_offset, - __constant float2 * twiddles_ptr, const int t, const int nz) +__kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols, + __global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, + __constant float2 * twiddles_ptr, const int t, const int nz) { const int x = get_global_id(0); const int y = get_group_id(1); @@ -253,14 +311,60 @@ __kernel void fft_multi_radix(__global const uchar* src_ptr, int src_step, int s if (y < nz) { __local float2 smem[LOCAL_SIZE]; + __constant const float2* twiddles = (__constant float2*) twiddles_ptr; + const int ind = x; + const int block_size = LOCAL_SIZE/kercn; + +#ifndef REAL_INPUT __global const float2* src = (__global const float2*)(src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(float)*2), src_offset))); + #pragma unroll + for (int i=0; i Date: Thu, 17 Jul 2014 12:31:41 +0400 Subject: [PATCH 05/13] Added packing to CCS format --- modules/core/src/dxt.cpp | 117 +++++++---------------------- modules/core/src/opencl/fft.cl | 48 ++++++++++-- modules/core/test/ocl/test_dft.cpp | 18 +++-- 3 files changed, 82 insertions(+), 101 deletions(-) diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp index a3df694364..69ec2c9efe 100644 --- a/modules/core/src/dxt.cpp +++ b/modules/core/src/dxt.cpp @@ -2083,20 +2083,19 @@ struct OCL_FftPlan int dft_size; int flags; - - OCL_FftPlan(int _size, int _flags): dft_size(_size), flags(_flags) + bool status; + OCL_FftPlan(int _size, int _flags): dft_size(_size), flags(_flags), status(true) { int min_radix = INT_MAX; std::vector radixes, blocks; ocl_getRadixes(dft_size, radixes, blocks, min_radix); thread_count = (dft_size + min_radix-1) / min_radix; - printf("cols: %d - ", dft_size); - for (int i=0; i ocl::Device::getDefault().maxWorkGroupSize()) { - printf("%d ", radixes[i]); + status = false; + return; } - printf("min radix - %d\n", min_radix); // generate string with radix calls String radix_processing; @@ -2142,6 +2141,9 @@ struct OCL_FftPlan bool enqueueTransform(InputArray _src, OutputArray _dst, int dft_size, int flags, bool rows = true) const { + if (!status) + return false; + UMat src = _src.getUMat(); UMat dst = _dst.getUMat(); @@ -2162,11 +2164,14 @@ struct OCL_FftPlan kernel_name = "fft_multi_radix_cols"; } + bool is1d = (flags & DFT_ROWS) != 0 || dft_size == 1; String options = buildOptions; if (src.channels() == 1) options += " -D REAL_INPUT"; if (dst.channels() == 1) options += " -D CCS_OUTPUT"; + if ((is1d && src.channels() == 1) || (rows && (flags & DFT_REAL_OUTPUT))) + options += " -D NO_CONJUGATE"; ocl::Kernel k(kernel_name.c_str(), ocl::core::fft_oclsrc, options); if (k.empty()) @@ -2219,61 +2224,16 @@ protected: std::vector planStorage; }; -static bool ocl_packToCCS(InputArray _src, OutputArray _dst, int flags) -{ - UMat src = _src.getUMat(); - _dst.create(src.size(), CV_32F); - UMat dst = _dst.getUMat(); - - src = src.reshape(1); - if ((flags & DFT_ROWS) == 0 && src.rows > 1) - { - // pack to CCS by rows - if (dst.cols > 2) - src.colRange(2, dst.cols + (dst.cols % 2)).copyTo(dst.colRange(1, dst.cols-1 + (dst.cols % 2))); - - Mat dst_mat = dst.getMat(ACCESS_WRITE); - Mat buffer_mat = src.getMat(ACCESS_READ); - - dst_mat.at(0,0) = buffer_mat.at(0,0); - dst_mat.at(dst_mat.rows-1,0) = buffer_mat.at(src.rows/2,0); - for (int i=1; i(i,0) = buffer_mat.at((i+1)/2,0); - dst_mat.at(i+1,0) = buffer_mat.at((i+1)/2,1); - } - - if (dst_mat.cols % 2 == 0) - { - dst_mat.at(0,dst_mat.cols-1) = buffer_mat.at(0,src.cols/2); - dst_mat.at(dst_mat.rows-1,dst_mat.cols-1) = buffer_mat.at(src.rows/2,src.cols/2); - - for (int i=1; i(i,dst_mat.cols-1) = buffer_mat.at((i+1)/2,src.cols/2); - dst_mat.at(i+1,dst_mat.cols-1) = buffer_mat.at((i+1)/2,src.cols/2+1); - } - } - } - else - { - // pack to CCS each row - src.colRange(0,1).copyTo(dst.colRange(0,1)); - src.colRange(2, (dst.cols+1)).copyTo(dst.colRange(1, dst.cols)); - } - return true; -} - static bool ocl_dft_C2C_rows(InputArray _src, OutputArray _dst, int nonzero_rows, int flags) { const OCL_FftPlan* plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.cols(), flags); return plan->enqueueTransform(_src, _dst, nonzero_rows, flags, true); } -static bool ocl_dft_C2C_cols(InputArray _src, OutputArray _dst, int flags) +static bool ocl_dft_C2C_cols(InputArray _src, OutputArray _dst, int nonzero_cols, int flags) { const OCL_FftPlan* plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.rows(), flags); - return plan->enqueueTransform(_src, _dst, _src.cols(), flags, false); + return plan->enqueueTransform(_src, _dst, nonzero_cols, flags, false); } static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_rows) @@ -2315,6 +2275,8 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro // Forward Complex to CCS not supported if (complex_input && real_output && !inv) { + flags ^= DFT_REAL_OUTPUT; + flags |= DFT_COMPLEX_OUTPUT; real_output = 0; complex_output = 1; } @@ -2344,23 +2306,21 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro if (complex_output) { - if (real_input && is1d && !inv) - output.create(src.size(), CV_32FC2); - else - { - _dst.create(src.size(), CV_32FC2); - output = _dst.getUMat(); - } - } else + _dst.create(src.size(), CV_32FC2); + output = _dst.getUMat(); + } + else { - // CCS if (is1d) { _dst.create(src.size(), CV_32FC1); output = _dst.getUMat(); } else + { + _dst.create(src.size(), CV_32FC1); output.create(src.size(), CV_32FC2); + } } if (!ocl_dft_C2C_rows(input, output, nonzero_rows, flags)) @@ -2368,32 +2328,13 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro if (!is1d) { - if (!ocl_dft_C2C_cols(output, output, flags)) + int nonzero_cols = real_input && real_output ? output.cols/2 + 1 : output.cols; + if (!ocl_dft_C2C_cols(output, _dst, nonzero_cols, flags)) return false; - } - - if (complex_output) + } else { - if (real_input && is1d && !inv) - _dst.assign(output.colRange(0, output.cols/2+1)); - else - _dst.assign(output); + _dst.assign(output); } - else - { - if (!inv) - { - if (!is1d) - ocl_packToCCS(output, _dst, flags); - else - _dst.assign(output); - } - else - { - // copy real part to dst - } - } - //printf("OCL!\n"); return true; } @@ -2435,7 +2376,6 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows ) int elem_size = (int)src.elemSize1(), complex_elem_size = elem_size*2; int factors[34]; bool inplace_transform = false; - bool is1d = (flags & DFT_ROWS) != 0 || src.rows == 1; #ifdef USE_IPP_DFT AutoBuffer ippbuf; int ipp_norm_flag = !(flags & DFT_SCALE) ? 8 : inv ? 2 : 1; @@ -2444,10 +2384,7 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows ) CV_Assert( type == CV_32FC1 || type == CV_32FC2 || type == CV_64FC1 || type == CV_64FC2 ); if( !inv && src.channels() == 1 && (flags & DFT_COMPLEX_OUTPUT) ) - if (!is1d) - _dst.create( src.size(), CV_MAKETYPE(depth, 2) ); - else - _dst.create( Size(src.cols/2+1, src.rows), CV_MAKETYPE(depth, 2) ); + _dst.create( src.size(), CV_MAKETYPE(depth, 2) ); else if( inv && src.channels() == 2 && (flags & DFT_REAL_OUTPUT) ) _dst.create( src.size(), depth ); else diff --git a/modules/core/src/opencl/fft.cl b/modules/core/src/opencl/fft.cl index 7803cdbc21..a778d59f22 100644 --- a/modules/core/src/opencl/fft.cl +++ b/modules/core/src/opencl/fft.cl @@ -331,10 +331,17 @@ __kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step, RADIX_PROCESS; #ifndef CCS_OUTPUT - __global float2* dst = (__global float2*)(dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(float)*2), dst_offset))); +#ifdef NO_CONJUGATE + // copy result without complex conjugate + const int cols = dst_cols/2 + 1; +#else + const int cols = dst_cols; +#endif + + __global float2* dst = (__global float2*)(dst_ptr + mad24(y, dst_step, dst_offset)); #pragma unroll - for (int i=0; i(df) << std::endl; double eps = src.size().area() * 1e-4; EXPECT_MAT_NEAR(dst, udst, eps); @@ -181,9 +189,9 @@ OCL_TEST_P(MulSpectrums, Mat) OCL_INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MulSpectrums, testing::Combine(Bool(), Bool())); -OCL_INSTANTIATE_TEST_CASE_P(Core, Dft, Combine(Values(cv::Size(6, 1), cv::Size(5, 8), cv::Size(30, 20), +OCL_INSTANTIATE_TEST_CASE_P(Core, Dft, Combine(Values(cv::Size(6, 4), cv::Size(5, 8), cv::Size(6, 6), cv::Size(512, 1), cv::Size(1280, 768)), - Values((OCL_FFT_TYPE) R2C, (OCL_FFT_TYPE) C2C, (OCL_FFT_TYPE) R2R/*, (OCL_FFT_TYPE) C2R*/), + Values((OCL_FFT_TYPE) R2C, (OCL_FFT_TYPE) C2C, (OCL_FFT_TYPE) R2R, (OCL_FFT_TYPE) C2R), Bool(), // DFT_ROWS Bool() // inplace ) From b17bf031f696f1bc89b871466360d675f5a1b3fb Mon Sep 17 00:00:00 2001 From: Alexander Karsakov Date: Thu, 17 Jul 2014 16:20:04 +0400 Subject: [PATCH 06/13] Added DFT_SCALE for forward transforms --- modules/core/src/dxt.cpp | 13 ++++-- modules/core/src/opencl/fft.cl | 31 ++++++++++---- modules/core/test/ocl/test_dft.cpp | 22 +++++----- samples/cpp/dft.cpp | 69 +++--------------------------- 4 files changed, 49 insertions(+), 86 deletions(-) diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp index 69ec2c9efe..449e19db4b 100644 --- a/modules/core/src/dxt.cpp +++ b/modules/core/src/dxt.cpp @@ -2151,27 +2151,34 @@ struct OCL_FftPlan size_t localsize[2]; String kernel_name; + bool is1d = (flags & DFT_ROWS) != 0 || dft_size == 1; + String options = buildOptions; + if (rows) { globalsize[0] = thread_count; globalsize[1] = dft_size; localsize[0] = thread_count; localsize[1] = 1; kernel_name = "fft_multi_radix_rows"; + if (is1d && (flags & DFT_SCALE)) + options += " -D DFT_SCALE"; } else { globalsize[0] = dft_size; globalsize[1] = thread_count; localsize[0] = 1; localsize[1] = thread_count; kernel_name = "fft_multi_radix_cols"; + if (flags & DFT_SCALE) + options += " -D DFT_SCALE"; } - - bool is1d = (flags & DFT_ROWS) != 0 || dft_size == 1; - String options = buildOptions; + if (src.channels() == 1) options += " -D REAL_INPUT"; if (dst.channels() == 1) options += " -D CCS_OUTPUT"; if ((is1d && src.channels() == 1) || (rows && (flags & DFT_REAL_OUTPUT))) options += " -D NO_CONJUGATE"; + if (is1d) + options += " -D IS_1D"; ocl::Kernel k(kernel_name.c_str(), ocl::core::fft_oclsrc, options); if (k.empty()) diff --git a/modules/core/src/opencl/fft.cl b/modules/core/src/opencl/fft.cl index a778d59f22..d59e0d9b48 100644 --- a/modules/core/src/opencl/fft.cl +++ b/modules/core/src/opencl/fft.cl @@ -301,6 +301,12 @@ void fft_radix5(__local float2* smem, __constant const float2* twiddles, const i barrier(CLK_LOCAL_MEM_FENCE); } +#ifdef DFT_SCALE +#define VAL(x, scale) x*scale +#else +#define VAL(x, scale) x +#endif + __kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols, __global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, __constant float2 * twiddles_ptr, const int t, const int nz) @@ -314,6 +320,11 @@ __kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step, __constant const float2* twiddles = (__constant float2*) twiddles_ptr; const int ind = x; const int block_size = LOCAL_SIZE/kercn; +#ifdef IS_1D + float scale = 1.f/dst_cols; +#else + float scale = 1.f/(dst_cols*dst_rows); +#endif #ifndef REAL_INPUT __global const float2* src = (__global const float2*)(src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(float)*2), src_offset))); @@ -341,15 +352,15 @@ __kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step, __global float2* dst = (__global float2*)(dst_ptr + mad24(y, dst_step, dst_offset)); #pragma unroll for (int i=x; i -#include -#include using namespace cv; using namespace std; @@ -26,31 +24,6 @@ const char* keys = int main(int argc, const char ** argv) { - //int cols = 4; - //int rows = 768; - //srand(0); - //Mat input(Size(cols, rows), CV_32FC2); - //for (int i=0; i(j,i) = Vec2f((float) rand() / RAND_MAX, (float) rand() / RAND_MAX); - //Mat dst; - // - //UMat gpu_input, gpu_dst; - //input.copyTo(gpu_input); - //auto start = std::chrono::system_clock::now(); - //dft(input, dst, DFT_ROWS); - //auto cpu_duration = chrono::duration_cast(chrono::system_clock::now() - start); - // - //start = std::chrono::system_clock::now(); - //dft(gpu_input, gpu_dst, DFT_ROWS); - //auto gpu_duration = chrono::duration_cast(chrono::system_clock::now() - start); - - //double n = norm(dst, gpu_dst); - //cout << "norm = " << n << endl; - //cout << "CPU time: " << cpu_duration.count() << "ms" << endl; - //cout << "GPU time: " << gpu_duration.count() << "ms" << endl; - - help(); CommandLineParser parser(argc, argv, keys); string filename = parser.get(0); @@ -62,46 +35,16 @@ int main(int argc, const char ** argv) printf("Cannot read image file: %s\n", filename.c_str()); return -1; } - - Mat small_img = img(Rect(0,0,6,6)); - - int M = getOptimalDFTSize( small_img.rows ); - int N = getOptimalDFTSize( small_img.cols ); + int M = getOptimalDFTSize( img.rows ); + int N = getOptimalDFTSize( img.cols ); Mat padded; - copyMakeBorder(small_img, padded, 0, M - small_img.rows, 0, N - small_img.cols, BORDER_CONSTANT, Scalar::all(0)); + copyMakeBorder(img, padded, 0, M - img.rows, 0, N - img.cols, BORDER_CONSTANT, Scalar::all(0)); - Mat planes[] = {Mat_(padded), Mat::ones(padded.size(), CV_32F)}; - Mat complexImg, complexImg1, complexInput; + Mat planes[] = {Mat_(padded), Mat::zeros(padded.size(), CV_32F)}; + Mat complexImg; merge(planes, 2, complexImg); - Mat realInput; - padded.convertTo(realInput, CV_32F); - complexInput = complexImg; - //cout << complexImg << endl; - //dft(complexImg, complexImg, DFT_REAL_OUTPUT); - //cout << "Complex to Complex" << endl; - //cout << complexImg << endl; - cout << "Complex input" << endl << complexInput << endl; - cout << "Real input" << endl << realInput << endl; - - dft(complexInput, complexImg1, DFT_COMPLEX_OUTPUT); - cout << "Complex to Complex image: " << endl; - cout << endl << complexImg1 << endl; - - Mat realImg1; - dft(complexInput, realImg1, DFT_REAL_OUTPUT); - cout << "Complex to Real image: " << endl; - cout << endl << realImg1 << endl; - - Mat realOut; - dft(complexImg1, realOut, DFT_INVERSE | DFT_COMPLEX_OUTPUT); - cout << "Complex to Complex (inverse):" << endl; - cout << realOut << endl; - - Mat complexOut; - dft(realImg1, complexOut, DFT_INVERSE | DFT_REAL_OUTPUT | DFT_SCALE); - cout << "Complex to Real (inverse):" << endl; - cout << complexOut << endl; + dft(complexImg, complexImg); // compute log(1 + sqrt(Re(DFT(img))**2 + Im(DFT(img))**2)) split(complexImg, planes); From 2b9e5560556541b5331e41413be17c0cca98af52 Mon Sep 17 00:00:00 2001 From: Alexander Karsakov Date: Fri, 18 Jul 2014 13:41:57 +0400 Subject: [PATCH 07/13] Added Elena's changes with implemented DFT_INVERSE C2C mode. --- modules/core/perf/opencl/perf_dxt.cpp | 2 +- modules/core/src/dxt.cpp | 40 +++------ modules/core/src/opencl/fft.cl | 113 ++++++++++++++++++++++++++ modules/core/test/ocl/test_dft.cpp | 11 +-- 4 files changed, 131 insertions(+), 35 deletions(-) diff --git a/modules/core/perf/opencl/perf_dxt.cpp b/modules/core/perf/opencl/perf_dxt.cpp index edeeda7f0c..3980a191fa 100644 --- a/modules/core/perf/opencl/perf_dxt.cpp +++ b/modules/core/perf/opencl/perf_dxt.cpp @@ -67,7 +67,7 @@ typedef TestBaseWithParam DftFixture; OCL_PERF_TEST_P(DftFixture, Dft, ::testing::Combine(Values(C2C, R2R, C2R, R2C), Values(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3, Size(1024, 1024), Size(512, 512), Size(2048, 2048)), - Values((int)DFT_ROWS, (int) 0/*, (int)DFT_SCALE, (int)DFT_INVERSE, + Values((int)DFT_ROWS, (int) 0, (int)DFT_SCALE/*, (int)DFT_INVERSE, (int)DFT_INVERSE | DFT_SCALE, (int)DFT_ROWS | DFT_INVERSE*/))) { const DftParams params = GetParam(); diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp index 449e19db4b..879a70613f 100644 --- a/modules/core/src/dxt.cpp +++ b/modules/core/src/dxt.cpp @@ -2129,8 +2129,8 @@ struct OCL_FftPlan for (int k=0; k<(n/radix); k++) { - ptr[ptr_index++] = cos(k*theta); - ptr[ptr_index++] = sin(k*theta); + ptr[ptr_index++] = (float) cos(k*theta); + ptr[ptr_index++] = (float) sin(k*theta); } } } @@ -2152,13 +2152,14 @@ struct OCL_FftPlan String kernel_name; bool is1d = (flags & DFT_ROWS) != 0 || dft_size == 1; + bool inv = (flags & DFT_INVERSE) != 0; String options = buildOptions; if (rows) { globalsize[0] = thread_count; globalsize[1] = dft_size; localsize[0] = thread_count; localsize[1] = 1; - kernel_name = "fft_multi_radix_rows"; + kernel_name = !inv ? "fft_multi_radix_rows" : "ifft_multi_radix_rows"; if (is1d && (flags & DFT_SCALE)) options += " -D DFT_SCALE"; } @@ -2166,7 +2167,7 @@ struct OCL_FftPlan { globalsize[0] = dft_size; globalsize[1] = thread_count; localsize[0] = 1; localsize[1] = thread_count; - kernel_name = "fft_multi_radix_cols"; + kernel_name = !inv ? "fft_multi_radix_cols" : "ifft_multi_radix_cols"; if (flags & DFT_SCALE) options += " -D DFT_SCALE"; } @@ -2270,13 +2271,10 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro // if output format is not specified if (complex_output + real_output == 0) { - if (!inv) - { - if (real_input) - real_output = 1; - else - complex_output = 1; - } + if (real_input) + real_output = 1; + else + complex_output = 1; } // Forward Complex to CCS not supported @@ -2294,23 +2292,7 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro real_output = 1; } - UMat input, output; - if (complex_input) - { - input = src; - } - else - { - if (!inv) - { - input = src; - } - else - { - // TODO: unpack from CCS format - } - } - + UMat output; if (complex_output) { _dst.create(src.size(), CV_32FC2); @@ -2330,7 +2312,7 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro } } - if (!ocl_dft_C2C_rows(input, output, nonzero_rows, flags)) + if (!ocl_dft_C2C_rows(src, output, nonzero_rows, flags)) return false; if (!is1d) diff --git a/modules/core/src/opencl/fft.cl b/modules/core/src/opencl/fft.cl index d59e0d9b48..8aecfc056a 100644 --- a/modules/core/src/opencl/fft.cl +++ b/modules/core/src/opencl/fft.cl @@ -424,4 +424,117 @@ __kernel void fft_multi_radix_cols(__global const uchar* src_ptr, int src_step, } #endif } +} + +__kernel void ifft_multi_radix_rows(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols, + __global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, + __constant float2 * twiddles_ptr, const int t, const int nz) +{ + const int x = get_global_id(0); + const int y = get_group_id(1); + + if (y < nz) + { + __local float2 smem[LOCAL_SIZE]; + __constant const float2* twiddles = (__constant float2*) twiddles_ptr; + const int ind = x; + const int block_size = LOCAL_SIZE/kercn; +#ifdef IS_1D + float scale = 1.f/dst_cols; +#else + float scale = 1.f/(dst_cols*dst_rows); +#endif + +#ifndef REAL_INPUT + __global const float2* src = (__global const float2*)(src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(float)*2), src_offset))); + #pragma unroll + for (int i=0; i Date: Mon, 21 Jul 2014 10:42:56 +0400 Subject: [PATCH 08/13] Added multi-block scheme --- modules/core/perf/opencl/perf_dxt.cpp | 6 +- modules/core/src/dxt.cpp | 49 +++-- modules/core/src/opencl/fft.cl | 282 +++++++++++++++++++++++++- modules/core/test/ocl/test_dft.cpp | 14 +- 4 files changed, 321 insertions(+), 30 deletions(-) diff --git a/modules/core/perf/opencl/perf_dxt.cpp b/modules/core/perf/opencl/perf_dxt.cpp index 3980a191fa..f4b6b49a9f 100644 --- a/modules/core/perf/opencl/perf_dxt.cpp +++ b/modules/core/perf/opencl/perf_dxt.cpp @@ -65,10 +65,10 @@ enum OCL_FFT_TYPE typedef tuple DftParams; typedef TestBaseWithParam DftFixture; -OCL_PERF_TEST_P(DftFixture, Dft, ::testing::Combine(Values(C2C, R2R, C2R, R2C), +OCL_PERF_TEST_P(DftFixture, Dft, ::testing::Combine(Values(C2C/*, R2R, C2R, R2C*/), Values(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3, Size(1024, 1024), Size(512, 512), Size(2048, 2048)), - Values((int)DFT_ROWS, (int) 0, (int)DFT_SCALE/*, (int)DFT_INVERSE, - (int)DFT_INVERSE | DFT_SCALE, (int)DFT_ROWS | DFT_INVERSE*/))) + Values((int) 0, (int)DFT_ROWS, (int)DFT_SCALE, (int)DFT_INVERSE, + /*(int)DFT_INVERSE | DFT_SCALE,*/ (int)DFT_ROWS | DFT_INVERSE))) { const DftParams params = GetParam(); const int dft_type = get<0>(params); diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp index 879a70613f..d5b1cb383d 100644 --- a/modules/core/src/dxt.cpp +++ b/modules/core/src/dxt.cpp @@ -2041,23 +2041,33 @@ static std::vector ocl_getRadixes(int cols, std::vector& radixes, std: int n = 1; int factor_index = 0; + min_radix = INT_MAX; // 2^n transforms if ( (factors[factor_index] & 1) == 0 ) { for( ; n < factors[factor_index]; ) { - int radix = 2; + int radix = 2, block = 1; if (8*n <= factors[0]) radix = 8; else if (4*n <= factors[0]) + { radix = 4; + if (cols % 8 == 0) + block = 2; + } + else + { + if (cols % 8 == 0) + block = 4; + else if (cols % 4 == 0) + block = 2; + } radixes.push_back(radix); - if (radix == 2 && cols % 4 == 0) - min_radix = min(min_radix, 2*radix); - else - min_radix = min(min_radix, radix); + blocks.push_back(block); + min_radix = min(min_radix, block*radix); n *= radix; } factor_index++; @@ -2066,11 +2076,22 @@ static std::vector ocl_getRadixes(int cols, std::vector& radixes, std: // all the other transforms for( ; factor_index < nf; factor_index++ ) { - radixes.push_back(factors[factor_index]); - if (factors[factor_index] == 3 && cols % 6 == 0) - min_radix = min(min_radix, 2*factors[factor_index]); - else - min_radix = min(min_radix, factors[factor_index]); + int radix = factors[factor_index], block = 1; + if (radix == 3) + { + if (cols % 12 == 0) + block = 4; + else if (cols % 6 == 0) + block = 2; + } + else if (radix == 5) + { + if (cols % 10 == 0) + block = 2; + } + radixes.push_back(radix); + blocks.push_back(block); + min_radix = min(min_radix, block*radix); } return radixes; } @@ -2086,7 +2107,7 @@ struct OCL_FftPlan bool status; OCL_FftPlan(int _size, int _flags): dft_size(_size), flags(_flags), status(true) { - int min_radix = INT_MAX; + int min_radix; std::vector radixes, blocks; ocl_getRadixes(dft_size, radixes, blocks, min_radix); thread_count = (dft_size + min_radix-1) / min_radix; @@ -2102,9 +2123,9 @@ struct OCL_FftPlan int n = 1, twiddle_size = 0; for (size_t i=0; i 1) + radix_processing += format("fft_radix%d_B%d(smem,twiddles+%d,ind,%d,%d);", radix, block, twiddle_size, n, dft_size/radix); else radix_processing += format("fft_radix%d(smem,twiddles+%d,ind,%d,%d);", radix, twiddle_size, n, dft_size/radix); twiddle_size += (radix-1)*n; diff --git a/modules/core/src/opencl/fft.cl b/modules/core/src/opencl/fft.cl index 8aecfc056a..fdbad19ad0 100644 --- a/modules/core/src/opencl/fft.cl +++ b/modules/core/src/opencl/fft.cl @@ -44,11 +44,11 @@ __attribute__((always_inline)) void fft_radix2_B2(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) { const int k1 = x & (block_size - 1); - const int x2 = x + (t+1)/2; + const int x2 = x + t/2; const int k2 = x2 & (block_size - 1); float2 a0, a1, a2, a3; - if (x < (t+1)/2) + if (x < t/2) { a0 = smem[x]; a1 = mul_float2(twiddles[k1],smem[x+t]); @@ -58,7 +58,7 @@ void fft_radix2_B2(__local float2* smem, __constant const float2* twiddles, cons barrier(CLK_LOCAL_MEM_FENCE); - if (x < (t+1)/2) + if (x < t/2) { int dst_ind = (x << 1) - k1; smem[dst_ind] = a0 + a1; @@ -72,6 +72,55 @@ void fft_radix2_B2(__local float2* smem, __constant const float2* twiddles, cons barrier(CLK_LOCAL_MEM_FENCE); } +__attribute__((always_inline)) +void fft_radix2_B4(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) +{ + const int thread_block = t/4; + const int k1 = x & (block_size - 1); + const int x2 = x + thread_block; + const int k2 = x2 & (block_size - 1); + const int x3 = x + 2*thread_block; + const int k3 = x3 & (block_size - 1); + const int x4 = x + 3*thread_block; + const int k4 = x4 & (block_size - 1); + float2 a0, a1, a2, a3, a4, a5, a6, a7; + + if (x < t/4) + { + a0 = smem[x]; + a1 = mul_float2(twiddles[k1],smem[x+t]); + a2 = smem[x2]; + a3 = mul_float2(twiddles[k2],smem[x2+t]); + a4 = smem[x3]; + a5 = mul_float2(twiddles[k3],smem[x3+t]); + a6 = smem[x4]; + a7 = mul_float2(twiddles[k4],smem[x4+t]); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (x < t/4) + { + int dst_ind = (x << 1) - k1; + smem[dst_ind] = a0 + a1; + smem[dst_ind+block_size] = a0 - a1; + + dst_ind = (x2 << 1) - k2; + smem[dst_ind] = a2 + a3; + smem[dst_ind+block_size] = a2 - a3; + + dst_ind = (x3 << 1) - k3; + smem[dst_ind] = a4 + a5; + smem[dst_ind+block_size] = a4 - a5; + + dst_ind = (x4 << 1) - k4; + smem[dst_ind] = a6 + a7; + smem[dst_ind+block_size] = a6 - a7; + } + + barrier(CLK_LOCAL_MEM_FENCE); +} + __attribute__((always_inline)) void fft_radix4(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) { @@ -107,6 +156,58 @@ void fft_radix4(__local float2* smem, __constant const float2* twiddles, const i barrier(CLK_LOCAL_MEM_FENCE); } +__attribute__((always_inline)) +void fft_radix4_B2(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) +{ + const int k = x & (block_size - 1); + const int x2 = x + t/2; + const int k2 = x2 & (block_size - 1); + float2 a0, a1, a2, a3, a4, a5, a6, a7; + + if (x < t/2) + { + a0 = smem[x]; + a1 = mul_float2(twiddles[k], smem[x+t]); + a2 = mul_float2(twiddles[k + block_size],smem[x+2*t]); + a3 = mul_float2(twiddles[k + 2*block_size],smem[x+3*t]); + + a4 = smem[x2]; + a5 = mul_float2(twiddles[k2], smem[x2+t]); + a6 = mul_float2(twiddles[k2 + block_size],smem[x2+2*t]); + a7 = mul_float2(twiddles[k2 + 2*block_size],smem[x2+3*t]); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (x < t/2) + { + int dst_ind = ((x - k) << 2) + k; + + float2 b0 = a0 + a2; + a2 = a0 - a2; + float2 b1 = a1 + a3; + a3 = twiddle(a1 - a3); + + smem[dst_ind] = b0 + b1; + smem[dst_ind + block_size] = a2 + a3; + smem[dst_ind + 2*block_size] = b0 - b1; + smem[dst_ind + 3*block_size] = a2 - a3; + + dst_ind = ((x2 - k2) << 2) + k2; + b0 = a4 + a6; + a6 = a4 - a6; + b1 = a5 + a7; + a7 = twiddle(a5 - a7); + + smem[dst_ind] = b0 + b1; + smem[dst_ind + block_size] = a6 + a7; + smem[dst_ind + 2*block_size] = b0 - b1; + smem[dst_ind + 3*block_size] = a6 - a7; + } + + barrier(CLK_LOCAL_MEM_FENCE); +} + __attribute__((always_inline)) void fft_radix8(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) { @@ -205,11 +306,11 @@ __attribute__((always_inline)) void fft_radix3_B2(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) { const int k = x % block_size; - const int x2 = x + (t+1)/2; + const int x2 = x + t/2; const int k2 = x2 % block_size; float2 a0, a1, a2, a3, a4, a5; - if (x < (t+1)/2) + if (x < t/2) { a0 = smem[x]; a1 = mul_float2(twiddles[k], smem[x+t]); @@ -222,7 +323,7 @@ void fft_radix3_B2(__local float2* smem, __constant const float2* twiddles, cons barrier(CLK_LOCAL_MEM_FENCE); - if (x < (t+1)/2) + if (x < t/2) { int dst_ind = ((x - k) * 3) + k; @@ -248,6 +349,86 @@ void fft_radix3_B2(__local float2* smem, __constant const float2* twiddles, cons barrier(CLK_LOCAL_MEM_FENCE); } +__attribute__((always_inline)) +void fft_radix3_B4(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) +{ + const int thread_block = t/4; + const int k = x % block_size; + const int x2 = x + thread_block; + const int k2 = x2 % block_size; + const int x3 = x + 2*thread_block; + const int k3 = x3 % block_size; + const int x4 = x + 3*thread_block; + const int k4 = x4 % block_size; + float2 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11; + + if (x < t/4) + { + a0 = smem[x]; + a1 = mul_float2(twiddles[k], smem[x+t]); + a2 = mul_float2(twiddles[k+block_size], smem[x+2*t]); + + a3 = smem[x2]; + a4 = mul_float2(twiddles[k2], smem[x2+t]); + a5 = mul_float2(twiddles[k2+block_size], smem[x2+2*t]); + + a6 = smem[x3]; + a7 = mul_float2(twiddles[k3], smem[x3+t]); + a8 = mul_float2(twiddles[k3+block_size], smem[x3+2*t]); + + a9 = smem[x4]; + a10 = mul_float2(twiddles[k4], smem[x4+t]); + a11 = mul_float2(twiddles[k4+block_size], smem[x4+2*t]); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (x < t/4) + { + int dst_ind = ((x - k) * 3) + k; + + float2 b1 = a1 + a2; + a2 = twiddle(sin_120*(a1 - a2)); + float2 b0 = a0 - (float2)(0.5f)*b1; + + smem[dst_ind] = a0 + b1; + smem[dst_ind + block_size] = b0 + a2; + smem[dst_ind + 2*block_size] = b0 - a2; + + dst_ind = ((x2 - k2) * 3) + k2; + + b1 = a4 + a5; + a5 = twiddle(sin_120*(a4 - a5)); + b0 = a3 - (float2)(0.5f)*b1; + + smem[dst_ind] = a3 + b1; + smem[dst_ind + block_size] = b0 + a5; + smem[dst_ind + 2*block_size] = b0 - a5; + + dst_ind = ((x3 - k3) * 3) + k3; + + b1 = a7 + a8; + a8 = twiddle(sin_120*(a7 - a8)); + b0 = a6 - (float2)(0.5f)*b1; + + smem[dst_ind] = a6 + b1; + smem[dst_ind + block_size] = b0 + a8; + smem[dst_ind + 2*block_size] = b0 - a8; + + dst_ind = ((x4 - k4) * 3) + k4; + + b1 = a10 + a11; + a11 = twiddle(sin_120*(a10 - a11)); + b0 = a9 - (float2)(0.5f)*b1; + + smem[dst_ind] = a9 + b1; + smem[dst_ind + block_size] = b0 + a11; + smem[dst_ind + 2*block_size] = b0 - a11; + } + + barrier(CLK_LOCAL_MEM_FENCE); +} + __attribute__((always_inline)) void fft_radix5(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) { @@ -301,6 +482,95 @@ void fft_radix5(__local float2* smem, __constant const float2* twiddles, const i barrier(CLK_LOCAL_MEM_FENCE); } +__attribute__((always_inline)) +void fft_radix5_B2(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) +{ + const int k = x % block_size; + const int x2 = x+t/2; + const int k2 = x2 % block_size; + float2 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9; + + if (x < t/2) + { + a0 = smem[x]; + a1 = mul_float2(twiddles[k], smem[x + t]); + a2 = mul_float2(twiddles[k + block_size],smem[x+2*t]); + a3 = mul_float2(twiddles[k+2*block_size],smem[x+3*t]); + a4 = mul_float2(twiddles[k+3*block_size],smem[x+4*t]); + + a5 = smem[x2]; + a6 = mul_float2(twiddles[k2], smem[x2 + t]); + a7 = mul_float2(twiddles[k2 + block_size],smem[x2+2*t]); + a8 = mul_float2(twiddles[k2+2*block_size],smem[x2+3*t]); + a9 = mul_float2(twiddles[k2+3*block_size],smem[x2+4*t]); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (x < t/2) + { + int dst_ind = ((x - k) * 5) + k; + __local float2* dst = smem + dst_ind; + + float2 b0, b1, b5; + + b1 = a1 + a4; + a1 -= a4; + + a4 = a3 + a2; + a3 -= a2; + + a2 = b1 + a4; + b0 = a0 - (float2)0.25f * a2; + + b1 = fft5_2 * (b1 - a4); + a4 = fft5_3 * (float2)(-a1.y - a3.y, a1.x + a3.x); + b5 = (float2)(a4.x - fft5_5 * a1.y, a4.y + fft5_5 * a1.x); + + a4.x += fft5_4 * a3.y; + a4.y -= fft5_4 * a3.x; + + a1 = b0 + b1; + b0 -= b1; + + dst[0] = a0 + a2; + dst[block_size] = a1 + a4; + dst[2 * block_size] = b0 + b5; + dst[3 * block_size] = b0 - b5; + dst[4 * block_size] = a1 - a4; + + dst_ind = ((x2 - k2) * 5) + k2; + dst = smem + dst_ind; + + b1 = a6 + a9; + a6 -= a9; + + a9 = a8 + a7; + a8 -= a7; + + a7 = b1 + a9; + b0 = a5 - (float2)0.25f * a7; + + b1 = fft5_2 * (b1 - a9); + a9 = fft5_3 * (float2)(-a6.y - a8.y, a6.x + a8.x); + b5 = (float2)(a9.x - fft5_5 * a6.y, a9.y + fft5_5 * a6.x); + + a9.x += fft5_4 * a8.y; + a9.y -= fft5_4 * a8.x; + + a6 = b0 + b1; + b0 -= b1; + + dst[0] = a5 + a7; + dst[block_size] = a6 + a9; + dst[2 * block_size] = b0 + b5; + dst[3 * block_size] = b0 - b5; + dst[4 * block_size] = a6 - a9; + } + + barrier(CLK_LOCAL_MEM_FENCE); +} + #ifdef DFT_SCALE #define VAL(x, scale) x*scale #else diff --git a/modules/core/test/ocl/test_dft.cpp b/modules/core/test/ocl/test_dft.cpp index 64f6c63843..1fa03ae472 100644 --- a/modules/core/test/ocl/test_dft.cpp +++ b/modules/core/test/ocl/test_dft.cpp @@ -62,7 +62,7 @@ namespace ocl { //////////////////////////////////////////////////////////////////////////// // Dft -PARAM_TEST_CASE(Dft, cv::Size, OCL_FFT_TYPE, bool, bool, bool, bool) +PARAM_TEST_CASE(Dft, cv::Size, OCL_FFT_TYPE, bool, bool, bool) { cv::Size dft_size; int dft_flags, depth, cn, dft_type; @@ -91,9 +91,9 @@ PARAM_TEST_CASE(Dft, cv::Size, OCL_FFT_TYPE, bool, bool, bool, bool) dft_flags |= cv::DFT_ROWS; if (GET_PARAM(3)) dft_flags |= cv::DFT_SCALE; - if (GET_PARAM(4)) - dft_flags |= cv::DFT_INVERSE; - inplace = GET_PARAM(5); + /*if (GET_PARAM(4)) + dft_flags |= cv::DFT_INVERSE;*/ + inplace = GET_PARAM(4); is1d = (dft_flags & DFT_ROWS) != 0 || dft_size.height == 1; @@ -188,12 +188,12 @@ OCL_TEST_P(MulSpectrums, Mat) OCL_INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MulSpectrums, testing::Combine(Bool(), Bool())); -OCL_INSTANTIATE_TEST_CASE_P(Core, Dft, Combine(Values(cv::Size(6, 4), cv::Size(5, 8), cv::Size(6, 6), +OCL_INSTANTIATE_TEST_CASE_P(Core, Dft, Combine(Values(cv::Size(16, 4), cv::Size(5, 8), cv::Size(6, 6), cv::Size(512, 1), cv::Size(1280, 768)), - Values(/*(OCL_FFT_TYPE) R2C, */(OCL_FFT_TYPE) C2C/*, (OCL_FFT_TYPE) R2R, (OCL_FFT_TYPE) C2R*/), + Values((OCL_FFT_TYPE) R2C, (OCL_FFT_TYPE) C2C, (OCL_FFT_TYPE) R2R, (OCL_FFT_TYPE) C2R), Bool(), // DFT_ROWS Bool(), // DFT_SCALE - Bool(), // DFT_INVERSE + //Bool(), // DFT_INVERSE Bool() // inplace ) ); From 52f76a32838019b4eb8e3f889dd25623a5983c74 Mon Sep 17 00:00:00 2001 From: Alexander Karsakov Date: Tue, 22 Jul 2014 11:24:19 +0400 Subject: [PATCH 09/13] Added rest Elena's changes --- modules/core/perf/opencl/perf_dxt.cpp | 6 +- modules/core/src/dxt.cpp | 131 +++-- modules/core/src/opencl/fft.cl | 692 +++++++++++++------------- modules/core/test/ocl/test_dft.cpp | 32 +- 4 files changed, 457 insertions(+), 404 deletions(-) diff --git a/modules/core/perf/opencl/perf_dxt.cpp b/modules/core/perf/opencl/perf_dxt.cpp index f4b6b49a9f..797b2c5334 100644 --- a/modules/core/perf/opencl/perf_dxt.cpp +++ b/modules/core/perf/opencl/perf_dxt.cpp @@ -65,10 +65,10 @@ enum OCL_FFT_TYPE typedef tuple DftParams; typedef TestBaseWithParam DftFixture; -OCL_PERF_TEST_P(DftFixture, Dft, ::testing::Combine(Values(C2C/*, R2R, C2R, R2C*/), +OCL_PERF_TEST_P(DftFixture, Dft, ::testing::Combine(Values(C2C, R2R, C2R, R2C), Values(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3, Size(1024, 1024), Size(512, 512), Size(2048, 2048)), - Values((int) 0, (int)DFT_ROWS, (int)DFT_SCALE, (int)DFT_INVERSE, - /*(int)DFT_INVERSE | DFT_SCALE,*/ (int)DFT_ROWS | DFT_INVERSE))) + Values((int) 0, (int)DFT_ROWS, (int)DFT_SCALE/*, (int)DFT_INVERSE, + (int)DFT_INVERSE | DFT_SCALE, (int)DFT_ROWS | DFT_INVERSE*/))) { const DftParams params = GetParam(); const int dft_type = get<0>(params); diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp index d5b1cb383d..eaef53ad23 100644 --- a/modules/core/src/dxt.cpp +++ b/modules/core/src/dxt.cpp @@ -1791,14 +1791,6 @@ namespace cv { CV_Assert(s == CLFFT_SUCCESS); \ } -enum FftType -{ - R2R = 0, // real to real - C2R = 1, // opencl HERMITIAN_INTERLEAVED to real - R2C = 2, // real to opencl HERMITIAN_INTERLEAVED - C2C = 3 // complex to complex -}; - class PlanCache { struct FftPlan @@ -2034,6 +2026,14 @@ namespace cv #ifdef HAVE_OPENCL +enum FftType +{ + R2R = 0, + C2R = 1, + R2C = 2, + C2C = 3 +}; + static std::vector ocl_getRadixes(int cols, std::vector& radixes, std::vector& blocks, int& min_radix) { int factors[34]; @@ -2054,13 +2054,19 @@ static std::vector ocl_getRadixes(int cols, std::vector& radixes, std: else if (4*n <= factors[0]) { radix = 4; - if (cols % 8 == 0) + if (cols % 12 == 0) + block = 3; + else if (cols % 8 == 0) block = 2; } else { - if (cols % 8 == 0) + if (cols % 10 == 0) + block = 5; + else if (cols % 8 == 0) block = 4; + else if (cols % 6 == 0) + block = 3; else if (cols % 4 == 0) block = 2; } @@ -2081,6 +2087,8 @@ static std::vector ocl_getRadixes(int cols, std::vector& radixes, std: { if (cols % 12 == 0) block = 4; + else if (cols % 9 == 0) + block = 3; else if (cols % 6 == 0) block = 2; } @@ -2142,7 +2150,6 @@ struct OCL_FftPlan { int radix = radixes[i]; n *= radix; - for (int j=1; j planStorage; }; -static bool ocl_dft_C2C_rows(InputArray _src, OutputArray _dst, int nonzero_rows, int flags) +static bool ocl_dft_C2C_rows(InputArray _src, OutputArray _dst, int nonzero_rows, int flags, int fftType) { const OCL_FftPlan* plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.cols(), flags); - return plan->enqueueTransform(_src, _dst, nonzero_rows, flags, true); + return plan->enqueueTransform(_src, _dst, nonzero_rows, flags, fftType, true); } -static bool ocl_dft_C2C_cols(InputArray _src, OutputArray _dst, int nonzero_cols, int flags) +static bool ocl_dft_C2C_cols(InputArray _src, OutputArray _dst, int nonzero_cols, int flags, int fftType) { const OCL_FftPlan* plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.rows(), flags); - return plan->enqueueTransform(_src, _dst, nonzero_cols, flags, false); + return plan->enqueueTransform(_src, _dst, nonzero_cols, flags, fftType, false); } static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_rows) @@ -2298,29 +2318,26 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro complex_output = 1; } + FftType fftType = (FftType)(complex_input << 0 | complex_output << 1); + // Forward Complex to CCS not supported - if (complex_input && real_output && !inv) - { - flags ^= DFT_REAL_OUTPUT; - flags |= DFT_COMPLEX_OUTPUT; - real_output = 0; - complex_output = 1; - } + if (fftType == C2R && !inv) + fftType = C2C; + // Inverse CCS to Complex not supported - if (real_input && complex_output && inv) - { - complex_output = 0; - real_output = 1; - } + if (fftType == R2C && inv) + fftType = R2R; UMat output; - if (complex_output) + if (fftType == C2C || fftType == R2C) { + // complex output _dst.create(src.size(), CV_32FC2); output = _dst.getUMat(); - } + } else { + // real output if (is1d) { _dst.create(src.size(), CV_32FC1); @@ -2333,17 +2350,49 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro } } - if (!ocl_dft_C2C_rows(src, output, nonzero_rows, flags)) - return false; - - if (!is1d) + if (!inv) { - int nonzero_cols = real_input && real_output ? output.cols/2 + 1 : output.cols; - if (!ocl_dft_C2C_cols(output, _dst, nonzero_cols, flags)) + if (!ocl_dft_C2C_rows(src, output, nonzero_rows, flags, fftType)) return false; - } else + + if (!is1d) + { + int nonzero_cols = fftType == R2R ? output.cols/2 + 1 : output.cols; + if (!ocl_dft_C2C_cols(output, _dst, nonzero_cols, flags, fftType)) + return false; + } + } + else { - _dst.assign(output); + if (fftType == C2C) + { + // complex output + if (!ocl_dft_C2C_rows(src, output, nonzero_rows, flags, fftType)) + return false; + + if (!is1d) + { + if (!ocl_dft_C2C_cols(output, output, output.cols, flags, fftType)) + return false; + } + } + else + { + if (is1d) + { + if (!ocl_dft_C2C_rows(src, output, nonzero_rows, flags, fftType)) + return false; + } + else + { + int nonzero_cols = src.cols/2 + 1;// : src.cols; + if (!ocl_dft_C2C_cols(src, output, nonzero_cols, flags, fftType)) + return false; + + if (!ocl_dft_C2C_rows(output, _dst, nonzero_rows, flags, fftType)) + return false; + } + } } return true; } diff --git a/modules/core/src/opencl/fft.cl b/modules/core/src/opencl/fft.cl index fdbad19ad0..dd8ff59850 100644 --- a/modules/core/src/opencl/fft.cl +++ b/modules/core/src/opencl/fft.cl @@ -16,106 +16,224 @@ float2 twiddle(float2 a) { } __attribute__((always_inline)) -void fft_radix2(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) +void butterfly2(float2 a0, float2 a1, __local float2* smem, __constant const float2* twiddles, + const int x, const int block_size) +{ + const int k = x & (block_size - 1); + a1 = mul_float2(twiddles[k], a1); + const int dst_ind = (x << 1) - k; + + smem[dst_ind] = a0 + a1; + smem[dst_ind+block_size] = a0 - a1; +} + +__attribute__((always_inline)) +void butterfly4(float2 a0, float2 a1, float2 a2, float2 a3, __local float2* smem, __constant const float2* twiddles, + const int x, const int block_size) { const int k = x & (block_size - 1); + a1 = mul_float2(twiddles[k], a1); + a2 = mul_float2(twiddles[k + block_size], a2); + a3 = mul_float2(twiddles[k + 2*block_size], a3); + + const int dst_ind = ((x - k) << 2) + k; + + float2 b0 = a0 + a2; + a2 = a0 - a2; + float2 b1 = a1 + a3; + a3 = twiddle(a1 - a3); + + smem[dst_ind] = b0 + b1; + smem[dst_ind + block_size] = a2 + a3; + smem[dst_ind + 2*block_size] = b0 - b1; + smem[dst_ind + 3*block_size] = a2 - a3; +} + +__attribute__((always_inline)) +void butterfly3(float2 a0, float2 a1, float2 a2, __local float2* smem, __constant const float2* twiddles, + const int x, const int block_size) +{ + const int k = x & (block_size - 1); + a1 = mul_float2(twiddles[k], a1); + a2 = mul_float2(twiddles[k+block_size], a2); + const int dst_ind = ((x - k) * 3) + k; + + float2 b1 = a1 + a2; + a2 = twiddle(sin_120*(a1 - a2)); + float2 b0 = a0 - (float2)(0.5f)*b1; + + smem[dst_ind] = a0 + b1; + smem[dst_ind + block_size] = b0 + a2; + smem[dst_ind + 2*block_size] = b0 - a2; +} + +__attribute__((always_inline)) +void butterfly5(float2 a0, float2 a1, float2 a2, float2 a3, float2 a4, __local float2* smem, __constant const float2* twiddles, + const int x, const int block_size) +{ + const int k = x & (block_size - 1); + a1 = mul_float2(twiddles[k], a1); + a2 = mul_float2(twiddles[k + block_size], a2); + a3 = mul_float2(twiddles[k+2*block_size], a3); + a4 = mul_float2(twiddles[k+3*block_size], a4); + + const int dst_ind = ((x - k) * 5) + k; + __local float2* dst = smem + dst_ind; + + float2 b0, b1, b5; + + b1 = a1 + a4; + a1 -= a4; + + a4 = a3 + a2; + a3 -= a2; + + a2 = b1 + a4; + b0 = a0 - (float2)0.25f * a2; + + b1 = fft5_2 * (b1 - a4); + a4 = fft5_3 * (float2)(-a1.y - a3.y, a1.x + a3.x); + b5 = (float2)(a4.x - fft5_5 * a1.y, a4.y + fft5_5 * a1.x); + + a4.x += fft5_4 * a3.y; + a4.y -= fft5_4 * a3.x; + + a1 = b0 + b1; + b0 -= b1; + + dst[0] = a0 + a2; + dst[block_size] = a1 + a4; + dst[2 * block_size] = b0 + b5; + dst[3 * block_size] = b0 - b5; + dst[4 * block_size] = a1 - a4; +} + +__attribute__((always_inline)) +void fft_radix2(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) +{ float2 a0, a1; if (x < t) { a0 = smem[x]; - a1 = mul_float2(twiddles[k],smem[x+t]); + a1 = smem[x+t]; } barrier(CLK_LOCAL_MEM_FENCE); if (x < t) - { - const int dst_ind = (x << 1) - k; - - smem[dst_ind] = a0 + a1; - smem[dst_ind+block_size] = a0 - a1; - } + butterfly2(a0, a1, smem, twiddles, x, block_size); barrier(CLK_LOCAL_MEM_FENCE); } __attribute__((always_inline)) -void fft_radix2_B2(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) +void fft_radix2_B2(__local float2* smem, __constant const float2* twiddles, const int x1, const int block_size, const int t) { - const int k1 = x & (block_size - 1); - const int x2 = x + t/2; - const int k2 = x2 & (block_size - 1); + const int x2 = x1 + t/2; float2 a0, a1, a2, a3; - if (x < t/2) + if (x1 < t/2) { - a0 = smem[x]; - a1 = mul_float2(twiddles[k1],smem[x+t]); - a2 = smem[x2]; - a3 = mul_float2(twiddles[k2],smem[x2+t]); + a0 = smem[x1]; a1 = smem[x1+t]; + a2 = smem[x2]; a3 = smem[x2+t]; } barrier(CLK_LOCAL_MEM_FENCE); - if (x < t/2) + if (x1 < t/2) { - int dst_ind = (x << 1) - k1; - smem[dst_ind] = a0 + a1; - smem[dst_ind+block_size] = a0 - a1; + butterfly2(a0, a1, smem, twiddles, x1, block_size); + butterfly2(a2, a3, smem, twiddles, x2, block_size); + } - dst_ind = (x2 << 1) - k2; - smem[dst_ind] = a2 + a3; - smem[dst_ind+block_size] = a2 - a3; + barrier(CLK_LOCAL_MEM_FENCE); +} + +__attribute__((always_inline)) +void fft_radix2_B3(__local float2* smem, __constant const float2* twiddles, const int x1, const int block_size, const int t) +{ + const int x2 = x1 + t/3; + const int x3 = x1 + 2*t/3; + float2 a0, a1, a2, a3, a4, a5; + + if (x1 < t/3) + { + a0 = smem[x1]; a1 = smem[x1+t]; + a2 = smem[x2]; a3 = smem[x2+t]; + a4 = smem[x3]; a5 = smem[x3+t]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (x1 < t/3) + { + butterfly2(a0, a1, smem, twiddles, x1, block_size); + butterfly2(a2, a3, smem, twiddles, x2, block_size); + butterfly2(a4, a5, smem, twiddles, x3, block_size); } barrier(CLK_LOCAL_MEM_FENCE); } __attribute__((always_inline)) -void fft_radix2_B4(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) +void fft_radix2_B4(__local float2* smem, __constant const float2* twiddles, const int x1, const int block_size, const int t) { const int thread_block = t/4; - const int k1 = x & (block_size - 1); - const int x2 = x + thread_block; - const int k2 = x2 & (block_size - 1); - const int x3 = x + 2*thread_block; - const int k3 = x3 & (block_size - 1); - const int x4 = x + 3*thread_block; - const int k4 = x4 & (block_size - 1); + const int x2 = x1 + thread_block; + const int x3 = x1 + 2*thread_block; + const int x4 = x1 + 3*thread_block; float2 a0, a1, a2, a3, a4, a5, a6, a7; - if (x < t/4) + if (x1 < t/4) { - a0 = smem[x]; - a1 = mul_float2(twiddles[k1],smem[x+t]); - a2 = smem[x2]; - a3 = mul_float2(twiddles[k2],smem[x2+t]); - a4 = smem[x3]; - a5 = mul_float2(twiddles[k3],smem[x3+t]); - a6 = smem[x4]; - a7 = mul_float2(twiddles[k4],smem[x4+t]); + a0 = smem[x1]; a1 = smem[x1+t]; + a2 = smem[x2]; a3 = smem[x2+t]; + a4 = smem[x3]; a5 = smem[x3+t]; + a6 = smem[x4]; a7 = smem[x4+t]; } barrier(CLK_LOCAL_MEM_FENCE); - if (x < t/4) + if (x1 < t/4) { - int dst_ind = (x << 1) - k1; - smem[dst_ind] = a0 + a1; - smem[dst_ind+block_size] = a0 - a1; + butterfly2(a0, a1, smem, twiddles, x1, block_size); + butterfly2(a2, a3, smem, twiddles, x2, block_size); + butterfly2(a4, a5, smem, twiddles, x3, block_size); + butterfly2(a6, a7, smem, twiddles, x4, block_size); + } - dst_ind = (x2 << 1) - k2; - smem[dst_ind] = a2 + a3; - smem[dst_ind+block_size] = a2 - a3; + barrier(CLK_LOCAL_MEM_FENCE); +} - dst_ind = (x3 << 1) - k3; - smem[dst_ind] = a4 + a5; - smem[dst_ind+block_size] = a4 - a5; +__attribute__((always_inline)) +void fft_radix2_B5(__local float2* smem, __constant const float2* twiddles, const int x1, const int block_size, const int t) +{ + const int thread_block = t/5; + const int x2 = x1 + thread_block; + const int x3 = x1 + 2*thread_block; + const int x4 = x1 + 3*thread_block; + const int x5 = x1 + 4*thread_block; + float2 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9; - dst_ind = (x4 << 1) - k4; - smem[dst_ind] = a6 + a7; - smem[dst_ind+block_size] = a6 - a7; + if (x1 < t/5) + { + a0 = smem[x1]; a1 = smem[x1+t]; + a2 = smem[x2]; a3 = smem[x2+t]; + a4 = smem[x3]; a5 = smem[x3+t]; + a6 = smem[x4]; a7 = smem[x4+t]; + a8 = smem[x5]; a9 = smem[x5+t]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (x1 < t/5) + { + butterfly2(a0, a1, smem, twiddles, x1, block_size); + butterfly2(a2, a3, smem, twiddles, x2, block_size); + butterfly2(a4, a5, smem, twiddles, x3, block_size); + butterfly2(a6, a7, smem, twiddles, x4, block_size); + butterfly2(a8, a9, smem, twiddles, x5, block_size); } barrier(CLK_LOCAL_MEM_FENCE); @@ -124,85 +242,65 @@ void fft_radix2_B4(__local float2* smem, __constant const float2* twiddles, cons __attribute__((always_inline)) void fft_radix4(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) { - const int k = x & (block_size - 1); float2 a0, a1, a2, a3; if (x < t) { - const int twiddle_block = block_size / 4; - a0 = smem[x]; - a1 = mul_float2(twiddles[k],smem[x+t]); - a2 = mul_float2(twiddles[k + block_size],smem[x+2*t]); - a3 = mul_float2(twiddles[k + 2*block_size],smem[x+3*t]); + a0 = smem[x]; a1 = smem[x+t]; a2 = smem[x+2*t]; a3 = smem[x+3*t]; } barrier(CLK_LOCAL_MEM_FENCE); if (x < t) + butterfly4(a0, a1, a2, a3, smem, twiddles, x, block_size); + + barrier(CLK_LOCAL_MEM_FENCE); +} + +__attribute__((always_inline)) +void fft_radix4_B2(__local float2* smem, __constant const float2* twiddles, const int x1, const int block_size, const int t) +{ + const int x2 = x1 + t/2; + float2 a0, a1, a2, a3, a4, a5, a6, a7; + + if (x1 < t/2) { - const int dst_ind = ((x - k) << 2) + k; + a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t]; + a4 = smem[x2]; a5 = smem[x2+t]; a6 = smem[x2+2*t]; a7 = smem[x2+3*t]; + } - float2 b0 = a0 + a2; - a2 = a0 - a2; - float2 b1 = a1 + a3; - a3 = twiddle(a1 - a3); + barrier(CLK_LOCAL_MEM_FENCE); - smem[dst_ind] = b0 + b1; - smem[dst_ind + block_size] = a2 + a3; - smem[dst_ind + 2*block_size] = b0 - b1; - smem[dst_ind + 3*block_size] = a2 - a3; + if (x1 < t/2) + { + butterfly4(a0, a1, a2, a3, smem, twiddles, x1, block_size); + butterfly4(a4, a5, a6, a7, smem, twiddles, x2, block_size); } barrier(CLK_LOCAL_MEM_FENCE); } __attribute__((always_inline)) -void fft_radix4_B2(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) +void fft_radix4_B3(__local float2* smem, __constant const float2* twiddles, const int x1, const int block_size, const int t) { - const int k = x & (block_size - 1); - const int x2 = x + t/2; - const int k2 = x2 & (block_size - 1); - float2 a0, a1, a2, a3, a4, a5, a6, a7; + const int x2 = x1 + t/3; + const int x3 = x2 + t/3; + float2 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11; - if (x < t/2) + if (x1 < t/3) { - a0 = smem[x]; - a1 = mul_float2(twiddles[k], smem[x+t]); - a2 = mul_float2(twiddles[k + block_size],smem[x+2*t]); - a3 = mul_float2(twiddles[k + 2*block_size],smem[x+3*t]); - - a4 = smem[x2]; - a5 = mul_float2(twiddles[k2], smem[x2+t]); - a6 = mul_float2(twiddles[k2 + block_size],smem[x2+2*t]); - a7 = mul_float2(twiddles[k2 + 2*block_size],smem[x2+3*t]); + a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t]; + a4 = smem[x2]; a5 = smem[x2+t]; a6 = smem[x2+2*t]; a7 = smem[x2+3*t]; + a8 = smem[x3]; a9 = smem[x3+t]; a10 = smem[x3+2*t]; a11 = smem[x3+3*t]; } barrier(CLK_LOCAL_MEM_FENCE); - if (x < t/2) + if (x1 < t/3) { - int dst_ind = ((x - k) << 2) + k; - - float2 b0 = a0 + a2; - a2 = a0 - a2; - float2 b1 = a1 + a3; - a3 = twiddle(a1 - a3); - - smem[dst_ind] = b0 + b1; - smem[dst_ind + block_size] = a2 + a3; - smem[dst_ind + 2*block_size] = b0 - b1; - smem[dst_ind + 3*block_size] = a2 - a3; - - dst_ind = ((x2 - k2) << 2) + k2; - b0 = a4 + a6; - a6 = a4 - a6; - b1 = a5 + a7; - a7 = twiddle(a5 - a7); - - smem[dst_ind] = b0 + b1; - smem[dst_ind + block_size] = a6 + a7; - smem[dst_ind + 2*block_size] = b0 - b1; - smem[dst_ind + 3*block_size] = a6 - a7; + butterfly4(a0, a1, a2, a3, smem, twiddles, x1, block_size); + butterfly4(a4, a5, a6, a7, smem, twiddles, x2, block_size); + butterfly4(a8, a9, a10, a11, smem, twiddles, x3, block_size); } barrier(CLK_LOCAL_MEM_FENCE); @@ -274,156 +372,95 @@ void fft_radix8(__local float2* smem, __constant const float2* twiddles, const i __attribute__((always_inline)) void fft_radix3(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) { - const int k = x % block_size; float2 a0, a1, a2; if (x < t) { - a0 = smem[x]; - a1 = mul_float2(twiddles[k], smem[x+t]); - a2 = mul_float2(twiddles[k+block_size], smem[x+2*t]); + a0 = smem[x]; a1 = smem[x+t]; a2 = smem[x+2*t]; } barrier(CLK_LOCAL_MEM_FENCE); if (x < t) - { - const int dst_ind = ((x - k) * 3) + k; - - float2 b1 = a1 + a2; - a2 = twiddle(sin_120*(a1 - a2)); - float2 b0 = a0 - (float2)(0.5f)*b1; - - smem[dst_ind] = a0 + b1; - smem[dst_ind + block_size] = b0 + a2; - smem[dst_ind + 2*block_size] = b0 - a2; - } + butterfly3(a0, a1, a2, smem, twiddles, x, block_size); barrier(CLK_LOCAL_MEM_FENCE); } __attribute__((always_inline)) -void fft_radix3_B2(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) +void fft_radix3_B2(__local float2* smem, __constant const float2* twiddles, const int x1, const int block_size, const int t) { - const int k = x % block_size; - const int x2 = x + t/2; - const int k2 = x2 % block_size; + const int x2 = x1 + t/2; float2 a0, a1, a2, a3, a4, a5; - if (x < t/2) + if (x1 < t/2) { - a0 = smem[x]; - a1 = mul_float2(twiddles[k], smem[x+t]); - a2 = mul_float2(twiddles[k+block_size], smem[x+2*t]); - - a3 = smem[x2]; - a4 = mul_float2(twiddles[k2], smem[x2+t]); - a5 = mul_float2(twiddles[k2+block_size], smem[x2+2*t]); + a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t]; + a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t]; } barrier(CLK_LOCAL_MEM_FENCE); - if (x < t/2) + if (x1 < t/2) { - int dst_ind = ((x - k) * 3) + k; + butterfly3(a0, a1, a2, smem, twiddles, x1, block_size); + butterfly3(a3, a4, a5, smem, twiddles, x2, block_size); + } - float2 b1 = a1 + a2; - a2 = twiddle(sin_120*(a1 - a2)); - float2 b0 = a0 - (float2)(0.5f)*b1; + barrier(CLK_LOCAL_MEM_FENCE); +} - smem[dst_ind] = a0 + b1; - smem[dst_ind + block_size] = b0 + a2; - smem[dst_ind + 2*block_size] = b0 - a2; +__attribute__((always_inline)) +void fft_radix3_B3(__local float2* smem, __constant const float2* twiddles, const int x1, const int block_size, const int t) +{ + const int x2 = x1 + t/3; + const int x3 = x2 + t/3; + float2 a0, a1, a2, a3, a4, a5, a6, a7, a8; - dst_ind = ((x2 - k2) * 3) + k2; + if (x1 < t/2) + { + a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t]; + a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t]; + a6 = smem[x3]; a7 = smem[x3+t]; a8 = smem[x3+2*t]; + } - b1 = a4 + a5; - a5 = twiddle(sin_120*(a4 - a5)); - b0 = a3 - (float2)(0.5f)*b1; + barrier(CLK_LOCAL_MEM_FENCE); - smem[dst_ind] = a3 + b1; - smem[dst_ind + block_size] = b0 + a5; - smem[dst_ind + 2*block_size] = b0 - a5; + if (x1 < t/2) + { + butterfly3(a0, a1, a2, smem, twiddles, x1, block_size); + butterfly3(a3, a4, a5, smem, twiddles, x2, block_size); + butterfly3(a6, a7, a8, smem, twiddles, x3, block_size); } barrier(CLK_LOCAL_MEM_FENCE); } __attribute__((always_inline)) -void fft_radix3_B4(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) +void fft_radix3_B4(__local float2* smem, __constant const float2* twiddles, const int x1, const int block_size, const int t) { const int thread_block = t/4; - const int k = x % block_size; - const int x2 = x + thread_block; - const int k2 = x2 % block_size; - const int x3 = x + 2*thread_block; - const int k3 = x3 % block_size; - const int x4 = x + 3*thread_block; - const int k4 = x4 % block_size; + const int x2 = x1 + thread_block; + const int x3 = x1 + 2*thread_block; + const int x4 = x1 + 3*thread_block; float2 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11; - if (x < t/4) + if (x1 < t/4) { - a0 = smem[x]; - a1 = mul_float2(twiddles[k], smem[x+t]); - a2 = mul_float2(twiddles[k+block_size], smem[x+2*t]); - - a3 = smem[x2]; - a4 = mul_float2(twiddles[k2], smem[x2+t]); - a5 = mul_float2(twiddles[k2+block_size], smem[x2+2*t]); - - a6 = smem[x3]; - a7 = mul_float2(twiddles[k3], smem[x3+t]); - a8 = mul_float2(twiddles[k3+block_size], smem[x3+2*t]); - - a9 = smem[x4]; - a10 = mul_float2(twiddles[k4], smem[x4+t]); - a11 = mul_float2(twiddles[k4+block_size], smem[x4+2*t]); + a0 = smem[x1]; a1 = smem[x1+t]; a2 = smem[x1+2*t]; + a3 = smem[x2]; a4 = smem[x2+t]; a5 = smem[x2+2*t]; + a6 = smem[x3]; a7 = smem[x3+t]; a8 = smem[x3+2*t]; + a9 = smem[x4]; a10 = smem[x4+t]; a11 = smem[x4+2*t]; } barrier(CLK_LOCAL_MEM_FENCE); - if (x < t/4) + if (x1 < t/4) { - int dst_ind = ((x - k) * 3) + k; - - float2 b1 = a1 + a2; - a2 = twiddle(sin_120*(a1 - a2)); - float2 b0 = a0 - (float2)(0.5f)*b1; - - smem[dst_ind] = a0 + b1; - smem[dst_ind + block_size] = b0 + a2; - smem[dst_ind + 2*block_size] = b0 - a2; - - dst_ind = ((x2 - k2) * 3) + k2; - - b1 = a4 + a5; - a5 = twiddle(sin_120*(a4 - a5)); - b0 = a3 - (float2)(0.5f)*b1; - - smem[dst_ind] = a3 + b1; - smem[dst_ind + block_size] = b0 + a5; - smem[dst_ind + 2*block_size] = b0 - a5; - - dst_ind = ((x3 - k3) * 3) + k3; - - b1 = a7 + a8; - a8 = twiddle(sin_120*(a7 - a8)); - b0 = a6 - (float2)(0.5f)*b1; - - smem[dst_ind] = a6 + b1; - smem[dst_ind + block_size] = b0 + a8; - smem[dst_ind + 2*block_size] = b0 - a8; - - dst_ind = ((x4 - k4) * 3) + k4; - - b1 = a10 + a11; - a11 = twiddle(sin_120*(a10 - a11)); - b0 = a9 - (float2)(0.5f)*b1; - - smem[dst_ind] = a9 + b1; - smem[dst_ind + block_size] = b0 + a11; - smem[dst_ind + 2*block_size] = b0 - a11; + butterfly3(a0, a1, a2, smem, twiddles, x1, block_size); + butterfly3(a3, a4, a5, smem, twiddles, x2, block_size); + butterfly3(a6, a7, a8, smem, twiddles, x3, block_size); + butterfly3(a9, a10, a11, smem, twiddles, x4, block_size); } barrier(CLK_LOCAL_MEM_FENCE); @@ -437,135 +474,35 @@ void fft_radix5(__local float2* smem, __constant const float2* twiddles, const i if (x < t) { - a0 = smem[x]; - a1 = mul_float2(twiddles[k], smem[x + t]); - a2 = mul_float2(twiddles[k + block_size],smem[x+2*t]); - a3 = mul_float2(twiddles[k+2*block_size],smem[x+3*t]); - a4 = mul_float2(twiddles[k+3*block_size],smem[x+4*t]); + a0 = smem[x]; a1 = smem[x + t]; a2 = smem[x+2*t]; a3 = smem[x+3*t]; a4 = smem[x+4*t]; } barrier(CLK_LOCAL_MEM_FENCE); if (x < t) - { - const int dst_ind = ((x - k) * 5) + k; - __local float2* dst = smem + dst_ind; - - float2 b0, b1, b5; - - b1 = a1 + a4; - a1 -= a4; - - a4 = a3 + a2; - a3 -= a2; - - a2 = b1 + a4; - b0 = a0 - (float2)0.25f * a2; - - b1 = fft5_2 * (b1 - a4); - a4 = fft5_3 * (float2)(-a1.y - a3.y, a1.x + a3.x); - b5 = (float2)(a4.x - fft5_5 * a1.y, a4.y + fft5_5 * a1.x); - - a4.x += fft5_4 * a3.y; - a4.y -= fft5_4 * a3.x; - - a1 = b0 + b1; - b0 -= b1; - - dst[0] = a0 + a2; - dst[block_size] = a1 + a4; - dst[2 * block_size] = b0 + b5; - dst[3 * block_size] = b0 - b5; - dst[4 * block_size] = a1 - a4; - } + butterfly5(a0, a1, a2, a3, a4, smem, twiddles, x, block_size); barrier(CLK_LOCAL_MEM_FENCE); } __attribute__((always_inline)) -void fft_radix5_B2(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) +void fft_radix5_B2(__local float2* smem, __constant const float2* twiddles, const int x1, const int block_size, const int t) { - const int k = x % block_size; - const int x2 = x+t/2; - const int k2 = x2 % block_size; + const int x2 = x1+t/2; float2 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9; - if (x < t/2) + if (x1 < t/2) { - a0 = smem[x]; - a1 = mul_float2(twiddles[k], smem[x + t]); - a2 = mul_float2(twiddles[k + block_size],smem[x+2*t]); - a3 = mul_float2(twiddles[k+2*block_size],smem[x+3*t]); - a4 = mul_float2(twiddles[k+3*block_size],smem[x+4*t]); - - a5 = smem[x2]; - a6 = mul_float2(twiddles[k2], smem[x2 + t]); - a7 = mul_float2(twiddles[k2 + block_size],smem[x2+2*t]); - a8 = mul_float2(twiddles[k2+2*block_size],smem[x2+3*t]); - a9 = mul_float2(twiddles[k2+3*block_size],smem[x2+4*t]); + a0 = smem[x1]; a1 = smem[x1 + t]; a2 = smem[x1+2*t]; a3 = smem[x1+3*t]; a4 = smem[x1+4*t]; + a5 = smem[x2]; a6 = smem[x2 + t]; a7 = smem[x2+2*t]; a8 = smem[x2+3*t]; a9 = smem[x2+4*t]; } barrier(CLK_LOCAL_MEM_FENCE); - if (x < t/2) + if (x1 < t/2) { - int dst_ind = ((x - k) * 5) + k; - __local float2* dst = smem + dst_ind; - - float2 b0, b1, b5; - - b1 = a1 + a4; - a1 -= a4; - - a4 = a3 + a2; - a3 -= a2; - - a2 = b1 + a4; - b0 = a0 - (float2)0.25f * a2; - - b1 = fft5_2 * (b1 - a4); - a4 = fft5_3 * (float2)(-a1.y - a3.y, a1.x + a3.x); - b5 = (float2)(a4.x - fft5_5 * a1.y, a4.y + fft5_5 * a1.x); - - a4.x += fft5_4 * a3.y; - a4.y -= fft5_4 * a3.x; - - a1 = b0 + b1; - b0 -= b1; - - dst[0] = a0 + a2; - dst[block_size] = a1 + a4; - dst[2 * block_size] = b0 + b5; - dst[3 * block_size] = b0 - b5; - dst[4 * block_size] = a1 - a4; - - dst_ind = ((x2 - k2) * 5) + k2; - dst = smem + dst_ind; - - b1 = a6 + a9; - a6 -= a9; - - a9 = a8 + a7; - a8 -= a7; - - a7 = b1 + a9; - b0 = a5 - (float2)0.25f * a7; - - b1 = fft5_2 * (b1 - a9); - a9 = fft5_3 * (float2)(-a6.y - a8.y, a6.x + a8.x); - b5 = (float2)(a9.x - fft5_5 * a6.y, a9.y + fft5_5 * a6.x); - - a9.x += fft5_4 * a8.y; - a9.y -= fft5_4 * a8.x; - - a6 = b0 + b1; - b0 -= b1; - - dst[0] = a5 + a7; - dst[block_size] = a6 + a9; - dst[2 * block_size] = b0 + b5; - dst[3 * block_size] = b0 - b5; - dst[4 * block_size] = a6 - a9; + butterfly5(a0, a1, a2, a3, a4, smem, twiddles, x1, block_size); + butterfly5(a5, a6, a7, a8, a9, smem, twiddles, x2, block_size); } barrier(CLK_LOCAL_MEM_FENCE); @@ -611,7 +548,7 @@ __kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step, RADIX_PROCESS; -#ifndef CCS_OUTPUT +#ifndef REAL_OUTPUT #ifdef NO_CONJUGATE // copy result without complex conjugate const int cols = dst_cols/2 + 1; @@ -659,7 +596,7 @@ __kernel void fft_multi_radix_cols(__global const uchar* src_ptr, int src_step, RADIX_PROCESS; -#ifndef CCS_OUTPUT +#ifndef REAL_OUTPUT __global uchar* dst = dst_ptr + mad24(y, dst_step, mad24(x, (int)(sizeof(float)*2), dst_offset)); #pragma unroll for (int i=0; i Date: Tue, 22 Jul 2014 14:54:38 +0400 Subject: [PATCH 10/13] Added nonzero_rows support --- modules/core/src/dxt.cpp | 49 +++++------- modules/core/src/ocl.cpp | 8 +- modules/core/src/opencl/fft.cl | 120 +++++++++++++++++------------ modules/core/test/ocl/test_dft.cpp | 32 ++++---- 4 files changed, 109 insertions(+), 100 deletions(-) diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp index eaef53ad23..869409f50d 100644 --- a/modules/core/src/dxt.cpp +++ b/modules/core/src/dxt.cpp @@ -2034,19 +2034,19 @@ enum FftType C2C = 3 }; -static std::vector ocl_getRadixes(int cols, std::vector& radixes, std::vector& blocks, int& min_radix) +static void ocl_getRadixes(int cols, std::vector& radixes, std::vector& blocks, int& min_radix) { int factors[34]; - int nf = DFTFactorize( cols, factors ); + int nf = DFTFactorize(cols, factors); int n = 1; int factor_index = 0; min_radix = INT_MAX; // 2^n transforms - if ( (factors[factor_index] & 1) == 0 ) + if ((factors[factor_index] & 1) == 0) { - for( ; n < factors[factor_index]; ) + for( ; n < factors[factor_index];) { int radix = 2, block = 1; if (8*n <= factors[0]) @@ -2080,7 +2080,7 @@ static std::vector ocl_getRadixes(int cols, std::vector& radixes, std: } // all the other transforms - for( ; factor_index < nf; factor_index++ ) + for( ; factor_index < nf; factor_index++) { int radix = factors[factor_index], block = 1; if (radix == 3) @@ -2101,7 +2101,6 @@ static std::vector ocl_getRadixes(int cols, std::vector& radixes, std: blocks.push_back(block); min_radix = min(min_radix, block*radix); } - return radixes; } struct OCL_FftPlan @@ -2111,14 +2110,13 @@ struct OCL_FftPlan int thread_count; int dft_size; - int flags; bool status; - OCL_FftPlan(int _size, int _flags): dft_size(_size), flags(_flags), status(true) + OCL_FftPlan(int _size): dft_size(_size), status(true) { int min_radix; std::vector radixes, blocks; ocl_getRadixes(dft_size, radixes, blocks, min_radix); - thread_count = (dft_size + min_radix-1) / min_radix; + thread_count = dft_size / min_radix; if (thread_count > ocl::Device::getDefault().maxWorkGroupSize()) { @@ -2140,8 +2138,7 @@ struct OCL_FftPlan n *= radix; } - twiddles.create(1, twiddle_size, CV_32FC2); - Mat tw = twiddles.getMat(ACCESS_WRITE); + Mat tw(1, twiddle_size, CV_32FC2); float* ptr = tw.ptr(); int ptr_index = 0; @@ -2162,6 +2159,7 @@ struct OCL_FftPlan } } } + twiddles = tw.getUMat(ACCESS_READ); buildOptions = format("-D LOCAL_SIZE=%d -D kercn=%d -D RADIX_PROCESS=%s", dft_size, dft_size/thread_count, radix_processing.c_str()); @@ -2185,10 +2183,10 @@ struct OCL_FftPlan if (rows) { - globalsize[0] = thread_count; globalsize[1] = dft_size; + globalsize[0] = thread_count; globalsize[1] = src.rows; localsize[0] = thread_count; localsize[1] = 1; kernel_name = !inv ? "fft_multi_radix_rows" : "ifft_multi_radix_rows"; - if (is1d && (flags & DFT_SCALE)) + if ((is1d || inv) && (flags & DFT_SCALE)) options += " -D DFT_SCALE"; } else @@ -2200,14 +2198,9 @@ struct OCL_FftPlan options += " -D DFT_SCALE"; } - if (src.channels() == 1) - options += " -D REAL_INPUT"; - else - options += " -D COMPLEX_INPUT"; - if (dst.channels() == 1) - options += " -D REAL_OUTPUT"; - if (is1d) - options += " -D IS_1D"; + options += src.channels() == 1 ? " -D REAL_INPUT" : " -D COMPLEX_INPUT"; + options += dst.channels() == 1 ? " -D REAL_OUTPUT" : " -D COMPLEX_OUTPUT"; + options += is1d ? " -D IS_1D" : ""; if (!inv) { @@ -2216,10 +2209,10 @@ struct OCL_FftPlan } else { - if (is1d && fftType == C2R || (rows && fftType == R2R)) + if (rows && (fftType == C2R || fftType == R2R)) options += " -D NO_CONJUGATE"; if (dst.cols % 2 == 0) - options += " -D EVEN"; + options += " -D EVEN"; } ocl::Kernel k(kernel_name.c_str(), ocl::core::fft_oclsrc, options); @@ -2240,7 +2233,7 @@ public: return planCache; } - OCL_FftPlan* getFftPlan(int dft_size, int flags) + OCL_FftPlan* getFftPlan(int dft_size) { for (size_t i = 0, size = planStorage.size(); i < size; ++i) { @@ -2252,7 +2245,7 @@ public: } } - OCL_FftPlan * newPlan = new OCL_FftPlan(dft_size, flags); + OCL_FftPlan * newPlan = new OCL_FftPlan(dft_size); planStorage.push_back(newPlan); return newPlan; } @@ -2275,13 +2268,13 @@ protected: static bool ocl_dft_C2C_rows(InputArray _src, OutputArray _dst, int nonzero_rows, int flags, int fftType) { - const OCL_FftPlan* plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.cols(), flags); + const OCL_FftPlan* plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.cols()); return plan->enqueueTransform(_src, _dst, nonzero_rows, flags, fftType, true); } static bool ocl_dft_C2C_cols(InputArray _src, OutputArray _dst, int nonzero_cols, int flags, int fftType) { - const OCL_FftPlan* plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.rows(), flags); + const OCL_FftPlan* plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.rows()); return plan->enqueueTransform(_src, _dst, nonzero_cols, flags, fftType, false); } @@ -2385,7 +2378,7 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro } else { - int nonzero_cols = src.cols/2 + 1;// : src.cols; + int nonzero_cols = src.cols/2 + 1; if (!ocl_dft_C2C_cols(src, output, nonzero_cols, flags, fftType)) return false; diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp index 32db8c91b4..a2110f6cc2 100644 --- a/modules/core/src/ocl.cpp +++ b/modules/core/src/ocl.cpp @@ -3002,7 +3002,8 @@ bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[], sync ? 0 : &p->e); if( sync || retval != CL_SUCCESS ) { - CV_OclDbgAssert(clFinish(qq) == CL_SUCCESS); + int a = clFinish(qq); + CV_OclDbgAssert(a == CL_SUCCESS); p->cleanupUMats(); } else @@ -3898,8 +3899,9 @@ public: if( (accessFlags & ACCESS_READ) != 0 && u->hostCopyObsolete() ) { AlignedDataPtr alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT); - CV_Assert( clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0, - u->size, alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS ); + int a = clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0, + u->size, alignedPtr.getAlignedPtr(), 0, 0, 0); + CV_Assert( a == CL_SUCCESS ); u->markHostCopyObsolete(false); } } diff --git a/modules/core/src/opencl/fft.cl b/modules/core/src/opencl/fft.cl index dd8ff59850..b8d2c6716d 100644 --- a/modules/core/src/opencl/fft.cl +++ b/modules/core/src/opencl/fft.cl @@ -16,7 +16,7 @@ float2 twiddle(float2 a) { } __attribute__((always_inline)) -void butterfly2(float2 a0, float2 a1, __local float2* smem, __constant const float2* twiddles, +void butterfly2(float2 a0, float2 a1, __local float2* smem, __global const float2* twiddles, const int x, const int block_size) { const int k = x & (block_size - 1); @@ -28,7 +28,7 @@ void butterfly2(float2 a0, float2 a1, __local float2* smem, __constant const flo } __attribute__((always_inline)) -void butterfly4(float2 a0, float2 a1, float2 a2, float2 a3, __local float2* smem, __constant const float2* twiddles, +void butterfly4(float2 a0, float2 a1, float2 a2, float2 a3, __local float2* smem, __global const float2* twiddles, const int x, const int block_size) { const int k = x & (block_size - 1); @@ -50,10 +50,10 @@ void butterfly4(float2 a0, float2 a1, float2 a2, float2 a3, __local float2* smem } __attribute__((always_inline)) -void butterfly3(float2 a0, float2 a1, float2 a2, __local float2* smem, __constant const float2* twiddles, +void butterfly3(float2 a0, float2 a1, float2 a2, __local float2* smem, __global const float2* twiddles, const int x, const int block_size) { - const int k = x & (block_size - 1); + const int k = x % block_size; a1 = mul_float2(twiddles[k], a1); a2 = mul_float2(twiddles[k+block_size], a2); const int dst_ind = ((x - k) * 3) + k; @@ -68,10 +68,10 @@ void butterfly3(float2 a0, float2 a1, float2 a2, __local float2* smem, __constan } __attribute__((always_inline)) -void butterfly5(float2 a0, float2 a1, float2 a2, float2 a3, float2 a4, __local float2* smem, __constant const float2* twiddles, +void butterfly5(float2 a0, float2 a1, float2 a2, float2 a3, float2 a4, __local float2* smem, __global const float2* twiddles, const int x, const int block_size) { - const int k = x & (block_size - 1); + const int k = x % block_size; a1 = mul_float2(twiddles[k], a1); a2 = mul_float2(twiddles[k + block_size], a2); a3 = mul_float2(twiddles[k+2*block_size], a3); @@ -109,7 +109,7 @@ void butterfly5(float2 a0, float2 a1, float2 a2, float2 a3, float2 a4, __local f } __attribute__((always_inline)) -void fft_radix2(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) +void fft_radix2(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t) { float2 a0, a1; @@ -128,7 +128,7 @@ void fft_radix2(__local float2* smem, __constant const float2* twiddles, const i } __attribute__((always_inline)) -void fft_radix2_B2(__local float2* smem, __constant const float2* twiddles, const int x1, const int block_size, const int t) +void fft_radix2_B2(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) { const int x2 = x1 + t/2; float2 a0, a1, a2, a3; @@ -151,7 +151,7 @@ void fft_radix2_B2(__local float2* smem, __constant const float2* twiddles, cons } __attribute__((always_inline)) -void fft_radix2_B3(__local float2* smem, __constant const float2* twiddles, const int x1, const int block_size, const int t) +void fft_radix2_B3(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) { const int x2 = x1 + t/3; const int x3 = x1 + 2*t/3; @@ -177,7 +177,7 @@ void fft_radix2_B3(__local float2* smem, __constant const float2* twiddles, cons } __attribute__((always_inline)) -void fft_radix2_B4(__local float2* smem, __constant const float2* twiddles, const int x1, const int block_size, const int t) +void fft_radix2_B4(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) { const int thread_block = t/4; const int x2 = x1 + thread_block; @@ -207,7 +207,7 @@ void fft_radix2_B4(__local float2* smem, __constant const float2* twiddles, cons } __attribute__((always_inline)) -void fft_radix2_B5(__local float2* smem, __constant const float2* twiddles, const int x1, const int block_size, const int t) +void fft_radix2_B5(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) { const int thread_block = t/5; const int x2 = x1 + thread_block; @@ -240,7 +240,7 @@ void fft_radix2_B5(__local float2* smem, __constant const float2* twiddles, cons } __attribute__((always_inline)) -void fft_radix4(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) +void fft_radix4(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t) { float2 a0, a1, a2, a3; @@ -258,7 +258,7 @@ void fft_radix4(__local float2* smem, __constant const float2* twiddles, const i } __attribute__((always_inline)) -void fft_radix4_B2(__local float2* smem, __constant const float2* twiddles, const int x1, const int block_size, const int t) +void fft_radix4_B2(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) { const int x2 = x1 + t/2; float2 a0, a1, a2, a3, a4, a5, a6, a7; @@ -281,7 +281,7 @@ void fft_radix4_B2(__local float2* smem, __constant const float2* twiddles, cons } __attribute__((always_inline)) -void fft_radix4_B3(__local float2* smem, __constant const float2* twiddles, const int x1, const int block_size, const int t) +void fft_radix4_B3(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) { const int x2 = x1 + t/3; const int x3 = x2 + t/3; @@ -307,7 +307,7 @@ void fft_radix4_B3(__local float2* smem, __constant const float2* twiddles, cons } __attribute__((always_inline)) -void fft_radix8(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) +void fft_radix8(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t) { const int k = x % block_size; float2 a0, a1, a2, a3, a4, a5, a6, a7; @@ -370,7 +370,7 @@ void fft_radix8(__local float2* smem, __constant const float2* twiddles, const i } __attribute__((always_inline)) -void fft_radix3(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) +void fft_radix3(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t) { float2 a0, a1, a2; @@ -388,7 +388,7 @@ void fft_radix3(__local float2* smem, __constant const float2* twiddles, const i } __attribute__((always_inline)) -void fft_radix3_B2(__local float2* smem, __constant const float2* twiddles, const int x1, const int block_size, const int t) +void fft_radix3_B2(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) { const int x2 = x1 + t/2; float2 a0, a1, a2, a3, a4, a5; @@ -411,7 +411,7 @@ void fft_radix3_B2(__local float2* smem, __constant const float2* twiddles, cons } __attribute__((always_inline)) -void fft_radix3_B3(__local float2* smem, __constant const float2* twiddles, const int x1, const int block_size, const int t) +void fft_radix3_B3(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) { const int x2 = x1 + t/3; const int x3 = x2 + t/3; @@ -437,7 +437,7 @@ void fft_radix3_B3(__local float2* smem, __constant const float2* twiddles, cons } __attribute__((always_inline)) -void fft_radix3_B4(__local float2* smem, __constant const float2* twiddles, const int x1, const int block_size, const int t) +void fft_radix3_B4(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) { const int thread_block = t/4; const int x2 = x1 + thread_block; @@ -467,7 +467,7 @@ void fft_radix3_B4(__local float2* smem, __constant const float2* twiddles, cons } __attribute__((always_inline)) -void fft_radix5(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) +void fft_radix5(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t) { const int k = x % block_size; float2 a0, a1, a2, a3, a4; @@ -486,7 +486,7 @@ void fft_radix5(__local float2* smem, __constant const float2* twiddles, const i } __attribute__((always_inline)) -void fft_radix5_B2(__local float2* smem, __constant const float2* twiddles, const int x1, const int block_size, const int t) +void fft_radix5_B2(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) { const int x2 = x1+t/2; float2 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9; @@ -516,24 +516,23 @@ void fft_radix5_B2(__local float2* smem, __constant const float2* twiddles, cons __kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols, __global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, - __constant float2 * twiddles_ptr, const int t, const int nz) + __global float2* twiddles_ptr, const int t, const int nz) { const int x = get_global_id(0); const int y = get_group_id(1); - + const int block_size = LOCAL_SIZE/kercn; if (y < nz) { __local float2 smem[LOCAL_SIZE]; - __constant const float2* twiddles = (__constant float2*) twiddles_ptr; + __global const float2* twiddles = (__global float2*) twiddles_ptr; const int ind = x; - const int block_size = LOCAL_SIZE/kercn; #ifdef IS_1D float scale = 1.f/dst_cols; #else float scale = 1.f/(dst_cols*dst_rows); #endif -#ifndef REAL_INPUT +#ifdef COMPLEX_INPUT __global const float2* src = (__global const float2*)(src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(float)*2), src_offset))); #pragma unroll for (int i=0; i(df) << std::endl; double eps = src.size().area() * 1e-4; @@ -188,13 +185,12 @@ OCL_TEST_P(MulSpectrums, Mat) OCL_INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MulSpectrums, testing::Combine(Bool(), Bool())); -OCL_INSTANTIATE_TEST_CASE_P(Core, Dft, Combine(Values(cv::Size(4, 1), cv::Size(5, 8), cv::Size(6, 6), - cv::Size(512, 1), cv::Size(1280, 768)), - Values((OCL_FFT_TYPE) R2C, (OCL_FFT_TYPE) C2C, (OCL_FFT_TYPE) R2R, (OCL_FFT_TYPE) C2R), +OCL_INSTANTIATE_TEST_CASE_P(Core, Dft, Combine(Values(cv::Size(10, 10), cv::Size(36, 36), cv::Size(512, 1), cv::Size(1280, 768)), + Values((OCL_FFT_TYPE) R2C, (OCL_FFT_TYPE) C2C, (OCL_FFT_TYPE) R2R, (OCL_FFT_TYPE) C2R), Bool(), // DFT_INVERSE Bool(), // DFT_ROWS Bool(), // DFT_SCALE - Bool() // inplace + Bool() // hint ) ); From 66ac46214d4bb7685b78d0b3aed4c67b5cdb76a7 Mon Sep 17 00:00:00 2001 From: Alexander Karsakov Date: Wed, 23 Jul 2014 12:13:09 +0400 Subject: [PATCH 11/13] Final refactoring, fixes --- modules/core/perf/opencl/perf_arithm.cpp | 2 +- modules/core/perf/opencl/perf_dxt.cpp | 37 +- modules/core/src/dxt.cpp | 513 +++++++++++------------ modules/core/src/ocl.cpp | 8 +- modules/core/src/opencl/fft.cl | 105 +++-- modules/core/test/ocl/test_dft.cpp | 75 +--- 6 files changed, 352 insertions(+), 388 deletions(-) diff --git a/modules/core/perf/opencl/perf_arithm.cpp b/modules/core/perf/opencl/perf_arithm.cpp index ba808b494f..17badca765 100644 --- a/modules/core/perf/opencl/perf_arithm.cpp +++ b/modules/core/perf/opencl/perf_arithm.cpp @@ -292,7 +292,7 @@ OCL_PERF_TEST_P(MagnitudeFixture, Magnitude, ::testing::Combine( typedef Size_MatType TransposeFixture; OCL_PERF_TEST_P(TransposeFixture, Transpose, ::testing::Combine( - OCL_TEST_SIZES, Values(CV_8UC1, CV_32FC1, CV_8UC2, CV_32FC2, CV_8UC4, CV_32FC4))) + OCL_TEST_SIZES, OCL_TEST_TYPES_134)) { const Size_MatType_t params = GetParam(); const Size srcSize = get<0>(params); diff --git a/modules/core/perf/opencl/perf_dxt.cpp b/modules/core/perf/opencl/perf_dxt.cpp index 797b2c5334..d0219913b5 100644 --- a/modules/core/perf/opencl/perf_dxt.cpp +++ b/modules/core/perf/opencl/perf_dxt.cpp @@ -54,40 +54,21 @@ namespace ocl { ///////////// dft //////////////////////// -enum OCL_FFT_TYPE -{ - R2R = 0, // real to real (CCS) - C2R = 1, // complex to real - R2C = 2, // real to complex - C2C = 3 // complex to complex -}; - -typedef tuple DftParams; +typedef tuple DftParams; typedef TestBaseWithParam DftFixture; -OCL_PERF_TEST_P(DftFixture, Dft, ::testing::Combine(Values(C2C, R2R, C2R, R2C), - Values(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3, Size(1024, 1024), Size(512, 512), Size(2048, 2048)), - Values((int) 0, (int)DFT_ROWS, (int)DFT_SCALE/*, (int)DFT_INVERSE, - (int)DFT_INVERSE | DFT_SCALE, (int)DFT_ROWS | DFT_INVERSE*/))) +OCL_PERF_TEST_P(DftFixture, Dft, ::testing::Combine(Values(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3), + Values((int)DFT_ROWS, (int)DFT_SCALE, (int)DFT_INVERSE, + (int)DFT_INVERSE | DFT_SCALE, (int)DFT_ROWS | DFT_INVERSE))) { const DftParams params = GetParam(); - const int dft_type = get<0>(params); - const Size srcSize = get<1>(params); - int flags = get<2>(params); - - int in_cn, out_cn; - switch (dft_type) - { - case R2R: flags |= cv::DFT_REAL_OUTPUT; in_cn = 1; out_cn = 1; break; - case C2R: flags |= cv::DFT_REAL_OUTPUT; in_cn = 2; out_cn = 2; break; - case R2C: flags |= cv::DFT_COMPLEX_OUTPUT; in_cn = 1; out_cn = 2; break; - case C2C: flags |= cv::DFT_COMPLEX_OUTPUT; in_cn = 2; out_cn = 2; break; - } - - UMat src(srcSize, CV_MAKE_TYPE(CV_32F, in_cn)), dst(srcSize, CV_MAKE_TYPE(CV_32F, out_cn)); + const Size srcSize = get<0>(params); + const int flags = get<1>(params); + + UMat src(srcSize, CV_32FC2), dst(srcSize, CV_32FC2); declare.in(src, WARMUP_RNG).out(dst); - OCL_TEST_CYCLE() cv::dft(src, dst, flags); + OCL_TEST_CYCLE() cv::dft(src, dst, flags | DFT_COMPLEX_OUTPUT); SANITY_CHECK(dst, 1e-3); } diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp index 869409f50d..cb0b118bca 100644 --- a/modules/core/src/dxt.cpp +++ b/modules/core/src/dxt.cpp @@ -1781,251 +1781,11 @@ static bool ippi_DFT_R_32F(const Mat& src, Mat& dst, bool inv, int norm_flag) #endif } -#ifdef HAVE_CLAMDFFT - -namespace cv { - -#define CLAMDDFT_Assert(func) \ - { \ - clAmdFftStatus s = (func); \ - CV_Assert(s == CLFFT_SUCCESS); \ - } - -class PlanCache -{ - struct FftPlan - { - FftPlan(const Size & _dft_size, int _src_step, int _dst_step, bool _doubleFP, bool _inplace, int _flags, FftType _fftType) : - dft_size(_dft_size), src_step(_src_step), dst_step(_dst_step), - doubleFP(_doubleFP), inplace(_inplace), flags(_flags), fftType(_fftType), - context((cl_context)ocl::Context::getDefault().ptr()), plHandle(0) - { - bool dft_inverse = (flags & DFT_INVERSE) != 0; - bool dft_scale = (flags & DFT_SCALE) != 0; - bool dft_rows = (flags & DFT_ROWS) != 0; - - clAmdFftLayout inLayout = CLFFT_REAL, outLayout = CLFFT_REAL; - clAmdFftDim dim = dft_size.height == 1 || dft_rows ? CLFFT_1D : CLFFT_2D; - - size_t batchSize = dft_rows ? dft_size.height : 1; - size_t clLengthsIn[3] = { dft_size.width, dft_rows ? 1 : dft_size.height, 1 }; - size_t clStridesIn[3] = { 1, 1, 1 }; - size_t clStridesOut[3] = { 1, 1, 1 }; - int elemSize = doubleFP ? sizeof(double) : sizeof(float); - - switch (fftType) - { - case C2C: - inLayout = CLFFT_COMPLEX_INTERLEAVED; - outLayout = CLFFT_COMPLEX_INTERLEAVED; - clStridesIn[1] = src_step / (elemSize << 1); - clStridesOut[1] = dst_step / (elemSize << 1); - break; - case R2C: - inLayout = CLFFT_REAL; - outLayout = CLFFT_HERMITIAN_INTERLEAVED; - clStridesIn[1] = src_step / elemSize; - clStridesOut[1] = dst_step / (elemSize << 1); - break; - case C2R: - inLayout = CLFFT_HERMITIAN_INTERLEAVED; - outLayout = CLFFT_REAL; - clStridesIn[1] = src_step / (elemSize << 1); - clStridesOut[1] = dst_step / elemSize; - break; - case R2R: - default: - CV_Error(Error::StsNotImplemented, "AMD Fft does not support this type"); - break; - } - - clStridesIn[2] = dft_rows ? clStridesIn[1] : dft_size.width * clStridesIn[1]; - clStridesOut[2] = dft_rows ? clStridesOut[1] : dft_size.width * clStridesOut[1]; - - CLAMDDFT_Assert(clAmdFftCreateDefaultPlan(&plHandle, (cl_context)ocl::Context::getDefault().ptr(), dim, clLengthsIn)) - - // setting plan properties - CLAMDDFT_Assert(clAmdFftSetPlanPrecision(plHandle, doubleFP ? CLFFT_DOUBLE : CLFFT_SINGLE)); - CLAMDDFT_Assert(clAmdFftSetResultLocation(plHandle, inplace ? CLFFT_INPLACE : CLFFT_OUTOFPLACE)) - CLAMDDFT_Assert(clAmdFftSetLayout(plHandle, inLayout, outLayout)) - CLAMDDFT_Assert(clAmdFftSetPlanBatchSize(plHandle, batchSize)) - CLAMDDFT_Assert(clAmdFftSetPlanInStride(plHandle, dim, clStridesIn)) - CLAMDDFT_Assert(clAmdFftSetPlanOutStride(plHandle, dim, clStridesOut)) - CLAMDDFT_Assert(clAmdFftSetPlanDistance(plHandle, clStridesIn[dim], clStridesOut[dim])) - - float scale = dft_scale ? 1.0f / (dft_rows ? dft_size.width : dft_size.area()) : 1.0f; - CLAMDDFT_Assert(clAmdFftSetPlanScale(plHandle, dft_inverse ? CLFFT_BACKWARD : CLFFT_FORWARD, scale)) - - // ready to bake - cl_command_queue queue = (cl_command_queue)ocl::Queue::getDefault().ptr(); - CLAMDDFT_Assert(clAmdFftBakePlan(plHandle, 1, &queue, NULL, NULL)) - } - - ~FftPlan() - { -// clAmdFftDestroyPlan(&plHandle); - } - - friend class PlanCache; - - private: - Size dft_size; - int src_step, dst_step; - bool doubleFP; - bool inplace; - int flags; - FftType fftType; - - cl_context context; - clAmdFftPlanHandle plHandle; - }; - -public: - static PlanCache & getInstance() - { - static PlanCache planCache; - return planCache; - } - - clAmdFftPlanHandle getPlanHandle(const Size & dft_size, int src_step, int dst_step, bool doubleFP, - bool inplace, int flags, FftType fftType) - { - cl_context currentContext = (cl_context)ocl::Context::getDefault().ptr(); - - for (size_t i = 0, size = planStorage.size(); i < size; ++i) - { - const FftPlan * const plan = planStorage[i]; - - if (plan->dft_size == dft_size && - plan->flags == flags && - plan->src_step == src_step && - plan->dst_step == dst_step && - plan->doubleFP == doubleFP && - plan->fftType == fftType && - plan->inplace == inplace) - { - if (plan->context != currentContext) - { - planStorage.erase(planStorage.begin() + i); - break; - } - - return plan->plHandle; - } - } - - // no baked plan is found, so let's create a new one - FftPlan * newPlan = new FftPlan(dft_size, src_step, dst_step, doubleFP, inplace, flags, fftType); - planStorage.push_back(newPlan); - - return newPlan->plHandle; - } - - ~PlanCache() - { - for (std::vector::iterator i = planStorage.begin(), end = planStorage.end(); i != end; ++i) - delete (*i); - planStorage.clear(); - } - -protected: - PlanCache() : - planStorage() - { - } - - std::vector planStorage; -}; - -extern "C" { - -static void CL_CALLBACK oclCleanupCallback(cl_event e, cl_int, void *p) -{ - UMatData * u = (UMatData *)p; - - if( u && CV_XADD(&u->urefcount, -1) == 1 ) - u->currAllocator->deallocate(u); - u = 0; - - clReleaseEvent(e), e = 0; -} - -} - -static bool ocl_dft_amdfft(InputArray _src, OutputArray _dst, int flags) -{ - int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - Size ssize = _src.size(); - - bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; - if ( (!doubleSupport && depth == CV_64F) || - !(type == CV_32FC1 || type == CV_32FC2 || type == CV_64FC1 || type == CV_64FC2) || - _src.offset() != 0) - return false; - - // if is not a multiplication of prime numbers { 2, 3, 5 } - if (ssize.area() != getOptimalDFTSize(ssize.area())) - return false; - - int dst_complex_input = cn == 2 ? 1 : 0; - bool dft_inverse = (flags & DFT_INVERSE) != 0 ? 1 : 0; - int dft_complex_output = (flags & DFT_COMPLEX_OUTPUT) != 0; - bool dft_real_output = (flags & DFT_REAL_OUTPUT) != 0; - - CV_Assert(dft_complex_output + dft_real_output < 2); - FftType fftType = (FftType)(dst_complex_input << 0 | dft_complex_output << 1); - - switch (fftType) - { - case C2C: - _dst.create(ssize.height, ssize.width, CV_MAKE_TYPE(depth, 2)); - break; - case R2C: // TODO implement it if possible - case C2R: // TODO implement it if possible - case R2R: // AMD Fft does not support this type - default: - return false; - } - - UMat src = _src.getUMat(), dst = _dst.getUMat(); - bool inplace = src.u == dst.u; - - clAmdFftPlanHandle plHandle = PlanCache::getInstance(). - getPlanHandle(ssize, (int)src.step, (int)dst.step, - depth == CV_64F, inplace, flags, fftType); - - // get the bufferSize - size_t bufferSize = 0; - CLAMDDFT_Assert(clAmdFftGetTmpBufSize(plHandle, &bufferSize)) - UMat tmpBuffer(1, (int)bufferSize, CV_8UC1); - - cl_mem srcarg = (cl_mem)src.handle(ACCESS_READ); - cl_mem dstarg = (cl_mem)dst.handle(ACCESS_RW); - - cl_command_queue queue = (cl_command_queue)ocl::Queue::getDefault().ptr(); - cl_event e = 0; - - CLAMDDFT_Assert(clAmdFftEnqueueTransform(plHandle, dft_inverse ? CLFFT_BACKWARD : CLFFT_FORWARD, - 1, &queue, 0, NULL, &e, - &srcarg, &dstarg, (cl_mem)tmpBuffer.handle(ACCESS_RW))) - - tmpBuffer.addref(); - clSetEventCallback(e, CL_COMPLETE, oclCleanupCallback, tmpBuffer.u); - - return true; -} - -#undef DFT_ASSERT - -} - -#endif // HAVE_CLAMDFFT +#ifdef HAVE_OPENCL namespace cv { -#ifdef HAVE_OPENCL - enum FftType { R2R = 0, @@ -2038,7 +1798,7 @@ static void ocl_getRadixes(int cols, std::vector& radixes, std::vector { int factors[34]; int nf = DFTFactorize(cols, factors); - + int n = 1; int factor_index = 0; min_radix = INT_MAX; @@ -2118,7 +1878,7 @@ struct OCL_FftPlan ocl_getRadixes(dft_size, radixes, blocks, min_radix); thread_count = dft_size / min_radix; - if (thread_count > ocl::Device::getDefault().maxWorkGroupSize()) + if (thread_count > (int) ocl::Device::getDefault().maxWorkGroupSize()) { status = false; return; @@ -2141,13 +1901,13 @@ struct OCL_FftPlan Mat tw(1, twiddle_size, CV_32FC2); float* ptr = tw.ptr(); int ptr_index = 0; - + n = 1; for (size_t i=0; i 0; - if ( (!doubleSupport && depth == CV_64F) || - !(type == CV_32FC1 || type == CV_32FC2 || type == CV_64FC1 || type == CV_64FC2)) + if ( !(type == CV_32FC1 || type == CV_32FC2) ) return false; // if is not a multiplication of prime numbers { 2, 3, 5 } @@ -2325,7 +2083,7 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro if (fftType == C2C || fftType == R2C) { // complex output - _dst.create(src.size(), CV_32FC2); + _dst.create(src.size(), CV_32FC2); output = _dst.getUMat(); } else @@ -2381,7 +2139,7 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro int nonzero_cols = src.cols/2 + 1; if (!ocl_dft_C2C_cols(src, output, nonzero_cols, flags, fftType)) return false; - + if (!ocl_dft_C2C_rows(output, _dst, nonzero_rows, flags, fftType)) return false; } @@ -2390,11 +2148,248 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro return true; } +} // namespace cv; + #endif -} // namespace cv; +#ifdef HAVE_CLAMDFFT + +namespace cv { + +#define CLAMDDFT_Assert(func) \ + { \ + clAmdFftStatus s = (func); \ + CV_Assert(s == CLFFT_SUCCESS); \ + } + +class PlanCache +{ + struct FftPlan + { + FftPlan(const Size & _dft_size, int _src_step, int _dst_step, bool _doubleFP, bool _inplace, int _flags, FftType _fftType) : + dft_size(_dft_size), src_step(_src_step), dst_step(_dst_step), + doubleFP(_doubleFP), inplace(_inplace), flags(_flags), fftType(_fftType), + context((cl_context)ocl::Context::getDefault().ptr()), plHandle(0) + { + bool dft_inverse = (flags & DFT_INVERSE) != 0; + bool dft_scale = (flags & DFT_SCALE) != 0; + bool dft_rows = (flags & DFT_ROWS) != 0; + clAmdFftLayout inLayout = CLFFT_REAL, outLayout = CLFFT_REAL; + clAmdFftDim dim = dft_size.height == 1 || dft_rows ? CLFFT_1D : CLFFT_2D; + + size_t batchSize = dft_rows ? dft_size.height : 1; + size_t clLengthsIn[3] = { dft_size.width, dft_rows ? 1 : dft_size.height, 1 }; + size_t clStridesIn[3] = { 1, 1, 1 }; + size_t clStridesOut[3] = { 1, 1, 1 }; + int elemSize = doubleFP ? sizeof(double) : sizeof(float); + switch (fftType) + { + case C2C: + inLayout = CLFFT_COMPLEX_INTERLEAVED; + outLayout = CLFFT_COMPLEX_INTERLEAVED; + clStridesIn[1] = src_step / (elemSize << 1); + clStridesOut[1] = dst_step / (elemSize << 1); + break; + case R2C: + inLayout = CLFFT_REAL; + outLayout = CLFFT_HERMITIAN_INTERLEAVED; + clStridesIn[1] = src_step / elemSize; + clStridesOut[1] = dst_step / (elemSize << 1); + break; + case C2R: + inLayout = CLFFT_HERMITIAN_INTERLEAVED; + outLayout = CLFFT_REAL; + clStridesIn[1] = src_step / (elemSize << 1); + clStridesOut[1] = dst_step / elemSize; + break; + case R2R: + default: + CV_Error(Error::StsNotImplemented, "AMD Fft does not support this type"); + break; + } + + clStridesIn[2] = dft_rows ? clStridesIn[1] : dft_size.width * clStridesIn[1]; + clStridesOut[2] = dft_rows ? clStridesOut[1] : dft_size.width * clStridesOut[1]; + + CLAMDDFT_Assert(clAmdFftCreateDefaultPlan(&plHandle, (cl_context)ocl::Context::getDefault().ptr(), dim, clLengthsIn)) + + // setting plan properties + CLAMDDFT_Assert(clAmdFftSetPlanPrecision(plHandle, doubleFP ? CLFFT_DOUBLE : CLFFT_SINGLE)); + CLAMDDFT_Assert(clAmdFftSetResultLocation(plHandle, inplace ? CLFFT_INPLACE : CLFFT_OUTOFPLACE)) + CLAMDDFT_Assert(clAmdFftSetLayout(plHandle, inLayout, outLayout)) + CLAMDDFT_Assert(clAmdFftSetPlanBatchSize(plHandle, batchSize)) + CLAMDDFT_Assert(clAmdFftSetPlanInStride(plHandle, dim, clStridesIn)) + CLAMDDFT_Assert(clAmdFftSetPlanOutStride(plHandle, dim, clStridesOut)) + CLAMDDFT_Assert(clAmdFftSetPlanDistance(plHandle, clStridesIn[dim], clStridesOut[dim])) + + float scale = dft_scale ? 1.0f / (dft_rows ? dft_size.width : dft_size.area()) : 1.0f; + CLAMDDFT_Assert(clAmdFftSetPlanScale(plHandle, dft_inverse ? CLFFT_BACKWARD : CLFFT_FORWARD, scale)) + + // ready to bake + cl_command_queue queue = (cl_command_queue)ocl::Queue::getDefault().ptr(); + CLAMDDFT_Assert(clAmdFftBakePlan(plHandle, 1, &queue, NULL, NULL)) + } + + ~FftPlan() + { +// clAmdFftDestroyPlan(&plHandle); + } + + friend class PlanCache; + + private: + Size dft_size; + int src_step, dst_step; + bool doubleFP; + bool inplace; + int flags; + FftType fftType; + + cl_context context; + clAmdFftPlanHandle plHandle; + }; + +public: + static PlanCache & getInstance() + { + static PlanCache planCache; + return planCache; + } + + clAmdFftPlanHandle getPlanHandle(const Size & dft_size, int src_step, int dst_step, bool doubleFP, + bool inplace, int flags, FftType fftType) + { + cl_context currentContext = (cl_context)ocl::Context::getDefault().ptr(); + + for (size_t i = 0, size = planStorage.size(); i < size; ++i) + { + const FftPlan * const plan = planStorage[i]; + + if (plan->dft_size == dft_size && + plan->flags == flags && + plan->src_step == src_step && + plan->dst_step == dst_step && + plan->doubleFP == doubleFP && + plan->fftType == fftType && + plan->inplace == inplace) + { + if (plan->context != currentContext) + { + planStorage.erase(planStorage.begin() + i); + break; + } + + return plan->plHandle; + } + } + + // no baked plan is found, so let's create a new one + FftPlan * newPlan = new FftPlan(dft_size, src_step, dst_step, doubleFP, inplace, flags, fftType); + planStorage.push_back(newPlan); + + return newPlan->plHandle; + } + + ~PlanCache() + { + for (std::vector::iterator i = planStorage.begin(), end = planStorage.end(); i != end; ++i) + delete (*i); + planStorage.clear(); + } + +protected: + PlanCache() : + planStorage() + { + } + + std::vector planStorage; +}; + +extern "C" { + +static void CL_CALLBACK oclCleanupCallback(cl_event e, cl_int, void *p) +{ + UMatData * u = (UMatData *)p; + + if( u && CV_XADD(&u->urefcount, -1) == 1 ) + u->currAllocator->deallocate(u); + u = 0; + + clReleaseEvent(e), e = 0; +} + +} + +static bool ocl_dft_amdfft(InputArray _src, OutputArray _dst, int flags) +{ + int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); + Size ssize = _src.size(); + + bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; + if ( (!doubleSupport && depth == CV_64F) || + !(type == CV_32FC1 || type == CV_32FC2 || type == CV_64FC1 || type == CV_64FC2) || + _src.offset() != 0) + return false; + + // if is not a multiplication of prime numbers { 2, 3, 5 } + if (ssize.area() != getOptimalDFTSize(ssize.area())) + return false; + + int dst_complex_input = cn == 2 ? 1 : 0; + bool dft_inverse = (flags & DFT_INVERSE) != 0 ? 1 : 0; + int dft_complex_output = (flags & DFT_COMPLEX_OUTPUT) != 0; + bool dft_real_output = (flags & DFT_REAL_OUTPUT) != 0; + + CV_Assert(dft_complex_output + dft_real_output < 2); + FftType fftType = (FftType)(dst_complex_input << 0 | dft_complex_output << 1); + + switch (fftType) + { + case C2C: + _dst.create(ssize.height, ssize.width, CV_MAKE_TYPE(depth, 2)); + break; + case R2C: // TODO implement it if possible + case C2R: // TODO implement it if possible + case R2R: // AMD Fft does not support this type + default: + return false; + } + + UMat src = _src.getUMat(), dst = _dst.getUMat(); + bool inplace = src.u == dst.u; + + clAmdFftPlanHandle plHandle = PlanCache::getInstance(). + getPlanHandle(ssize, (int)src.step, (int)dst.step, + depth == CV_64F, inplace, flags, fftType); + + // get the bufferSize + size_t bufferSize = 0; + CLAMDDFT_Assert(clAmdFftGetTmpBufSize(plHandle, &bufferSize)) + UMat tmpBuffer(1, (int)bufferSize, CV_8UC1); + + cl_mem srcarg = (cl_mem)src.handle(ACCESS_READ); + cl_mem dstarg = (cl_mem)dst.handle(ACCESS_RW); + + cl_command_queue queue = (cl_command_queue)ocl::Queue::getDefault().ptr(); + cl_event e = 0; + + CLAMDDFT_Assert(clAmdFftEnqueueTransform(plHandle, dft_inverse ? CLFFT_BACKWARD : CLFFT_FORWARD, + 1, &queue, 0, NULL, &e, + &srcarg, &dstarg, (cl_mem)tmpBuffer.handle(ACCESS_RW))) + + tmpBuffer.addref(); + clSetEventCallback(e, CL_COMPLETE, oclCleanupCallback, tmpBuffer.u); + return true; +} + +#undef DFT_ASSERT + +} + +#endif // HAVE_CLAMDFFT void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows ) { diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp index a2110f6cc2..32db8c91b4 100644 --- a/modules/core/src/ocl.cpp +++ b/modules/core/src/ocl.cpp @@ -3002,8 +3002,7 @@ bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[], sync ? 0 : &p->e); if( sync || retval != CL_SUCCESS ) { - int a = clFinish(qq); - CV_OclDbgAssert(a == CL_SUCCESS); + CV_OclDbgAssert(clFinish(qq) == CL_SUCCESS); p->cleanupUMats(); } else @@ -3899,9 +3898,8 @@ public: if( (accessFlags & ACCESS_READ) != 0 && u->hostCopyObsolete() ) { AlignedDataPtr alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT); - int a = clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0, - u->size, alignedPtr.getAlignedPtr(), 0, 0, 0); - CV_Assert( a == CL_SUCCESS ); + CV_Assert( clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0, + u->size, alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS ); u->markHostCopyObsolete(false); } } diff --git a/modules/core/src/opencl/fft.cl b/modules/core/src/opencl/fft.cl index b8d2c6716d..1cb2278c0d 100644 --- a/modules/core/src/opencl/fft.cl +++ b/modules/core/src/opencl/fft.cl @@ -6,36 +6,36 @@ #define fft5_5 0.363271264002f __attribute__((always_inline)) -float2 mul_float2(float2 a, float2 b) { - return (float2)(fma(a.x, b.x, -a.y * b.y), fma(a.x, b.y, a.y * b.x)); +float2 mul_float2(float2 a, float2 b) { + return (float2)(fma(a.x, b.x, -a.y * b.y), fma(a.x, b.y, a.y * b.x)); } __attribute__((always_inline)) -float2 twiddle(float2 a) { - return (float2)(a.y, -a.x); +float2 twiddle(float2 a) { + return (float2)(a.y, -a.x); } __attribute__((always_inline)) -void butterfly2(float2 a0, float2 a1, __local float2* smem, __global const float2* twiddles, - const int x, const int block_size) -{ +void butterfly2(float2 a0, float2 a1, __local float2* smem, __global const float2* twiddles, + const int x, const int block_size) +{ const int k = x & (block_size - 1); a1 = mul_float2(twiddles[k], a1); const int dst_ind = (x << 1) - k; - + smem[dst_ind] = a0 + a1; smem[dst_ind+block_size] = a0 - a1; } __attribute__((always_inline)) -void butterfly4(float2 a0, float2 a1, float2 a2, float2 a3, __local float2* smem, __global const float2* twiddles, - const int x, const int block_size) +void butterfly4(float2 a0, float2 a1, float2 a2, float2 a3, __local float2* smem, __global const float2* twiddles, + const int x, const int block_size) { const int k = x & (block_size - 1); a1 = mul_float2(twiddles[k], a1); a2 = mul_float2(twiddles[k + block_size], a2); a3 = mul_float2(twiddles[k + 2*block_size], a3); - + const int dst_ind = ((x - k) << 2) + k; float2 b0 = a0 + a2; @@ -50,9 +50,9 @@ void butterfly4(float2 a0, float2 a1, float2 a2, float2 a3, __local float2* smem } __attribute__((always_inline)) -void butterfly3(float2 a0, float2 a1, float2 a2, __local float2* smem, __global const float2* twiddles, - const int x, const int block_size) -{ +void butterfly3(float2 a0, float2 a1, float2 a2, __local float2* smem, __global const float2* twiddles, + const int x, const int block_size) +{ const int k = x % block_size; a1 = mul_float2(twiddles[k], a1); a2 = mul_float2(twiddles[k+block_size], a2); @@ -69,8 +69,8 @@ void butterfly3(float2 a0, float2 a1, float2 a2, __local float2* smem, __global __attribute__((always_inline)) void butterfly5(float2 a0, float2 a1, float2 a2, float2 a3, float2 a4, __local float2* smem, __global const float2* twiddles, - const int x, const int block_size) -{ + const int x, const int block_size) +{ const int k = x % block_size; a1 = mul_float2(twiddles[k], a1); a2 = mul_float2(twiddles[k + block_size], a2); @@ -95,7 +95,7 @@ void butterfly5(float2 a0, float2 a1, float2 a2, float2 a3, float2 a4, __local f a4 = fft5_3 * (float2)(-a1.y - a3.y, a1.x + a3.x); b5 = (float2)(a4.x - fft5_5 * a1.y, a4.y + fft5_5 * a1.x); - a4.x += fft5_4 * a3.y; + a4.x += fft5_4 * a3.y; a4.y -= fft5_4 * a3.x; a1 = b0 + b1; @@ -109,7 +109,7 @@ void butterfly5(float2 a0, float2 a1, float2 a2, float2 a3, float2 a4, __local f } __attribute__((always_inline)) -void fft_radix2(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t) +void fft_radix2(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t) { float2 a0, a1; @@ -122,13 +122,13 @@ void fft_radix2(__local float2* smem, __global const float2* twiddles, const int barrier(CLK_LOCAL_MEM_FENCE); if (x < t) - butterfly2(a0, a1, smem, twiddles, x, block_size); + butterfly2(a0, a1, smem, twiddles, x, block_size); barrier(CLK_LOCAL_MEM_FENCE); } __attribute__((always_inline)) -void fft_radix2_B2(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) +void fft_radix2_B2(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) { const int x2 = x1 + t/2; float2 a0, a1, a2, a3; @@ -151,7 +151,7 @@ void fft_radix2_B2(__local float2* smem, __global const float2* twiddles, const } __attribute__((always_inline)) -void fft_radix2_B3(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) +void fft_radix2_B3(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) { const int x2 = x1 + t/3; const int x3 = x1 + 2*t/3; @@ -177,7 +177,7 @@ void fft_radix2_B3(__local float2* smem, __global const float2* twiddles, const } __attribute__((always_inline)) -void fft_radix2_B4(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) +void fft_radix2_B4(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) { const int thread_block = t/4; const int x2 = x1 + thread_block; @@ -207,7 +207,7 @@ void fft_radix2_B4(__local float2* smem, __global const float2* twiddles, const } __attribute__((always_inline)) -void fft_radix2_B5(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) +void fft_radix2_B5(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) { const int thread_block = t/5; const int x2 = x1 + thread_block; @@ -326,7 +326,7 @@ void fft_radix8(__local float2* smem, __global const float2* twiddles, const int a7 = mul_float2(twiddles[k+6*block_size],smem[x+7*t]); float2 b0, b1, b6, b7; - + b0 = a0 + a4; a4 = a0 - a4; b1 = a1 + a5; @@ -335,7 +335,7 @@ void fft_radix8(__local float2* smem, __global const float2* twiddles, const int b6 = twiddle(a2 - a6); a2 = a2 + a6; b7 = a3 - a7; - b7 = (float2)(SQRT_2) * (float2)(-b7.x + b7.y, -b7.x - b7.y); + b7 = (float2)(SQRT_2) * (float2)(-b7.x + b7.y, -b7.x - b7.y); a3 = a3 + a7; a0 = b0 + a2; @@ -571,10 +571,15 @@ __kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step, } else { + // fill with zero other rows +#ifdef COMPLEX_OUTPUT __global float2* dst = (__global float2*)(dst_ptr + mad24(y, dst_step, dst_offset)); +#else + __global float* dst = (__global float*)(dst_ptr + mad24(y, dst_step, dst_offset)); +#endif #pragma unroll for (int i=x; i(df) << std::endl; + OCL_OFF(cv::dft(src, dst, dft_flags | cv::DFT_COMPLEX_OUTPUT)); + OCL_ON(cv::dft(usrc, udst, dft_flags | cv::DFT_COMPLEX_OUTPUT)); double eps = src.size().area() * 1e-4; EXPECT_MAT_NEAR(dst, udst, eps); @@ -185,15 +150,15 @@ OCL_TEST_P(MulSpectrums, Mat) OCL_INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MulSpectrums, testing::Combine(Bool(), Bool())); -OCL_INSTANTIATE_TEST_CASE_P(Core, Dft, Combine(Values(cv::Size(10, 10), cv::Size(36, 36), cv::Size(512, 1), cv::Size(1280, 768)), - Values((OCL_FFT_TYPE) R2C, (OCL_FFT_TYPE) C2C, (OCL_FFT_TYPE) R2R, (OCL_FFT_TYPE) C2R), - Bool(), // DFT_INVERSE +OCL_INSTANTIATE_TEST_CASE_P(Core, Dft, Combine(Values(cv::Size(2, 3), cv::Size(5, 4), cv::Size(25, 20), + cv::Size(512, 1), cv::Size(1024, 768)), + Values(CV_32F, CV_64F), + Bool(), // inplace Bool(), // DFT_ROWS Bool(), // DFT_SCALE - Bool() // hint - ) + Bool()) // DFT_INVERSE ); } } // namespace cvtest::ocl -#endif // HAVE_OPENCL \ No newline at end of file +#endif // HAVE_OPENCL From 37d01e2d27b6ed6eac7f9f179796a7574d43ae71 Mon Sep 17 00:00:00 2001 From: Alexander Karsakov Date: Fri, 25 Jul 2014 13:11:35 +0400 Subject: [PATCH 12/13] Added license header, using cv::Ptr, small fixes. --- modules/core/include/opencv2/core/cvdef.h | 2 +- modules/core/src/dxt.cpp | 195 +++++++++++----------- modules/core/src/opencl/fft.cl | 47 +++--- 3 files changed, 125 insertions(+), 119 deletions(-) diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h index 765c54cbe1..1f64cd2ace 100644 --- a/modules/core/include/opencv2/core/cvdef.h +++ b/modules/core/include/opencv2/core/cvdef.h @@ -244,7 +244,7 @@ typedef signed char schar; /* fundamental constants */ #define CV_PI 3.1415926535897932384626433832795 -#define CV_TWO_PI 6.283185307179586476925286766559 +#define CV_2PI 6.283185307179586476925286766559 #define CV_LOG2 0.69314718055994530941723212145818 /****************************************************************************************\ diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp index cb0b118bca..b57e4e8cc0 100644 --- a/modules/core/src/dxt.cpp +++ b/modules/core/src/dxt.cpp @@ -1788,89 +1788,23 @@ namespace cv enum FftType { - R2R = 0, - C2R = 1, - R2C = 2, - C2C = 3 + R2R = 0, // real to CCS in case forward transform, CCS to real otherwise + C2R = 1, // complex to real in case inverse transform + R2C = 2, // real to complex in case forward transform + C2C = 3 // complex to complex }; -static void ocl_getRadixes(int cols, std::vector& radixes, std::vector& blocks, int& min_radix) -{ - int factors[34]; - int nf = DFTFactorize(cols, factors); - - int n = 1; - int factor_index = 0; - min_radix = INT_MAX; - - // 2^n transforms - if ((factors[factor_index] & 1) == 0) - { - for( ; n < factors[factor_index];) - { - int radix = 2, block = 1; - if (8*n <= factors[0]) - radix = 8; - else if (4*n <= factors[0]) - { - radix = 4; - if (cols % 12 == 0) - block = 3; - else if (cols % 8 == 0) - block = 2; - } - else - { - if (cols % 10 == 0) - block = 5; - else if (cols % 8 == 0) - block = 4; - else if (cols % 6 == 0) - block = 3; - else if (cols % 4 == 0) - block = 2; - } - - radixes.push_back(radix); - blocks.push_back(block); - min_radix = min(min_radix, block*radix); - n *= radix; - } - factor_index++; - } - - // all the other transforms - for( ; factor_index < nf; factor_index++) - { - int radix = factors[factor_index], block = 1; - if (radix == 3) - { - if (cols % 12 == 0) - block = 4; - else if (cols % 9 == 0) - block = 3; - else if (cols % 6 == 0) - block = 2; - } - else if (radix == 5) - { - if (cols % 10 == 0) - block = 2; - } - radixes.push_back(radix); - blocks.push_back(block); - min_radix = min(min_radix, block*radix); - } -} - struct OCL_FftPlan { +private: UMat twiddles; String buildOptions; int thread_count; + bool status; +public: int dft_size; - bool status; + OCL_FftPlan(int _size): dft_size(_size), status(true) { int min_radix; @@ -1910,7 +1844,7 @@ struct OCL_FftPlan for (int j=1; j& radixes, std::vector& blocks, int& min_radix) + { + int factors[34]; + int nf = DFTFactorize(cols, factors); + + int n = 1; + int factor_index = 0; + min_radix = INT_MAX; + + // 2^n transforms + if ((factors[factor_index] & 1) == 0) + { + for( ; n < factors[factor_index];) + { + int radix = 2, block = 1; + if (8*n <= factors[0]) + radix = 8; + else if (4*n <= factors[0]) + { + radix = 4; + if (cols % 12 == 0) + block = 3; + else if (cols % 8 == 0) + block = 2; + } + else + { + if (cols % 10 == 0) + block = 5; + else if (cols % 8 == 0) + block = 4; + else if (cols % 6 == 0) + block = 3; + else if (cols % 4 == 0) + block = 2; + } + + radixes.push_back(radix); + blocks.push_back(block); + min_radix = min(min_radix, block*radix); + n *= radix; + } + factor_index++; + } + + // all the other transforms + for( ; factor_index < nf; factor_index++) + { + int radix = factors[factor_index], block = 1; + if (radix == 3) + { + if (cols % 12 == 0) + block = 4; + else if (cols % 9 == 0) + block = 3; + else if (cols % 6 == 0) + block = 2; + } + else if (radix == 5) + { + if (cols % 10 == 0) + block = 2; + } + radixes.push_back(radix); + blocks.push_back(block); + min_radix = min(min_radix, block*radix); + } + } }; class OCL_FftPlanCache @@ -1993,27 +1997,24 @@ public: return planCache; } - OCL_FftPlan* getFftPlan(int dft_size) + Ptr getFftPlan(int dft_size) { for (size_t i = 0, size = planStorage.size(); i < size; ++i) { - OCL_FftPlan * const plan = planStorage[i]; - + Ptr plan = planStorage[i]; if (plan->dft_size == dft_size) { return plan; } } - OCL_FftPlan * newPlan = new OCL_FftPlan(dft_size); + Ptr newPlan = Ptr(new OCL_FftPlan(dft_size)); planStorage.push_back(newPlan); return newPlan; } ~OCL_FftPlanCache() { - for (std::vector::iterator i = planStorage.begin(), end = planStorage.end(); i != end; ++i) - delete (*i); planStorage.clear(); } @@ -2023,18 +2024,18 @@ protected: { } - std::vector planStorage; + std::vector > planStorage; }; -static bool ocl_dft_C2C_rows(InputArray _src, OutputArray _dst, int nonzero_rows, int flags, int fftType) +static bool ocl_dft_rows(InputArray _src, OutputArray _dst, int nonzero_rows, int flags, int fftType) { - const OCL_FftPlan* plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.cols()); + Ptr plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.cols()); return plan->enqueueTransform(_src, _dst, nonzero_rows, flags, fftType, true); } -static bool ocl_dft_C2C_cols(InputArray _src, OutputArray _dst, int nonzero_cols, int flags, int fftType) +static bool ocl_dft_cols(InputArray _src, OutputArray _dst, int nonzero_cols, int flags, int fftType) { - const OCL_FftPlan* plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.rows()); + Ptr plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.rows()); return plan->enqueueTransform(_src, _dst, nonzero_cols, flags, fftType, false); } @@ -2103,13 +2104,13 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro if (!inv) { - if (!ocl_dft_C2C_rows(src, output, nonzero_rows, flags, fftType)) + if (!ocl_dft_rows(src, output, nonzero_rows, flags, fftType)) return false; if (!is1d) { int nonzero_cols = fftType == R2R ? output.cols/2 + 1 : output.cols; - if (!ocl_dft_C2C_cols(output, _dst, nonzero_cols, flags, fftType)) + if (!ocl_dft_cols(output, _dst, nonzero_cols, flags, fftType)) return false; } } @@ -2118,12 +2119,12 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro if (fftType == C2C) { // complex output - if (!ocl_dft_C2C_rows(src, output, nonzero_rows, flags, fftType)) + if (!ocl_dft_rows(src, output, nonzero_rows, flags, fftType)) return false; if (!is1d) { - if (!ocl_dft_C2C_cols(output, output, output.cols, flags, fftType)) + if (!ocl_dft_cols(output, output, output.cols, flags, fftType)) return false; } } @@ -2131,16 +2132,16 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro { if (is1d) { - if (!ocl_dft_C2C_rows(src, output, nonzero_rows, flags, fftType)) + if (!ocl_dft_rows(src, output, nonzero_rows, flags, fftType)) return false; } else { int nonzero_cols = src.cols/2 + 1; - if (!ocl_dft_C2C_cols(src, output, nonzero_cols, flags, fftType)) + if (!ocl_dft_cols(src, output, nonzero_cols, flags, fftType)) return false; - if (!ocl_dft_C2C_rows(output, _dst, nonzero_rows, flags, fftType)) + if (!ocl_dft_rows(output, _dst, nonzero_rows, flags, fftType)) return false; } } @@ -2286,7 +2287,7 @@ public: } // no baked plan is found, so let's create a new one - FftPlan * newPlan = new FftPlan(dft_size, src_step, dst_step, doubleFP, inplace, flags, fftType); + Ptr newPlan = Ptr(new FftPlan(dft_size, src_step, dst_step, doubleFP, inplace, flags, fftType)); planStorage.push_back(newPlan); return newPlan->plHandle; @@ -2294,8 +2295,6 @@ public: ~PlanCache() { - for (std::vector::iterator i = planStorage.begin(), end = planStorage.end(); i != end; ++i) - delete (*i); planStorage.clear(); } @@ -2305,7 +2304,7 @@ protected: { } - std::vector planStorage; + std::vector > planStorage; }; extern "C" { diff --git a/modules/core/src/opencl/fft.cl b/modules/core/src/opencl/fft.cl index 1cb2278c0d..1268c4d6e4 100644 --- a/modules/core/src/opencl/fft.cl +++ b/modules/core/src/opencl/fft.cl @@ -1,3 +1,10 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +// Copyright (C) 2014, Itseez, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. + #define SQRT_2 0.707106781188f #define sin_120 0.866025403784f #define fft5_2 0.559016994374f @@ -509,9 +516,9 @@ void fft_radix5_B2(__local float2* smem, __global const float2* twiddles, const } #ifdef DFT_SCALE -#define VAL(x, scale) x*scale +#define SCALE_VAL(x, scale) x*scale #else -#define VAL(x, scale) x +#define SCALE_VAL(x, scale) x #endif __kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols, @@ -558,15 +565,15 @@ __kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step, __global float2* dst = (__global float2*)(dst_ptr + mad24(y, dst_step, dst_offset)); #pragma unroll for (int i=x; i Date: Sun, 27 Jul 2014 13:31:46 +0400 Subject: [PATCH 13/13] Using std::map in PlanCache --- modules/core/src/dxt.cpp | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp index b57e4e8cc0..69cac1a0fc 100644 --- a/modules/core/src/dxt.cpp +++ b/modules/core/src/dxt.cpp @@ -43,6 +43,7 @@ #include "opencv2/core/opencl/runtime/opencl_clamdfft.hpp" #include "opencv2/core/opencl/runtime/opencl_core.hpp" #include "opencl_kernels.hpp" +#include namespace cv { @@ -1801,10 +1802,9 @@ private: String buildOptions; int thread_count; bool status; - -public: int dft_size; +public: OCL_FftPlan(int _size): dft_size(_size), status(true) { int min_radix; @@ -1999,18 +1999,17 @@ public: Ptr getFftPlan(int dft_size) { - for (size_t i = 0, size = planStorage.size(); i < size; ++i) + std::map >::iterator f = planStorage.find(dft_size); + if (f != planStorage.end()) { - Ptr plan = planStorage[i]; - if (plan->dft_size == dft_size) - { - return plan; - } + return f->second; + } + else + { + Ptr newPlan = Ptr(new OCL_FftPlan(dft_size)); + planStorage[dft_size] = newPlan; + return newPlan; } - - Ptr newPlan = Ptr(new OCL_FftPlan(dft_size)); - planStorage.push_back(newPlan); - return newPlan; } ~OCL_FftPlanCache() @@ -2023,8 +2022,7 @@ protected: planStorage() { } - - std::vector > planStorage; + std::map > planStorage; }; static bool ocl_dft_rows(InputArray _src, OutputArray _dst, int nonzero_rows, int flags, int fftType)