used new device layer for cv::gpu::mulSpectrums

pull/1540/head
Vladislav Vinogradov 12 years ago
parent 3f62e78592
commit e820c5c65f
  1. 89
      modules/cudaarithm/src/arithm.cpp
  2. 174
      modules/cudaarithm/src/cuda/mul_spectrums.cu

@ -292,95 +292,6 @@ void cv::cuda::gemm(InputArray _src1, InputArray _src2, double alpha, InputArray
#endif #endif
} }
//////////////////////////////////////////////////////////////////////////////
// mulSpectrums
#ifdef HAVE_CUFFT
namespace cv { namespace cuda { namespace device
{
void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream);
void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream);
}}}
#endif
void cv::cuda::mulSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, bool conjB, Stream& stream)
{
#ifndef HAVE_CUFFT
(void) _src1;
(void) _src2;
(void) _dst;
(void) flags;
(void) conjB;
(void) stream;
throw_no_cuda();
#else
(void) flags;
typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, PtrStepSz<cufftComplex>, cudaStream_t stream);
static Caller callers[] = { device::mulSpectrums, device::mulSpectrums_CONJ };
GpuMat src1 = _src1.getGpuMat();
GpuMat src2 = _src2.getGpuMat();
CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2 );
CV_Assert( src1.size() == src2.size() );
_dst.create(src1.size(), CV_32FC2);
GpuMat dst = _dst.getGpuMat();
Caller caller = callers[(int)conjB];
caller(src1, src2, dst, StreamAccessor::getStream(stream));
#endif
}
//////////////////////////////////////////////////////////////////////////////
// mulAndScaleSpectrums
#ifdef HAVE_CUFFT
namespace cv { namespace cuda { namespace device
{
void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream);
void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream);
}}}
#endif
void cv::cuda::mulAndScaleSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, float scale, bool conjB, Stream& stream)
{
#ifndef HAVE_CUFFT
(void) _src1;
(void) _src2;
(void) _dst;
(void) flags;
(void) scale;
(void) conjB;
(void) stream;
throw_no_cuda();
#else
(void)flags;
typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, float scale, PtrStepSz<cufftComplex>, cudaStream_t stream);
static Caller callers[] = { device::mulAndScaleSpectrums, device::mulAndScaleSpectrums_CONJ };
GpuMat src1 = _src1.getGpuMat();
GpuMat src2 = _src2.getGpuMat();
CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2);
CV_Assert( src1.size() == src2.size() );
_dst.create(src1.size(), CV_32FC2);
GpuMat dst = _dst.getGpuMat();
Caller caller = callers[(int)conjB];
caller(src1, src2, scale, dst, StreamAccessor::getStream(stream));
#endif
}
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// dft // dft

@ -40,132 +40,126 @@
// //
//M*/ //M*/
#if !defined CUDA_DISABLER #include "opencv2/opencv_modules.hpp"
#include "cvconfig.h" #ifndef HAVE_OPENCV_CUDEV
#ifdef HAVE_CUFFT #error "opencv_cudev is required"
#include <cufft.h> #else
#include "opencv2/core/cuda/common.hpp" #include "opencv2/cudaarithm.hpp"
#include "opencv2/cudev.hpp"
namespace cv { namespace cuda { namespace device using namespace cv::cudev;
{
//////////////////////////////////////////////////////////////////////////
// mulSpectrums
__global__ void mulSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c) //////////////////////////////////////////////////////////////////////////////
{ // mulSpectrums
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < c.cols && y < c.rows) namespace
{ {
c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]); __device__ __forceinline__ float real(const float2& val)
} {
return val.x;
} }
__device__ __forceinline__ float imag(const float2& val)
void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream)
{ {
dim3 threads(256); return val.y;
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
mulSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, c);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
__device__ __forceinline__ float2 cmul(const float2& a, const float2& b)
{
return make_float2((real(a) * real(b)) - (imag(a) * imag(b)),
(real(a) * imag(b)) + (imag(a) * real(b)));
}
////////////////////////////////////////////////////////////////////////// __device__ __forceinline__ float2 conj(const float2& a)
// mulSpectrums_CONJ
__global__ void mulSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; return make_float2(real(a), -imag(a));
const int y = blockIdx.y * blockDim.y + threadIdx.y; }
if (x < c.cols && y < c.rows) struct comlex_mul : binary_function<float2, float2, float2>
{
__device__ __forceinline__ float2 operator ()(const float2& a, const float2& b) const
{ {
c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x])); return cmul(a, b);
} }
} };
struct comlex_mul_conj : binary_function<float2, float2, float2>
void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream)
{ {
dim3 threads(256); __device__ __forceinline__ float2 operator ()(const float2& a, const float2& b) const
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y)); {
return cmul(a, conj(b));
mulSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, c); }
cudaSafeCall( cudaGetLastError() ); };
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
struct comlex_mul_scale : binary_function<float2, float2, float2>
{
float scale;
////////////////////////////////////////////////////////////////////////// __device__ __forceinline__ float2 operator ()(const float2& a, const float2& b) const
// mulAndScaleSpectrums {
return scale * cmul(a, b);
}
};
__global__ void mulAndScaleSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c) struct comlex_mul_conj_scale : binary_function<float2, float2, float2>
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; float scale;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < c.cols && y < c.rows) __device__ __forceinline__ float2 operator ()(const float2& a, const float2& b) const
{ {
cufftComplex v = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]); return scale * cmul(a, conj(b));
c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);
} }
} };
}
void cv::cuda::mulSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, bool conjB, Stream& stream)
{
(void) flags;
void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream) GpuMat src1 = _src1.getGpuMat();
{ GpuMat src2 = _src2.getGpuMat();
dim3 threads(256);
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
mulAndScaleSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, scale, c); CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2 );
cudaSafeCall( cudaGetLastError() ); CV_Assert( src1.size() == src2.size() );
if (stream) _dst.create(src1.size(), CV_32FC2);
cudaSafeCall( cudaDeviceSynchronize() ); GpuMat dst = _dst.getGpuMat();
}
if (conjB)
gridTransformBinary(globPtr<float2>(src1), globPtr<float2>(src2), globPtr<float2>(dst), comlex_mul_conj(), stream);
else
gridTransformBinary(globPtr<float2>(src1), globPtr<float2>(src2), globPtr<float2>(dst), comlex_mul(), stream);
}
////////////////////////////////////////////////////////////////////////// void cv::cuda::mulAndScaleSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, float scale, bool conjB, Stream& stream)
// mulAndScaleSpectrums_CONJ {
(void) flags;
__global__ void mulAndScaleSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c) GpuMat src1 = _src1.getGpuMat();
{ GpuMat src2 = _src2.getGpuMat();
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < c.cols && y < c.rows) CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2);
{ CV_Assert( src1.size() == src2.size() );
cufftComplex v = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));
c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);
}
}
_dst.create(src1.size(), CV_32FC2);
GpuMat dst = _dst.getGpuMat();
void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream) if (conjB)
{ {
dim3 threads(256); comlex_mul_conj_scale op;
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y)); op.scale = scale;
gridTransformBinary(globPtr<float2>(src1), globPtr<float2>(src2), globPtr<float2>(dst), op, stream);
mulAndScaleSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, scale, c);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
} }
}}} // namespace cv { namespace cuda { namespace cudev else
{
#endif // HAVE_CUFFT comlex_mul_scale op;
op.scale = scale;
gridTransformBinary(globPtr<float2>(src1), globPtr<float2>(src2), globPtr<float2>(dst), op, stream);
}
}
#endif /* CUDA_DISABLER */ #endif

Loading…
Cancel
Save