diff --git a/modules/gpu/src/cuda/match_template.cu b/modules/gpu/src/cuda/match_template.cu index 7eda2c92c9..700baa673d 100644 --- a/modules/gpu/src/cuda/match_template.cu +++ b/modules/gpu/src/cuda/match_template.cu @@ -50,6 +50,7 @@ using namespace cv::gpu; namespace cv { namespace gpu { namespace imgproc { + texture imageTex_8U; texture templTex_8U; @@ -98,6 +99,54 @@ void matchTemplate_8U_SQDIFF(const DevMem2D image, const DevMem2D templ, DevMem2 } +texture imageTex_32F; +texture templTex_32F; + + +__global__ void matchTemplateKernel_32F_SQDIFF(int w, int h, DevMem2Df result) +{ + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; + + if (x < result.cols && y < result.rows) + { + float sum = 0.f; + float delta; + + for (int i = 0; i < h; ++i) + { + for (int j = 0; j < w; ++j) + { + delta = tex2D(imageTex_32F, x + j, y + i) - + tex2D(templTex_32F, j, i); + sum += delta * delta; + } + } + + result.ptr(y)[x] = sum; + } +} + + +void matchTemplate_32F_SQDIFF(const DevMem2D image, const DevMem2D templ, DevMem2Df result) +{ + dim3 threads(32, 8); + dim3 grid(divUp(image.cols - templ.cols + 1, threads.x), + divUp(image.rows - templ.rows + 1, threads.y)); + + cudaChannelFormatDesc desc = cudaCreateChannelDesc(); + cudaBindTexture2D(0, imageTex_32F, image.data, desc, image.cols, image.rows, image.step); + cudaBindTexture2D(0, templTex_32F, templ.data, desc, templ.cols, templ.rows, templ.step); + imageTex_8U.filterMode = cudaFilterModePoint; + templTex_8U.filterMode = cudaFilterModePoint; + + matchTemplateKernel_32F_SQDIFF<<>>(templ.cols, templ.rows, result); + cudaSafeCall(cudaThreadSynchronize()); + cudaSafeCall(cudaUnbindTexture(imageTex_32F)); + cudaSafeCall(cudaUnbindTexture(templTex_32F)); +} + + __global__ void multiplyAndNormalizeSpectsKernel(int n, float scale, const cufftComplex* a, const cufftComplex* b, cufftComplex* c) { diff --git a/modules/gpu/src/match_template.cpp b/modules/gpu/src/match_template.cpp index 6713f7ba57..7ce8838e28 100644 --- a/modules/gpu/src/match_template.cpp +++ b/modules/gpu/src/match_template.cpp @@ -61,6 +61,7 @@ namespace cv { namespace gpu { namespace imgproc void multiplyAndNormalizeSpects(int n, float scale, const cufftComplex* a, const cufftComplex* b, cufftComplex* c); void matchTemplate_8U_SQDIFF(const DevMem2D image, const DevMem2D templ, DevMem2Df result); + void matchTemplate_32F_SQDIFF(const DevMem2D image, const DevMem2D templ, DevMem2Df result); }}} @@ -92,6 +93,14 @@ namespace imgproc::matchTemplate_8U_SQDIFF(image, templ, result); } + + template <> + void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result) + { + result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F); + imgproc::matchTemplate_32F_SQDIFF(image, templ, result); + } + #ifdef BLOCK_VERSION template <> @@ -243,7 +252,8 @@ void cv::gpu::matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& re typedef void (*Caller)(const GpuMat&, const GpuMat&, GpuMat&); static const Caller callers8U[] = { ::matchTemplate, 0, 0, 0, 0, 0 }; - static const Caller callers32F[] = { 0, 0, ::matchTemplate, 0, 0, 0 }; + static const Caller callers32F[] = { ::matchTemplate, 0, + ::matchTemplate, 0, 0, 0 }; const Caller* callers; switch (image.type()) diff --git a/tests/gpu/src/match_template.cpp b/tests/gpu/src/match_template.cpp index 599f501f58..c87625564e 100644 --- a/tests/gpu/src/match_template.cpp +++ b/tests/gpu/src/match_template.cpp @@ -87,6 +87,16 @@ struct CV_GpuMatchTemplateTest: CvTest F(cout << "gpu_block: " << clock() - t << endl;) if (!check(dst_gold, Mat(dst), 5 * h * w * 1e-5f)) return; + gen(image, n, m, CV_32F); + gen(templ, h, w, CV_32F); + F(t = clock();) + matchTemplate(image, templ, dst_gold, CV_TM_SQDIFF); + F(cout << "cpu:" << clock() - t << endl;) + F(t = clock();) + gpu::matchTemplate(gpu::GpuMat(image), gpu::GpuMat(templ), dst, CV_TM_SQDIFF); + F(cout << "gpu_block: " << clock() - t << endl;) + if (!check(dst_gold, Mat(dst), 0.25f * h * w * 1e-5f)) return; + gen(image, n, m, CV_32F); gen(templ, h, w, CV_32F); F(t = clock();) @@ -136,48 +146,48 @@ struct CV_GpuMatchTemplateTest: CvTest return true; } - void match_template_naive_SQDIFF(const Mat& a, const Mat& b, Mat& c) - { - c.create(a.rows - b.rows + 1, a.cols - b.cols + 1, CV_32F); - for (int i = 0; i < c.rows; ++i) - { - for (int j = 0; j < c.cols; ++j) - { - float delta; - float sum = 0.f; - for (int y = 0; y < b.rows; ++y) - { - const unsigned char* arow = a.ptr(i + y); - const unsigned char* brow = b.ptr(y); - for (int x = 0; x < b.cols; ++x) - { - delta = (float)(arow[j + x] - brow[x]); - sum += delta * delta; - } - } - c.at(i, j) = sum; - } - } - } - - void match_template_naive_CCORR(const Mat& a, const Mat& b, Mat& c) - { - c.create(a.rows - b.rows + 1, a.cols - b.cols + 1, CV_32F); - for (int i = 0; i < c.rows; ++i) - { - for (int j = 0; j < c.cols; ++j) - { - float sum = 0.f; - for (int y = 0; y < b.rows; ++y) - { - const float* arow = a.ptr(i + y); - const float* brow = b.ptr(y); - for (int x = 0; x < b.cols; ++x) - sum += arow[j + x] * brow[x]; - } - c.at(i, j) = sum; - } - } - } + //void match_template_naive_SQDIFF(const Mat& a, const Mat& b, Mat& c) + //{ + // c.create(a.rows - b.rows + 1, a.cols - b.cols + 1, CV_32F); + // for (int i = 0; i < c.rows; ++i) + // { + // for (int j = 0; j < c.cols; ++j) + // { + // float delta; + // float sum = 0.f; + // for (int y = 0; y < b.rows; ++y) + // { + // const unsigned char* arow = a.ptr(i + y); + // const unsigned char* brow = b.ptr(y); + // for (int x = 0; x < b.cols; ++x) + // { + // delta = (float)(arow[j + x] - brow[x]); + // sum += delta * delta; + // } + // } + // c.at(i, j) = sum; + // } + // } + //} + + //void match_template_naive_CCORR(const Mat& a, const Mat& b, Mat& c) + //{ + // c.create(a.rows - b.rows + 1, a.cols - b.cols + 1, CV_32F); + // for (int i = 0; i < c.rows; ++i) + // { + // for (int j = 0; j < c.cols; ++j) + // { + // float sum = 0.f; + // for (int y = 0; y < b.rows; ++y) + // { + // const float* arow = a.ptr(i + y); + // const float* brow = b.ptr(y); + // for (int x = 0; x < b.cols; ++x) + // sum += arow[j + x] * brow[x]; + // } + // c.at(i, j) = sum; + // } + // } + //} } match_template_test;