diff --git a/modules/gpu/src/cuda/imgproc.cu b/modules/gpu/src/cuda/imgproc.cu index ece304d939..72053ad3b7 100644 --- a/modules/gpu/src/cuda/imgproc.cu +++ b/modules/gpu/src/cuda/imgproc.cu @@ -373,49 +373,17 @@ namespace cv { namespace gpu { namespace device reprojectImageTo3D_caller(disp, xyzw, q, stream); } - //////////////////////////////////////// Extract Cov Data //////////////////////////////////////////////// - - __global__ void extractCovData_kernel(const int cols, const int rows, const PtrStepf Dx, - const PtrStepf Dy, PtrStepf dst) - { - const int x = blockIdx.x * blockDim.x + threadIdx.x; - const int y = blockIdx.y * blockDim.y + threadIdx.y; - - if (x < cols && y < rows) - { - float dx = Dx.ptr(y)[x]; - float dy = Dy.ptr(y)[x]; - - dst.ptr(y)[x] = dx * dx; - dst.ptr(y + rows)[x] = dx * dy; - dst.ptr(y + (rows << 1))[x] = dy * dy; - } - } - - void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst, cudaStream_t stream) - { - dim3 threads(32, 8); - dim3 grid(divUp(Dx.cols, threads.x), divUp(Dx.rows, threads.y)); - - extractCovData_kernel<<>>(Dx.cols, Dx.rows, Dx, Dy, dst); - cudaSafeCall( cudaGetLastError() ); - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } - /////////////////////////////////////////// Corner Harris ///////////////////////////////////////////////// - texture harrisDxTex; - texture harrisDyTex; + texture harrisDxTex(0, cudaFilterModePoint, cudaAddressModeClamp); + texture harrisDyTex(0, cudaFilterModePoint, cudaAddressModeClamp); - __global__ void cornerHarris_kernel(const int cols, const int rows, const int block_size, const float k, - PtrStepb dst) + __global__ void cornerHarris_kernel(const int block_size, const float k, DevMem2Df dst) { - const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; - const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; - if (x < cols && y < rows) + if (x < dst.cols && y < dst.rows) { float a = 0.f; float b = 0.f; @@ -432,24 +400,24 @@ namespace cv { namespace gpu { namespace device { float dx = tex2D(harrisDxTex, j, i); float dy = tex2D(harrisDyTex, j, i); + a += dx * dx; b += dx * dy; c += dy * dy; } } - ((float*)dst.ptr(y))[x] = a * c - b * b - k * (a + c) * (a + c); + dst(y, x) = a * c - b * b - k * (a + c) * (a + c); } } template - __global__ void cornerHarris_kernel(const int cols, const int rows, const int block_size, const float k, - PtrStepb dst, BR border_row, BC border_col) + __global__ void cornerHarris_kernel(const int block_size, const float k, DevMem2Df dst, const BR border_row, const BC border_col) { - const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; - const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; - if (x < cols && y < rows) + if (x < dst.cols && y < dst.rows) { float a = 0.f; float b = 0.f; @@ -462,50 +430,45 @@ namespace cv { namespace gpu { namespace device for (int i = ibegin; i < iend; ++i) { - int y = border_col.idx_row(i); + const int y = border_col.idx_row(i); + for (int j = jbegin; j < jend; ++j) { - int x = border_row.idx_col(j); + const int x = border_row.idx_col(j); + float dx = tex2D(harrisDxTex, x, y); float dy = tex2D(harrisDyTex, x, y); + a += dx * dx; b += dx * dy; c += dy * dy; } } - ((float*)dst.ptr(y))[x] = a * c - b * b - k * (a + c) * (a + c); + dst(y, x) = a * c - b * b - k * (a + c) * (a + c); } } - void cornerHarris_caller(const int block_size, const float k, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, - int border_type, cudaStream_t stream) + void cornerHarris_gpu(int block_size, float k, DevMem2Df Dx, DevMem2Df Dy, DevMem2Df dst, int border_type, cudaStream_t stream) { - const int rows = Dx.rows; - const int cols = Dx.cols; + dim3 block(32, 8); + dim3 grid(divUp(Dx.cols, block.x), divUp(Dx.rows, block.y)); - dim3 threads(32, 8); - dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y)); - - cudaChannelFormatDesc desc = cudaCreateChannelDesc(); - cudaBindTexture2D(0, harrisDxTex, Dx.data, desc, Dx.cols, Dx.rows, Dx.step); - cudaBindTexture2D(0, harrisDyTex, Dy.data, desc, Dy.cols, Dy.rows, Dy.step); - harrisDxTex.filterMode = cudaFilterModePoint; - harrisDyTex.filterMode = cudaFilterModePoint; + bindTexture(&harrisDxTex, Dx); + bindTexture(&harrisDyTex, Dy); switch (border_type) { case BORDER_REFLECT101_GPU: - cornerHarris_kernel<<>>( - cols, rows, block_size, k, dst, BrdRowReflect101(cols), BrdColReflect101(rows)); + cornerHarris_kernel<<>>(block_size, k, dst, BrdRowReflect101(Dx.cols), BrdColReflect101(Dx.rows)); break; - case BORDER_REPLICATE_GPU: - harrisDxTex.addressMode[0] = cudaAddressModeClamp; - harrisDxTex.addressMode[1] = cudaAddressModeClamp; - harrisDyTex.addressMode[0] = cudaAddressModeClamp; - harrisDyTex.addressMode[1] = cudaAddressModeClamp; - cornerHarris_kernel<<>>(cols, rows, block_size, k, dst); + case BORDER_REFLECT_GPU: + cornerHarris_kernel<<>>(block_size, k, dst, BrdRowReflect(Dx.cols), BrdColReflect(Dx.rows)); + break; + + case BORDER_REPLICATE_GPU: + cornerHarris_kernel<<>>(block_size, k, dst); break; } @@ -513,23 +476,19 @@ namespace cv { namespace gpu { namespace device if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); - - //cudaSafeCall(cudaUnbindTexture(harrisDxTex)); - //cudaSafeCall(cudaUnbindTexture(harrisDyTex)); } /////////////////////////////////////////// Corner Min Eigen Val ///////////////////////////////////////////////// - texture minEigenValDxTex; - texture minEigenValDyTex; + texture minEigenValDxTex(0, cudaFilterModePoint, cudaAddressModeClamp); + texture minEigenValDyTex(0, cudaFilterModePoint, cudaAddressModeClamp); - __global__ void cornerMinEigenVal_kernel(const int cols, const int rows, const int block_size, - PtrStepb dst) + __global__ void cornerMinEigenVal_kernel(const int block_size, DevMem2Df dst) { - const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; - const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; - if (x < cols && y < rows) + if (x < dst.cols && y < dst.rows) { float a = 0.f; float b = 0.f; @@ -546,6 +505,7 @@ namespace cv { namespace gpu { namespace device { float dx = tex2D(minEigenValDxTex, j, i); float dy = tex2D(minEigenValDyTex, j, i); + a += dx * dx; b += dx * dy; c += dy * dy; @@ -554,19 +514,19 @@ namespace cv { namespace gpu { namespace device a *= 0.5f; c *= 0.5f; - ((float*)dst.ptr(y))[x] = (a + c) - sqrtf((a - c) * (a - c) + b * b); + + dst(y, x) = (a + c) - sqrtf((a - c) * (a - c) + b * b); } } template - __global__ void cornerMinEigenVal_kernel(const int cols, const int rows, const int block_size, - PtrStepb dst, BR border_row, BC border_col) + __global__ void cornerMinEigenVal_kernel(const int block_size, DevMem2Df dst, const BR border_row, const BC border_col) { - const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; - const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; - if (x < cols && y < rows) + if (x < dst.cols && y < dst.rows) { float a = 0.f; float b = 0.f; @@ -580,11 +540,14 @@ namespace cv { namespace gpu { namespace device for (int i = ibegin; i < iend; ++i) { int y = border_col.idx_row(i); + for (int j = jbegin; j < jend; ++j) { int x = border_row.idx_col(j); + float dx = tex2D(minEigenValDxTex, x, y); float dy = tex2D(minEigenValDyTex, x, y); + a += dx * dx; b += dx * dy; c += dy * dy; @@ -593,38 +556,31 @@ namespace cv { namespace gpu { namespace device a *= 0.5f; c *= 0.5f; - ((float*)dst.ptr(y))[x] = (a + c) - sqrtf((a - c) * (a - c) + b * b); + + dst(y, x) = (a + c) - sqrtf((a - c) * (a - c) + b * b); } } - void cornerMinEigenVal_caller(const int block_size, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, - int border_type, cudaStream_t stream) + void cornerMinEigenVal_gpu(int block_size, DevMem2Df Dx, DevMem2Df Dy, DevMem2Df dst, int border_type, cudaStream_t stream) { - const int rows = Dx.rows; - const int cols = Dx.cols; - - dim3 threads(32, 8); - dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y)); - - cudaChannelFormatDesc desc = cudaCreateChannelDesc(); - cudaBindTexture2D(0, minEigenValDxTex, Dx.data, desc, Dx.cols, Dx.rows, Dx.step); - cudaBindTexture2D(0, minEigenValDyTex, Dy.data, desc, Dy.cols, Dy.rows, Dy.step); - minEigenValDxTex.filterMode = cudaFilterModePoint; - minEigenValDyTex.filterMode = cudaFilterModePoint; + dim3 block(32, 8); + dim3 grid(divUp(Dx.cols, block.x), divUp(Dx.rows, block.y)); + + bindTexture(&minEigenValDxTex, Dx); + bindTexture(&minEigenValDyTex, Dy); switch (border_type) { case BORDER_REFLECT101_GPU: - cornerMinEigenVal_kernel<<>>( - cols, rows, block_size, dst, BrdRowReflect101(cols), BrdColReflect101(rows)); + cornerMinEigenVal_kernel<<>>(block_size, dst, BrdRowReflect101(Dx.cols), BrdColReflect101(Dx.rows)); break; - case BORDER_REPLICATE_GPU: - minEigenValDxTex.addressMode[0] = cudaAddressModeClamp; - minEigenValDxTex.addressMode[1] = cudaAddressModeClamp; - minEigenValDyTex.addressMode[0] = cudaAddressModeClamp; - minEigenValDyTex.addressMode[1] = cudaAddressModeClamp; - cornerMinEigenVal_kernel<<>>(cols, rows, block_size, dst); + case BORDER_REFLECT_GPU: + cornerMinEigenVal_kernel<<>>(block_size, dst, BrdRowReflect(Dx.cols), BrdColReflect(Dx.rows)); + break; + + case BORDER_REPLICATE_GPU: + cornerMinEigenVal_kernel<<>>(block_size, dst); break; } @@ -632,9 +588,6 @@ namespace cv { namespace gpu { namespace device if (stream == 0) cudaSafeCall(cudaDeviceSynchronize()); - - //cudaSafeCall(cudaUnbindTexture(minEigenValDxTex)); - //cudaSafeCall(cudaUnbindTexture(minEigenValDyTex)); } ////////////////////////////// Column Sum ////////////////////////////////////// diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp index e4410eced4..e5ea90c6b9 100644 --- a/modules/gpu/src/imgproc.cpp +++ b/modules/gpu/src/imgproc.cpp @@ -1344,22 +1344,23 @@ namespace cv { namespace gpu { namespace device { namespace imgproc { - void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst, cudaStream_t stream); - void cornerHarris_caller(const int block_size, const float k, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type, cudaStream_t stream); - void cornerMinEigenVal_caller(const int block_size, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type, cudaStream_t stream); + void cornerHarris_gpu(int block_size, float k, DevMem2Df Dx, DevMem2Df Dy, DevMem2Df dst, int border_type, cudaStream_t stream); + void cornerMinEigenVal_gpu(int block_size, DevMem2Df Dx, DevMem2Df Dy, DevMem2Df dst, int border_type, cudaStream_t stream); } }}} namespace { - template void extractCovData(const GpuMat& src, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream) - { - double scale = (double)(1 << ((ksize > 0 ? ksize : 3) - 1)) * blockSize; + { + double scale = static_cast(1 << ((ksize > 0 ? ksize : 3) - 1)) * blockSize; + if (ksize < 0) scale *= 2.; + if (src.depth() == CV_8U) scale *= 255.; + scale = 1./scale; Dx.create(src.size(), CV_32F); @@ -1376,23 +1377,7 @@ namespace Scharr(src, Dy, CV_32F, 0, 1, buf, scale, borderType, -1, stream); } } - - void extractCovData(const GpuMat& src, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream) - { - switch (src.type()) - { - case CV_8U: - extractCovData(src, Dx, Dy, buf, blockSize, ksize, borderType, stream); - break; - case CV_32F: - extractCovData(src, Dx, Dy, buf, blockSize, ksize, borderType, stream); - break; - default: - CV_Error(CV_StsBadArg, "extractCovData: unsupported type of the source matrix"); - } - } - -} // Anonymous namespace +} bool cv::gpu::tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType) { @@ -1433,17 +1418,18 @@ void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& D void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, double k, int borderType, Stream& stream) { - using namespace ::cv::gpu::device::imgproc; + using namespace cv::gpu::device::imgproc; - CV_Assert(borderType == cv::BORDER_REFLECT101 || - borderType == cv::BORDER_REPLICATE); + CV_Assert(borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT); int gpuBorderType; CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType)); extractCovData(src, Dx, Dy, buf, blockSize, ksize, borderType, stream); + dst.create(src.size(), CV_32F); - cornerHarris_caller(blockSize, (float)k, Dx, Dy, dst, gpuBorderType, StreamAccessor::getStream(stream)); + + cornerHarris_gpu(blockSize, static_cast(k), Dx, Dy, dst, gpuBorderType, StreamAccessor::getStream(stream)); } void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType) @@ -1462,15 +1448,16 @@ void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuM { using namespace ::cv::gpu::device::imgproc; - CV_Assert(borderType == cv::BORDER_REFLECT101 || - borderType == cv::BORDER_REPLICATE); + CV_Assert(borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT); int gpuBorderType; CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType)); - extractCovData(src, Dx, Dy, buf, blockSize, ksize, borderType, stream); + extractCovData(src, Dx, Dy, buf, blockSize, ksize, borderType, stream); + dst.create(src.size(), CV_32F); - cornerMinEigenVal_caller(blockSize, Dx, Dy, dst, gpuBorderType, StreamAccessor::getStream(stream)); + + cornerMinEigenVal_gpu(blockSize, Dx, Dy, dst, gpuBorderType, StreamAccessor::getStream(stream)); } ////////////////////////////////////////////////////////////////////////////// diff --git a/modules/gpu/test/test_imgproc.cpp b/modules/gpu/test/test_imgproc.cpp index e78a69ac24..04a4cc6f5e 100644 --- a/modules/gpu/test/test_imgproc.cpp +++ b/modules/gpu/test/test_imgproc.cpp @@ -2774,13 +2774,13 @@ TEST_P(CornerHarris, Accuracy) dev_dst.download(dst); ); - EXPECT_MAT_NEAR(dst_gold, dst, 1e-3); + EXPECT_MAT_NEAR(dst_gold, dst, 0.02); } INSTANTIATE_TEST_CASE_P(ImgProc, CornerHarris, Combine( ALL_DEVICES, Values(CV_8UC1, CV_32FC1), - Values((int) cv::BORDER_REFLECT101, (int) cv::BORDER_REPLICATE))); + Values((int) cv::BORDER_REFLECT101, (int) cv::BORDER_REPLICATE, (int) cv::BORDER_REFLECT))); /////////////////////////////////////////////////////////////////////////////////////////////////////// // cornerMinEigen @@ -2829,13 +2829,13 @@ TEST_P(CornerMinEigen, Accuracy) dev_dst.download(dst); ); - EXPECT_MAT_NEAR(dst_gold, dst, 1e-2); + EXPECT_MAT_NEAR(dst_gold, dst, 0.02); } INSTANTIATE_TEST_CASE_P(ImgProc, CornerMinEigen, Combine( ALL_DEVICES, Values(CV_8UC1, CV_32FC1), - Values((int) cv::BORDER_REFLECT101, (int) cv::BORDER_REPLICATE))); + Values((int) cv::BORDER_REFLECT101, (int) cv::BORDER_REPLICATE, (int) cv::BORDER_REFLECT))); //////////////////////////////////////////////////////////////////////// // ColumnSum