From 98c8ecf82926884757b1bf380a18dddbe797fc83 Mon Sep 17 00:00:00 2001 From: Andrey Morozov Date: Thu, 22 Jul 2010 12:42:42 +0000 Subject: [PATCH] modified kernel setto(), added double type, code has been improved --- modules/gpu/src/cuda/cuda_shared.hpp | 4 +- modules/gpu/src/cuda/matrix_operations.cu | 232 +++++++++++----------- modules/gpu/src/matrix_operations.cpp | 8 +- tests/gpu/src/operator_set_to.cpp | 116 ++--------- 4 files changed, 134 insertions(+), 226 deletions(-) diff --git a/modules/gpu/src/cuda/cuda_shared.hpp b/modules/gpu/src/cuda/cuda_shared.hpp index 469314d170..aa9497bc94 100644 --- a/modules/gpu/src/cuda/cuda_shared.hpp +++ b/modules/gpu/src/cuda/cuda_shared.hpp @@ -61,8 +61,8 @@ namespace cv { static inline int divUp(int a, int b) { return (a % b == 0) ? a/b : a/b + 1; } - extern "C" void set_to_without_mask (const DevMem2D& mat, const double * scalar, int depth, int channels); - extern "C" void set_to_with_mask (const DevMem2D& mat, const double * scalar, const DevMem2D& mask, int depth, int channels); + extern "C" void set_to_without_mask (const DevMem2D& mat, int depth, const double * scalar, int channels); + extern "C" void set_to_with_mask (const DevMem2D& mat, int depth, const double * scalar, const DevMem2D& mask, int channels); extern "C" void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, size_t width, size_t height, double alpha, double beta); } diff --git a/modules/gpu/src/cuda/matrix_operations.cu b/modules/gpu/src/cuda/matrix_operations.cu index 6f9d0c49ef..56ae283764 100644 --- a/modules/gpu/src/cuda/matrix_operations.cu +++ b/modules/gpu/src/cuda/matrix_operations.cu @@ -49,16 +49,16 @@ using namespace cv::gpu; using namespace cv::gpu::impl; -__constant__ __align__(16) float scalar_d[4]; +__constant__ __align__(16) double scalar_d[4]; namespace mat_operators { ////////////////////////////////////////////////////////// // SetTo ////////////////////////////////////////////////////////// - - template - __global__ void kernel_set_to_without_mask(T * mat, int cols, int rows, int step) + + template + __global__ void kernel_set_to_without_mask(T * mat, int cols, int rows, int step, int channels) { size_t x = blockIdx.x * blockDim.x + threadIdx.x; size_t y = blockIdx.y * blockDim.y + threadIdx.y; @@ -70,21 +70,21 @@ namespace mat_operators } } - template - __global__ void kernel_set_to_with_mask(T * mat, const unsigned char * mask, int cols, int rows, int step, int step_mask) + template + __global__ void kernel_set_to_with_mask(T * mat, const unsigned char * mask, int cols, int rows, int step, int channels, int step_mask) { size_t x = blockIdx.x * blockDim.x + threadIdx.x; size_t y = blockIdx.y * blockDim.y + threadIdx.y; - if (mask[y * step_mask + x] != 0) - if ((x < cols * channels ) && (y < rows)) + if ((x < cols * channels ) && (y < rows)) + if (mask[y * step_mask + x / channels] != 0) { size_t idx = y * (step / sizeof(T)) + x; mat[idx] = scalar_d[ x % channels ]; } } - + ////////////////////////////////////////////////////////// // ConvertTo ////////////////////////////////////////////////////////// @@ -109,7 +109,7 @@ namespace mat_operators return dim3(divUp(width, block.x), divUp(height, block.y)); } }; - + template struct Converter { @@ -128,7 +128,7 @@ namespace mat_operators const T* src1b = (const T*) &src4b.x; DT* dst1b = (DT*) &dst4b.x; - + dst1b[0] = (DT)__double2int_rn(alpha * src1b[0] + beta); dst1b[1] = (DT)__double2int_rn(alpha * src1b[1] + beta); dst1b[2] = (DT)__double2int_rn(alpha * src1b[2] + beta); @@ -154,7 +154,7 @@ namespace mat_operators return dim3(divUp(width, block.x << 2), divUp(height, block.y)); } };/**/ - + template struct Converter { @@ -190,7 +190,7 @@ namespace mat_operators return dim3(divUp(width, block.x << 1), divUp(height, block.y)); } };/**/ - + template struct Converter { @@ -203,7 +203,7 @@ namespace mat_operators const T* src = (const T*)(srcmat + src_step * y); DT* dst = (DT*)(dstmat + dst_step * y); if ((x << 2) + 3 < width) - { + { ushort4 src4s = ((const ushort4*)src)[x]; uchar4 dst4b; @@ -232,7 +232,7 @@ namespace mat_operators return dim3(divUp(width, block.x << 2), divUp(height, block.y)); } };/**/ - + template struct Converter { @@ -268,7 +268,7 @@ namespace mat_operators return dim3(divUp(width, block.x << 1), divUp(height, block.y)); } };/**/ - + template struct Converter { @@ -289,7 +289,7 @@ namespace mat_operators return dim3(divUp(width, block.x), divUp(height, block.y)); } }; - + template struct Converter { @@ -309,116 +309,116 @@ namespace mat_operators { return dim3(divUp(width, block.x), divUp(height, block.y)); } - }; - - template + }; + + template __global__ static void kernel_convert_to(uchar* srcmat, size_t src_step, uchar* dstmat, size_t dst_step, size_t width, size_t height, double alpha, double beta) { Converter::convert(srcmat, src_step, dstmat, dst_step, width, height, alpha, beta); } - -} // namespace mat_operators - -////////////////////////////////////////////////////////////// -// SetTo -////////////////////////////////////////////////////////////// - -extern "C" void cv::gpu::impl::set_to_without_mask(const DevMem2D& mat, const double * scalar, int elemSize1, int channels) -{ - float data[4]; - data[0] = static_cast(scalar[0]); - data[1] = static_cast(scalar[1]); - data[2] = static_cast(scalar[2]); - data[3] = static_cast(scalar[3]); - cudaSafeCall( cudaMemcpyToSymbol(scalar_d, &data, sizeof(data))); - - dim3 threadsPerBlock(16, 16, 1); - dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1); - - if (channels == 1) - { - if (elemSize1 == 1) ::mat_operators::kernel_set_to_without_mask<<>>(mat.ptr, mat.cols, mat.rows, mat.step); - if (elemSize1 == 2) ::mat_operators::kernel_set_to_without_mask<<>>((unsigned short *)mat.ptr, mat.cols, mat.rows, mat.step); - if (elemSize1 == 4) ::mat_operators::kernel_set_to_without_mask<<>>((float *)mat.ptr, mat.cols, mat.rows, mat.step); - } - if (channels == 2) - { - if (elemSize1 == 1) ::mat_operators::kernel_set_to_without_mask<<>>(mat.ptr, mat.cols, mat.rows, mat.step); - if (elemSize1 == 2) ::mat_operators::kernel_set_to_without_mask<<>>((unsigned short *)mat.ptr, mat.cols, mat.rows, mat.step); - if (elemSize1 == 4) ::mat_operators::kernel_set_to_without_mask<<>>((float *)mat.ptr, mat.cols, mat.rows, mat.step); - } - if (channels == 3) - { - if (elemSize1 == 1) ::mat_operators::kernel_set_to_without_mask<<>>(mat.ptr, mat.cols, mat.rows, mat.step); - if (elemSize1 == 2) ::mat_operators::kernel_set_to_without_mask<<>>((unsigned short *)mat.ptr, mat.cols, mat.rows, mat.step); - if (elemSize1 == 4) ::mat_operators::kernel_set_to_without_mask<<>>((float *)mat.ptr, mat.cols, mat.rows, mat.step); - } - if (channels == 4) - { - if (elemSize1 == 1) ::mat_operators::kernel_set_to_without_mask<<>>(mat.ptr, mat.cols, mat.rows, mat.step); - if (elemSize1 == 2) ::mat_operators::kernel_set_to_without_mask<<>>((unsigned short *)mat.ptr, mat.cols, mat.rows, mat.step); - if (elemSize1 == 4) ::mat_operators::kernel_set_to_without_mask<<>>((float *)mat.ptr, mat.cols, mat.rows, mat.step); - } - cudaSafeCall ( cudaThreadSynchronize() ); -} +} // namespace mat_operators -extern "C" void cv::gpu::impl::set_to_with_mask(const DevMem2D& mat, const double * scalar, const DevMem2D& mask, int elemSize1, int channels) +namespace cv { - float data[4]; - data[0] = static_cast(scalar[0]); - data[1] = static_cast(scalar[1]); - data[2] = static_cast(scalar[2]); - data[3] = static_cast(scalar[3]); - cudaSafeCall( cudaMemcpyToSymbol(scalar_d, &data, sizeof(data))); - - dim3 threadsPerBlock(16, 16, 1); - dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1); - - if (channels == 1) - { - if (elemSize1 == 1) ::mat_operators::kernel_set_to_with_mask<<>>(mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, mask.step); - if (elemSize1 == 2) ::mat_operators::kernel_set_to_with_mask<<>>((unsigned short *)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, mask.step); - if (elemSize1 == 4) ::mat_operators::kernel_set_to_with_mask<<>>((float *)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, mask.step); - } - if (channels == 2) - { - if (elemSize1 == 1) ::mat_operators::kernel_set_to_with_mask<<>>(mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, mask.step); - if (elemSize1 == 2) ::mat_operators::kernel_set_to_with_mask<<>>((unsigned short *)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, mask.step); - if (elemSize1 == 4) ::mat_operators::kernel_set_to_with_mask<<>>((float *)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, mask.step); - } - if (channels == 3) - { - if (elemSize1 == 1) ::mat_operators::kernel_set_to_with_mask<<>>(mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, mask.step); - if (elemSize1 == 2) ::mat_operators::kernel_set_to_with_mask<<>>((unsigned short *)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, mask.step); - if (elemSize1 == 4) ::mat_operators::kernel_set_to_with_mask<<>>((float *)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, mask.step); - } - if (channels == 4) - { - if (elemSize1 == 1) ::mat_operators::kernel_set_to_with_mask<<>>(mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, mask.step); - if (elemSize1 == 2) ::mat_operators::kernel_set_to_with_mask<<>>((unsigned short *)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, mask.step); - if (elemSize1 == 4) ::mat_operators::kernel_set_to_with_mask<<>>((float *)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, mask.step); - } - - cudaSafeCall ( cudaThreadSynchronize() ); -} + namespace gpu + { + namespace impl + { + + ////////////////////////////////////////////////////////////// + // SetTo + ////////////////////////////////////////////////////////////// + + typedef void (*SetToFunc_with_mask)(const DevMem2D& mat, const DevMem2D& mask, int channels); + typedef void (*SetToFunc_without_mask)(const DevMem2D& mat, int channels); + + template + void set_to_with_mask_run(const DevMem2D& mat, const DevMem2D& mask, int channels) + { + dim3 threadsPerBlock(32, 8, 1); + dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1); + ::mat_operators::kernel_set_to_with_mask<<>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step); + cudaSafeCall ( cudaThreadSynchronize() ); + } + + template + void set_to_without_mask_run(const DevMem2D& mat, int channels) + { + dim3 threadsPerBlock(32, 8, 1); + dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1); + ::mat_operators::kernel_set_to_without_mask<<>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels); + cudaSafeCall ( cudaThreadSynchronize() ); + } + + extern "C" void set_to_without_mask(const DevMem2D& mat, int depth, const double * scalar, int channels) + { + double data[4]; + data[0] = scalar[0]; + data[1] = scalar[1]; + data[2] = scalar[2]; + data[3] = scalar[3]; + cudaSafeCall( cudaMemcpyToSymbol(scalar_d, &data, sizeof(data))); + + static SetToFunc_without_mask tab[8] = + { + set_to_without_mask_run, + set_to_without_mask_run, + set_to_without_mask_run, + set_to_without_mask_run, + set_to_without_mask_run, + set_to_without_mask_run, + set_to_without_mask_run, + 0 + }; + + SetToFunc_without_mask func = tab[depth]; + + if (func == 0) error("Operation \'ConvertTo\' doesn't supported on your GPU model", __FILE__, __LINE__); + + func(mat, channels); + } + + + extern "C" void set_to_with_mask(const DevMem2D& mat, int depth, const double * scalar, const DevMem2D& mask, int channels) + { + double data[4]; + data[0] = scalar[0]; + data[1] = scalar[1]; + data[2] = scalar[2]; + data[3] = scalar[3]; + cudaSafeCall( cudaMemcpyToSymbol(scalar_d, &data, sizeof(data))); + + static SetToFunc_with_mask tab[8] = + { + set_to_with_mask_run, + set_to_with_mask_run, + set_to_with_mask_run, + set_to_with_mask_run, + set_to_with_mask_run, + set_to_with_mask_run, + set_to_with_mask_run, + 0 + }; + + SetToFunc_with_mask func = tab[depth]; + + if (func == 0) error("Operation \'ConvertTo\' doesn't supported on your GPU model", __FILE__, __LINE__); + + func(mat, mask, channels); + } + + ////////////////////////////////////////////////////////////// + // ConvertTo + ////////////////////////////////////////////////////////////// -////////////////////////////////////////////////////////////// -// ConvertTo -////////////////////////////////////////////////////////////// -namespace cv -{ - namespace gpu - { - namespace impl - { typedef void (*CvtFunc)(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta); //#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 130) - template + template void cvt_(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta) { dim3 block(32, 8); @@ -462,7 +462,7 @@ namespace cv func(src, dst, width, height, alpha, beta); } } - - - } + + + } } diff --git a/modules/gpu/src/matrix_operations.cpp b/modules/gpu/src/matrix_operations.cpp index 867efdacd3..a29859b894 100644 --- a/modules/gpu/src/matrix_operations.cpp +++ b/modules/gpu/src/matrix_operations.cpp @@ -126,14 +126,14 @@ void cv::gpu::GpuMat::convertTo( GpuMat& dst, int rtype, double alpha, double be const GpuMat* psrc = this; if( sdepth != ddepth && psrc == &dst ) psrc = &(temp = *this); - + dst.create( size(), rtype ); impl::convert_to(*psrc, sdepth, dst, ddepth, psrc->cols * psrc->channels(), psrc->rows, alpha, beta); } GpuMat& GpuMat::operator = (const Scalar& s) { - cv::gpu::impl::set_to_without_mask(*this, s.val, this->elemSize1(), this->channels()); + cv::gpu::impl::set_to_without_mask( *this, this->depth(), s.val, this->channels()); return *this; } @@ -145,11 +145,11 @@ GpuMat& GpuMat::setTo(const Scalar& s, const GpuMat& mask) if (mask.empty()) { - cv::gpu::impl::set_to_without_mask(*this, s.val, this->elemSize1(), this->channels()); + cv::gpu::impl::set_to_without_mask( *this, this->depth(), s.val, this->channels()); } else { - cv::gpu::impl::set_to_with_mask(*this, s.val, mask, this->elemSize1(), this->channels()); + cv::gpu::impl::set_to_with_mask( *this, this->depth(), s.val, mask, this->channels()); } return *this; diff --git a/tests/gpu/src/operator_set_to.cpp b/tests/gpu/src/operator_set_to.cpp index d071004a3c..6abd63f1a7 100644 --- a/tests/gpu/src/operator_set_to.cpp +++ b/tests/gpu/src/operator_set_to.cpp @@ -24,19 +24,6 @@ class CV_GpuMatOpSetTo : public CvTest bool compare_matrix(cv::Mat & cpumat, gpu::GpuMat & gpumat); - bool test_cv_8u_c1(); - bool test_cv_8u_c2(); - bool test_cv_8u_c3(); - bool test_cv_8u_c4(); - - bool test_cv_16u_c4(); - - bool test_cv_32f_c1(); - bool test_cv_32f_c2(); - bool test_cv_32f_c3(); - bool test_cv_32f_c4(); - - private: int rows; int cols; @@ -45,13 +32,13 @@ class CV_GpuMatOpSetTo : public CvTest CV_GpuMatOpSetTo::CV_GpuMatOpSetTo(): CvTest( "GpuMatOperatorSetTo", "setTo" ) { - rows = 129; - cols = 127; + rows = 256; + cols = 124; - s.val[0] = 128.0; - s.val[1] = 128.0; - s.val[2] = 128.0; - s.val[3] = 128.0; + s.val[0] = 127.0; + s.val[1] = 127.0; + s.val[2] = 127.0; + s.val[3] = 127.0; //#define PRINT_MATRIX } @@ -99,95 +86,16 @@ bool CV_GpuMatOpSetTo::compare_matrix(cv::Mat & cpumat, gpu::GpuMat & gpumat) } } - -bool CV_GpuMatOpSetTo::test_cv_8u_c1() -{ - Mat cpumat(rows, cols, CV_8U, Scalar::all(0)); - GpuMat gpumat(cpumat); - - return compare_matrix(cpumat, gpumat); -} - -bool CV_GpuMatOpSetTo::test_cv_8u_c2() -{ - Mat cpumat(rows, cols, CV_8UC2, Scalar::all(0)); - GpuMat gpumat(cpumat); - - return compare_matrix(cpumat, gpumat); -} - -bool CV_GpuMatOpSetTo::test_cv_8u_c3() -{ - Mat cpumat(rows, cols, CV_8UC3, Scalar::all(0)); - GpuMat gpumat(cpumat); - - return compare_matrix(cpumat, gpumat); -} - -bool CV_GpuMatOpSetTo::test_cv_8u_c4() -{ - Mat cpumat(rows, cols, CV_8UC4, Scalar::all(0)); - GpuMat gpumat(cpumat); - - return compare_matrix(cpumat, gpumat); -} - -bool CV_GpuMatOpSetTo::test_cv_16u_c4() -{ - Mat cpumat(rows, cols, CV_16UC4, Scalar::all(0)); - GpuMat gpumat(cpumat); - - return compare_matrix(cpumat, gpumat); -} - - -bool CV_GpuMatOpSetTo::test_cv_32f_c1() -{ - Mat cpumat(rows, cols, CV_32F, Scalar::all(0)); - GpuMat gpumat(cpumat); - - return compare_matrix(cpumat, gpumat); -} - -bool CV_GpuMatOpSetTo::test_cv_32f_c2() -{ - Mat cpumat(rows, cols, CV_32FC2, Scalar::all(0)); - GpuMat gpumat(cpumat); - - return compare_matrix(cpumat, gpumat); -} - -bool CV_GpuMatOpSetTo::test_cv_32f_c3() -{ - Mat cpumat(rows, cols, CV_32FC3, Scalar::all(0)); - GpuMat gpumat(cpumat); - - return compare_matrix(cpumat, gpumat); -} - -bool CV_GpuMatOpSetTo::test_cv_32f_c4() -{ - Mat cpumat(rows, cols, CV_32FC4, Scalar::all(0)); - GpuMat gpumat(cpumat); - - return compare_matrix(cpumat, gpumat); -} - void CV_GpuMatOpSetTo::run( int /* start_from */) { bool is_test_good = true; - is_test_good &= test_cv_8u_c1(); - is_test_good &= test_cv_8u_c2(); - is_test_good &= test_cv_8u_c3(); - is_test_good &= test_cv_8u_c4(); - - is_test_good &= test_cv_16u_c4(); - - is_test_good &= test_cv_32f_c1(); - is_test_good &= test_cv_32f_c2(); - is_test_good &= test_cv_32f_c3(); - is_test_good &= test_cv_32f_c4(); + for (int i = 0; i < 7; i++) + { + Mat cpumat(rows, cols, i, Scalar::all(0)); + GpuMat gpumat(cpumat); + is_test_good &= compare_matrix(cpumat, gpumat); + } if (is_test_good == true) ts->set_failed_test_info(CvTS::OK);