diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp index a55bc50acf..e1ac3d2fa8 100644 --- a/modules/gpu/src/arithm.cpp +++ b/modules/gpu/src/arithm.cpp @@ -262,17 +262,39 @@ void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst) //////////////////////////////////////////////////////////////////////// // transpose +namespace cv { namespace gpu { namespace mathfunc +{ + template + void transpose_gpu(const DevMem2D& src, const DevMem2D& dst); +}}} + void cv::gpu::transpose(const GpuMat& src, GpuMat& dst) { - CV_Assert(src.type() == CV_8UC1); + using namespace cv::gpu::mathfunc; + typedef void (*func_t)(const DevMem2D& src, const DevMem2D& dst); + static const func_t funcs[] = + { + transpose_gpu, transpose_gpu, transpose_gpu, transpose_gpu, + transpose_gpu, transpose_gpu + }; + + CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC4 || src.type() == CV_8SC4 + || src.type() == CV_16UC2 || src.type() == CV_16SC2 || src.type() == CV_32SC1 || src.type() == CV_32FC1); dst.create( src.cols, src.rows, src.type() ); - NppiSize sz; - sz.width = src.cols; - sz.height = src.rows; + if (src.type() == CV_8UC1) + { + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; - nppSafeCall( nppiTranspose_8u_C1R(src.ptr(), src.step, dst.ptr(), dst.step, sz) ); + nppSafeCall( nppiTranspose_8u_C1R(src.ptr(), src.step, dst.ptr(), dst.step, sz) ); + } + else + { + funcs[src.depth()](src, dst); + } } //////////////////////////////////////////////////////////////////////// diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu index 72775dd194..1bf811a59d 100644 --- a/modules/gpu/src/cuda/mathfunc.cu +++ b/modules/gpu/src/cuda/mathfunc.cu @@ -1250,4 +1250,49 @@ namespace cv { namespace gpu { namespace mathfunc } // namespace countnonzero +////////////////////////////////////////////////////////////////////////////////////////////////////////// +// transpose + + template + __global__ void transpose(const DevMem2D_ src, PtrStep_ dst) + { + __shared__ T s_mem[16 * 17]; + + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + int smem_idx = threadIdx.y * blockDim.x + threadIdx.x + threadIdx.y; + + if (y < src.rows && x < src.cols) + { + s_mem[smem_idx] = src.ptr(y)[x]; + } + __syncthreads(); + + smem_idx = threadIdx.x * blockDim.x + threadIdx.y + threadIdx.x; + + x = blockIdx.y * blockDim.x + threadIdx.x; + y = blockIdx.x * blockDim.y + threadIdx.y; + + if (y < src.cols && x < src.rows) + { + dst.ptr(y)[x] = s_mem[smem_idx]; + } + } + + template + void transpose_gpu(const DevMem2D& src, const DevMem2D& dst) + { + dim3 threads(16, 16, 1); + dim3 grid(divUp(src.cols, 16), divUp(src.rows, 16), 1); + + transpose<<>>((DevMem2D_)src, (DevMem2D_)dst); + cudaSafeCall( cudaThreadSynchronize() ); + } + + template void transpose_gpu(const DevMem2D& src, const DevMem2D& dst); + template void transpose_gpu(const DevMem2D& src, const DevMem2D& dst); + template void transpose_gpu(const DevMem2D& src, const DevMem2D& dst); + template void transpose_gpu(const DevMem2D& src, const DevMem2D& dst); + template void transpose_gpu(const DevMem2D& src, const DevMem2D& dst); + template void transpose_gpu(const DevMem2D& src, const DevMem2D& dst); }}} diff --git a/tests/gpu/src/arithm.cpp b/tests/gpu/src/arithm.cpp index 31722bacb8..29943c7812 100644 --- a/tests/gpu/src/arithm.cpp +++ b/tests/gpu/src/arithm.cpp @@ -257,7 +257,7 @@ struct CV_GpuNppImageTransposeTest : public CV_GpuArithmTest int test( const Mat& mat1, const Mat& ) { - if (mat1.type() != CV_8UC1) + if (mat1.type() != CV_8UC1 && mat1.type() != CV_8UC4 && mat1.type() != CV_32FC1) { ts->printf(CvTS::LOG, "\tUnsupported type\t"); return CvTS::OK;