From 57f7678db10ddbbb3bdbad73117e1353595ed0d4 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <no@email>
Date: Wed, 1 Dec 2010 07:00:50 +0000
Subject: [PATCH] added gpu transpose for CV_8UC4, CV_8SC4, CV_16SC2, CV_16UC2,
 CV_32SC1 and CV_32FC1 types

---
 modules/gpu/src/arithm.cpp       | 32 +++++++++++++++++++----
 modules/gpu/src/cuda/mathfunc.cu | 45 ++++++++++++++++++++++++++++++++
 tests/gpu/src/arithm.cpp         |  2 +-
 3 files changed, 73 insertions(+), 6 deletions(-)
diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp
index a55bc50acf..e1ac3d2fa8 100644
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -262,17 +262,39 @@ void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst)
 ////////////////////////////////////////////////////////////////////////
 // transpose
 
+namespace cv { namespace gpu { namespace mathfunc
+{
+    template <typename T>
+    void transpose_gpu(const DevMem2D& src, const DevMem2D& dst);
+}}}
+
 void cv::gpu::transpose(const GpuMat& src, GpuMat& dst)
 {
-    CV_Assert(src.type() == CV_8UC1);
+    using namespace cv::gpu::mathfunc;
+    typedef void (*func_t)(const DevMem2D& src, const DevMem2D& dst);
+    static const func_t funcs[] = 
+    {
+        transpose_gpu<uchar4>, transpose_gpu<char4>, transpose_gpu<ushort2>, transpose_gpu<short2>,
+        transpose_gpu<int>, transpose_gpu<float>
+    };
+
+    CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC4 || src.type() == CV_8SC4 
+        || src.type() == CV_16UC2 || src.type() == CV_16SC2 || src.type() == CV_32SC1 || src.type() == CV_32FC1);
 
     dst.create( src.cols, src.rows, src.type() );
 
-    NppiSize sz;
-    sz.width  = src.cols;
-    sz.height = src.rows;
+    if (src.type() == CV_8UC1)
+    {
+        NppiSize sz;
+        sz.width  = src.cols;
+        sz.height = src.rows;
 
-    nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz) );
+        nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz) );
+    }
+    else
+    {
+        funcs[src.depth()](src, dst);
+    }
 }
 
 ////////////////////////////////////////////////////////////////////////
diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu
index 72775dd194..1bf811a59d 100644
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -1250,4 +1250,49 @@ namespace cv { namespace gpu { namespace mathfunc
 
     } // namespace countnonzero
 
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+// transpose
+
+    template <typename T>
+    __global__ void transpose(const DevMem2D_<T> src, PtrStep_<T> dst)
+    {
+    	__shared__ T s_mem[16 * 17];
+
+    	int x = blockIdx.x * blockDim.x + threadIdx.x;
+    	int y = blockIdx.y * blockDim.y + threadIdx.y;
+	    int smem_idx = threadIdx.y * blockDim.x + threadIdx.x + threadIdx.y;
+
+	    if (y < src.rows && x < src.cols)
+	    {
+            s_mem[smem_idx] = src.ptr(y)[x];
+	    }
+	    __syncthreads();
+
+	    smem_idx = threadIdx.x * blockDim.x + threadIdx.y + threadIdx.x;
+
+	    x = blockIdx.y * blockDim.x + threadIdx.x;
+	    y = blockIdx.x * blockDim.y + threadIdx.y;
+
+	    if (y < src.cols && x < src.rows)
+	    {
+		    dst.ptr(y)[x] = s_mem[smem_idx];
+	    }
+    }
+
+    template <typename T>
+    void transpose_gpu(const DevMem2D& src, const DevMem2D& dst)
+    {
+	    dim3 threads(16, 16, 1);
+	    dim3 grid(divUp(src.cols, 16), divUp(src.rows, 16), 1);
+
+	    transpose<T><<<grid, threads>>>((DevMem2D_<T>)src, (DevMem2D_<T>)dst);
+        cudaSafeCall( cudaThreadSynchronize() );
+    }
+
+    template void transpose_gpu<uchar4 >(const DevMem2D& src, const DevMem2D& dst);
+    template void transpose_gpu<char4  >(const DevMem2D& src, const DevMem2D& dst);
+    template void transpose_gpu<ushort2>(const DevMem2D& src, const DevMem2D& dst);
+    template void transpose_gpu<short2 >(const DevMem2D& src, const DevMem2D& dst);
+    template void transpose_gpu<int    >(const DevMem2D& src, const DevMem2D& dst);
+    template void transpose_gpu<float  >(const DevMem2D& src, const DevMem2D& dst);
 }}}
diff --git a/tests/gpu/src/arithm.cpp b/tests/gpu/src/arithm.cpp
index 31722bacb8..29943c7812 100644
--- a/tests/gpu/src/arithm.cpp
+++ b/tests/gpu/src/arithm.cpp
@@ -257,7 +257,7 @@ struct CV_GpuNppImageTransposeTest : public CV_GpuArithmTest
 
     int test( const Mat& mat1, const Mat& )
     {
-        if (mat1.type() != CV_8UC1)
+        if (mat1.type() != CV_8UC1 && mat1.type() != CV_8UC4 && mat1.type() != CV_32FC1)
         {
             ts->printf(CvTS::LOG, "\tUnsupported type\t");
             return CvTS::OK;