implemented cv::gpu::merge and cv::gpu::split functions

15 years ago · b2cdb7fa39
parent 5a804717a7
commit b2cdb7fa39
4 changed files with 902 additions and 0 deletions
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@ -408,6 +408,30 @@ namespace cv
        //! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
        CV_EXPORTS void rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift = 0, double yShift = 0, int interpolation = INTER_LINEAR);

+        //! makes multi-channel array out of several single-channel arrays
+        CV_EXPORTS void merge(const GpuMat* src, size_t n, GpuMat& dst);
+
+        //! makes multi-channel array out of several single-channel arrays
+        CV_EXPORTS void merge(const vector<GpuMat>& src, GpuMat& dst);
+
+        //! makes multi-channel array out of several single-channel arrays (async version)
+        CV_EXPORTS void merge(const GpuMat* src, size_t n, GpuMat& dst, const Stream& stream);
+
+        //! makes multi-channel array out of several single-channel arrays (async version)
+        CV_EXPORTS void merge(const vector<GpuMat>& src, GpuMat& dst, const Stream& stream);
+
+        //! copies each plane of a multi-channel array to a dedicated array
+        CV_EXPORTS void split(const GpuMat& src, GpuMat* dst);
+
+        //! copies each plane of a multi-channel array to a dedicated array
+        CV_EXPORTS void split(const GpuMat& src, vector<GpuMat>& dst);
+
+        //! copies each plane of a multi-channel array to a dedicated array (async version)
+        CV_EXPORTS void split(const GpuMat& src, GpuMat* dst, const Stream& stream);
+
+        //! copies each plane of a multi-channel array to a dedicated array (async version)
+        CV_EXPORTS void split(const GpuMat& src, vector<GpuMat>& dst, const Stream& stream);
+
        ////////////////////////////// Image processing //////////////////////////////

        // DST[x,y] = SRC[xmap[x,y],ymap[x,y]] with bilinear interpolation.
--- a/modules/gpu/src/cuda/split_merge.cu
+++ b/modules/gpu/src/cuda/split_merge.cu
@ -0,0 +1,452 @@
+#include "opencv2/gpu/devmem2d.hpp"
+#include "cuda_shared.hpp"
+
+namespace cv { namespace gpu { namespace split_merge {
+
+    template <typename T, size_t elem_size = sizeof(T)>
+    struct TypeTraits 
+    {
+        typedef T type;
+        typedef T type2;
+        typedef T type3;
+        typedef T type4;
+    };
+
+    template <typename T>
+    struct TypeTraits<T, 1>
+    {
+        typedef char type;
+        typedef char2 type2;
+        typedef char3 type3;
+        typedef char4 type4;
+    };
+
+    template <typename T>
+    struct TypeTraits<T, 2>
+    {
+        typedef short type;
+        typedef short2 type2;
+        typedef short3 type3;
+        typedef short4 type4;
+    };
+
+    template <typename T>
+    struct TypeTraits<T, 4> 
+    {
+        typedef int type;
+        typedef int2 type2;
+        typedef int3 type3;
+        typedef int4 type4;
+    };
+
+    template <typename T>
+    struct TypeTraits<T, 8> 
+    {
+        typedef double type;
+        typedef double2 type2;
+        //typedef double3 type3;
+        //typedef double4 type3;
+    };
+
+    typedef void (*MergeFunction)(const DevMem2D* src, DevMem2D& dst, const cudaStream_t& stream);
+    typedef void (*SplitFunction)(const DevMem2D& src, DevMem2D* dst, const cudaStream_t& stream);
+
+    //------------------------------------------------------------
+    // Merge
+
+    template <typename T> 
+    static void mergeC2_(const DevMem2D* src, DevMem2D& dst, const cudaStream_t& stream)
+    {
+        dim3 blockDim(32, 8);
+        dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
+        mergeC2_<T><<<gridDim, blockDim, 0, stream>>>(
+                src[0].ptr, src[0].step, 
+                src[1].ptr, src[1].step,
+                dst.rows, dst.cols, dst.ptr, dst.step);
+        if (stream == 0)
+            cudaSafeCall(cudaThreadSynchronize());
+    }
+
+
+    template <typename T> 
+    static void mergeC3_(const DevMem2D* src, DevMem2D& dst, const cudaStream_t& stream)
+    {
+        dim3 blockDim(32, 8);
+        dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
+        mergeC3_<T><<<gridDim, blockDim, 0, stream>>>(
+                src[0].ptr, src[0].step, 
+                src[1].ptr, src[1].step,
+                src[2].ptr, src[2].step,
+                dst.rows, dst.cols, dst.ptr, dst.step);
+        if (stream == 0)
+            cudaSafeCall(cudaThreadSynchronize());
+    }
+
+
+    template <typename T> 
+    static void mergeC4_(const DevMem2D* src, DevMem2D& dst, const cudaStream_t& stream)
+    {
+        dim3 blockDim(32, 8);
+        dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
+        mergeC4_<T><<<gridDim, blockDim, 0, stream>>>(
+                src[0].ptr, src[0].step, 
+                src[1].ptr, src[1].step,
+                src[2].ptr, src[2].step,
+                src[3].ptr, src[3].step,
+                dst.rows, dst.cols, dst.ptr, dst.step);
+        if (stream == 0)
+            cudaSafeCall(cudaThreadSynchronize());
+    }
+
+
+    extern "C" void merge_caller(const DevMem2D* src, DevMem2D& dst, 
+                                 int total_channels, int elem_size, 
+                                 const cudaStream_t& stream) 
+    {
+        static MergeFunction merge_func_tbl[] = 
+        {
+            mergeC2_<char>, mergeC2_<short>, mergeC2_<int>, 0, mergeC2_<double>,
+            mergeC3_<char>, mergeC3_<short>, mergeC3_<int>, 0, mergeC3_<double>,
+            mergeC4_<char>, mergeC4_<short>, mergeC4_<int>, 0, mergeC4_<double>,
+        };
+
+        int merge_func_id = (total_channels - 2) * 5 + (elem_size >> 1);
+        MergeFunction merge_func = merge_func_tbl[merge_func_id];
+
+        if (merge_func == 0)
+            cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);
+
+        merge_func(src, dst, stream);
+    }
+
+
+    template <typename T>
+    __global__ void mergeC2_(const uchar* src0, size_t src0_step, 
+                             const uchar* src1, size_t src1_step, 
+                             int rows, int cols, uchar* dst, size_t dst_step)
+    {
+        typedef typename TypeTraits<T>::type2 dst_type;
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        const T* src0_y = (const T*)(src0 + y * src0_step);
+        const T* src1_y = (const T*)(src1 + y * src1_step);
+        dst_type* dst_y = (dst_type*)(dst + y * dst_step);
+
+        if (x < cols && y < rows) 
+        {                        
+            dst_type dst_elem;
+            dst_elem.x = src0_y[x];
+            dst_elem.y = src1_y[x];
+            dst_y[x] = dst_elem;
+        }
+    }
+
+
+    template <typename T>
+    __global__ void mergeC3_(const uchar* src0, size_t src0_step, 
+                             const uchar* src1, size_t src1_step, 
+                             const uchar* src2, size_t src2_step, 
+                             int rows, int cols, uchar* dst, size_t dst_step)
+    {
+        typedef typename TypeTraits<T>::type3 dst_type;
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        const T* src0_y = (const T*)(src0 + y * src0_step);
+        const T* src1_y = (const T*)(src1 + y * src1_step);
+        const T* src2_y = (const T*)(src2 + y * src2_step);
+        dst_type* dst_y = (dst_type*)(dst + y * dst_step);
+
+        if (x < cols && y < rows) 
+        {                        
+            dst_type dst_elem;
+            dst_elem.x = src0_y[x];
+            dst_elem.y = src1_y[x];
+            dst_elem.z = src2_y[x];
+            dst_y[x] = dst_elem;
+        }
+    }
+
+
+    template <>
+    __global__ void mergeC3_<double>(const uchar* src0, size_t src0_step, 
+                             const uchar* src1, size_t src1_step, 
+                             const uchar* src2, size_t src2_step, 
+                             int rows, int cols, uchar* dst, size_t dst_step)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        const double* src0_y = (const double*)(src0 + y * src0_step);
+        const double* src1_y = (const double*)(src1 + y * src1_step);
+        const double* src2_y = (const double*)(src2 + y * src2_step);
+        double* dst_y = (double*)(dst + y * dst_step);
+
+        if (x < cols && y < rows) 
+        {                        
+            dst_y[3 * x] = src0_y[x];
+            dst_y[3 * x + 1] = src1_y[x];
+            dst_y[3 * x + 2] = src2_y[x];
+        }
+    }
+
+
+    template <typename T>
+    __global__ void mergeC4_(const uchar* src0, size_t src0_step, 
+                             const uchar* src1, size_t src1_step, 
+                             const uchar* src2, size_t src2_step, 
+                             const uchar* src3, size_t src3_step, 
+                             int rows, int cols, uchar* dst, size_t dst_step)
+    {
+        typedef typename TypeTraits<T>::type4 dst_type;
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        const T* src0_y = (const T*)(src0 + y * src0_step);
+        const T* src1_y = (const T*)(src1 + y * src1_step);
+        const T* src2_y = (const T*)(src2 + y * src2_step);
+        const T* src3_y = (const T*)(src3 + y * src3_step);
+        dst_type* dst_y = (dst_type*)(dst + y * dst_step);
+
+        if (x < cols && y < rows) 
+        {                        
+            dst_type dst_elem;
+            dst_elem.x = src0_y[x];
+            dst_elem.y = src1_y[x];
+            dst_elem.z = src2_y[x];
+            dst_elem.w = src3_y[x];
+            dst_y[x] = dst_elem;
+        }
+    }
+
+
+    template <>
+    __global__ void mergeC4_<double>(const uchar* src0, size_t src0_step, 
+                             const uchar* src1, size_t src1_step, 
+                             const uchar* src2, size_t src2_step, 
+                             const uchar* src3, size_t src3_step, 
+                             int rows, int cols, uchar* dst, size_t dst_step)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        const double* src0_y = (const double*)(src0 + y * src0_step);
+        const double* src1_y = (const double*)(src1 + y * src1_step);
+        const double* src2_y = (const double*)(src2 + y * src2_step);
+        const double* src3_y = (const double*)(src3 + y * src3_step);
+        double2* dst_y = (double2*)(dst + y * dst_step);
+
+        if (x < cols && y < rows) 
+        {                        
+            dst_y[2 * x] = make_double2(src0_y[x], src1_y[x]);
+            dst_y[2 * x + 1] = make_double2(src2_y[x], src3_y[x]);
+        }
+    }
+
+    //------------------------------------------------------------
+    // Split
+
+
+    template <typename T> 
+    static void splitC2_(const DevMem2D& src, DevMem2D* dst, const cudaStream_t& stream)
+    {
+        dim3 blockDim(32, 8);
+        dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
+        splitC2_<T><<<gridDim, blockDim, 0, stream>>>(
+                src.ptr, src.step, src.rows, src.cols,
+                dst[0].ptr, dst[0].step, 
+                dst[1].ptr, dst[1].step);
+        if (stream == 0)
+            cudaSafeCall(cudaThreadSynchronize());
+    }
+
+
+    template <typename T> 
+    static void splitC3_(const DevMem2D& src, DevMem2D* dst, const cudaStream_t& stream)
+    {
+        dim3 blockDim(32, 8);
+        dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
+        splitC3_<T><<<gridDim, blockDim, 0, stream>>>(
+                src.ptr, src.step, src.rows, src.cols,
+                dst[0].ptr, dst[0].step, 
+                dst[1].ptr, dst[1].step,
+                dst[2].ptr, dst[2].step);         
+        if (stream == 0)
+            cudaSafeCall(cudaThreadSynchronize());
+    }
+
+
+    template <typename T> 
+    static void splitC4_(const DevMem2D& src, DevMem2D* dst, const cudaStream_t& stream)
+    {
+        dim3 blockDim(32, 8);
+        dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
+        splitC4_<T><<<gridDim, blockDim, 0, stream>>>(
+                 src.ptr, src.step, src.rows, src.cols,
+                 dst[0].ptr, dst[0].step, 
+                 dst[1].ptr, dst[1].step,
+                 dst[2].ptr, dst[2].step,
+                 dst[3].ptr, dst[3].step);
+        if (stream == 0)
+            cudaSafeCall(cudaThreadSynchronize());
+    }
+
+
+    extern "C" void split_caller(const DevMem2D& src, DevMem2D* dst, 
+                                 int num_channels, int elem_size1, 
+                                 const cudaStream_t& stream) 
+    {
+        static SplitFunction split_func_tbl[] = 
+        {
+            splitC2_<char>, splitC2_<short>, splitC2_<int>, 0, splitC2_<double>,
+            splitC3_<char>, splitC3_<short>, splitC3_<int>, 0, splitC3_<double>,
+            splitC4_<char>, splitC4_<short>, splitC4_<int>, 0, splitC4_<double>,
+        };
+
+        int split_func_id = (num_channels - 2) * 5 + (elem_size1 >> 1);
+        SplitFunction split_func = split_func_tbl[split_func_id];
+
+        if (split_func == 0)
+            cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);
+
+        split_func(src, dst, stream);
+    }
+
+
+    template <typename T>
+    __global__ void splitC2_(const uchar* src, size_t src_step, 
+                            int rows, int cols,
+                            uchar* dst0, size_t dst0_step,
+                            uchar* dst1, size_t dst1_step)
+    {
+        typedef typename TypeTraits<T>::type2 src_type;
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        const src_type* src_y = (const src_type*)(src + y * src_step);
+        T* dst0_y = (T*)(dst0 + y * dst0_step);
+        T* dst1_y = (T*)(dst1 + y * dst1_step);
+
+        if (x < cols && y < rows) 
+        {
+            src_type src_elem = src_y[x];
+            dst0_y[x] = src_elem.x;
+            dst1_y[x] = src_elem.y;
+        }
+    }
+
+
+    template <typename T>
+    __global__ void splitC3_(const uchar* src, size_t src_step, 
+                            int rows, int cols,
+                            uchar* dst0, size_t dst0_step,
+                            uchar* dst1, size_t dst1_step,
+                            uchar* dst2, size_t dst2_step)
+    {
+        typedef typename TypeTraits<T>::type3 src_type;
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        const src_type* src_y = (const src_type*)(src + y * src_step);
+        T* dst0_y = (T*)(dst0 + y * dst0_step);
+        T* dst1_y = (T*)(dst1 + y * dst1_step);
+        T* dst2_y = (T*)(dst2 + y * dst2_step);
+
+        if (x < cols && y < rows) 
+        {
+            src_type src_elem = src_y[x];
+            dst0_y[x] = src_elem.x;
+            dst1_y[x] = src_elem.y;
+            dst2_y[x] = src_elem.z;
+        }
+    }
+
+
+    template <>
+    __global__ void splitC3_<double>(
+            const uchar* src, size_t src_step, int rows, int cols,
+            uchar* dst0, size_t dst0_step,
+            uchar* dst1, size_t dst1_step,
+            uchar* dst2, size_t dst2_step)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        const double* src_y = (const double*)(src + y * src_step);
+        double* dst0_y = (double*)(dst0 + y * dst0_step);
+        double* dst1_y = (double*)(dst1 + y * dst1_step);
+        double* dst2_y = (double*)(dst2 + y * dst2_step);
+
+        if (x < cols && y < rows) 
+        {
+            dst0_y[x] = src_y[3 * x];
+            dst1_y[x] = src_y[3 * x + 1];
+            dst2_y[x] = src_y[3 * x + 2];
+        }
+    }
+
+
+    template <typename T>
+    __global__ void splitC4_(const uchar* src, size_t src_step, int rows, int cols,
+                            uchar* dst0, size_t dst0_step,
+                            uchar* dst1, size_t dst1_step,
+                            uchar* dst2, size_t dst2_step,
+                            uchar* dst3, size_t dst3_step)
+    {
+        typedef typename TypeTraits<T>::type4 src_type;
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        const src_type* src_y = (const src_type*)(src + y * src_step);
+        T* dst0_y = (T*)(dst0 + y * dst0_step);
+        T* dst1_y = (T*)(dst1 + y * dst1_step);
+        T* dst2_y = (T*)(dst2 + y * dst2_step);
+        T* dst3_y = (T*)(dst3 + y * dst3_step);
+
+        if (x < cols && y < rows) 
+        {
+            src_type src_elem = src_y[x];
+            dst0_y[x] = src_elem.x;
+            dst1_y[x] = src_elem.y;
+            dst2_y[x] = src_elem.z;
+            dst3_y[x] = src_elem.w;
+        }
+    }
+
+
+    template <>
+    __global__ void splitC4_<double>(
+            const uchar* src, size_t src_step, int rows, int cols,
+            uchar* dst0, size_t dst0_step,
+            uchar* dst1, size_t dst1_step,
+            uchar* dst2, size_t dst2_step,
+            uchar* dst3, size_t dst3_step)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        const double2* src_y = (const double2*)(src + y * src_step);
+        double* dst0_y = (double*)(dst0 + y * dst0_step);
+        double* dst1_y = (double*)(dst1 + y * dst1_step);
+        double* dst2_y = (double*)(dst2 + y * dst2_step);
+        double* dst3_y = (double*)(dst3 + y * dst3_step);
+
+        if (x < cols && y < rows) 
+        {
+            double2 src_elem1 = src_y[2 * x];
+            double2 src_elem2 = src_y[2 * x + 1];
+            dst0_y[x] = src_elem1.x;
+            dst1_y[x] = src_elem1.y;
+            dst2_y[x] = src_elem2.x;
+            dst3_y[x] = src_elem2.y;
+        }
+    }
+
+}}} // namespace cv::gpu::split_merge
--- a/modules/gpu/src/split_merge.cpp
+++ b/modules/gpu/src/split_merge.cpp
@ -0,0 +1,151 @@
+#include "precomp.hpp"
+#include <vector>
+
+using namespace std;
+
+#if !defined (HAVE_CUDA)
+
+void cv::gpu::merge(const GpuMat* /*src*/, size_t /*count*/, GpuMat& /*dst*/) { throw_nogpu(); }
+void cv::gpu::merge(const vector<GpuMat>& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }
+void cv::gpu::merge(const GpuMat* /*src*/, size_t /*count*/, GpuMat& /*dst*/, const Stream& /*stream*/) { throw_nogpu(); }
+void cv::gpu::merge(const vector<GpuMat>& /*src*/, GpuMat& /*dst*/, const Stream& /*stream*/) { throw_nogpu(); }
+void cv::gpu::split(const GpuMat& /*src*/, GpuMat* /*dst*/) { throw_nogpu(); }
+void cv::gpu::split(const GpuMat& /*src*/, vector<GpuMat>& /*dst*/) { throw_nogpu(); }
+void cv::gpu::split(const GpuMat& /*src*/, GpuMat* /*dst*/, const Stream& /*stream*/) { throw_nogpu(); }
+void cv::gpu::split(const GpuMat& /*src*/, vector<GpuMat>& /*dst*/, const Stream& /*stream*/) { throw_nogpu(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace cv { namespace gpu { namespace split_merge 
+{    
+    extern "C" void merge_caller(const DevMem2D* src, DevMem2D& dst, 
+                                 int total_channels, int elem_size, 
+                                 const cudaStream_t& stream);
+
+    extern "C" void split_caller(const DevMem2D& src, DevMem2D* dst, 
+                                 int num_channels, int elem_size1, 
+                                 const cudaStream_t& stream);
+
+    void merge(const GpuMat* src, size_t n, GpuMat& dst, const cudaStream_t& stream) 
+    {
+        CV_Assert(src);
+        CV_Assert(n > 0);
+
+        int depth = src[0].depth();
+        Size size = src[0].size();
+
+        bool single_channel_only = true;
+        int total_channels = 0;
+
+        for (size_t i = 0; i < n; ++i)
+        {
+            CV_Assert(src[i].size() == size);
+            CV_Assert(src[i].depth() == depth);
+            single_channel_only = single_channel_only && src[i].channels() == 1;
+            total_channels += src[i].channels();
+        }
+
+        CV_Assert(single_channel_only);
+        CV_Assert(total_channels <= 4);
+
+        if (total_channels == 1)  
+            src[0].copyTo(dst);
+        else 
+        {
+            dst.create(size, CV_MAKETYPE(depth, total_channels));
+
+            DevMem2D src_as_devmem[4];
+            for(size_t i = 0; i < n; ++i)
+                src_as_devmem[i] = src[i];
+
+            split_merge::merge_caller(src_as_devmem, (DevMem2D)dst, 
+                                      total_channels, CV_ELEM_SIZE(depth), 
+                                      stream);
+        }   
+    }
+
+
+    void split(const GpuMat& src, GpuMat* dst, const cudaStream_t& stream) 
+    {
+        CV_Assert(dst);
+
+        int depth = src.depth();
+        int num_channels = src.channels();
+        Size size = src.size();
+
+        if (num_channels == 1)
+        {
+            src.copyTo(dst[0]);
+            return;
+        }
+
+        for (int i = 0; i < num_channels; ++i)
+            dst[i].create(src.size(), depth);
+
+        CV_Assert(num_channels <= 4);
+
+        DevMem2D dst_as_devmem[4];
+        for (int i = 0; i < num_channels; ++i)
+            dst_as_devmem[i] = dst[i];
+
+        split_merge::split_caller((DevMem2D)src, dst_as_devmem, 
+                                  num_channels, src.elemSize1(), 
+                                  stream);
+    }
+
+
+}}}
+
+
+void cv::gpu::merge(const GpuMat* src, size_t n, GpuMat& dst) 
+{ 
+    split_merge::merge(src, n, dst, 0);
+}
+
+
+void cv::gpu::merge(const vector<GpuMat>& src, GpuMat& dst) 
+{
+    split_merge::merge(&src[0], src.size(), dst, 0);
+}
+
+
+void cv::gpu::merge(const GpuMat* src, size_t n, GpuMat& dst, const Stream& stream) 
+{ 
+    split_merge::merge(src, n, dst, StreamAccessor::getStream(stream));
+}
+
+
+void cv::gpu::merge(const vector<GpuMat>& src, GpuMat& dst, const Stream& stream) 
+{
+    split_merge::merge(&src[0], src.size(), dst, StreamAccessor::getStream(stream));
+}
+
+
+void cv::gpu::split(const GpuMat& src, GpuMat* dst) 
+{
+    split_merge::split(src, dst, 0);
+}
+
+
+void cv::gpu::split(const GpuMat& src, vector<GpuMat>& dst) 
+{
+    dst.resize(src.channels());
+    if(src.channels() > 0)
+        split_merge::split(src, &dst[0], 0);
+}
+
+
+void cv::gpu::split(const GpuMat& src, GpuMat* dst, const Stream& stream) 
+{
+    split_merge::split(src, dst, StreamAccessor::getStream(stream));
+}
+
+
+void cv::gpu::split(const GpuMat& src, vector<GpuMat>& dst, const Stream& stream) 
+{
+    dst.resize(src.channels());
+    if(src.channels() > 0)
+        split_merge::split(src, &dst[0], StreamAccessor::getStream(stream));
+}
+
+#endif /* !defined (HAVE_CUDA) */
--- a/tests/gpu/src/split_merge.cpp
+++ b/tests/gpu/src/split_merge.cpp
@ -0,0 +1,275 @@
+#include "gputest.hpp"
+#include <opencv2/opencv.hpp>
+#include <opencv2/gpu/gpu.hpp>
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+using namespace std;
+using namespace cv;
+
+struct CV_MergeTest : public CvTest
+{
+    CV_MergeTest() : CvTest("GPU-Merge", "merge") {}
+    void can_merge(size_t rows, size_t cols);
+    void can_merge_submatrixes(size_t rows, size_t cols);
+    void run(int);
+} merge_test;
+
+
+void CV_MergeTest::can_merge(size_t rows, size_t cols)
+{
+    for (size_t num_channels = 1; num_channels <= 4; ++num_channels)
+        for (size_t depth = CV_8U; depth <= CV_64F; ++depth)
+        {
+            vector<Mat> src;
+            for (size_t i = 0; i < num_channels; ++i)
+                src.push_back(Mat(rows, cols, depth, Scalar::all(static_cast<double>(i))));
+            
+            Mat dst(rows, cols, CV_MAKETYPE(depth, num_channels));   
+
+            cv::merge(src, dst);   
+
+            vector<gpu::GpuMat> dev_src;
+            for (size_t i = 0; i < num_channels; ++i)
+                dev_src.push_back(gpu::GpuMat(src[i]));
+
+            gpu::GpuMat dev_dst(rows, cols, CV_MAKETYPE(depth, num_channels));
+            cv::gpu::merge(dev_src, dev_dst); 
+
+            Mat host_dst = dev_dst;
+
+            double err = norm(dst, host_dst, NORM_INF);
+
+            if (err > 1e-3)
+            {
+                //ts->printf(CvTS::CONSOLE, "\nNorm: %f\n", err);
+                //ts->printf(CvTS::CONSOLE, "Depth: %d\n", depth);
+                //ts->printf(CvTS::CONSOLE, "Rows: %d\n", rows);
+                //ts->printf(CvTS::CONSOLE, "Cols: %d\n", cols);
+                //ts->printf(CvTS::CONSOLE, "NumChannels: %d\n", num_channels);
+                ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);
+                return;
+            }
+        }
+}
+
+
+void CV_MergeTest::can_merge_submatrixes(size_t rows, size_t cols)
+{
+    for (size_t num_channels = 1; num_channels <= 4; ++num_channels)
+        for (size_t depth = CV_8U; depth <= CV_64F; ++depth)
+        {
+            vector<Mat> src;
+            for (size_t i = 0; i < num_channels; ++i) 
+            {
+                Mat m(rows * 2, cols * 2, depth, Scalar::all(static_cast<double>(i)));
+                src.push_back(m(Range(rows / 2, rows / 2 + rows), Range(cols / 2, cols / 2 + cols)));
+            }
+
+            Mat dst(rows, cols, CV_MAKETYPE(depth, num_channels));   
+
+            cv::merge(src, dst);   
+
+            vector<gpu::GpuMat> dev_src;
+            for (size_t i = 0; i < num_channels; ++i)
+                dev_src.push_back(gpu::GpuMat(src[i]));
+
+            gpu::GpuMat dev_dst(rows, cols, CV_MAKETYPE(depth, num_channels));
+            cv::gpu::merge(dev_src, dev_dst);
+
+            Mat host_dst = dev_dst;
+
+            double err = norm(dst, host_dst, NORM_INF);
+
+            if (err > 1e-3)
+            {
+                //ts->printf(CvTS::CONSOLE, "\nNorm: %f\n", err);
+                //ts->printf(CvTS::CONSOLE, "Depth: %d\n", depth);
+                //ts->printf(CvTS::CONSOLE, "Rows: %d\n", rows);
+                //ts->printf(CvTS::CONSOLE, "Cols: %d\n", cols);
+                //ts->printf(CvTS::CONSOLE, "NumChannels: %d\n", num_channels);
+                ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);
+                return;
+            }
+        }
+}
+
+
+void CV_MergeTest::run(int) 
+{
+    try
+    {
+        can_merge(1, 1);
+        can_merge(1, 7);
+        can_merge(53, 7);
+        can_merge_submatrixes(1, 1);
+        can_merge_submatrixes(1, 7);
+        can_merge_submatrixes(53, 7);
+    }
+    catch(const cv::Exception& e)
+    {
+        if (!check_and_treat_gpu_exception(e, ts))
+            throw;        
+    }    
+}
+
+
+struct CV_SplitTest : public CvTest
+{
+    CV_SplitTest() : CvTest("GPU-Split", "split") {}
+    void can_split(size_t rows, size_t cols);    
+    void can_split_submatrix(size_t rows, size_t cols);
+    void run(int);
+} split_test;
+
+
+void CV_SplitTest::can_split(size_t rows, size_t cols)
+{
+    for (size_t num_channels = 1; num_channels <= 4; ++num_channels)
+        for (size_t depth = CV_8U; depth <= CV_64F; ++depth)
+        {
+            Mat src(rows, cols, CV_MAKETYPE(depth, num_channels), Scalar(1.0, 2.0, 3.0, 4.0));   
+            vector<Mat> dst;
+            cv::split(src, dst);   
+
+            gpu::GpuMat dev_src(src);
+            vector<gpu::GpuMat> dev_dst;
+            cv::gpu::split(dev_src, dev_dst);
+
+            if (dev_dst.size() != dst.size())
+            {
+                ts->printf(CvTS::CONSOLE, "Bad output sizes");
+                ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);
+            }
+
+            for (size_t i = 0; i < num_channels; ++i)
+            {
+                Mat host_dst = dev_dst[i];
+                double err = norm(dst[i], host_dst, NORM_INF);
+
+                if (err > 1e-3)
+                {
+                    //ts->printf(CvTS::CONSOLE, "\nNorm: %f\n", err);
+                    //ts->printf(CvTS::CONSOLE, "Depth: %d\n", depth);
+                    //ts->printf(CvTS::CONSOLE, "Rows: %d\n", rows);
+                    //ts->printf(CvTS::CONSOLE, "Cols: %d\n", cols);
+                    //ts->printf(CvTS::CONSOLE, "NumChannels: %d\n", num_channels);
+                    ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);
+                    return;
+                }
+            }
+        }
+}
+
+
+
+void CV_SplitTest::can_split_submatrix(size_t rows, size_t cols)
+{
+    for (size_t num_channels = 1; num_channels <= 4; ++num_channels)
+        for (size_t depth = CV_8U; depth <= CV_64F; ++depth)
+        {
+            Mat src_data(rows * 2, cols * 2, CV_MAKETYPE(depth, num_channels), Scalar(1.0, 2.0, 3.0, 4.0));   
+            Mat src(src_data(Range(rows / 2, rows / 2 + rows), Range(cols / 2, cols / 2 + cols)));
+            vector<Mat> dst;
+            cv::split(src, dst);   
+
+            gpu::GpuMat dev_src(src);
+            vector<gpu::GpuMat> dev_dst;
+            cv::gpu::split(dev_src, dev_dst);
+
+            if (dev_dst.size() != dst.size())
+            {
+                ts->printf(CvTS::CONSOLE, "Bad output sizes");
+                ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);
+            }
+
+            for (size_t i = 0; i < num_channels; ++i)
+            {
+                Mat host_dst = dev_dst[i];
+                double err = norm(dst[i], host_dst, NORM_INF);
+
+                if (err > 1e-3)
+                {
+                    //ts->printf(CvTS::CONSOLE, "\nNorm: %f\n", err);
+                    //ts->printf(CvTS::CONSOLE, "Depth: %d\n", depth);
+                    //ts->printf(CvTS::CONSOLE, "Rows: %d\n", rows);
+                    //ts->printf(CvTS::CONSOLE, "Cols: %d\n", cols);
+                    //ts->printf(CvTS::CONSOLE, "NumChannels: %d\n", num_channels);
+                    ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);
+                    return;
+                }
+            }
+        }
+}
+
+
+void CV_SplitTest::run(int)
+{
+    try 
+    {
+        can_split(1, 1);
+        can_split(1, 7);
+        can_split(7, 53);
+        can_split_submatrix(1, 1);
+        can_split_submatrix(1, 7);
+        can_split_submatrix(7, 53);
+    }
+    catch(const cv::Exception& e)
+    {
+        if (!check_and_treat_gpu_exception(e, ts))
+            throw;        
+    }    
+}
+
+
+struct CV_SplitMergeTest : public CvTest
+{
+    CV_SplitMergeTest() : CvTest("GPU-SplitMerge", "split merge") {}
+    void can_split_merge(size_t rows, size_t cols);    
+    void run(int);
+} split_merge_test;
+
+
+void CV_SplitMergeTest::can_split_merge(size_t rows, size_t cols) {
+    for (size_t num_channels = 1; num_channels <= 4; ++num_channels)
+        for (size_t depth = CV_8U; depth <= CV_64F; ++depth)
+        {
+            Mat orig(rows, cols, CV_MAKETYPE(depth, num_channels), Scalar(1.0, 2.0, 3.0, 4.0));   
+            gpu::GpuMat dev_orig(orig);
+            vector<gpu::GpuMat> dev_vec;
+            cv::gpu::split(dev_orig, dev_vec);
+
+            gpu::GpuMat dev_final(rows, cols, CV_MAKETYPE(depth, num_channels));
+            cv::gpu::merge(dev_vec, dev_final);
+
+            double err = cv::norm((Mat)dev_orig, (Mat)dev_final, NORM_INF);
+            if (err > 1e-3)
+            {
+                //ts->printf(CvTS::CONSOLE, "\nNorm: %f\n", err);
+                //ts->printf(CvTS::CONSOLE, "Depth: %d\n", depth);
+                //ts->printf(CvTS::CONSOLE, "Rows: %d\n", rows);
+                //ts->printf(CvTS::CONSOLE, "Cols: %d\n", cols);
+                //ts->printf(CvTS::CONSOLE, "NumChannels: %d\n", num_channels);
+                ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);
+                return;
+            }
+        }
+}
+
+
+void CV_SplitMergeTest::run(int) 
+{
+    try 
+    {
+        can_split_merge(1, 1);
+        can_split_merge(1, 7);
+        can_split_merge(7, 53);
+    }
+    catch(const cv::Exception& e)
+    {
+        if (!check_and_treat_gpu_exception(e, ts))
+            throw;        
+    }    
+}