Merge remote-tracking branch 'origin/2.4' into merge-2.4

Conflicts: modules/gpuwarping/src/cuda/resize.cu modules/gpuwarping/src/resize.cpp modules/gpuwarping/test/test_resize.cpp modules/ocl/perf/main.cpp modules/ocl/perf/perf_calib3d.cpp modules/ocl/perf/perf_canny.cpp modules/ocl/perf/perf_color.cpp modules/ocl/perf/perf_haar.cpp modules/ocl/perf/perf_match_template.cpp modules/ocl/perf/perf_precomp.cpp modules/ocl/perf/perf_precomp.hpp
12 years ago · 2d6f35d6ed
parent dd25f416a6 525b6eca2e
commit 2d6f35d6ed
45 changed files with 2704 additions and 4102 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -460,6 +460,8 @@ include(cmake/OpenCVGenAndroidMK.cmake)
 # Generate OpenCVСonfig.cmake and OpenCVConfig-version.cmake for cmake projects
 include(cmake/OpenCVGenConfig.cmake)

+# Generate Info.plist for the IOS framework
+include(cmake/OpenCVGenInfoPlist.cmake)

 # ----------------------------------------------------------------------------
 # Summary:
--- a/cmake/OpenCVCRTLinkage.cmake
+++ b/cmake/OpenCVCRTLinkage.cmake
@ -26,8 +26,8 @@ else()
  set(HAVE_MSVC2012 TRUE)
 endif()

-TRY_COMPILE(HAVE_WINRT_SDK
-  "${OPENCV_BINARY_DIR}/CMakeFiles/CMakeTmp"
+try_compile(HAVE_WINRT_SDK
+  "${OpenCV_BINARY_DIR}"
  "${OpenCV_SOURCE_DIR}/cmake/checks/winrttest.cpp")

 if (ENABLE_WINRT_MODE AND HAVE_WINRT_SDK AND HAVE_MSVC2012 AND HAVE_MSPDK)
--- a/cmake/OpenCVCompilerOptions.cmake
+++ b/cmake/OpenCVCompilerOptions.cmake
@ -233,6 +233,10 @@ if(MSVC)
      set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /fp:fast") # !! important - be on the same wave with x64 compilers
    endif()
  endif()
+
+  if(OPENCV_WARNINGS_ARE_ERRORS)
+    set(OPENCV_EXTRA_FLAGS "${OPENCV_EXTRA_FLAGS} /WX")
+  endif()
 endif()

 # Extra link libs if the user selects building static libs:
--- a/cmake/OpenCVFindLibsGUI.cmake
+++ b/cmake/OpenCVFindLibsGUI.cmake
@ -5,12 +5,11 @@
 #--- Win32 UI ---
 ocv_clear_vars(HAVE_WIN32UI)
 if(WITH_WIN32UI)
-  TRY_COMPILE(HAVE_WIN32UI
-    "${OPENCV_BINARY_DIR}/CMakeFiles/CMakeTmp"
+  try_compile(HAVE_WIN32UI
+    "${OpenCV_BINARY_DIR}"
    "${OpenCV_SOURCE_DIR}/cmake/checks/win32uitest.cpp"
-    CMAKE_FLAGS "\"user32.lib\" \"gdi32.lib\""
-    OUTPUT_VARIABLE OUTPUT)
-endif(WITH_WIN32UI)
+    CMAKE_FLAGS "-DLINK_LIBRARIES:STRING=user32;gdi32")
+endif()

 # --- QT4 ---
 ocv_clear_vars(HAVE_QT HAVE_QT5)
--- a/cmake/OpenCVFindLibsGrfmt.cmake
+++ b/cmake/OpenCVFindLibsGrfmt.cmake
@ -149,7 +149,7 @@ if(WITH_JASPER)
 endif()

 # --- libpng (optional, should be searched after zlib) ---
-if(WITH_PNG AND NOT IOS)
+if(WITH_PNG)
  if(BUILD_PNG)
    ocv_clear_vars(PNG_FOUND)
  else()
--- a/cmake/OpenCVFindLibsVideo.cmake
+++ b/cmake/OpenCVFindLibsVideo.cmake
@ -3,13 +3,12 @@
 # ----------------------------------------------------------------------------

 ocv_clear_vars(HAVE_VFW)
-if (WITH_VFW)
-  TRY_COMPILE(HAVE_VFW
-    "${OPENCV_BINARY_DIR}/CMakeFiles/CMakeTmp"
+if(WITH_VFW)
+  try_compile(HAVE_VFW
+    "${OpenCV_BINARY_DIR}"
    "${OpenCV_SOURCE_DIR}/cmake/checks/vfwtest.cpp"
-    CMAKE_FLAGS "-DLINK_LIBRARIES:STRING=vfw32"
-    OUTPUT_VARIABLE OUTPUT)
- endif(WITH_VFW)
+    CMAKE_FLAGS "-DLINK_LIBRARIES:STRING=vfw32")
+endif(WITH_VFW)

 # --- GStreamer ---
 ocv_clear_vars(HAVE_GSTREAMER)
@ -271,8 +270,10 @@ if(WITH_AVFOUNDATION)
 endif()

 # --- QuickTime ---
-if(WITH_QUICKTIME)
-  set(HAVE_QUICKTIME YES)
-elseif(APPLE)
-  set(HAVE_QTKIT YES)
+if (NOT IOS)
+  if(WITH_QUICKTIME)
+    set(HAVE_QUICKTIME YES)
+  elseif(APPLE)
+    set(HAVE_QTKIT YES)
+  endif()
 endif()
--- a/cmake/OpenCVGenInfoPlist.cmake
+++ b/cmake/OpenCVGenInfoPlist.cmake
@ -0,0 +1,4 @@
+if(IOS)
+  configure_file("${OpenCV_SOURCE_DIR}/platforms/ios/Info.plist.in"
+                 "${CMAKE_BINARY_DIR}/ios/Info.plist")
+endif()
--- a/cmake/OpenCVUtils.cmake
+++ b/cmake/OpenCVUtils.cmake
@ -77,7 +77,7 @@ MACRO(ocv_check_compiler_flag LANG FLAG RESULT)
    if(_fname)
      MESSAGE(STATUS "Performing Test ${RESULT}")
      TRY_COMPILE(${RESULT}
-        ${CMAKE_BINARY_DIR}
+        "${CMAKE_BINARY_DIR}"
        "${_fname}"
        COMPILE_DEFINITIONS "${FLAG}"
        OUTPUT_VARIABLE OUTPUT)
--- a/modules/contrib/src/inputoutput.cpp
+++ b/modules/contrib/src/inputoutput.cpp
@ -136,7 +136,7 @@ namespace cv
                        char* fname;
                    #ifdef HAVE_WINRT
                        char fname_tmp[MAX_PATH];
-                        size_t copied = wcstombs(fname, FindFileData.cFileName, MAX_PATH);
+                        size_t copied = wcstombs(fname_tmp, FindFileData.cFileName, MAX_PATH);
                        CV_Assert((copied != MAX_PATH) && (copied != (size_t)-1));
                        fname = fname_tmp;
                    #else
--- a/modules/core/doc/basic_structures.rst
+++ b/modules/core/doc/basic_structures.rst
@ -1369,7 +1369,7 @@ The method makes a new header for the specified row span of the matrix. Similarl

 Mat::colRange
 -------------
-Creates a matrix header for the specified row span.
+Creates a matrix header for the specified column span.

 .. ocv:function:: Mat Mat::colRange(int startcol, int endcol) const

--- a/modules/core/src/parallel.cpp
+++ b/modules/core/src/parallel.cpp
@ -144,9 +144,9 @@ namespace
        {
            cv::Range r;
            r.start = (int)(wholeRange.start +
-                            ((size_t)sr.start*(wholeRange.end - wholeRange.start) + nstripes/2)/nstripes);
+                            ((uint64)sr.start*(wholeRange.end - wholeRange.start) + nstripes/2)/nstripes);
            r.end = sr.end >= nstripes ? wholeRange.end : (int)(wholeRange.start +
-                            ((size_t)sr.end*(wholeRange.end - wholeRange.start) + nstripes/2)/nstripes);
+                            ((uint64)sr.end*(wholeRange.end - wholeRange.start) + nstripes/2)/nstripes);
            (*body)(r);
        }
        cv::Range stripeRange() const { return cv::Range(0, nstripes); }
--- a/modules/gpuoptflow/src/tvl1flow.cpp
+++ b/modules/gpuoptflow/src/tvl1flow.cpp
@ -222,7 +222,8 @@ void cv::gpu::OpticalFlowDual_TVL1_GPU::procOneScale(const GpuMat& I0, const Gpu
        {
            estimateU(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, u1, u2, diff, l_t, static_cast<float>(theta));

-            error = gpu::sum(diff, norm_buf)[0];
+            if (epsilon > 0)
+                error = gpu::sum(diff, norm_buf)[0];

            estimateDualVariables(u1, u2, p11, p12, p21, p22, taut);
        }
--- a/modules/gpuwarping/src/cuda/resize.cu
+++ b/modules/gpuwarping/src/cuda/resize.cu
@ -49,254 +49,434 @@
 #include "opencv2/core/cuda/vec_math.hpp"
 #include "opencv2/core/cuda/saturate_cast.hpp"
 #include "opencv2/core/cuda/filters.hpp"
-#include "opencv2/core/cuda/scan.hpp"

 namespace cv { namespace gpu { namespace cudev
 {
-    namespace imgproc
+    // kernels
+
+    template <typename T> __global__ void resize_nearest(const PtrStep<T> src, PtrStepSz<T> dst, const float fy, const float fx)
    {
-        template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, PtrStepSz<T> dst)
+        const int dst_x = blockDim.x * blockIdx.x + threadIdx.x;
+        const int dst_y = blockDim.y * blockIdx.y + threadIdx.y;
+
+        if (dst_x < dst.cols && dst_y < dst.rows)
        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+            const float src_x = dst_x * fx;
+            const float src_y = dst_y * fy;

-            if (x < dst.cols && y < dst.rows)
-            {
-                const float xcoo = x * fx;
-                const float ycoo = y * fy;
+            dst(dst_y, dst_x) = src(__float2int_rz(src_y), __float2int_rz(src_x));
+        }
+    }

-                dst(y, x) = saturate_cast<T>(src(ycoo, xcoo));
-            }
+    template <typename T> __global__ void resize_linear(const PtrStepSz<T> src, PtrStepSz<T> dst, const float fy, const float fx)
+    {
+        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
+
+        const int dst_x = blockDim.x * blockIdx.x + threadIdx.x;
+        const int dst_y = blockDim.y * blockIdx.y + threadIdx.y;
+
+        if (dst_x < dst.cols && dst_y < dst.rows)
+        {
+            const float src_x = dst_x * fx;
+            const float src_y = dst_y * fy;
+
+            work_type out = VecTraits<work_type>::all(0);
+
+            const int x1 = __float2int_rd(src_x);
+            const int y1 = __float2int_rd(src_y);
+            const int x2 = x1 + 1;
+            const int y2 = y1 + 1;
+            const int x2_read = ::min(x2, src.cols - 1);
+            const int y2_read = ::min(y2, src.rows - 1);
+
+            T src_reg = src(y1, x1);
+            out = out + src_reg * ((x2 - src_x) * (y2 - src_y));
+
+            src_reg = src(y1, x2_read);
+            out = out + src_reg * ((src_x - x1) * (y2 - src_y));
+
+            src_reg = src(y2_read, x1);
+            out = out + src_reg * ((x2 - src_x) * (src_y - y1));
+
+            src_reg = src(y2_read, x2_read);
+            out = out + src_reg * ((src_x - x1) * (src_y - y1));
+
+            dst(dst_y, dst_x) = saturate_cast<T>(out);
        }
+    }
+
+    template <class Ptr2D, typename T> __global__ void resize(const Ptr2D src, PtrStepSz<T> dst, const float fy, const float fx)
+    {
+        const int dst_x = blockDim.x * blockIdx.x + threadIdx.x;
+        const int dst_y = blockDim.y * blockIdx.y + threadIdx.y;

-        template <typename Ptr2D, typename T> __global__ void resize_area(const Ptr2D src, float fx, float fy, PtrStepSz<T> dst)
+        if (dst_x < dst.cols && dst_y < dst.rows)
        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+            const float src_x = dst_x * fx;
+            const float src_y = dst_y * fy;

-            if (x < dst.cols && y < dst.rows)
-            {
-                dst(y, x) = saturate_cast<T>(src(y, x));
-            }
+            dst(dst_y, dst_x) = src(src_y, src_x);
        }
+    }

-        template <template <typename> class Filter, typename T> struct ResizeDispatcherStream
+    template <typename Ptr2D, typename T> __global__ void resize_area(const Ptr2D src, PtrStepSz<T> dst)
+    {
+        const int x = blockDim.x * blockIdx.x + threadIdx.x;
+        const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+        if (x < dst.cols && y < dst.rows)
        {
-            static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
-            {
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+            dst(y, x) = src(y, x);
+        }
+    }

-                BrdReplicate<T> brd(src.rows, src.cols);
-                BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filteredSrc(brdSrc, fx, fy);
+    // textures

-                resize<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst);
-                cudaSafeCall( cudaGetLastError() );
-            }
+    template <typename T> struct TextureAccessor;
+
+    #define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \
+        texture<type, cudaTextureType2D, cudaReadModeElementType> tex_resize_##type (0, cudaFilterModePoint, cudaAddressModeClamp); \
+        template <> struct TextureAccessor<type> \
+        { \
+            typedef type elem_type; \
+            typedef int index_type; \
+            int xoff; \
+            int yoff; \
+            __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
+            { \
+                return tex2D(tex_resize_##type, x + xoff, y + yoff); \
+            } \
+            __host__ static void bind(const PtrStepSz<type>& mat) \
+            { \
+                bindTexture(&tex_resize_##type, mat); \
+            } \
        };

-        template <typename T> struct ResizeDispatcherStream<AreaFilter, T>
+    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)
+    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4)
+
+    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort)
+    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4)
+
+    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short)
+    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4)
+
+    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float)
+    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4)
+
+    #undef OPENCV_GPU_IMPLEMENT_RESIZE_TEX
+
+    template <typename T>
+    TextureAccessor<T> texAccessor(const PtrStepSz<T>& mat, int yoff, int xoff)
+    {
+        TextureAccessor<T>::bind(mat);
+
+        TextureAccessor<T> t;
+        t.xoff = xoff;
+        t.yoff = yoff;
+
+        return t;
+    }
+
+    // callers for nearest interpolation
+
+    template <typename T>
+    void call_resize_nearest_glob(const PtrStepSz<T>& src, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+        resize_nearest<<<grid, block, 0, stream>>>(src, dst, fy, fx);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template <typename T>
+    void call_resize_nearest_tex(const PtrStepSz<T>& src, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+        resize<<<grid, block>>>(texAccessor(srcWhole, yoff, xoff), dst, fy, fx);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    // callers for linear interpolation
+
+    template <typename T>
+    void call_resize_linear_glob(const PtrStepSz<T>& src, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+        resize_linear<<<grid, block>>>(src, dst, fy, fx);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template <typename T>
+    void call_resize_linear_tex(const PtrStepSz<T>& src, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+        if (srcWhole.data == src.data)
        {
-            static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
-            {
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+            TextureAccessor<T> texSrc = texAccessor(src, 0, 0);
+            LinearFilter< TextureAccessor<T> > filteredSrc(texSrc);

-                BrdConstant<T> brd(src.rows, src.cols);
-                BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd);
-                AreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy);
-                resize_area<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst);
-                cudaSafeCall( cudaGetLastError() );
-                if (stream == 0)
-                    cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        };
+            resize<<<grid, block>>>(filteredSrc, dst, fy, fx);
+        }
+        else
+        {
+            TextureAccessor<T> texSrc = texAccessor(srcWhole, yoff, xoff);
+
+            BrdReplicate<T> brd(src.rows, src.cols);
+            BorderReader<TextureAccessor<T>, BrdReplicate<T> > brdSrc(texSrc, brd);
+            LinearFilter< BorderReader<TextureAccessor<T>, BrdReplicate<T> > > filteredSrc(brdSrc);
+
+            resize<<<grid, block>>>(filteredSrc, dst, fy, fx);
+        }
+
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    // callers for cubic interpolation
+
+    template <typename T>
+    void call_resize_cubic_glob(const PtrStepSz<T>& src, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+        BrdReplicate<T> brd(src.rows, src.cols);
+        BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
+        CubicFilter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filteredSrc(brdSrc);
+
+        resize<<<grid, block, 0, stream>>>(filteredSrc, dst, fy, fx);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }

-        template <typename T> struct ResizeDispatcherStream<IntegerAreaFilter, T>
+    template <typename T>
+    void call_resize_cubic_tex(const PtrStepSz<T>& src, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+        if (srcWhole.data == src.data)
        {
-            static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
-            {
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-                BrdConstant<T> brd(src.rows, src.cols);
-                BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd);
-                IntegerAreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy);
-                resize_area<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst);
-                cudaSafeCall( cudaGetLastError() );
-                if (stream == 0)
-                    cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        };
+            TextureAccessor<T> texSrc = texAccessor(src, 0, 0);
+            CubicFilter< TextureAccessor<T> > filteredSrc(texSrc);

-        template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream
+            resize<<<grid, block>>>(filteredSrc, dst, fy, fx);
+        }
+        else
        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst)
-            {
-                (void)srcWhole;
-                (void)xoff;
-                (void)yoff;
+            TextureAccessor<T> texSrc = texAccessor(srcWhole, yoff, xoff);

-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+            BrdReplicate<T> brd(src.rows, src.cols);
+            BorderReader<TextureAccessor<T>, BrdReplicate<T> > brdSrc(texSrc, brd);
+            CubicFilter< BorderReader<TextureAccessor<T>, BrdReplicate<T> > > filteredSrc(brdSrc);

-                BrdReplicate<T> brd(src.rows, src.cols);
-                BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filteredSrc(brdSrc);
+            resize<<<grid, block>>>(filteredSrc, dst, fy, fx);
+        }

-                resize<<<grid, block>>>(filteredSrc, fx, fy, dst);
-                cudaSafeCall( cudaGetLastError() );
+        cudaSafeCall( cudaGetLastError() );

-                cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        };
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }

-        #define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \
-            texture< type , cudaTextureType2D> tex_resize_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
-            struct tex_resize_ ## type ## _reader \
-            { \
-                typedef type elem_type; \
-                typedef int index_type; \
-                const int xoff; \
-                const int yoff; \
-                __host__ tex_resize_ ## type ## _reader(int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
-                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
-                { \
-                    return tex2D(tex_resize_ ## type, x + xoff, y + yoff); \
-                } \
-            }; \
-            template <template <typename> class Filter> struct ResizeDispatcherNonStream<Filter, type > \
-            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz< type > dst) \
-                { \
-                    dim3 block(32, 8); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_resize_ ## type, srcWhole); \
-                    tex_resize_ ## type ## _reader texSrc(xoff, yoff); \
-                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
-                    { \
-                        Filter<tex_resize_ ## type ## _reader> filteredSrc(texSrc); \
-                        resize<<<grid, block>>>(filteredSrc, fx, fy, dst); \
-                    } \
-                    else \
-                    { \
-                        BrdReplicate< type > brd(src.rows, src.cols); \
-                        BorderReader<tex_resize_ ## type ## _reader, BrdReplicate< type > > brdSrc(texSrc, brd); \
-                        Filter< BorderReader<tex_resize_ ## type ## _reader, BrdReplicate< type > > > filteredSrc(brdSrc); \
-                        resize<<<grid, block>>>(filteredSrc, fx, fy, dst); \
-                    } \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            };
-
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4)
-
-        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(schar)
-        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char4)
-
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort)
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4)
-
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short)
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4)
-
-        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int)
-        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int4)
-
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float)
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4)
-
-        #undef OPENCV_GPU_IMPLEMENT_RESIZE_TEX
-
-        template <template <typename> class Filter, typename T> struct ResizeDispatcher
+    // ResizeNearestDispatcher
+
+    template <typename T> struct ResizeNearestDispatcher
+    {
+        static void call(const PtrStepSz<T>& src, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream)
        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
+            call_resize_nearest_glob(src, dst, fy, fx, stream);
+        }
+    };
+
+    template <typename T> struct SelectImplForNearest
+    {
+        static void call(const PtrStepSz<T>& src, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream)
+        {
+            if (stream)
+                call_resize_nearest_glob(src, dst, fy, fx, stream);
+            else
            {
-                if (stream == 0)
-                    ResizeDispatcherNonStream<Filter, T>::call(src, srcWhole, xoff, yoff, fx, fy, dst);
+                if (fx > 1 || fy > 1)
+                    call_resize_nearest_glob(src, dst, fy, fx, 0);
                else
-                    ResizeDispatcherStream<Filter, T>::call(src, fx, fy, dst, stream);
+                    call_resize_nearest_tex(src, srcWhole, yoff, xoff, dst, fy, fx);
            }
-        };
+        }
+    };
+
+    template <> struct ResizeNearestDispatcher<uchar> : SelectImplForNearest<uchar> {};
+    template <> struct ResizeNearestDispatcher<uchar4> : SelectImplForNearest<uchar4> {};
+
+    template <> struct ResizeNearestDispatcher<ushort> : SelectImplForNearest<ushort> {};
+    template <> struct ResizeNearestDispatcher<ushort4> : SelectImplForNearest<ushort4> {};
+
+    template <> struct ResizeNearestDispatcher<short> : SelectImplForNearest<short> {};
+    template <> struct ResizeNearestDispatcher<short4> : SelectImplForNearest<short4> {};

-        template <typename T> struct ResizeDispatcher<AreaFilter, T>
+    template <> struct ResizeNearestDispatcher<float> : SelectImplForNearest<float> {};
+    template <> struct ResizeNearestDispatcher<float4> : SelectImplForNearest<float4> {};
+
+    // ResizeLinearDispatcher
+
+    template <typename T> struct ResizeLinearDispatcher
+    {
+        static void call(const PtrStepSz<T>& src, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream)
        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
+            call_resize_linear_glob(src, dst, fy, fx, stream);
+        }
+    };
+
+    template <typename T> struct SelectImplForLinear
+    {
+        static void call(const PtrStepSz<T>& src, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream)
+        {
+            if (stream)
+                call_resize_linear_glob(src, dst, fy, fx, stream);
+            else
            {
-                (void)srcWhole;
-                (void)xoff;
-                (void)yoff;
-                int iscale_x = (int)round(fx);
-                int iscale_y = (int)round(fy);
-
-                if( std::abs(fx - iscale_x) < FLT_MIN && std::abs(fy - iscale_y) < FLT_MIN)
-                    ResizeDispatcherStream<IntegerAreaFilter, T>::call(src, fx, fy, dst, stream);
+                if (fx > 1 || fy > 1)
+                    call_resize_linear_glob(src, dst, fy, fx, 0);
                else
-                    ResizeDispatcherStream<AreaFilter, T>::call(src, fx, fy, dst, stream);
+                    call_resize_linear_tex(src, srcWhole, yoff, xoff, dst, fy, fx);
            }
-        };
+        }
+    };

-        template <typename T> void resize_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy,
-            PtrStepSzb dst, int interpolation, cudaStream_t stream)
+    template <> struct ResizeLinearDispatcher<uchar> : SelectImplForLinear<uchar> {};
+    template <> struct ResizeLinearDispatcher<uchar4> : SelectImplForLinear<uchar4> {};
+
+    template <> struct ResizeLinearDispatcher<ushort> : SelectImplForLinear<ushort> {};
+    template <> struct ResizeLinearDispatcher<ushort4> : SelectImplForLinear<ushort4> {};
+
+    template <> struct ResizeLinearDispatcher<short> : SelectImplForLinear<short> {};
+    template <> struct ResizeLinearDispatcher<short4> : SelectImplForLinear<short4> {};
+
+    template <> struct ResizeLinearDispatcher<float> : SelectImplForLinear<float> {};
+    template <> struct ResizeLinearDispatcher<float4> : SelectImplForLinear<float4> {};
+
+    // ResizeCubicDispatcher
+
+    template <typename T> struct ResizeCubicDispatcher
+    {
+        static void call(const PtrStepSz<T>& src, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream)
        {
-            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream);
+            call_resize_cubic_glob(src, dst, fy, fx, stream);
+        }
+    };

-            static const caller_t callers[4] =
-            {
-                ResizeDispatcher<PointFilter, T>::call,
-                ResizeDispatcher<LinearFilter, T>::call,
-                ResizeDispatcher<CubicFilter, T>::call,
-                ResizeDispatcher<AreaFilter, T>::call
-            };
-            // chenge to linear if area interpolation upscaling
-            if (interpolation == 3 && (fx <= 1.f || fy <= 1.f))
-                interpolation = 1;
-
-            callers[interpolation](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, fx, fy,
-                static_cast< PtrStepSz<T> >(dst), stream);
+    template <typename T> struct SelectImplForCubic
+    {
+        static void call(const PtrStepSz<T>& src, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream)
+        {
+            if (stream)
+                call_resize_cubic_glob(src, dst, fy, fx, stream);
+            else
+                call_resize_cubic_tex(src, srcWhole, yoff, xoff, dst, fy, fx);
        }
+    };

-        template void resize_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+    template <> struct ResizeCubicDispatcher<uchar> : SelectImplForCubic<uchar> {};
+    template <> struct ResizeCubicDispatcher<uchar4> : SelectImplForCubic<uchar4> {};

-        //template void resize_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+    template <> struct ResizeCubicDispatcher<ushort> : SelectImplForCubic<ushort> {};
+    template <> struct ResizeCubicDispatcher<ushort4> : SelectImplForCubic<ushort4> {};

-        template void resize_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+    template <> struct ResizeCubicDispatcher<short> : SelectImplForCubic<short> {};
+    template <> struct ResizeCubicDispatcher<short4> : SelectImplForCubic<short4> {};

-        template void resize_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+    template <> struct ResizeCubicDispatcher<float> : SelectImplForCubic<float> {};
+    template <> struct ResizeCubicDispatcher<float4> : SelectImplForCubic<float4> {};

-        //template void resize_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+    // ResizeAreaDispatcher

-        template void resize_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+    template <typename T> struct ResizeAreaDispatcher
+    {
+        static void call(const PtrStepSz<T>& src, const PtrStepSz<T>&, int, int, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream)
+        {
+            const int iscale_x = (int) round(fx);
+            const int iscale_y = (int) round(fy);

-        template<typename T> struct scan_traits{};
+            const dim3 block(32, 8);
+            const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));

-        template<> struct scan_traits<uchar>
+            if (std::abs(fx - iscale_x) < FLT_MIN && std::abs(fy - iscale_y) < FLT_MIN)
+            {
+                BrdConstant<T> brd(src.rows, src.cols);
+                BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd);
+                IntegerAreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy);
+
+                resize_area<<<grid, block, 0, stream>>>(filteredSrc, dst);
+            }
+            else
+            {
+                BrdConstant<T> brd(src.rows, src.cols);
+                BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd);
+                AreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy);
+
+                resize_area<<<grid, block, 0, stream>>>(filteredSrc, dst);
+            }
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+
+    // resize
+
+    template <typename T> void resize(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream)
+    {
+        typedef void (*func_t)(const PtrStepSz<T>& src, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx, cudaStream_t stream);
+        static const func_t funcs[4] =
        {
-            typedef float scan_line_type;
+            ResizeNearestDispatcher<T>::call,
+            ResizeLinearDispatcher<T>::call,
+            ResizeCubicDispatcher<T>::call,
+            ResizeAreaDispatcher<T>::call
        };

-    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace cudev
+        // change to linear if area interpolation upscaling
+        if (interpolation == 3 && (fx <= 1.f || fy <= 1.f))
+            interpolation = 1;
+
+        funcs[interpolation](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), yoff, xoff, static_cast< PtrStepSz<T> >(dst), fy, fx, stream);
+    }
+
+    template void resize<uchar >(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+    template void resize<uchar3>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+    template void resize<uchar4>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+
+    template void resize<ushort >(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+    template void resize<ushort3>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+    template void resize<ushort4>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+
+    template void resize<short >(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+    template void resize<short3>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+    template void resize<short4>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);

+    template void resize<float >(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+    template void resize<float3>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+    template void resize<float4>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+}}}

 #endif /* CUDA_DISABLER */
--- a/modules/gpuwarping/src/resize.cpp
+++ b/modules/gpuwarping/src/resize.cpp
@ -50,18 +50,25 @@ void cv::gpu::resize(InputArray, OutputArray, Size, double, double, int, Stream&

 namespace cv { namespace gpu { namespace cudev
 {
-    namespace imgproc
-    {
-        template <typename T>
-        void resize_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy,
-                        PtrStepSzb dst, int interpolation, cudaStream_t stream);
-    }
+    template <typename T>
+    void resize(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
 }}}

-void cv::gpu::resize(InputArray _src, OutputArray _dst, Size dsize, double fx, double fy, int interpolation, Stream& _stream)
+void cv::gpu::resize(InputArray _src, OutputArray _dst, Size dsize, double fx, double fy, int interpolation, Stream& stream)
 {
    GpuMat src = _src.getGpuMat();

+    typedef void (*func_t)(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+    static const func_t funcs[6][4] =
+    {
+        {cudev::resize<uchar>      , 0 /*cudev::resize<uchar2>*/ , cudev::resize<uchar3>     , cudev::resize<uchar4>     },
+        {0 /*cudev::resize<schar>*/, 0 /*cudev::resize<char2>*/  , 0 /*cudev::resize<char3>*/, 0 /*cudev::resize<char4>*/},
+        {cudev::resize<ushort>     , 0 /*cudev::resize<ushort2>*/, cudev::resize<ushort3>    , cudev::resize<ushort4>    },
+        {cudev::resize<short>      , 0 /*cudev::resize<short2>*/ , cudev::resize<short3>     , cudev::resize<short4>     },
+        {0 /*cudev::resize<int>*/  , 0 /*cudev::resize<int2>*/   , 0 /*cudev::resize<int3>*/ , 0 /*cudev::resize<int4>*/ },
+        {cudev::resize<float>      , 0 /*cudev::resize<float2>*/ , cudev::resize<float3>     , cudev::resize<float4>     }
+    };
+
    CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 );
    CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC || interpolation == INTER_AREA );
    CV_Assert( !(dsize == Size()) || (fx > 0 && fy > 0) );
@ -81,72 +88,21 @@ void cv::gpu::resize(InputArray _src, OutputArray _dst, Size dsize, double fx, d

    if (dsize == src.size())
    {
-        src.copyTo(dst, _stream);
+        src.copyTo(dst, stream);
        return;
    }

-    cudaStream_t stream = StreamAccessor::getStream(_stream);
+    const func_t func = funcs[src.depth()][src.channels() - 1];
+
+    if (!func)
+        CV_Error(Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");

    Size wholeSize;
    Point ofs;
    src.locateROI(wholeSize, ofs);
+    PtrStepSzb wholeSrc(wholeSize.height, wholeSize.width, src.datastart, src.step);

-    bool useNpp = (src.type() == CV_8UC1 || src.type() == CV_8UC4);
-    useNpp = useNpp && (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR);
-
-    if (useNpp)
-    {
-        typedef NppStatus (*func_t)(const Npp8u * pSrc, NppiSize oSrcSize, int nSrcStep, NppiRect oSrcROI, Npp8u * pDst, int nDstStep, NppiSize dstROISize,
-                                    double xFactor, double yFactor, int eInterpolation);
-
-        const func_t funcs[4] = { nppiResize_8u_C1R, 0, 0, nppiResize_8u_C4R };
-
-        static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC, 0, NPPI_INTER_LANCZOS};
-
-        NppiSize srcsz;
-        srcsz.width  = wholeSize.width;
-        srcsz.height = wholeSize.height;
-
-        NppiRect srcrect;
-        srcrect.x = ofs.x;
-        srcrect.y = ofs.y;
-        srcrect.width  = src.cols;
-        srcrect.height = src.rows;
-
-        NppiSize dstsz;
-        dstsz.width  = dst.cols;
-        dstsz.height = dst.rows;
-
-        NppStreamHandler h(stream);
-
-        nppSafeCall( funcs[src.channels() - 1](src.datastart, srcsz, static_cast<int>(src.step), srcrect,
-                dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, fx, fy, npp_inter[interpolation]) );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-    else
-    {
-        using namespace ::cv::gpu::cudev::imgproc;
-
-        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-
-        static const func_t funcs[6][4] =
-        {
-            {resize_gpu<uchar>      , 0 /*resize_gpu<uchar2>*/ , resize_gpu<uchar3>     , resize_gpu<uchar4>     },
-            {0 /*resize_gpu<schar>*/, 0 /*resize_gpu<char2>*/  , 0 /*resize_gpu<char3>*/, 0 /*resize_gpu<char4>*/},
-            {resize_gpu<ushort>     , 0 /*resize_gpu<ushort2>*/, resize_gpu<ushort3>    , resize_gpu<ushort4>    },
-            {resize_gpu<short>      , 0 /*resize_gpu<short2>*/ , resize_gpu<short3>     , resize_gpu<short4>     },
-            {0 /*resize_gpu<int>*/  , 0 /*resize_gpu<int2>*/   , 0 /*resize_gpu<int3>*/ , 0 /*resize_gpu<int4>*/ },
-            {resize_gpu<float>      , 0 /*resize_gpu<float2>*/ , resize_gpu<float3>     , resize_gpu<float4>     }
-        };
-
-        const func_t func = funcs[src.depth()][src.channels() - 1];
-        CV_Assert(func != 0);
-
-        func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y,
-            static_cast<float>(1.0 / fx), static_cast<float>(1.0 / fy), dst, interpolation, stream);
-    }
+    func(src, wholeSrc, ofs.y, ofs.x, dst, static_cast<float>(1.0 / fy), static_cast<float>(1.0 / fx), interpolation, StreamAccessor::getStream(stream));
 }

 #endif // HAVE_CUDA
--- a/modules/gpuwarping/test/test_resize.cpp
+++ b/modules/gpuwarping/test/test_resize.cpp
@ -155,7 +155,7 @@ GPU_TEST_P(Resize, Accuracy)
 INSTANTIATE_TEST_CASE_P(GPU_Warping, Resize, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC3), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
    testing::Values(0.3, 0.5, 1.5, 2.0),
    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
    WHOLE_SUBMAT));
@ -201,50 +201,9 @@ GPU_TEST_P(ResizeSameAsHost, Accuracy)
 INSTANTIATE_TEST_CASE_P(GPU_Warping, ResizeSameAsHost, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC3), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
    testing::Values(0.3, 0.5),
-    testing::Values(Interpolation(cv::INTER_AREA), Interpolation(cv::INTER_NEAREST)),  //, Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_AREA)),
    WHOLE_SUBMAT));

-///////////////////////////////////////////////////////////////////
-// Test NPP
-
-PARAM_TEST_CASE(ResizeNPP, cv::gpu::DeviceInfo, MatType, double, Interpolation)
-{
-    cv::gpu::DeviceInfo devInfo;
-    double coeff;
-    int interpolation;
-    int type;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        type = GET_PARAM(1);
-        coeff = GET_PARAM(2);
-        interpolation = GET_PARAM(3);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(ResizeNPP, Accuracy)
-{
-    cv::Mat src = readImageType("stereobp/aloe-L.png", type);
-    ASSERT_FALSE(src.empty());
-
-    cv::gpu::GpuMat dst;
-    cv::gpu::resize(loadMat(src), dst, cv::Size(), coeff, coeff, interpolation);
-
-    cv::Mat dst_gold;
-    resizeGold(src, dst_gold, coeff, coeff, interpolation);
-
-    EXPECT_MAT_SIMILAR(dst_gold, dst, 1e-1);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Warping, ResizeNPP, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4)),
-    testing::Values(0.3, 0.5, 1.5, 2.0),
-    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR))));
-
 #endif // HAVE_CUDA
--- a/modules/highgui/src/grfmt_png.cpp
+++ b/modules/highgui/src/grfmt_png.cpp
@ -171,7 +171,9 @@ bool  PngDecoder::readHeader()
                if( !m_buf.empty() || m_f )
                {
                    png_uint_32 wdth, hght;
-                    int bit_depth, color_type;
+                    int bit_depth, color_type, num_trans=0;
+                    png_bytep trans;
+                    png_color_16p trans_values;

                    png_read_info( png_ptr, info_ptr );

@ -187,15 +189,22 @@ bool  PngDecoder::readHeader()
                    {
                        switch(color_type)
                        {
-                           case PNG_COLOR_TYPE_RGB:
-                           case PNG_COLOR_TYPE_PALETTE:
-                               m_type = CV_8UC3;
-                               break;
-                          case PNG_COLOR_TYPE_RGB_ALPHA:
-                               m_type = CV_8UC4;
-                               break;
-                          default:
-                               m_type = CV_8UC1;
+                            case PNG_COLOR_TYPE_RGB:
+                                m_type = CV_8UC3;
+                                break;
+                            case PNG_COLOR_TYPE_PALETTE:
+                                png_get_tRNS( png_ptr, info_ptr, &trans, &num_trans, &trans_values);
+                                //Check if there is a transparency value in the palette
+                                if ( num_trans > 0 )
+                                    m_type = CV_8UC4;
+                                else
+                                    m_type = CV_8UC3;
+                                break;
+                            case PNG_COLOR_TYPE_RGB_ALPHA:
+                                m_type = CV_8UC4;
+                                break;
+                            default:
+                                m_type = CV_8UC1;
                        }
                        if( bit_depth == 16 )
                            m_type = CV_MAKETYPE(CV_16U, CV_MAT_CN(m_type));
--- a/modules/highgui/test/test_grfmt.cpp
+++ b/modules/highgui/test/test_grfmt.cpp
@ -284,6 +284,98 @@ TEST(Highgui_ImreadVSCvtColor, regression)
    EXPECT_LT(actual_avg_diff, MAX_MEAN_DIFF);
    EXPECT_LT(actual_maxval, MAX_ABS_DIFF);
 }
+
+//Test OpenCV issue 3075 is solved
+class CV_GrfmtReadPNGColorPaletteWithAlphaTest : public cvtest::BaseTest
+{
+public:
+    void run(int)
+    {
+        try
+        {
+            // First Test : Read PNG with alpha, imread flag -1
+            Mat img = imread(string(ts->get_data_path()) + "readwrite/color_palette_alpha.png",-1);
+            if (img.empty()) ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_TEST_DATA);
+
+            ASSERT_TRUE(img.channels() == 4);
+
+            unsigned char* img_data = (unsigned char*)img.data;
+
+            // Verification first pixel is red in BGRA
+            ASSERT_TRUE(img_data[0] == 0x00);
+            ASSERT_TRUE(img_data[1] == 0x00);
+            ASSERT_TRUE(img_data[2] == 0xFF);
+            ASSERT_TRUE(img_data[3] == 0xFF);
+
+            // Verification second pixel is red in BGRA
+            ASSERT_TRUE(img_data[4] == 0x00);
+            ASSERT_TRUE(img_data[5] == 0x00);
+            ASSERT_TRUE(img_data[6] == 0xFF);
+            ASSERT_TRUE(img_data[7] == 0xFF);
+
+            // Second Test : Read PNG without alpha, imread flag -1
+            img = imread(string(ts->get_data_path()) + "readwrite/color_palette_no_alpha.png",-1);
+            if (img.empty()) ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_TEST_DATA);
+
+            ASSERT_TRUE(img.channels() == 3);
+
+            img_data = (unsigned char*)img.data;
+
+            // Verification first pixel is red in BGR
+            ASSERT_TRUE(img_data[0] == 0x00);
+            ASSERT_TRUE(img_data[1] == 0x00);
+            ASSERT_TRUE(img_data[2] == 0xFF);
+
+            // Verification second pixel is red in BGR
+            ASSERT_TRUE(img_data[3] == 0x00);
+            ASSERT_TRUE(img_data[4] == 0x00);
+            ASSERT_TRUE(img_data[5] == 0xFF);
+
+            // Third Test : Read PNG with alpha, imread flag 1
+            img = imread(string(ts->get_data_path()) + "readwrite/color_palette_alpha.png",1);
+            if (img.empty()) ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_TEST_DATA);
+
+            ASSERT_TRUE(img.channels() == 3);
+
+            img_data = (unsigned char*)img.data;
+
+            // Verification first pixel is red in BGR
+            ASSERT_TRUE(img_data[0] == 0x00);
+            ASSERT_TRUE(img_data[1] == 0x00);
+            ASSERT_TRUE(img_data[2] == 0xFF);
+
+            // Verification second pixel is red in BGR
+            ASSERT_TRUE(img_data[3] == 0x00);
+            ASSERT_TRUE(img_data[4] == 0x00);
+            ASSERT_TRUE(img_data[5] == 0xFF);
+
+            // Fourth Test : Read PNG without alpha, imread flag 1
+            img = imread(string(ts->get_data_path()) + "readwrite/color_palette_no_alpha.png",1);
+            if (img.empty()) ts->set_failed_test_info(cvtest::TS::FAIL_INVALID_TEST_DATA);
+
+            ASSERT_TRUE(img.channels() == 3);
+
+            img_data = (unsigned char*)img.data;
+
+            // Verification first pixel is red in BGR
+            ASSERT_TRUE(img_data[0] == 0x00);
+            ASSERT_TRUE(img_data[1] == 0x00);
+            ASSERT_TRUE(img_data[2] == 0xFF);
+
+            // Verification second pixel is red in BGR
+            ASSERT_TRUE(img_data[3] == 0x00);
+            ASSERT_TRUE(img_data[4] == 0x00);
+            ASSERT_TRUE(img_data[5] == 0xFF);
+        }
+        catch(...)
+        {
+            ts->set_failed_test_info(cvtest::TS::FAIL_EXCEPTION);
+    }
+        ts->set_failed_test_info(cvtest::TS::OK);
+    }
+};
+
+TEST(Highgui_Image, read_png_color_palette_with_alpha) { CV_GrfmtReadPNGColorPaletteWithAlphaTest test; test.safe_run(); }
 #endif

 #ifdef HAVE_JPEG
--- a/modules/ocl/perf/main.cpp
+++ b/modules/ocl/perf/main.cpp
@ -42,131 +42,54 @@

 #include "perf_precomp.hpp"

-static int cvErrorCallback(int /*status*/, const char * /*func_name*/,
-    const char *err_msg, const char * /*file_name*/,
-    int /*line*/, void * /*userdata*/)
+const char * impls[] =
 {
-    TestSystem::instance().printError(err_msg);
-    return 0;
-}
-
-int main(int argc, const char *argv[])
+    IMPL_OCL,
+    IMPL_PLAIN,
+#ifdef HAVE_OPENCV_GPU
+    IMPL_GPU
+#endif
+};
+
+int main(int argc, char ** argv)
 {
-    const char *keys =
-        "{ h help    | false | print help message }"
-        "{ f filter  |       | filter for test }"
-        "{ w workdir |       | set working directory }"
-        "{ l list    | false | show all tests }"
-        "{ d device  | 0     | device id }"
-        "{ c cpu_ocl | false | use cpu as ocl device}"
-        "{ i iters   | 10    | iteration count }"
-        "{ m warmup  | 1     | gpu warm up iteration count}"
-        "{ t xtop    | 1.1	 | xfactor top boundary}"
-        "{ b xbottom | 0.9	 | xfactor bottom boundary}"
-        "{ v verify  | false | only run gpu once to verify if problems occur}";
+    const char * keys =
+        "{ h help     | false              | print help message }"
+        "{ t type     | gpu                | set device type:cpu or gpu}"
+        "{ p platform | 0                  | set platform id }"
+        "{ d device   | 0                  | set device id }";

-    redirectError(cvErrorCallback);
    CommandLineParser cmd(argc, argv, keys);
    if (cmd.has("help"))
    {
-        cout << "Avaible options:" << endl;
+        cout << "Available options besides google test option:" << endl;
        cmd.printMessage();
        return 0;
    }

-    // get ocl devices
-    bool use_cpu = cmd.get<bool>("c");
-    vector<ocl::Info> oclinfo;
-    int num_devices = 0;
-    if(use_cpu)
-        num_devices = getDevice(oclinfo, ocl::CVCL_DEVICE_TYPE_CPU);
-    else
-        num_devices = getDevice(oclinfo);
-    if (num_devices < 1)
-    {
-        cerr << "no device found\n";
-        return -1;
-    }
-
-    // show device info
-    int devidx = 0;
-    for (size_t i = 0; i < oclinfo.size(); i++)
-    {
-        for (size_t j = 0; j < oclinfo[i].DeviceName.size(); j++)
-        {
-            cout << "device " << devidx++ << ": " << oclinfo[i].DeviceName[j] << endl;
-        }
-    }
-
+    string type = cmd.get<string>("type");
+    unsigned int pid = cmd.get<unsigned int>("platform");
    int device = cmd.get<int>("device");
-    if (device < 0 || device >= num_devices)
-    {
-        cerr << "Invalid device ID" << endl;
-        return -1;
-    }
-
-    // set this to overwrite binary cache every time the test starts
-    ocl::setBinaryDiskCache(ocl::CACHE_UPDATE);
-
-    if (cmd.get<bool>("verify"))
-    {
-        TestSystem::instance().setNumIters(1);
-        TestSystem::instance().setGPUWarmupIters(0);
-        TestSystem::instance().setCPUIters(0);
-    }
-
-    devidx = 0;
-    for (size_t i = 0; i < oclinfo.size(); i++)
-    {
-        for (size_t j = 0; j < oclinfo[i].DeviceName.size(); j++, devidx++)
-        {
-            if (device == devidx)
-            {
-                ocl::setDevice(oclinfo[i], (int)j);
-                TestSystem::instance().setRecordName(oclinfo[i].DeviceName[j]);
-                cout << "use " << devidx << ": " <<oclinfo[i].DeviceName[j] << endl;
-                goto END_DEV;
-            }
-        }
-    }
-
-END_DEV:
-
-    string filter = cmd.get<string>("filter");
-    string workdir = cmd.get<string>("workdir");
-    bool list = cmd.has("list");
-    int iters = cmd.get<int>("iters");
-    int wu_iters = cmd.get<int>("warmup");
-    double x_top = cmd.get<double>("xtop");
-    double x_bottom = cmd.get<double>("xbottom");

-    TestSystem::instance().setTopThreshold(x_top);
-    TestSystem::instance().setBottomThreshold(x_bottom);
+    int flag = type == "cpu" ? cv::ocl::CVCL_DEVICE_TYPE_CPU :
+                               cv::ocl::CVCL_DEVICE_TYPE_GPU;

-    if (!filter.empty())
+    std::vector<cv::ocl::Info> oclinfo;
+    int devnums = cv::ocl::getDevice(oclinfo, flag);
+    if (devnums <= device || device < 0)
    {
-        TestSystem::instance().setTestFilter(filter);
-    }
-
-    if (!workdir.empty())
-    {
-        if (workdir[workdir.size() - 1] != '/' && workdir[workdir.size() - 1] != '\\')
-        {
-            workdir += '/';
-        }
-
-        TestSystem::instance().setWorkingDir(workdir);
+        std::cout << "device invalid\n";
+        return -1;
    }

-    if (list)
+    if (pid >= oclinfo.size())
    {
-        TestSystem::instance().setListMode(true);
+        std::cout << "platform invalid\n";
+        return -1;
    }

-    TestSystem::instance().setNumIters(iters);
-    TestSystem::instance().setGPUWarmupIters(wu_iters);
-
-    TestSystem::instance().run();
+    cv::ocl::setDevice(oclinfo[pid], device);
+    cv::ocl::setBinaryDiskCache(cv::ocl::CACHE_UPDATE);

-    return 0;
+    CV_PERF_TEST_MAIN_INTERNALS(ocl, impls)
 }
--- a/modules/ocl/perf/perf_arithm.cpp
+++ b/modules/ocl/perf/perf_arithm.cpp
--- a/modules/ocl/perf/perf_blend.cpp
+++ b/modules/ocl/perf/perf_blend.cpp
@ -45,9 +45,15 @@
 //M*/

 #include "perf_precomp.hpp"
+
+using namespace perf;
+
 ///////////// blend ////////////////////////
+
 template <typename T>
-void blendLinearGold(const cv::Mat &img1, const cv::Mat &img2, const cv::Mat &weights1, const cv::Mat &weights2, cv::Mat &result_gold)
+static void blendLinearGold(const cv::Mat &img1, const cv::Mat &img2,
+                            const cv::Mat &weights1, const cv::Mat &weights2,
+                            cv::Mat &result_gold)
 {
    result_gold.create(img1.size(), img1.type());

@ -63,60 +69,46 @@ void blendLinearGold(const cv::Mat &img1, const cv::Mat &img2, const cv::Mat &we

        for (int x = 0; x < img1.cols * cn; ++x)
        {
-            float w1 = weights1_row[x / cn];
-            float w2 = weights2_row[x / cn];
-            result_gold_row[x] = static_cast<T>((img1_row[x] * w1 + img2_row[x] * w2) / (w1 + w2 + 1e-5f));
+            int x1 = x * cn;
+            float w1 = weights1_row[x];
+            float w2 = weights2_row[x];
+            result_gold_row[x] = static_cast<T>((img1_row[x1] * w1
+                                                 + img2_row[x1] * w2) / (w1 + w2 + 1e-5f));
        }
    }
 }
-PERFTEST(blend)
+
+typedef TestBaseWithParam<Size> blendLinearFixture;
+
+PERF_TEST_P(blendLinearFixture, blendLinear, OCL_TYPICAL_MAT_SIZES)
 {
-    Mat src1, src2, weights1, weights2, dst, ocl_dst;
-    ocl::oclMat d_src1, d_src2, d_weights1, d_weights2, d_dst;
+    const Size srcSize = GetParam();
+    const int type = CV_8UC1;
+
+    Mat src1(srcSize, type), src2(srcSize, CV_8UC1), dst;
+    Mat weights1(srcSize, CV_32FC1), weights2(srcSize, CV_32FC1);

-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
+    declare.in(src1, src2, WARMUP_RNG);
+    randu(weights1, 0.0f, 1.0f);
+    randu(weights2, 0.0f, 1.0f);

-    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
+    if (RUN_OCL_IMPL)
    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] << " and CV_32FC1";
-
-            gen(src1, size, size, all_type[j], 0, 256);
-            gen(src2, size, size, all_type[j], 0, 256);
-            gen(weights1, size, size, CV_32FC1, 0, 1);
-            gen(weights2, size, size, CV_32FC1, 0, 1);
-
-            blendLinearGold<uchar>(src1, src2, weights1, weights2, dst);
-
-            CPU_ON;
-            blendLinearGold<uchar>(src1, src2, weights1, weights2, dst);
-            CPU_OFF;
-
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-            d_weights1.upload(weights1);
-            d_weights2.upload(weights2);
-
-            WARMUP_ON;
-            ocl::blendLinear(d_src1, d_src2, d_weights1, d_weights2, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::blendLinear(d_src1, d_src2, d_weights1, d_weights2, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            d_src2.upload(src2);
-            d_weights1.upload(weights1);
-            d_weights2.upload(weights2);
-            ocl::blendLinear(d_src1, d_src2, d_weights1, d_weights2, d_dst);
-            d_dst.download(ocl_dst);
-            GPU_FULL_OFF;
-
-            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 1.f);
-        }
+        ocl::oclMat oclSrc1(src1), oclSrc2(src2), oclDst;
+        ocl::oclMat oclWeights1(weights1), oclWeights2(weights2);
+
+        TEST_CYCLE() cv::ocl::blendLinear(oclSrc1, oclSrc2, oclWeights1, oclWeights2, oclDst);
+
+        oclDst.download(dst);
+
+        SANITY_CHECK(dst);
+    }
+    else if (RUN_PLAIN_IMPL)
+    {
+        TEST_CYCLE() blendLinearGold<uchar>(src1, src2, weights1, weights2, dst);
+
+        SANITY_CHECK(dst);
    }
+    else
+        OCL_PERF_ELSE
 }
--- a/modules/ocl/perf/perf_brute_force_matcher.cpp
+++ b/modules/ocl/perf/perf_brute_force_matcher.cpp
@ -45,123 +45,119 @@
 //M*/
 #include "perf_precomp.hpp"

-//////////////////// BruteForceMatch /////////////////
-PERFTEST(BruteForceMatcher)
-{
-    Mat trainIdx_cpu;
-    Mat distance_cpu;
-    Mat allDist_cpu;
-    Mat nMatches_cpu;
-
-    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
-    {
-        // Init CPU matcher
-        int desc_len = 64;
-
-        BFMatcher matcher(NORM_L2);
-
-        Mat query;
-        gen(query, size, desc_len, CV_32F, 0, 1);
-
-        Mat train;
-        gen(train, size, desc_len, CV_32F, 0, 1);
-        // Output
-        vector< vector<DMatch> > matches(2);
-        vector< vector<DMatch> > d_matches(2);
-        // Init GPU matcher
-        ocl::BruteForceMatcher_OCL_base d_matcher(ocl::BruteForceMatcher_OCL_base::L2Dist);
+using namespace perf;

-        ocl::oclMat d_query(query);
-        ocl::oclMat d_train(train);
+#define OCL_BFMATCHER_TYPICAL_MAT_SIZES ::testing::Values(cv::Size(128, 500), cv::Size(128, 1000), cv::Size(128, 2000))

-        ocl::oclMat d_trainIdx, d_distance, d_allDist, d_nMatches;
-
-        SUBTEST << size << "; match";
-
-        matcher.match(query, train, matches[0]);
+//////////////////// BruteForceMatch /////////////////

-        CPU_ON;
-        matcher.match(query, train, matches[0]);
-        CPU_OFF;
+typedef TestBaseWithParam<Size> BruteForceMatcherFixture;

-        WARMUP_ON;
-        d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
-        WARMUP_OFF;
+PERF_TEST_P(BruteForceMatcherFixture, DISABLED_match,
+            OCL_BFMATCHER_TYPICAL_MAT_SIZES) // TODO too big difference between implementations
+{
+    const Size srcSize = GetParam();

-        GPU_ON;
-        d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
-        GPU_OFF;
+    vector<DMatch> matches;
+    Mat query(srcSize, CV_32F), train(srcSize, CV_32F);
+    declare.in(query, train).time(srcSize.height == 2000 ? 8 : 4 );
+    randu(query, 0.0f, 1.0f);
+    randu(train, 0.0f, 1.0f);

-        GPU_FULL_ON;
-        d_query.upload(query);
-        d_train.upload(train);
-        d_matcher.match(d_query, d_train, d_matches[0]);
-        GPU_FULL_OFF;
+    if (RUN_PLAIN_IMPL)
+    {
+        BFMatcher matcher(NORM_L2);
+        TEST_CYCLE() matcher.match(query, train, matches);

-        int diff = abs((int)d_matches[0].size() - (int)matches[0].size());
-        if(diff == 0)
-            TestSystem::instance().setAccurate(1, 0);
-        else
-            TestSystem::instance().setAccurate(0, diff);
+        SANITY_CHECK_MATCHES(matches);
+    }
+    else if (RUN_OCL_IMPL)
+    {
+        ocl::BruteForceMatcher_OCL_base oclMatcher(ocl::BruteForceMatcher_OCL_base::L2Dist);
+        ocl::oclMat oclQuery(query), oclTrain(train);

-        SUBTEST << size << "; knnMatch";
+        TEST_CYCLE() oclMatcher.match(oclQuery, oclTrain, matches);

-        matcher.knnMatch(query, train, matches, 2);
+        SANITY_CHECK_MATCHES(matches);
+    }
+    else
+        OCL_PERF_ELSE
+}

-        CPU_ON;
-        matcher.knnMatch(query, train, matches, 2);
-        CPU_OFF;
+PERF_TEST_P(BruteForceMatcherFixture, DISABLED_knnMatch,
+            OCL_BFMATCHER_TYPICAL_MAT_SIZES)  // TODO too many outliers
+{
+    const Size srcSize = GetParam();

-        WARMUP_ON;
-        d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, 2);
-        WARMUP_OFF;
+    vector<vector<DMatch> > matches(2);
+    Mat query(srcSize, CV_32F), train(srcSize, CV_32F);
+    randu(query, 0.0f, 1.0f);
+    randu(train, 0.0f, 1.0f);

-        GPU_ON;
-        d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, 2);
-        GPU_OFF;
+    declare.in(query, train);
+    if (srcSize.height == 2000)
+        declare.time(8);

-        GPU_FULL_ON;
-        d_query.upload(query);
-        d_train.upload(train);
-        d_matcher.knnMatch(d_query, d_train, d_matches, 2);
-        GPU_FULL_OFF;
+    if (RUN_PLAIN_IMPL)
+    {
+        BFMatcher matcher (NORM_L2);
+        TEST_CYCLE() matcher.knnMatch(query, train, matches, 2);

-        diff = abs((int)d_matches[0].size() - (int)matches[0].size());
-        if(diff == 0)
-            TestSystem::instance().setAccurate(1, 0);
-        else
-            TestSystem::instance().setAccurate(0, diff);
+        std::vector<DMatch> & matches0 = matches[0], & matches1 = matches[1];
+        SANITY_CHECK_MATCHES(matches0);
+        SANITY_CHECK_MATCHES(matches1);
+    }
+    else if (RUN_OCL_IMPL)
+    {
+        ocl::BruteForceMatcher_OCL_base oclMatcher(ocl::BruteForceMatcher_OCL_base::L2Dist);
+        ocl::oclMat oclQuery(query), oclTrain(train);

-        SUBTEST << size << "; radiusMatch";
+        TEST_CYCLE() oclMatcher.knnMatch(oclQuery, oclTrain, matches, 2);

-        float max_distance = 2.0f;
+        std::vector<DMatch> & matches0 = matches[0], & matches1 = matches[1];
+        SANITY_CHECK_MATCHES(matches0);
+        SANITY_CHECK_MATCHES(matches1);
+    }
+    else
+        OCL_PERF_ELSE
+}

-        matcher.radiusMatch(query, train, matches, max_distance);
+PERF_TEST_P(BruteForceMatcherFixture, DISABLED_radiusMatch,
+            OCL_BFMATCHER_TYPICAL_MAT_SIZES) // TODO too many outliers
+{
+    const Size srcSize = GetParam();

-        CPU_ON;
-        matcher.radiusMatch(query, train, matches, max_distance);
-        CPU_OFF;
+    const float max_distance = 2.0f;
+    vector<vector<DMatch> > matches(2);
+    Mat query(srcSize, CV_32F), train(srcSize, CV_32F);
+    declare.in(query, train);
+    Mat trainIdx, distance, allDist;

-        d_trainIdx.release();
+    randu(query, 0.0f, 1.0f);
+    randu(train, 0.0f, 1.0f);

-        WARMUP_ON;
-        d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, max_distance);
-        WARMUP_OFF;
+    if (RUN_PLAIN_IMPL)
+    {
+        BFMatcher matcher (NORM_L2);
+        TEST_CYCLE() matcher.radiusMatch(query, matches, max_distance);

-        GPU_ON;
-        d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, max_distance);
-        GPU_OFF;
+        std::vector<DMatch> & matches0 = matches[0], & matches1 = matches[1];
+        SANITY_CHECK_MATCHES(matches0);
+        SANITY_CHECK_MATCHES(matches1);
+    }
+    else if (RUN_OCL_IMPL)
+    {
+        ocl::oclMat oclQuery(query), oclTrain(train);
+        ocl::BruteForceMatcher_OCL_base oclMatcher(ocl::BruteForceMatcher_OCL_base::L2Dist);

-        GPU_FULL_ON;
-        d_query.upload(query);
-        d_train.upload(train);
-        d_matcher.radiusMatch(d_query, d_train, d_matches, max_distance);
-        GPU_FULL_OFF;
+        TEST_CYCLE() oclMatcher.radiusMatch(oclQuery, oclTrain, matches, max_distance);

-        diff = abs((int)d_matches[0].size() - (int)matches[0].size());
-        if(diff == 0)
-            TestSystem::instance().setAccurate(1, 0);
-        else
-            TestSystem::instance().setAccurate(0, diff);
+        std::vector<DMatch> & matches0 = matches[0], & matches1 = matches[1];
+        SANITY_CHECK_MATCHES(matches0);
+        SANITY_CHECK_MATCHES(matches1);
    }
+    else
+        OCL_PERF_ELSE
 }
+
+#undef OCL_BFMATCHER_TYPICAL_MAT_SIZES
--- a/modules/ocl/perf/perf_calib3d.cpp
+++ b/modules/ocl/perf/perf_calib3d.cpp
@ -45,48 +45,44 @@
 //M*/

 #include "perf_precomp.hpp"
+
 ///////////// StereoMatchBM ////////////////////////
-PERFTEST(StereoMatchBM)
-{
-    Mat left_image = imread(abspath("aloeL.jpg"), cv::IMREAD_GRAYSCALE);
-    Mat right_image = imread(abspath("aloeR.jpg"), cv::IMREAD_GRAYSCALE);
-    Mat disp,dst;
-    ocl::oclMat d_left, d_right,d_disp;
-    int n_disp= 128;
-    int winSize =19;

-    SUBTEST << left_image.cols << 'x' << left_image.rows << "; aloeL.jpg ;"<< right_image.cols << 'x' << right_image.rows << "; aloeR.jpg ";
+PERF_TEST(StereoMatchBMFixture, DISABLED_StereoMatchBM) // TODO doesn't work properly
+{
+    Mat left_image = imread(getDataPath("gpu/stereobm/aloe-L.png"), cv::IMREAD_GRAYSCALE);
+    Mat right_image = imread(getDataPath("gpu/stereobm/aloe-R.png"), cv::IMREAD_GRAYSCALE);

-    Ptr<StereoBM> bm = createStereoBM(n_disp, winSize);
-    bm->compute(left_image, right_image, dst);
+    ASSERT_TRUE(!left_image.empty()) << "no input image";
+    ASSERT_TRUE(!right_image.empty()) << "no input image";
+    ASSERT_TRUE(right_image.size() == left_image.size());
+    ASSERT_TRUE(right_image.size() == left_image.size());

-    CPU_ON;
-    bm->compute(left_image, right_image, dst);
-    CPU_OFF;
+    const int n_disp = 128, winSize = 19;
+    Mat disp(left_image.size(), CV_16SC1);

-    d_left.upload(left_image);
-    d_right.upload(right_image);
+    declare.in(left_image, right_image).out(disp);

-    ocl::StereoBM_OCL d_bm(0, n_disp, winSize);
+    if (RUN_OCL_IMPL)
+    {
+        ocl::oclMat oclLeft(left_image), oclRight(right_image),
+                oclDisp(left_image.size(), CV_16SC1);
+        ocl::StereoBM_OCL oclBM(0, n_disp, winSize);

-    WARMUP_ON;
-    d_bm(d_left, d_right, d_disp);
-    WARMUP_OFF;
+        TEST_CYCLE() oclBM(oclLeft, oclRight, oclDisp);

-    cv::Mat ocl_mat;
-    d_disp.download(ocl_mat);
-    ocl_mat.convertTo(ocl_mat, dst.type());
+        oclDisp.download(disp);

-    GPU_ON;
-    d_bm(d_left, d_right, d_disp);
-    GPU_OFF;
+        SANITY_CHECK(disp);
+    }
+    else if (RUN_PLAIN_IMPL)
+    {
+        Ptr<StereoBM> bm = createStereoBM(n_disp, winSize);

-    GPU_FULL_ON;
-    d_left.upload(left_image);
-    d_right.upload(right_image);
-    d_bm(d_left, d_right, d_disp);
-    d_disp.download(disp);
-    GPU_FULL_OFF;
+        TEST_CYCLE() bm->compute(left_image, right_image, disp);

-    TestSystem::instance().setAccurate(-1, 0.);
+        SANITY_CHECK(disp);
+    }
+    else
+        OCL_PERF_ELSE
 }
--- a/modules/ocl/perf/perf_canny.cpp
+++ b/modules/ocl/perf/perf_canny.cpp
@ -45,41 +45,33 @@
 //M*/
 #include "perf_precomp.hpp"

-///////////// Canny ////////////////////////
-PERFTEST(Canny)
-{
-    Mat img = imread(abspath("aloeL.jpg"), IMREAD_GRAYSCALE);
-
-    if (img.empty())
-    {
-        throw runtime_error("can't open aloeL.jpg");
-    }
-
-    SUBTEST << img.cols << 'x' << img.rows << "; aloeL.jpg" << "; edges" << "; CV_8UC1";
+using namespace perf;

-    Mat edges(img.size(), CV_8UC1), ocl_edges;
+///////////// Canny ////////////////////////

-    CPU_ON;
-    Canny(img, edges, 50.0, 100.0);
-    CPU_OFF;
+PERF_TEST(CannyFixture, DISABLED_Canny) // TODO difference between implmentations
+{
+    Mat img = imread(getDataPath("gpu/stereobm/aloe-L.png"), cv::IMREAD_GRAYSCALE),
+            edges(img.size(), CV_8UC1);
+    ASSERT_TRUE(!img.empty()) << "can't open aloe-L.png";

-    ocl::oclMat d_img(img);
-    ocl::oclMat d_edges;
-    ocl::CannyBuf d_buf;
+    declare.in(img).out(edges);

-    WARMUP_ON;
-    ocl::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
-    WARMUP_OFF;
+    if (RUN_OCL_IMPL)
+    {
+        ocl::oclMat oclImg(img), oclEdges(img.size(), CV_8UC1);

-    GPU_ON;
-    ocl::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
-    GPU_OFF;
+        TEST_CYCLE() ocl::Canny(oclImg, oclEdges, 50.0, 100.0);
+        oclEdges.download(edges);

-    GPU_FULL_ON;
-    d_img.upload(img);
-    ocl::Canny(d_img, d_buf, d_edges, 50.0, 100.0);
-    d_edges.download(ocl_edges);
-    GPU_FULL_OFF;
+        SANITY_CHECK(edges);
+    }
+    else if (RUN_PLAIN_IMPL)
+    {
+        TEST_CYCLE() Canny(img, edges, 50.0, 100.0);

-    TestSystem::instance().ExceptedMatSimilar(edges, ocl_edges, 2e-2);
+        SANITY_CHECK(edges);
+    }
+    else
+        OCL_PERF_ELSE
 }
--- a/modules/ocl/perf/perf_color.cpp
+++ b/modules/ocl/perf/perf_color.cpp
@ -45,49 +45,34 @@
 //M*/
 #include "perf_precomp.hpp"

-///////////// cvtColor////////////////////////
-PERFTEST(cvtColor)
-{
-    Mat src, dst, ocl_dst;
-    ocl::oclMat d_src, d_dst;
-
-    int all_type[] = {CV_8UC4};
-    std::string type_name[] = {"CV_8UC4"};
-
-    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            gen(src, size, size, all_type[j], 0, 256);
-            SUBTEST << size << "x" << size << "; " << type_name[j] << " ; CV_RGBA2GRAY";
-
-            cvtColor(src, dst, COLOR_RGBA2GRAY, 4);
-
-            CPU_ON;
-            cvtColor(src, dst, COLOR_RGBA2GRAY, 4);
-            CPU_OFF;
+using namespace perf;

-            d_src.upload(src);
+///////////// cvtColor////////////////////////

-            WARMUP_ON;
-            ocl::cvtColor(d_src, d_dst, COLOR_RGBA2GRAY, 4);
-            WARMUP_OFF;
+typedef TestBaseWithParam<Size> cvtColorFixture;

-            GPU_ON;
-            ocl::cvtColor(d_src, d_dst, COLOR_RGBA2GRAY, 4);
-            GPU_OFF;
+PERF_TEST_P(cvtColorFixture, cvtColor, OCL_TYPICAL_MAT_SIZES)
+{
+    const Size srcSize = GetParam();

-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::cvtColor(d_src, d_dst, COLOR_RGBA2GRAY, 4);
-            d_dst.download(ocl_dst);
-            GPU_FULL_OFF;
+    Mat src(srcSize, CV_8UC4), dst(srcSize, CV_8UC4);
+    declare.in(src, WARMUP_RNG).out(dst);

-            TestSystem::instance().ExceptedMatSimilar(dst, ocl_dst, 1e-5);
-        }
+    if (RUN_OCL_IMPL)
+    {
+        ocl::oclMat oclSrc(src), oclDst(src.size(), CV_8UC4);

+        TEST_CYCLE() ocl::cvtColor(oclSrc, oclDst, COLOR_RGBA2GRAY, 4);
+        oclDst.download(dst);

+        SANITY_CHECK(dst);
    }
+    else if (RUN_PLAIN_IMPL)
+    {
+        TEST_CYCLE() cv::cvtColor(src, dst, COLOR_RGBA2GRAY, 4);

-
+        SANITY_CHECK(dst);
+    }
+    else
+        OCL_PERF_ELSE
 }
--- a/modules/ocl/perf/perf_fft.cpp
+++ b/modules/ocl/perf/perf_fft.cpp
@ -45,47 +45,39 @@
 //M*/
 #include "perf_precomp.hpp"

-///////////// dft ////////////////////////
-PERFTEST(dft)
-{
-    Mat src, dst, ocl_dst;
-    ocl::oclMat d_src, d_dst;
-
-    int all_type[] = {CV_32FC2};
-    std::string type_name[] = {"CV_32FC2"};
+using namespace perf;

-    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] << " ; complex-to-complex";
+///////////// dft ////////////////////////

-            gen(src, size, size, all_type[j], Scalar::all(0), Scalar::all(1));
+typedef TestBaseWithParam<Size> dftFixture;

-            dft(src, dst);
+PERF_TEST_P(dftFixture, DISABLED_dft, OCL_TYPICAL_MAT_SIZES) // TODO not implemented
+{
+    const Size srcSize = GetParam();

-            CPU_ON;
-            dft(src, dst);
-            CPU_OFF;
+    Mat src(srcSize, CV_32FC2), dst;
+    randu(src, 0.0f, 1.0f);
+    declare.in(src);

-            d_src.upload(src);
+    if (srcSize == OCL_SIZE_4000)
+        declare.time(7.4);

-            WARMUP_ON;
-            ocl::dft(d_src, d_dst, Size(size, size));
-            WARMUP_OFF;
+    if (RUN_OCL_IMPL)
+    {
+        ocl::oclMat oclSrc(src), oclDst;

-            GPU_ON;
-            ocl::dft(d_src, d_dst, Size(size, size));
-            GPU_OFF;
+        TEST_CYCLE() cv::ocl::dft(oclSrc, oclDst);

-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::dft(d_src, d_dst, Size(size, size));
-            d_dst.download(ocl_dst);
-            GPU_FULL_OFF;
+        oclDst.download(dst);

-            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, src.size().area() * 1e-4);
-        }
+        SANITY_CHECK(dst);
+    }
+    else if (RUN_PLAIN_IMPL)
+    {
+        TEST_CYCLE() cv::dft(src, dst);

+        SANITY_CHECK(dst);
    }
+    else
+        OCL_PERF_ELSE
 }
--- a/modules/ocl/perf/perf_filters.cpp
+++ b/modules/ocl/perf/perf_filters.cpp
@ -45,333 +45,279 @@
 //M*/
 #include "perf_precomp.hpp"

-///////////// Blur////////////////////////
-PERFTEST(Blur)
-{
-    Mat src1, dst, ocl_dst;
-    ocl::oclMat d_src1, d_dst;
+using namespace perf;
+using std::tr1::get;
+using std::tr1::tuple;

-    Size ksize = Size(3, 3);
-    int bordertype = BORDER_CONSTANT;
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
+///////////// Blur////////////////////////

-            gen(src1, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
+typedef Size_MatType BlurFixture;

-            blur(src1, dst, ksize, Point(-1, -1), bordertype);
+PERF_TEST_P(BlurFixture, Blur,
+            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
+                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4)))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params), ksize(3, 3);
+    const int type = get<1>(params), bordertype = BORDER_CONSTANT;

-            CPU_ON;
-            blur(src1, dst, ksize, Point(-1, -1), bordertype);
-            CPU_OFF;
+    Mat src(srcSize, type), dst(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);

-            d_src1.upload(src1);
+    if (srcSize == OCL_SIZE_4000 && type == CV_8UC4)
+        declare.time(5);

-            WARMUP_ON;
-            ocl::blur(d_src1, d_dst, ksize, Point(-1, -1), bordertype);
-            WARMUP_OFF;
+    if (RUN_OCL_IMPL)
+    {
+        ocl::oclMat oclSrc(src), oclDst(srcSize, type);

-            GPU_ON;
-            ocl::blur(d_src1, d_dst, ksize, Point(-1, -1), bordertype);
-            GPU_OFF;
+        TEST_CYCLE() cv::ocl::blur(oclSrc, oclDst, ksize, Point(-1, -1), bordertype);

-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            ocl::blur(d_src1, d_dst, ksize, Point(-1, -1), bordertype);
-            d_dst.download(ocl_dst);
-            GPU_FULL_OFF;
+        oclDst.download(dst);

-            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1.0);
-        }
+        SANITY_CHECK(dst, 1 + DBL_EPSILON);
+    }
+    else if (RUN_PLAIN_IMPL)
+    {
+        TEST_CYCLE() cv::blur(src, dst, ksize, Point(-1, -1), bordertype);

+        SANITY_CHECK(dst, 1 + DBL_EPSILON);
    }
+    else
+        OCL_PERF_ELSE
 }
-///////////// Laplacian////////////////////////
-PERFTEST(Laplacian)
-{
-    Mat src1, dst, ocl_dst;
-    ocl::oclMat d_src1, d_dst;
-
-    int ksize = 3;
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};

-    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
+///////////// Laplacian////////////////////////

-            gen(src1, size, size, all_type[j], 0, 256);
-            gen(dst, size, size, all_type[j], 0, 256);
+typedef Size_MatType LaplacianFixture;

-            Laplacian(src1, dst, -1, ksize, 1);
+PERF_TEST_P(LaplacianFixture, Laplacian,
+            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
+                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4)))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), ksize = 3;

-            CPU_ON;
-            Laplacian(src1, dst, -1, ksize, 1);
-            CPU_OFF;
+    Mat src(srcSize, type), dst(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);

-            d_src1.upload(src1);
+    if (srcSize == OCL_SIZE_4000 && type == CV_8UC4)
+        declare.time(6);

-            WARMUP_ON;
-            ocl::Laplacian(d_src1, d_dst, -1, ksize, 1);
-            WARMUP_OFF;
+    if (RUN_OCL_IMPL)
+    {
+        ocl::oclMat oclSrc(src), oclDst(srcSize, type);

-            GPU_ON;
-            ocl::Laplacian(d_src1, d_dst, -1, ksize, 1);
-            GPU_OFF;
+        TEST_CYCLE() cv::ocl::Laplacian(oclSrc, oclDst, -1, ksize, 1);

-            GPU_FULL_ON;
-            d_src1.upload(src1);
-            ocl::Laplacian(d_src1, d_dst, -1, ksize, 1);
-            d_dst.download(ocl_dst);
-            GPU_FULL_OFF;
+        oclDst.download(dst);

-            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1e-5);
-        }
+        SANITY_CHECK(dst);
+    }
+    else if (RUN_PLAIN_IMPL)
+    {
+        TEST_CYCLE() cv::Laplacian(src, dst, -1, ksize, 1);

+        SANITY_CHECK(dst);
    }
+    else
+        OCL_PERF_ELSE
 }

 ///////////// Erode ////////////////////
-PERFTEST(Erode)
-{
-    Mat src, dst, ker, ocl_dst;
-    ocl::oclMat d_src, d_dst;
-
-    int all_type[] = {CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4", "CV_32FC1", "CV_32FC4"};

-    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src, size, size, all_type[j], Scalar::all(0), Scalar::all(256));
-            ker = getStructuringElement(MORPH_RECT, Size(3, 3));
+typedef Size_MatType ErodeFixture;

-            erode(src, dst, ker);
+PERF_TEST_P(ErodeFixture, Erode,
+            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
+                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4)))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), ksize = 3;
+    const Mat ker = getStructuringElement(MORPH_RECT, Size(ksize, ksize));

-            CPU_ON;
-            erode(src, dst, ker);
-            CPU_OFF;
+    Mat src(srcSize, type), dst(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(dst).in(ker);

-            d_src.upload(src);
+    if (srcSize == OCL_SIZE_4000 && type == CV_8UC4)
+        declare.time(5);

-            WARMUP_ON;
-            ocl::erode(d_src, d_dst, ker);
-            WARMUP_OFF;
+    if (RUN_OCL_IMPL)
+    {
+        ocl::oclMat oclSrc(src), oclDst(srcSize, type), oclKer(ker);

-            GPU_ON;
-            ocl::erode(d_src, d_dst, ker);
-            GPU_OFF;
+        TEST_CYCLE() cv::ocl::erode(oclSrc, oclDst, oclKer);

-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::erode(d_src, d_dst, ker);
-            d_dst.download(ocl_dst);
-            GPU_FULL_OFF;
+        oclDst.download(dst);

-            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1e-5);
-        }
+        SANITY_CHECK(dst);
+    }
+    else if (RUN_PLAIN_IMPL)
+    {
+        TEST_CYCLE() cv::erode(src, dst, ker);

+        SANITY_CHECK(dst);
    }
+    else
+        OCL_PERF_ELSE
 }

 ///////////// Sobel ////////////////////////
-PERFTEST(Sobel)
-{
-    Mat src, dst, ocl_dst;
-    ocl::oclMat d_src, d_dst;
-
-    int dx = 1;
-    int dy = 1;
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};

-    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src, size, size, all_type[j], 0, 256);
+typedef Size_MatType SobelFixture;

-            Sobel(src, dst, -1, dx, dy);
+PERF_TEST_P(SobelFixture, Sobel,
+            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
+                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4)))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), dx = 1, dy = 1;

-            CPU_ON;
-            Sobel(src, dst, -1, dx, dy);
-            CPU_OFF;
+    Mat src(srcSize, type), dst(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);

-            d_src.upload(src);
+    if ((srcSize == OCL_SIZE_2000 && type == CV_8UC4) ||
+            (srcSize == OCL_SIZE_4000 && type == CV_8UC1))
+        declare.time(5.5);
+    else if (srcSize == OCL_SIZE_4000 && type == CV_8UC4)
+        declare.time(20);

-            WARMUP_ON;
-            ocl::Sobel(d_src, d_dst, -1, dx, dy);
-            WARMUP_OFF;
+    if (RUN_OCL_IMPL)
+    {
+        ocl::oclMat oclSrc(src), oclDst(srcSize, type);

-            GPU_ON;
-            ocl::Sobel(d_src, d_dst, -1, dx, dy);
-            GPU_OFF;
+        TEST_CYCLE() cv::ocl::Sobel(oclSrc, oclDst, -1, dx, dy);

-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::Sobel(d_src, d_dst, -1, dx, dy);
-            d_dst.download(ocl_dst);
-            GPU_FULL_OFF;
+        oclDst.download(dst);

-            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1);
-        }
+        SANITY_CHECK(dst);
+    }
+    else if (RUN_PLAIN_IMPL)
+    {
+        TEST_CYCLE() cv::Sobel(src, dst, -1, dx, dy);

+        SANITY_CHECK(dst);
    }
+    else
+        OCL_PERF_ELSE
 }
-///////////// Scharr ////////////////////////
-PERFTEST(Scharr)
-{
-    Mat src, dst, ocl_dst;
-    ocl::oclMat d_src, d_dst;

-    int dx = 1;
-    int dy = 0;
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
+///////////// Scharr ////////////////////////

-            gen(src, size, size, all_type[j], 0, 256);
+typedef Size_MatType ScharrFixture;

-            Scharr(src, dst, -1, dx, dy);
+PERF_TEST_P(ScharrFixture, Scharr,
+            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
+                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4)))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), dx = 1, dy = 0;

-            CPU_ON;
-            Scharr(src, dst, -1, dx, dy);
-            CPU_OFF;
+    Mat src(srcSize, type), dst(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);

-            d_src.upload(src);
+    if ((srcSize == OCL_SIZE_2000 && type == CV_8UC4) ||
+            (srcSize == OCL_SIZE_4000 && type == CV_8UC1))
+        declare.time(5.5);
+    else if (srcSize == OCL_SIZE_4000 && type == CV_8UC4)
+        declare.time(21);

-            WARMUP_ON;
-            ocl::Scharr(d_src, d_dst, -1, dx, dy);
-            WARMUP_OFF;
+    if (RUN_OCL_IMPL)
+    {
+        ocl::oclMat oclSrc(src), oclDst(srcSize, type);

-            GPU_ON;
-            ocl::Scharr(d_src, d_dst, -1, dx, dy);
-            GPU_OFF;
+        TEST_CYCLE() cv::ocl::Scharr(oclSrc, oclDst, -1, dx, dy);

-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::Scharr(d_src, d_dst, -1, dx, dy);
-            d_dst.download(ocl_dst);
-            GPU_FULL_OFF;
+        oclDst.download(dst);

-            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1);
-        }
+        SANITY_CHECK(dst);
+    }
+    else if (RUN_PLAIN_IMPL)
+    {
+        TEST_CYCLE() cv::Scharr(src, dst, -1, dx, dy);

+        SANITY_CHECK(dst);
    }
+    else
+        OCL_PERF_ELSE
 }

 ///////////// GaussianBlur ////////////////////////
-PERFTEST(GaussianBlur)
-{
-    Mat src, dst, ocl_dst;
-    int all_type[] = {CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4", "CV_32FC1", "CV_32FC4"};
-    const int ksize = 7;

-    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-
-            gen(src, size, size, all_type[j], 0, 256);
+typedef Size_MatType GaussianBlurFixture;

-            GaussianBlur(src, dst, Size(ksize, ksize), 0);
+PERF_TEST_P(GaussianBlurFixture, GaussianBlur,
+            ::testing::Combine(::testing::Values(OCL_SIZE_1000, OCL_SIZE_2000),
+                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4)))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), ksize = 7;

-            CPU_ON;
-            GaussianBlur(src, dst, Size(ksize, ksize), 0);
-            CPU_OFF;
+    Mat src(srcSize, type), dst(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);

-            ocl::oclMat d_src(src);
-            ocl::oclMat d_dst;
+    const double eps = src.depth() == CV_8U ? 1 + DBL_EPSILON : 3e-4;

-            WARMUP_ON;
-            ocl::GaussianBlur(d_src, d_dst, Size(ksize, ksize), 0);
-            WARMUP_OFF;
+    if (RUN_OCL_IMPL)
+    {
+        ocl::oclMat oclSrc(src), oclDst(srcSize, type);

-            GPU_ON;
-            ocl::GaussianBlur(d_src, d_dst, Size(ksize, ksize), 0);
-            GPU_OFF;
+        TEST_CYCLE() cv::ocl::GaussianBlur(oclSrc, oclDst, Size(ksize, ksize), 0);

-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::GaussianBlur(d_src, d_dst, Size(ksize, ksize), 0);
-            d_dst.download(ocl_dst);
-            GPU_FULL_OFF;
+        oclDst.download(dst);

-            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1.0);
-        }
+        SANITY_CHECK(dst, eps);
+    }
+    else if (RUN_PLAIN_IMPL)
+    {
+        TEST_CYCLE() cv::GaussianBlur(src, dst, Size(ksize, ksize), 0);

+        SANITY_CHECK(dst, eps);
    }
+    else
+        OCL_PERF_ELSE
 }

 ///////////// filter2D////////////////////////
-PERFTEST(filter2D)
-{
-    Mat src;
-
-    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
-    {
-        int all_type[] = {CV_8UC1, CV_8UC4};
-        std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            gen(src, size, size, all_type[j], 0, 256);
-
-            const int ksize = 3;
-
-            SUBTEST << "ksize = " << ksize << "; " << size << 'x' << size << "; " << type_name[j] ;

-            Mat kernel;
-            gen(kernel, ksize, ksize, CV_32SC1, -3.0, 3.0);
+typedef Size_MatType filter2DFixture;

-            Mat dst, ocl_dst;
-
-            cv::filter2D(src, dst, -1, kernel);
-
-            CPU_ON;
-            cv::filter2D(src, dst, -1, kernel);
-            CPU_OFF;
-
-            ocl::oclMat d_src(src), d_dst;
+PERF_TEST_P(filter2DFixture, filter2D,
+            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
+                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4)))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params), ksize = 3;

-            WARMUP_ON;
-            ocl::filter2D(d_src, d_dst, -1, kernel);
-            WARMUP_OFF;
+    Mat src(srcSize, type), dst(srcSize, type), kernel(ksize, ksize, CV_32SC1);
+    declare.in(src, WARMUP_RNG).in(kernel).out(dst);
+    randu(kernel, -3.0, 3.0);

-            GPU_ON;
-            ocl::filter2D(d_src, d_dst, -1, kernel);
-            GPU_OFF;
+    if (srcSize == OCL_SIZE_4000 && type == CV_8UC4)
+        declare.time(8);

-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::filter2D(d_src, d_dst, -1, kernel);
-            d_dst.download(ocl_dst);
-            GPU_FULL_OFF;
+    if (RUN_OCL_IMPL)
+    {
+        ocl::oclMat oclSrc(src), oclDst(srcSize, type), oclKernel(kernel);

-            TestSystem::instance().ExpectedMatNear(ocl_dst, dst, 1e-5);
+        TEST_CYCLE() cv::ocl::filter2D(oclSrc, oclDst, -1, oclKernel);

-        }
+        oclDst.download(dst);

+        SANITY_CHECK(dst);
+    }
+    else if (RUN_PLAIN_IMPL)
+    {
+        TEST_CYCLE() cv::filter2D(src, dst, -1, kernel);

+        SANITY_CHECK(dst);
    }
+    else
+        OCL_PERF_ELSE
 }
--- a/modules/ocl/perf/perf_gemm.cpp
+++ b/modules/ocl/perf/perf_gemm.cpp
@ -45,46 +45,40 @@
 //M*/
 #include "perf_precomp.hpp"

-///////////// gemm ////////////////////////
-PERFTEST(gemm)
-{
-    Mat src1, src2, src3, dst, ocl_dst;
-    ocl::oclMat d_src1, d_src2, d_src3, d_dst;
+using namespace perf;

-    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
-    {
-        SUBTEST << size << 'x' << size;
+///////////// gemm ////////////////////////

-        gen(src1, size, size, CV_32FC1, Scalar::all(-10), Scalar::all(10));
-        gen(src2, size, size, CV_32FC1, Scalar::all(-10), Scalar::all(10));
-        gen(src3, size, size, CV_32FC1, Scalar::all(-10), Scalar::all(10));
+typedef TestBaseWithParam<Size> gemmFixture;

-        gemm(src1, src2, 1.0, src3, 1.0, dst);
+PERF_TEST_P(gemmFixture, DISABLED_gemm, OCL_TYPICAL_MAT_SIZES) // TODO not implemented
+{
+    const Size srcSize = GetParam();

-        CPU_ON;
-        gemm(src1, src2, 1.0, src3, 1.0, dst);
-        CPU_OFF;
+    Mat src1(srcSize, CV_32FC1), src2(srcSize, CV_32FC1),
+            src3(srcSize, CV_32FC1), dst(srcSize, CV_32FC1);
+    declare.in(src1, src2, src3).out(dst);
+    randu(src1, -10.0f, 10.0f);
+    randu(src2, -10.0f, 10.0f);
+    randu(src3, -10.0f, 10.0f);

-        d_src1.upload(src1);
-        d_src2.upload(src2);
-        d_src3.upload(src3);
+    if (RUN_OCL_IMPL)
+    {
+        ocl::oclMat oclSrc1(src1), oclSrc2(src2),
+                oclSrc3(src3), oclDst(srcSize, CV_32FC1);

-        WARMUP_ON;
-        ocl::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst);
-        WARMUP_OFF;
+        TEST_CYCLE() cv::ocl::gemm(oclSrc1, oclSrc2, 1.0, oclSrc3, 1.0, oclDst);

-        GPU_ON;
-        ocl::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst);
-        GPU_OFF;
+        oclDst.download(dst);

-        GPU_FULL_ON;
-        d_src1.upload(src1);
-        d_src2.upload(src2);
-        d_src3.upload(src3);
-        ocl::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst);
-        d_dst.download(ocl_dst);
-        GPU_FULL_OFF;
+        SANITY_CHECK(dst);
+    }
+    else if (RUN_PLAIN_IMPL)
+    {
+        TEST_CYCLE() cv::gemm(src1, src2, 1.0, src3, 1.0, dst);

-        TestSystem::instance().ExpectedMatNear(ocl_dst, dst, src1.cols * src1.rows * 1e-4);
+        SANITY_CHECK(dst);
    }
+    else
+        OCL_PERF_ELSE
 }
--- a/modules/ocl/perf/perf_gftt.cpp
+++ b/modules/ocl/perf/perf_gftt.cpp
@ -46,56 +46,50 @@

 #include "perf_precomp.hpp"

-///////////// GoodFeaturesToTrack ////////////////////////
-PERFTEST(GoodFeaturesToTrack)
-{
-    using namespace cv;
-
-    int maxCorners = 2000;
-    double qualityLevel = 0.01;
+using namespace perf;
+using std::tr1::tuple;
+using std::tr1::get;

-    std::string images[] = { "rubberwhale1.png", "aloeL.jpg" };
-
-    std::vector<cv::Point2f> pts_gold, pts_ocl;
-
-    for(size_t imgIdx = 0; imgIdx < (sizeof(images)/sizeof(std::string)); ++imgIdx)
-    {
-        Mat frame = imread(abspath(images[imgIdx]), IMREAD_GRAYSCALE);
-        CV_Assert(!frame.empty());
+///////////// GoodFeaturesToTrack ////////////////////////

-        for(float minDistance = 0; minDistance < 4; minDistance += 3.0)
-        {
-            SUBTEST << "image = " << images[imgIdx] << "; ";
-            SUBTEST << "minDistance = " << minDistance << "; ";
+typedef tuple<string, double> GoodFeaturesToTrackParams;
+typedef TestBaseWithParam<GoodFeaturesToTrackParams> GoodFeaturesToTrackFixture;

-            cv::goodFeaturesToTrack(frame, pts_gold, maxCorners, qualityLevel, minDistance);
+PERF_TEST_P(GoodFeaturesToTrackFixture, GoodFeaturesToTrack,
+            ::testing::Combine(::testing::Values(string("gpu/opticalflow/rubberwhale1.png"),
+                                                 string("gpu/stereobm/aloe-L.png")),
+                               ::testing::Range(0.0, 4.0, 3.0)))
+{

-            CPU_ON;
-            cv::goodFeaturesToTrack(frame, pts_gold, maxCorners, qualityLevel, minDistance);
-            CPU_OFF;
+    const GoodFeaturesToTrackParams param = GetParam();
+    const string fileName = getDataPath(get<0>(param));
+    const int maxCorners = 2000;
+    const double qualityLevel = 0.01, minDistance = get<1>(param);

-            cv::ocl::GoodFeaturesToTrackDetector_OCL detector(maxCorners, qualityLevel, minDistance);
+    Mat frame = imread(fileName, IMREAD_GRAYSCALE);
+    ASSERT_TRUE(!frame.empty()) << "no input image";

-            ocl::oclMat frame_ocl(frame), pts_oclmat;
+    vector<Point2f> pts_gold;
+    declare.in(frame);

-            WARMUP_ON;
-            detector(frame_ocl, pts_oclmat);
-            WARMUP_OFF;
+    if (RUN_OCL_IMPL)
+    {
+        ocl::oclMat oclFrame(frame), pts_oclmat;
+        ocl::GoodFeaturesToTrackDetector_OCL detector(maxCorners, qualityLevel, minDistance);

-            detector.downloadPoints(pts_oclmat, pts_ocl);
+        TEST_CYCLE() detector(oclFrame, pts_oclmat);

-            double diff = abs(static_cast<float>(pts_gold.size() - pts_ocl.size()));
-            TestSystem::instance().setAccurate(diff == 0.0, diff);
+        detector.downloadPoints(pts_oclmat, pts_gold);

-            GPU_ON;
-            detector(frame_ocl, pts_oclmat);
-            GPU_OFF;
+        SANITY_CHECK(pts_gold);
+    }
+    else if (RUN_PLAIN_IMPL)
+    {
+        TEST_CYCLE() cv::goodFeaturesToTrack(frame, pts_gold,
+                                             maxCorners, qualityLevel, minDistance);

-            GPU_FULL_ON;
-            frame_ocl.upload(frame);
-            detector(frame_ocl, pts_oclmat);
-            detector.downloadPoints(pts_oclmat, pts_ocl);
-            GPU_FULL_OFF;
-        }
+        SANITY_CHECK(pts_gold);
    }
+    else
+        OCL_PERF_ELSE
 }
--- a/modules/ocl/perf/perf_haar.cpp
+++ b/modules/ocl/perf/perf_haar.cpp
@ -45,63 +45,44 @@
 //M*/
 #include "perf_precomp.hpp"

-///////////// Haar ////////////////////////
-
-PERFTEST(Haar)
-{
-    Mat img = imread(abspath("basketball1.png"), IMREAD_GRAYSCALE);
+#include "opencv2/objdetect/objdetect_c.h"

-    if (img.empty())
-    {
-        throw runtime_error("can't open basketball1.png");
-    }
+using namespace perf;

-    CascadeClassifier faceCascadeCPU;
-
-    if (!faceCascadeCPU.load(abspath("haarcascade_frontalface_alt.xml")))
-    {
-        throw runtime_error("can't load haarcascade_frontalface_alt.xml");
-    }
+///////////// Haar ////////////////////////

+PERF_TEST(HaarFixture, Haar)
+{
    vector<Rect> faces;

-    SUBTEST << img.cols << "x" << img.rows << "; scale image";
-    CPU_ON;
-    faceCascadeCPU.detectMultiScale(img, faces,
-                                    1.1, 2, 0 | CASCADE_SCALE_IMAGE, Size(30, 30));
-    CPU_OFF;
+    Mat img = imread(getDataPath("gpu/haarcascade/basketball1.png"), IMREAD_GRAYSCALE);
+    ASSERT_TRUE(!img.empty()) << "can't open basketball1.png";
+    declare.in(img);

+    if (RUN_PLAIN_IMPL)
+    {
+        CascadeClassifier faceCascade;
+        ASSERT_TRUE(faceCascade.load(getDataPath("gpu/haarcascade/haarcascade_frontalface_alt.xml")))
+                << "can't load haarcascade_frontalface_alt.xml";

-    vector<Rect> oclfaces;
-    ocl::OclCascadeClassifier faceCascade;
+        TEST_CYCLE() faceCascade.detectMultiScale(img, faces,
+                                                     1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));

-    if (!faceCascade.load(abspath("haarcascade_frontalface_alt.xml")))
-    {
-        throw runtime_error("can't load haarcascade_frontalface_alt.xml");
+        SANITY_CHECK(faces, 4 + 1e-4);
    }
+    else if (RUN_OCL_IMPL)
+    {
+        ocl::OclCascadeClassifier faceCascade;
+        ocl::oclMat oclImg(img);

-    ocl::oclMat d_img(img);
+        ASSERT_TRUE(faceCascade.load(getDataPath("gpu/haarcascade/haarcascade_frontalface_alt.xml")))
+                << "can't load haarcascade_frontalface_alt.xml";

-    WARMUP_ON;
-    faceCascade.detectMultiScale(d_img, oclfaces,
-                                 1.1, 2, 0 | CASCADE_SCALE_IMAGE, Size(30, 30));
-    WARMUP_OFF;
+        TEST_CYCLE() faceCascade.detectMultiScale(oclImg, faces,
+                                     1.1, 2, 0 | CV_HAAR_SCALE_IMAGE, Size(30, 30));

-    if(faces.size() == oclfaces.size())
-        TestSystem::instance().setAccurate(1, 0);
+        SANITY_CHECK(faces, 4 + 1e-4);
+    }
    else
-        TestSystem::instance().setAccurate(0, abs((int)faces.size() - (int)oclfaces.size()));
-
-    faces.clear();
-
-    GPU_ON;
-    faceCascade.detectMultiScale(d_img, oclfaces,
-                                 1.1, 2, 0 | CASCADE_SCALE_IMAGE, Size(30, 30));
-    GPU_OFF;
-
-    GPU_FULL_ON;
-    d_img.upload(img);
-    faceCascade.detectMultiScale(d_img, oclfaces,
-                                 1.1, 2, 0 | CASCADE_SCALE_IMAGE, Size(30, 30));
-    GPU_FULL_OFF;
+        OCL_PERF_ELSE
 }
--- a/modules/ocl/perf/perf_hog.cpp
+++ b/modules/ocl/perf/perf_hog.cpp
@ -45,50 +45,37 @@
 //M*/
 #include "perf_precomp.hpp"

+using namespace perf;
+
 ///////////// HOG////////////////////////

-PERFTEST(HOG)
+PERF_TEST(HOGFixture, HOG)
 {
-    Mat src = imread(abspath("road.png"), cv::IMREAD_GRAYSCALE);
-
-    if (src.empty())
-    {
-        throw runtime_error("can't open road.png");
-    }
-
-    cv::HOGDescriptor hog;
-    hog.setSVMDetector(hog.getDefaultPeopleDetector());
-    std::vector<cv::Rect> found_locations;
-    std::vector<cv::Rect> d_found_locations;
+    Mat src = imread(getDataPath("gpu/hog/road.png"), cv::IMREAD_GRAYSCALE);
+    ASSERT_TRUE(!src.empty()) << "can't open input image road.png";

-    SUBTEST << src.cols << 'x' << src.rows << "; road.png";
+    vector<cv::Rect> found_locations;
+    declare.in(src).time(5);

-    hog.detectMultiScale(src, found_locations);
+    if (RUN_PLAIN_IMPL)
+    {
+        HOGDescriptor hog;
+        hog.setSVMDetector(hog.getDefaultPeopleDetector());

-    CPU_ON;
-    hog.detectMultiScale(src, found_locations);
-    CPU_OFF;
+        TEST_CYCLE() hog.detectMultiScale(src, found_locations);

-    cv::ocl::HOGDescriptor ocl_hog;
-    ocl_hog.setSVMDetector(ocl_hog.getDefaultPeopleDetector());
-    ocl::oclMat d_src;
-    d_src.upload(src);
+        SANITY_CHECK(found_locations, 1 + DBL_EPSILON);
+    }
+    else if (RUN_OCL_IMPL)
+    {
+        ocl::HOGDescriptor ocl_hog;
+        ocl_hog.setSVMDetector(ocl_hog.getDefaultPeopleDetector());
+        ocl::oclMat oclSrc(src);

-    WARMUP_ON;
-    ocl_hog.detectMultiScale(d_src, d_found_locations);
-    WARMUP_OFF;
+        TEST_CYCLE() ocl_hog.detectMultiScale(oclSrc, found_locations);

-    if(d_found_locations.size() == found_locations.size())
-        TestSystem::instance().setAccurate(1, 0);
+        SANITY_CHECK(found_locations, 1 + DBL_EPSILON);
+    }
    else
-        TestSystem::instance().setAccurate(0, abs((int)found_locations.size() - (int)d_found_locations.size()));
-
-    GPU_ON;
-    ocl_hog.detectMultiScale(d_src, found_locations);
-    GPU_OFF;
-
-    GPU_FULL_ON;
-    d_src.upload(src);
-    ocl_hog.detectMultiScale(d_src, found_locations);
-    GPU_FULL_OFF;
+        OCL_PERF_ELSE
 }
--- a/modules/ocl/perf/perf_imgproc.cpp
+++ b/modules/ocl/perf/perf_imgproc.cpp
--- a/modules/ocl/perf/perf_match_template.cpp
+++ b/modules/ocl/perf/perf_match_template.cpp
@ -45,101 +45,77 @@
 //M*/
 #include "perf_precomp.hpp"

-/////////// matchTemplate ////////////////////////
-//void InitMatchTemplate()
-//{
-//	Mat src; gen(src, 500, 500, CV_32F, 0, 1);
-//	Mat templ; gen(templ, 500, 500, CV_32F, 0, 1);
-//	ocl::oclMat d_src(src), d_templ(templ), d_dst;
-//	ocl::matchTemplate(d_src, d_templ, d_dst, CV_TM_CCORR);
-//}
-PERFTEST(matchTemplate)
-{
-    //InitMatchTemplate();
-    Mat src, templ, dst, ocl_dst;
-    int templ_size = 5;
-
-    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
-    {
-        int all_type[] = {CV_32FC1, CV_32FC4};
-        std::string type_name[] = {"CV_32FC1", "CV_32FC4"};
-
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            for(templ_size = 5; templ_size <= 5; templ_size *= 5)
-            {
-                gen(src, size, size, all_type[j], 0, 1);
-
-                SUBTEST << src.cols << 'x' << src.rows << "; " << type_name[j] << "; templ " << templ_size << 'x' << templ_size << "; CCORR";
+using namespace perf;
+using std::tr1::tuple;
+using std::tr1::get;

-                gen(templ, templ_size, templ_size, all_type[j], 0, 1);
-
-                matchTemplate(src, templ, dst, TM_CCORR);
-
-                CPU_ON;
-                matchTemplate(src, templ, dst, TM_CCORR);
-                CPU_OFF;
-
-                ocl::oclMat d_src(src), d_templ(templ), d_dst;
-
-                WARMUP_ON;
-                ocl::matchTemplate(d_src, d_templ, d_dst, TM_CCORR);
-                WARMUP_OFF;
+/////////// matchTemplate ////////////////////////

-                GPU_ON;
-                ocl::matchTemplate(d_src, d_templ, d_dst, TM_CCORR);
-                GPU_OFF;
+typedef Size_MatType CV_TM_CCORRFixture;

-                GPU_FULL_ON;
-                d_src.upload(src);
-                d_templ.upload(templ);
-                ocl::matchTemplate(d_src, d_templ, d_dst, TM_CCORR);
-                d_dst.download(ocl_dst);
-                GPU_FULL_OFF;
+PERF_TEST_P(CV_TM_CCORRFixture, matchTemplate,
+            ::testing::Combine(::testing::Values(OCL_SIZE_1000, OCL_SIZE_2000),
+                               OCL_PERF_ENUM(CV_32FC1, CV_32FC4)))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params), templSize(5, 5);
+    const int type = get<1>(params);
+
+    Mat src(srcSize, type), templ(templSize, type);
+    const Size dstSize(src.cols - templ.cols + 1, src.rows - templ.rows + 1);
+    Mat dst(dstSize, CV_32F);
+    randu(src, 0.0f, 1.0f);
+    randu(templ, 0.0f, 1.0f);
+    declare.time(srcSize == OCL_SIZE_2000 ? 20 : 6).in(src, templ).out(dst);
+
+    if (RUN_OCL_IMPL)
+    {
+        ocl::oclMat oclSrc(src), oclTempl(templ), oclDst(dstSize, CV_32F);

-                TestSystem::instance().ExpectedMatNear(dst, ocl_dst, templ.rows * templ.cols * 1e-1);
-            }
-        }
+        TEST_CYCLE() cv::ocl::matchTemplate(oclSrc, oclTempl, oclDst, TM_CCORR);

-        int all_type_8U[] = {CV_8UC1};
-        std::string type_name_8U[] = {"CV_8UC1"};
+        oclDst.download(dst);

-        for (size_t j = 0; j < sizeof(all_type_8U) / sizeof(int); j++)
-        {
-            for(templ_size = 5; templ_size <= 5; templ_size *= 5)
-            {
-                SUBTEST << src.cols << 'x' << src.rows << "; " << type_name_8U[j] << "; templ " << templ_size << 'x' << templ_size << "; CCORR_NORMED";
+        SANITY_CHECK(dst, 1e-4);
+    }
+    else if (RUN_PLAIN_IMPL)
+    {
+        TEST_CYCLE() cv::matchTemplate(src, templ, dst, TM_CCORR);

-                gen(src, size, size, all_type_8U[j], 0, 255);
+        SANITY_CHECK(dst, 1e-4);
+    }
+    else
+        OCL_PERF_ELSE
+}

-                gen(templ, templ_size, templ_size, all_type_8U[j], 0, 255);
+typedef TestBaseWithParam<Size> CV_TM_CCORR_NORMEDFixture;

-                matchTemplate(src, templ, dst, TM_CCORR_NORMED);
+PERF_TEST_P(CV_TM_CCORR_NORMEDFixture, matchTemplate, OCL_TYPICAL_MAT_SIZES)
+{
+    const Size srcSize = GetParam(), templSize(5, 5);

-                CPU_ON;
-                matchTemplate(src, templ, dst, TM_CCORR_NORMED);
-                CPU_OFF;
+    Mat src(srcSize, CV_8UC1), templ(templSize, CV_8UC1), dst;
+    const Size dstSize(src.cols - templ.cols + 1, src.rows - templ.rows + 1);
+    dst.create(dstSize, CV_8UC1);
+    declare.in(src, templ, WARMUP_RNG).out(dst)
+            .time(srcSize == OCL_SIZE_2000 ? 10 : srcSize == OCL_SIZE_4000 ? 23 : 2);

-                ocl::oclMat d_src(src);
-                ocl::oclMat d_templ(templ), d_dst;
+    if (RUN_OCL_IMPL)
+    {
+        ocl::oclMat oclSrc(src), oclTempl(templ), oclDst(dstSize, CV_8UC1);

-                WARMUP_ON;
-                ocl::matchTemplate(d_src, d_templ, d_dst, TM_CCORR_NORMED);
-                WARMUP_OFF;
+        TEST_CYCLE() cv::ocl::matchTemplate(oclSrc, oclTempl, oclDst, TM_CCORR_NORMED);

-                GPU_ON;
-                ocl::matchTemplate(d_src, d_templ, d_dst, TM_CCORR_NORMED);
-                GPU_OFF;
+        oclDst.download(dst);

-                GPU_FULL_ON;
-                d_src.upload(src);
-                d_templ.upload(templ);
-                ocl::matchTemplate(d_src, d_templ, d_dst, TM_CCORR_NORMED);
-                d_dst.download(ocl_dst);
-                GPU_FULL_OFF;
+        SANITY_CHECK(dst, 2e-2);
+    }
+    else if (RUN_PLAIN_IMPL)
+    {
+        TEST_CYCLE() cv::matchTemplate(src, templ, dst, TM_CCORR_NORMED);

-                TestSystem::instance().ExpectedMatNear(dst, ocl_dst, templ.rows * templ.cols * 1e-1);
-            }
-        }
+        SANITY_CHECK(dst, 2e-2);
    }
+    else
+        OCL_PERF_ELSE
 }
--- a/modules/ocl/perf/perf_matrix_operation.cpp
+++ b/modules/ocl/perf/perf_matrix_operation.cpp
@ -45,142 +45,113 @@
 //M*/
 #include "perf_precomp.hpp"

-///////////// ConvertTo////////////////////////
-PERFTEST(ConvertTo)
-{
-    Mat src, dst, ocl_dst;
-    ocl::oclMat d_src, d_dst;
-
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] << " to 32FC1";
+using namespace perf;
+using std::tr1::tuple;
+using std::tr1::get;

-            gen(src, size, size, all_type[j], 0, 256);
-            //gen(dst, size, size, all_type[j], 0, 256);
-
-            //d_dst.upload(dst);
-
-            src.convertTo(dst, CV_32FC1);
+///////////// ConvertTo////////////////////////

-            CPU_ON;
-            src.convertTo(dst, CV_32FC1);
-            CPU_OFF;
+typedef Size_MatType ConvertToFixture;

-            d_src.upload(src);
+PERF_TEST_P(ConvertToFixture, ConvertTo,
+            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
+                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4)))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);

-            WARMUP_ON;
-            d_src.convertTo(d_dst, CV_32FC1);
-            WARMUP_OFF;
+    Mat src(srcSize, type), dst;
+    const int dstType = CV_MAKE_TYPE(CV_32F, src.channels());
+    dst.create(srcSize, dstType);
+    declare.in(src, WARMUP_RNG).out(dst);

-            GPU_ON;
-            d_src.convertTo(d_dst, CV_32FC1);
-            GPU_OFF;
+    if (RUN_OCL_IMPL)
+    {
+        ocl::oclMat oclSrc(src), oclDst(srcSize, dstType);

-            GPU_FULL_ON;
-            d_src.upload(src);
-            d_src.convertTo(d_dst, CV_32FC1);
-            d_dst.download(ocl_dst);
-            GPU_FULL_OFF;
+        TEST_CYCLE() oclSrc.convertTo(oclDst, dstType);

-            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 0.0);
-        }
+        oclDst.download(dst);

+        SANITY_CHECK(dst);
    }
-}
-///////////// copyTo////////////////////////
-PERFTEST(copyTo)
-{
-    Mat src, dst, ocl_dst;
-    ocl::oclMat d_src, d_dst;
-
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
+    else if (RUN_PLAIN_IMPL)
    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
+        TEST_CYCLE() src.convertTo(dst, dstType);

-            gen(src, size, size, all_type[j], 0, 256);
-            //gen(dst, size, size, all_type[j], 0, 256);
-
-            //d_dst.upload(dst);
+        SANITY_CHECK(dst);
+    }
+    else
+        OCL_PERF_ELSE
+}

-            src.copyTo(dst);
+///////////// copyTo////////////////////////

-            CPU_ON;
-            src.copyTo(dst);
-            CPU_OFF;
+typedef Size_MatType copyToFixture;

-            d_src.upload(src);
+PERF_TEST_P(copyToFixture, copyTo,
+            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
+                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4)))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);

-            WARMUP_ON;
-            d_src.copyTo(d_dst);
-            WARMUP_OFF;
+    Mat src(srcSize, type), dst(srcSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);

-            GPU_ON;
-            d_src.copyTo(d_dst);
-            GPU_OFF;
+    if (RUN_OCL_IMPL)
+    {
+        ocl::oclMat oclSrc(src), oclDst(srcSize, type);

-            GPU_FULL_ON;
-            d_src.upload(src);
-            d_src.copyTo(d_dst);
-            d_dst.download(ocl_dst);
-            GPU_FULL_OFF;
+        TEST_CYCLE() oclSrc.copyTo(oclDst);

-            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 0.0);
-        }
+        oclDst.download(dst);

+        SANITY_CHECK(dst);
    }
-}
-///////////// setTo////////////////////////
-PERFTEST(setTo)
-{
-    Mat src, ocl_src;
-    Scalar val(1, 2, 3, 4);
-    ocl::oclMat d_src;
-
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
+    else if (RUN_PLAIN_IMPL)
    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
+        TEST_CYCLE() src.copyTo(dst);

-            gen(src, size, size, all_type[j], 0, 256);
+        SANITY_CHECK(dst);
+    }
+    else
+        OCL_PERF_ELSE
+}

-            src.setTo(val);
+///////////// setTo////////////////////////

-            CPU_ON;
-            src.setTo(val);
-            CPU_OFF;
+typedef Size_MatType setToFixture;

-            d_src.upload(src);
+PERF_TEST_P(setToFixture, setTo,
+            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
+                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4)))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+    const Scalar val(1, 2, 3, 4);

-            WARMUP_ON;
-            d_src.setTo(val);
-            WARMUP_OFF;
+    Mat src(srcSize, type);
+    declare.in(src);

-            d_src.download(ocl_src);
-            TestSystem::instance().ExpectedMatNear(src, ocl_src, 1.0);
+    if (RUN_OCL_IMPL)
+    {
+        ocl::oclMat oclSrc(srcSize, type);

-            GPU_ON;;
-            d_src.setTo(val);
-            GPU_OFF;
+        TEST_CYCLE() oclSrc.setTo(val);
+        oclSrc.download(src);

-            GPU_FULL_ON;
-            d_src.upload(src);
-            d_src.setTo(val);
-            GPU_FULL_OFF;
-        }
+        SANITY_CHECK(src);
+    }
+    else if (RUN_PLAIN_IMPL)
+    {
+        TEST_CYCLE() src.setTo(val);

+        SANITY_CHECK(src);
    }
+    else
+        OCL_PERF_ELSE
 }
--- a/modules/ocl/perf/perf_moments.cpp
+++ b/modules/ocl/perf/perf_moments.cpp
@ -43,50 +43,47 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-#include "perf_precomp.hpp"
-///////////// Moments ////////////////////////
-PERFTEST(Moments)
-{
-    Mat src;
-    bool binaryImage = 0;

-    int all_type[] = {CV_8UC1, CV_16SC1, CV_32FC1, CV_64FC1};
-    std::string type_name[] = {"CV_8UC1", "CV_16SC1", "CV_32FC1", "CV_64FC1"};
-
-    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j];
+#include "perf_precomp.hpp"

-            gen(src, size, size, all_type[j], 0, 256);
+using namespace perf;
+using std::tr1::tuple;
+using std::tr1::get;

-            cv::Moments CvMom = moments(src, binaryImage);
+///////////// Moments ////////////////////////

-            CPU_ON;
-            moments(src, binaryImage);
-            CPU_OFF;
+typedef Size_MatType MomentsFixture;

-            cv::Moments oclMom;
-            WARMUP_ON;
-            oclMom = ocl::ocl_moments(src, binaryImage);
-            WARMUP_OFF;
+PERF_TEST_P(MomentsFixture, DISABLED_Moments,
+            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
+                               OCL_PERF_ENUM(CV_8UC1, CV_16SC1, CV_32FC1, CV_64FC1)))  // TODO does not work properly (see below)
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);

-            Mat gpu_dst, cpu_dst;
-            HuMoments(CvMom, cpu_dst);
-            HuMoments(oclMom, gpu_dst);
+    Mat src(srcSize, type), dst(7, 1, CV_64F);
+    const bool binaryImage = false;
+    cv::Moments mom;

-            GPU_ON;
-            ocl::ocl_moments(src, binaryImage);
-            GPU_OFF;
+    declare.in(src, WARMUP_RNG).out(dst);

-            GPU_FULL_ON;
-            ocl::ocl_moments(src, binaryImage);
-            GPU_FULL_OFF;
+    if (RUN_OCL_IMPL)
+    {
+        ocl::oclMat oclSrc(src);

-            TestSystem::instance().ExpectedMatNear(gpu_dst, cpu_dst, .5);
+        TEST_CYCLE() mom = cv::ocl::ocl_moments(oclSrc, binaryImage); // TODO Use oclSrc
+        cv::HuMoments(mom, dst);

-        }
+        SANITY_CHECK(dst);
+    }
+    else if (RUN_PLAIN_IMPL)
+    {
+        TEST_CYCLE() mom = cv::moments(src, binaryImage);
+        cv::HuMoments(mom, dst);

+        SANITY_CHECK(dst);
    }
+    else
+        OCL_PERF_ELSE
 }
--- a/modules/ocl/perf/perf_norm.cpp
+++ b/modules/ocl/perf/perf_norm.cpp
@ -45,43 +45,39 @@
 //M*/
 #include "perf_precomp.hpp"

-///////////// norm////////////////////////
-PERFTEST(norm)
-{
-    Mat src1, src2, ocl_src1;
-    ocl::oclMat d_src1, d_src2;
+using namespace perf;
+using std::tr1::tuple;
+using std::tr1::get;

-    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
-    {
-        SUBTEST << size << 'x' << size << "; CV_8UC1; NORM_INF";
-
-        gen(src1, size, size, CV_8UC1, Scalar::all(0), Scalar::all(1));
-        gen(src2, size, size, CV_8UC1, Scalar::all(0), Scalar::all(1));
+///////////// norm////////////////////////

-        norm(src1, src2, NORM_INF);
+typedef TestBaseWithParam<Size> normFixture;

-        CPU_ON;
-        norm(src1, src2, NORM_INF);
-        CPU_OFF;
+PERF_TEST_P(normFixture, DISABLED_norm, OCL_TYPICAL_MAT_SIZES) // TODO doesn't work properly
+{
+    const Size srcSize = GetParam();
+    const std::string impl = getSelectedImpl();
+    double value = 0.0;

-        d_src1.upload(src1);
-        d_src2.upload(src2);
+    Mat src1(srcSize, CV_8UC1), src2(srcSize, CV_8UC1);
+    declare.in(src1, src2);
+    randu(src1, 0, 1);
+    randu(src2, 0, 1);

-        WARMUP_ON;
-        ocl::norm(d_src1, d_src2, NORM_INF);
-        WARMUP_OFF;
+    if (RUN_OCL_IMPL)
+    {
+        ocl::oclMat oclSrc1(src1), oclSrc2(src2);

-        d_src1.download(ocl_src1);
-        TestSystem::instance().ExpectedMatNear(src1, ocl_src1, .5);
+        TEST_CYCLE() value = cv::ocl::norm(oclSrc1, oclSrc2, NORM_INF);

-        GPU_ON;
-        ocl::norm(d_src1, d_src2, NORM_INF);
-        GPU_OFF;
+        SANITY_CHECK(value);
+    }
+    else if (RUN_PLAIN_IMPL)
+    {
+        TEST_CYCLE() value = cv::norm(src1, src2, NORM_INF);

-        GPU_FULL_ON;
-        d_src1.upload(src1);
-        d_src2.upload(src2);
-        ocl::norm(d_src1, d_src2, NORM_INF);
-        GPU_FULL_OFF;
+        SANITY_CHECK(value);
    }
+    else
+        OCL_PERF_ELSE
 }
--- a/modules/ocl/perf/perf_opticalflow.cpp
+++ b/modules/ocl/perf/perf_opticalflow.cpp
@ -46,311 +46,228 @@
 #include "perf_precomp.hpp"

 ///////////// PyrLKOpticalFlow ////////////////////////
-PERFTEST(PyrLKOpticalFlow)
-{
-    std::string images1[] = {"rubberwhale1.png", "aloeL.jpg"};
-    std::string images2[] = {"rubberwhale2.png", "aloeR.jpg"};

-    for (size_t i = 0; i < sizeof(images1) / sizeof(std::string); i++)
-    {
-        Mat frame0 = imread(abspath(images1[i]), i == 0 ? IMREAD_COLOR : IMREAD_GRAYSCALE);
+using namespace perf;
+using std::tr1::get;
+using std::tr1::tuple;
+using std::tr1::make_tuple;

-        if (frame0.empty())
-        {
-            std::string errstr = "can't open " + images1[i];
-            throw runtime_error(errstr);
-        }
+template <typename T>
+static vector<T> & MatToVector(const ocl::oclMat & oclSrc, vector<T> & instance)
+{
+    Mat src;
+    oclSrc.download(src);

-        Mat frame1 = imread(abspath(images2[i]), i == 0 ? IMREAD_COLOR : IMREAD_GRAYSCALE);
+    for (int i = 0; i < src.cols; ++i)
+        instance.push_back(src.at<T>(0, i));

-        if (frame1.empty())
-        {
-            std::string errstr = "can't open " + images2[i];
-            throw runtime_error(errstr);
-        }
+    return instance;
+}

-        Mat gray_frame;
+CV_ENUM(LoadMode, IMREAD_GRAYSCALE, IMREAD_COLOR)
+
+typedef tuple<int, tuple<string, string, LoadMode> > PyrLKOpticalFlowParamType;
+typedef TestBaseWithParam<PyrLKOpticalFlowParamType> PyrLKOpticalFlowFixture;
+
+PERF_TEST_P(PyrLKOpticalFlowFixture,
+            DISABLED_PyrLKOpticalFlow,
+            ::testing::Combine(
+                ::testing::Values(1000, 2000, 4000),
+                ::testing::Values(
+                    make_tuple<string, string, LoadMode>
+                    (
+                        string("gpu/opticalflow/rubberwhale1.png"),
+                        string("gpu/opticalflow/rubberwhale2.png"),
+                        LoadMode(IMREAD_COLOR)
+                        )
+                    , make_tuple<string, string, LoadMode>
+                    (
+                        string("gpu/stereobm/aloe-L.png"),
+                        string("gpu/stereobm/aloe-R.png"),
+                        LoadMode(IMREAD_GRAYSCALE)
+                        )
+                    )
+                )
+            ) // TODO to big difference between implementations
+{
+    PyrLKOpticalFlowParamType params = GetParam();
+    tuple<string, string, LoadMode> fileParam = get<1>(params);
+    const int pointsCount = get<0>(params);
+    const int openMode = static_cast<int>(get<2>(fileParam));
+    const string fileName0 = get<0>(fileParam), fileName1 = get<1>(fileParam);
+    Mat frame0 = imread(getDataPath(fileName0), openMode);
+    Mat frame1 = imread(getDataPath(fileName1), openMode);
+
+    ASSERT_FALSE(frame0.empty()) << "can't load " << fileName0;
+    ASSERT_FALSE(frame1.empty()) << "can't load " << fileName1;
+
+    Mat grayFrame;
+    if (openMode == IMREAD_COLOR)
+        cvtColor(frame0, grayFrame, COLOR_BGR2GRAY);
+    else
+        grayFrame = frame0;
+
+    vector<Point2f> pts, nextPts;
+    vector<unsigned char> status;
+    vector<float> err;
+    goodFeaturesToTrack(grayFrame, pts, pointsCount, 0.01, 0.0);
+
+    if (RUN_PLAIN_IMPL)
+    {
+        TEST_CYCLE()
+                cv::calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, err);

-        if (i == 0)
-        {
-            cvtColor(frame0, gray_frame, COLOR_BGR2GRAY);
-        }
+        SANITY_CHECK(nextPts);
+        SANITY_CHECK(status);
+        SANITY_CHECK(err);
+    }
+    else if (RUN_OCL_IMPL)
+    {
+        ocl::PyrLKOpticalFlow oclPyrLK;
+        ocl::oclMat oclFrame0(frame0), oclFrame1(frame1);
+        ocl::oclMat oclPts(1, static_cast<int>(pts.size()), CV_32FC2, (void *)&pts[0]);
+        ocl::oclMat oclNextPts, oclStatus, oclErr;

-        for (int points = Min_Size; points <= Max_Size; points *= Multiple)
-        {
-            if (i == 0)
-                SUBTEST << frame0.cols << "x" << frame0.rows << "; color; " << points << " points";
-            else
-                SUBTEST << frame0.cols << "x" << frame0.rows << "; gray; " << points << " points";
-            Mat ocl_nextPts;
-            Mat ocl_status;
-
-            vector<Point2f> pts;
-            goodFeaturesToTrack(i == 0 ? gray_frame : frame0, pts, points, 0.01, 0.0);
-
-            vector<Point2f> nextPts;
-            vector<unsigned char> status;
-
-            vector<float> err;
-
-            calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, err);
-
-            CPU_ON;
-            calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, err);
-            CPU_OFF;
-
-            ocl::PyrLKOpticalFlow d_pyrLK;
-
-            ocl::oclMat d_frame0(frame0);
-            ocl::oclMat d_frame1(frame1);
-
-            ocl::oclMat d_pts;
-            Mat pts_mat(1, (int)pts.size(), CV_32FC2, (void *)&pts[0]);
-            d_pts.upload(pts_mat);
-
-            ocl::oclMat d_nextPts;
-            ocl::oclMat d_status;
-            ocl::oclMat d_err;
-
-            WARMUP_ON;
-            d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
-            WARMUP_OFF;
-
-            GPU_ON;
-            d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            d_frame0.upload(frame0);
-            d_frame1.upload(frame1);
-            d_pts.upload(pts_mat);
-            d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
-
-            if (!d_nextPts.empty())
-                d_nextPts.download(ocl_nextPts);
-
-            if (!d_status.empty())
-                d_status.download(ocl_status);
-            GPU_FULL_OFF;
-
-            size_t mismatch = 0;
-            for (int i = 0; i < (int)nextPts.size(); ++i)
-            {
-                if(status[i] != ocl_status.at<unsigned char>(0, i))
-                {
-                    mismatch++;
-                    continue;
-                }
-                if(status[i])
-                {
-                    Point2f gpu_rst = ocl_nextPts.at<Point2f>(0, i);
-                    Point2f cpu_rst = nextPts[i];
-                    if(fabs(gpu_rst.x - cpu_rst.x) >= 1. || fabs(gpu_rst.y - cpu_rst.y) >= 1.)
-                        mismatch++;
-                }
-            }
-            double ratio = (double)mismatch / (double)nextPts.size();
-            if(ratio < .02)
-                TestSystem::instance().setAccurate(1, ratio);
-            else
-                TestSystem::instance().setAccurate(0, ratio);
-        }
+        TEST_CYCLE()
+                oclPyrLK.sparse(oclFrame0, oclFrame1, oclPts, oclNextPts, oclStatus, &oclErr);
+
+        MatToVector(oclNextPts, nextPts);
+        MatToVector(oclStatus, status);
+        MatToVector(oclErr, err);

+        SANITY_CHECK(nextPts);
+        SANITY_CHECK(status);
+        SANITY_CHECK(err);
    }
+    else
+        OCL_PERF_ELSE
 }

-
-PERFTEST(tvl1flow)
+PERF_TEST(tvl1flowFixture, tvl1flow)
 {
-    cv::Mat frame0 = imread("rubberwhale1.png", cv::IMREAD_GRAYSCALE);
-    assert(!frame0.empty());
+    Mat frame0 = imread(getDataPath("gpu/opticalflow/rubberwhale1.png"), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty()) << "can't load rubberwhale1.png";

-    cv::Mat frame1 = imread("rubberwhale2.png", cv::IMREAD_GRAYSCALE);
-    assert(!frame1.empty());
+    Mat frame1 = imread(getDataPath("gpu/opticalflow/rubberwhale2.png"), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty()) << "can't load rubberwhale2.png";

-    cv::ocl::OpticalFlowDual_TVL1_OCL d_alg;
-    cv::ocl::oclMat d_flowx(frame0.size(), CV_32FC1);
-    cv::ocl::oclMat d_flowy(frame1.size(), CV_32FC1);
+    const Size srcSize = frame0.size();
+    const double eps = 1.2;
+    Mat flow(srcSize, CV_32FC2), flow1(srcSize, CV_32FC1), flow2(srcSize, CV_32FC1);
+    declare.in(frame0, frame1).out(flow1, flow2).time(159);

-    cv::Ptr<cv::DenseOpticalFlow> alg = cv::createOptFlow_DualTVL1();
-    cv::Mat flow;
+    if (RUN_PLAIN_IMPL)
+    {
+        Ptr<DenseOpticalFlow> alg = createOptFlow_DualTVL1();

+        TEST_CYCLE() alg->calc(frame0, frame1, flow);

-    SUBTEST << frame0.cols << 'x' << frame0.rows << "; rubberwhale1.png; "<<frame1.cols<<'x'<<frame1.rows<<"; rubberwhale2.png";
+        alg->collectGarbage();
+        Mat flows[2] = { flow1, flow2 };
+        split(flow, flows);

-    alg->calc(frame0, frame1, flow);
+        SANITY_CHECK(flow1, eps);
+        SANITY_CHECK(flow2, eps);
+    }
+    else if (RUN_OCL_IMPL)
+    {
+        ocl::OpticalFlowDual_TVL1_OCL oclAlg;
+        ocl::oclMat oclFrame0(frame0), oclFrame1(frame1), oclFlow1(srcSize, CV_32FC1),
+                oclFlow2(srcSize, CV_32FC1);

-    CPU_ON;
-    alg->calc(frame0, frame1, flow);
-    CPU_OFF;
+        TEST_CYCLE() oclAlg(oclFrame0, oclFrame1, oclFlow1, oclFlow2);

-    cv::Mat gold[2];
-    cv::split(flow, gold);
+        oclAlg.collectGarbage();

-    cv::ocl::oclMat d0(frame0.size(), CV_32FC1);
-    d0.upload(frame0);
-    cv::ocl::oclMat d1(frame1.size(), CV_32FC1);
-    d1.upload(frame1);
+        oclFlow1.download(flow1);
+        oclFlow2.download(flow2);

-    WARMUP_ON;
-    d_alg(d0, d1, d_flowx, d_flowy);
-    WARMUP_OFF;
-    /*
-        double diff1 = 0.0, diff2 = 0.0;
-        if(ExceptedMatSimilar(gold[0], cv::Mat(d_flowx), 3e-3, diff1) == 1
-            &&ExceptedMatSimilar(gold[1], cv::Mat(d_flowy), 3e-3, diff2) == 1)
-            TestSystem::instance().setAccurate(1);
-        else
-            TestSystem::instance().setAccurate(0);
+        SANITY_CHECK(flow1, eps);
+        SANITY_CHECK(flow2, eps);
+    }
+    else
+        OCL_PERF_ELSE
+}

-        TestSystem::instance().setDiff(diff1);
-        TestSystem::instance().setDiff(diff2);
-    */
+///////////// FarnebackOpticalFlow ////////////////////////

+CV_ENUM(farneFlagType, 0, OPTFLOW_FARNEBACK_GAUSSIAN)

-    GPU_ON;
-    d_alg(d0, d1, d_flowx, d_flowy);
-    d_alg.collectGarbage();
-    GPU_OFF;
+typedef tuple<tuple<int, double>, farneFlagType, bool> FarnebackOpticalFlowParams;
+typedef TestBaseWithParam<FarnebackOpticalFlowParams> FarnebackOpticalFlowFixture;

+PERF_TEST_P(FarnebackOpticalFlowFixture, FarnebackOpticalFlow,
+            ::testing::Combine(
+                ::testing::Values(make_tuple<int, double>(5, 1.1),
+                                  make_tuple<int, double>(7, 1.5)),
+                farneFlagType::all(),
+                ::testing::Bool()))
+{
+    Mat frame0 = imread(getDataPath("gpu/opticalflow/rubberwhale1.png"), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty()) << "can't load rubberwhale1.png";

-    cv::Mat flowx, flowy;
+    Mat frame1 = imread(getDataPath("gpu/opticalflow/rubberwhale2.png"), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty()) << "can't load rubberwhale2.png";

-    GPU_FULL_ON;
-    d0.upload(frame0);
-    d1.upload(frame1);
-    d_alg(d0, d1, d_flowx, d_flowy);
-    d_alg.collectGarbage();
-    d_flowx.download(flowx);
-    d_flowy.download(flowy);
-    GPU_FULL_OFF;
+    const Size srcSize = frame0.size();

-    TestSystem::instance().ExceptedMatSimilar(gold[0], flowx, 3e-3);
-    TestSystem::instance().ExceptedMatSimilar(gold[1], flowy, 3e-3);
-}
+    const FarnebackOpticalFlowParams params = GetParam();
+    const tuple<int, double> polyParams = get<0>(params);
+    const int polyN = get<0>(polyParams), flags = get<1>(params);
+    const double polySigma = get<1>(polyParams), pyrScale = 0.5;
+    const bool useInitFlow = get<2>(params);
+    const double eps = 1.5;

-///////////// FarnebackOpticalFlow ////////////////////////
-PERFTEST(FarnebackOpticalFlow)
-{
-    cv::Mat frame0 = imread("rubberwhale1.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame0.empty());
+    Mat flowx(srcSize, CV_32FC1), flowy(srcSize, CV_32FC1), flow(srcSize, CV_32FC2);
+    declare.in(frame0, frame1).out(flowx, flowy);

-    cv::Mat frame1 = imread("rubberwhale2.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame1.empty());
+    ocl::FarnebackOpticalFlow farn;
+    farn.pyrScale = pyrScale;
+    farn.polyN = polyN;
+    farn.polySigma = polySigma;
+    farn.flags = flags;

-    cv::ocl::oclMat d_frame0(frame0), d_frame1(frame1);
+    if (RUN_PLAIN_IMPL)
+    {
+        if (useInitFlow)
+        {
+            calcOpticalFlowFarneback(
+                        frame0, frame1, flow, farn.pyrScale, farn.numLevels, farn.winSize,
+                        farn.numIters, farn.polyN, farn.polySigma, farn.flags);
+            farn.flags |= OPTFLOW_USE_INITIAL_FLOW;
+        }

-    int polyNs[2] = { 5, 7 };
-    double polySigmas[2] = { 1.1, 1.5 };
-    int farneFlags[2] = { 0, cv::OPTFLOW_FARNEBACK_GAUSSIAN };
-    bool UseInitFlows[2] = { false, true };
-    double pyrScale = 0.5;
+        TEST_CYCLE()
+                calcOpticalFlowFarneback(
+                    frame0, frame1, flow, farn.pyrScale, farn.numLevels, farn.winSize,
+                    farn.numIters, farn.polyN, farn.polySigma, farn.flags);

-    string farneFlagStrs[2] = { "BoxFilter", "GaussianBlur" };
-    string useInitFlowStrs[2] = { "", "UseInitFlow" };
+        Mat flowxy[2] = { flowx, flowy };
+        split(flow, flowxy);

-    for ( int i = 0; i < 2; ++i)
+        SANITY_CHECK(flowx, eps);
+        SANITY_CHECK(flowy, eps);
+    }
+    else if (RUN_OCL_IMPL)
    {
-        int polyN = polyNs[i];
-        double polySigma = polySigmas[i];
+        ocl::oclMat oclFrame0(frame0), oclFrame1(frame1),
+                oclFlowx(srcSize, CV_32FC1), oclFlowy(srcSize, CV_32FC1);

-        for ( int j = 0; j < 2; ++j)
+        if (useInitFlow)
        {
-            int flags = farneFlags[j];
-
-            for ( int k = 0; k < 2; ++k)
-            {
-                bool useInitFlow = UseInitFlows[k];
-                SUBTEST << "polyN(" << polyN << "); " << farneFlagStrs[j] << "; " << useInitFlowStrs[k];
-
-                cv::ocl::FarnebackOpticalFlow farn;
-                farn.pyrScale = pyrScale;
-                farn.polyN = polyN;
-                farn.polySigma = polySigma;
-                farn.flags = flags;
-
-                cv::ocl::oclMat d_flowx, d_flowy;
-                cv::Mat flow, flowBuf, flowxBuf, flowyBuf;
-
-                WARMUP_ON;
-                farn(d_frame0, d_frame1, d_flowx, d_flowy);
-
-                if (useInitFlow)
-                {
-                    cv::Mat flowxy[] = {cv::Mat(d_flowx), cv::Mat(d_flowy)};
-                    cv::merge(flowxy, 2, flow);
-                    flow.copyTo(flowBuf);
-                    flowxy[0].copyTo(flowxBuf);
-                    flowxy[1].copyTo(flowyBuf);
-
-                    farn.flags |= cv::OPTFLOW_USE_INITIAL_FLOW;
-                    farn(d_frame0, d_frame1, d_flowx, d_flowy);
-                }
-                WARMUP_OFF;
-
-                cv::calcOpticalFlowFarneback(
-                    frame0, frame1, flow, farn.pyrScale, farn.numLevels, farn.winSize,
-                    farn.numIters, farn.polyN, farn.polySigma, farn.flags);
-
-                std::vector<cv::Mat> flowxy;
-                cv::split(flow, flowxy);
+            farn(oclFrame0, oclFrame1, oclFlowx, oclFlowy);
+            farn.flags |= OPTFLOW_USE_INITIAL_FLOW;
+        }

-                Mat md_flowx = cv::Mat(d_flowx);
-                Mat md_flowy = cv::Mat(d_flowy);
-                TestSystem::instance().ExceptedMatSimilar(flowxy[0], md_flowx, 0.1);
-                TestSystem::instance().ExceptedMatSimilar(flowxy[1], md_flowy, 0.1);
+        TEST_CYCLE()
+                farn(oclFrame0, oclFrame1, oclFlowx, oclFlowy);

-                if (useInitFlow)
-                {
-                    cv::Mat flowx, flowy;
-                    farn.flags = (flags | cv::OPTFLOW_USE_INITIAL_FLOW);
+        oclFlowx.download(flowx);
+        oclFlowy.download(flowy);

-                    CPU_ON;
-                    cv::calcOpticalFlowFarneback(
-                        frame0, frame1, flowBuf, farn.pyrScale, farn.numLevels, farn.winSize,
-                        farn.numIters, farn.polyN, farn.polySigma, farn.flags);
-                    CPU_OFF;
-
-                    GPU_ON;
-                    farn(d_frame0, d_frame1, d_flowx, d_flowy);
-                    GPU_OFF;
-
-                    GPU_FULL_ON;
-                    d_frame0.upload(frame0);
-                    d_frame1.upload(frame1);
-                    d_flowx.upload(flowxBuf);
-                    d_flowy.upload(flowyBuf);
-                    farn(d_frame0, d_frame1, d_flowx, d_flowy);
-                    d_flowx.download(flowx);
-                    d_flowy.download(flowy);
-                    GPU_FULL_OFF;
-                }
-                else
-                {
-                    cv::Mat flow, flowx, flowy;
-                    cv::ocl::oclMat d_flowx, d_flowy;
-
-                    farn.flags = flags;
-
-                    CPU_ON;
-                    cv::calcOpticalFlowFarneback(
-                        frame0, frame1, flow, farn.pyrScale, farn.numLevels, farn.winSize,
-                        farn.numIters, farn.polyN, farn.polySigma, farn.flags);
-                    CPU_OFF;
-
-                    GPU_ON;
-                    farn(d_frame0, d_frame1, d_flowx, d_flowy);
-                    GPU_OFF;
-
-                    GPU_FULL_ON;
-                    d_frame0.upload(frame0);
-                    d_frame1.upload(frame1);
-                    farn(d_frame0, d_frame1, d_flowx, d_flowy);
-                    d_flowx.download(flowx);
-                    d_flowy.download(flowy);
-                    GPU_FULL_OFF;
-                }
-            }
-        }
+        SANITY_CHECK(flowx, eps);
+        SANITY_CHECK(flowy, eps);
    }
+    else
+        OCL_PERF_ELSE
 }
--- a/modules/ocl/perf/perf_precomp.cpp
+++ b/modules/ocl/perf/perf_precomp.cpp
@ -41,443 +41,3 @@
 //M*/

 #include "perf_precomp.hpp"
-#if GTEST_OS_WINDOWS
-#ifndef NOMINMAX
-#define NOMINMAX
-#endif
-# include <windows.h>
-#endif
-
-// This program test most of the functions in ocl module and generate data metrix of x-factor in .csv files
-// All images needed in this test are in samples/gpu folder.
-// For haar template, haarcascade_frontalface_alt.xml shouold be in working directory
-void TestSystem::run()
-{
-    if (is_list_mode_)
-    {
-        for (vector<Runnable *>::iterator it = tests_.begin(); it != tests_.end(); ++it)
-        {
-            cout << (*it)->name() << endl;
-        }
-
-        return;
-    }
-
-    // Run test initializers
-    for (vector<Runnable *>::iterator it = inits_.begin(); it != inits_.end(); ++it)
-    {
-        if ((*it)->name().find(test_filter_, 0) != string::npos)
-        {
-            (*it)->run();
-        }
-    }
-
-    printHeading();
-    writeHeading();
-
-    // Run tests
-    for (vector<Runnable *>::iterator it = tests_.begin(); it != tests_.end(); ++it)
-    {
-        try
-        {
-            if ((*it)->name().find(test_filter_, 0) != string::npos)
-            {
-                cout << endl << (*it)->name() << ":\n";
-
-                setCurrentTest((*it)->name());
-                //fprintf(record_,"%s\n",(*it)->name().c_str());
-
-                (*it)->run();
-                finishCurrentSubtest();
-            }
-        }
-        catch (const Exception &)
-        {
-            // Message is printed via callback
-            resetCurrentSubtest();
-        }
-        catch (const runtime_error &e)
-        {
-            printError(e.what());
-            resetCurrentSubtest();
-        }
-    }
-
-    printSummary();
-    writeSummary();
-}
-
-
-void TestSystem::finishCurrentSubtest()
-{
-    if (cur_subtest_is_empty_)
-        // There is no need to print subtest statistics
-    {
-        return;
-    }
-
-    double cpu_time = cpu_elapsed_ / getTickFrequency() * 1000.0;
-    double gpu_time = gpu_elapsed_ / getTickFrequency() * 1000.0;
-    double gpu_full_time = gpu_full_elapsed_ / getTickFrequency() * 1000.0;
-
-    double speedup = static_cast<double>(cpu_elapsed_) / std::max(1.0, gpu_elapsed_);
-    speedup_total_ += speedup;
-
-    double fullspeedup = static_cast<double>(cpu_elapsed_) / std::max(1.0, gpu_full_elapsed_);
-    speedup_full_total_ += fullspeedup;
-
-    if (speedup > top_)
-    {
-        speedup_faster_count_++;
-    }
-    else if (speedup < bottom_)
-    {
-        speedup_slower_count_++;
-    }
-    else
-    {
-        speedup_equal_count_++;
-    }
-
-    if (fullspeedup > top_)
-    {
-        speedup_full_faster_count_++;
-    }
-    else if (fullspeedup < bottom_)
-    {
-        speedup_full_slower_count_++;
-    }
-    else
-    {
-        speedup_full_equal_count_++;
-    }
-
-    // compute min, max and
-    std::sort(gpu_times_.begin(), gpu_times_.end());
-    double gpu_min = gpu_times_.front() / getTickFrequency() * 1000.0;
-    double gpu_max = gpu_times_.back() / getTickFrequency() * 1000.0;
-    double deviation = 0;
-
-    if (gpu_times_.size() > 1)
-    {
-        double sum = 0;
-
-        for (size_t i = 0; i < gpu_times_.size(); i++)
-        {
-            int64 diff = gpu_times_[i] - static_cast<int64>(gpu_elapsed_);
-            double diff_time = diff * 1000 / getTickFrequency();
-            sum += diff_time * diff_time;
-        }
-
-        deviation = std::sqrt(sum / gpu_times_.size());
-    }
-
-    printMetrics(is_accurate_, cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup);
-    writeMetrics(cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup, gpu_min, gpu_max, deviation);
-
-    num_subtests_called_++;
-    resetCurrentSubtest();
-}
-
-
-double TestSystem::meanTime(const vector<int64> &samples)
-{
-    double sum = accumulate(samples.begin(), samples.end(), 0.);
-    return sum / samples.size();
-}
-
-
-void TestSystem::printHeading()
-{
-    cout << endl;
-    cout<< setiosflags(ios_base::left);
-
-#if 0
-    cout<<TAB<<setw(7)<< "Accu." << setw(10) << "CPU (ms)" << setw(10) << "GPU, ms"
-        << setw(8) << "Speedup"<< setw(10)<<"GPUTotal" << setw(10) << "Total"
-        << "Description\n";
-    cout<<TAB<<setw(7)<<""<<setw(10)<<""<<setw(10)<<""<<setw(8)<<""<<setw(10)<<"(ms)"<<setw(10)<<"Speedup\n";
-#endif
-
-    cout<<TAB<< setw(10) << "CPU (ms)" << setw(10) << "GPU, ms"
-        << setw(8) << "Speedup"<< setw(10)<<"GPUTotal" << setw(10) << "Total"
-        << "Description\n";
-    cout<<TAB<<setw(10)<<""<<setw(10)<<""<<setw(8)<<""<<setw(10)<<"(ms)"<<setw(10)<<"Speedup\n";
-
-    cout << resetiosflags(ios_base::left);
-}
-
-void TestSystem::writeHeading()
-{
-    if (!record_)
-    {
-        recordname_ += "_OCL.csv";
-        record_ = fopen(recordname_.c_str(), "w");
-        if(record_ == NULL)
-        {
-            cout<<".csv file open failed.\n";
-            exit(0);
-        }
-    }
-
-    fprintf(record_, "NAME,DESCRIPTION,ACCURACY,DIFFERENCE,CPU (ms),GPU (ms),SPEEDUP,GPUTOTAL (ms),TOTALSPEEDUP,GPU Min (ms),GPU Max (ms), Standard deviation (ms)\n");
-
-    fflush(record_);
-}
-
-void TestSystem::printSummary()
-{
-    cout << setiosflags(ios_base::fixed);
-    cout << "\naverage GPU speedup: x"
-        << setprecision(3) << speedup_total_ / std::max(1, num_subtests_called_)
-        << endl;
-    cout << "\nGPU exceeded: "
-        << setprecision(3) << speedup_faster_count_
-        << "\nGPU passed: "
-        << setprecision(3) << speedup_equal_count_
-        << "\nGPU failed: "
-        << setprecision(3) << speedup_slower_count_
-        << endl;
-    cout << "\nGPU exceeded rate: "
-        << setprecision(3) << (float)speedup_faster_count_ / std::max(1, num_subtests_called_) * 100
-        << "%"
-        << "\nGPU passed rate: "
-        << setprecision(3) << (float)speedup_equal_count_ / std::max(1, num_subtests_called_) * 100
-        << "%"
-        << "\nGPU failed rate: "
-        << setprecision(3) << (float)speedup_slower_count_ / std::max(1, num_subtests_called_) * 100
-        << "%"
-        << endl;
-    cout << "\naverage GPUTOTAL speedup: x"
-        << setprecision(3) << speedup_full_total_ / std::max(1, num_subtests_called_)
-        << endl;
-    cout << "\nGPUTOTAL exceeded: "
-        << setprecision(3) << speedup_full_faster_count_
-        << "\nGPUTOTAL passed: "
-        << setprecision(3) << speedup_full_equal_count_
-        << "\nGPUTOTAL failed: "
-        << setprecision(3) << speedup_full_slower_count_
-        << endl;
-    cout << "\nGPUTOTAL exceeded rate: "
-        << setprecision(3) << (float)speedup_full_faster_count_ / std::max(1, num_subtests_called_) * 100
-        << "%"
-        << "\nGPUTOTAL passed rate: "
-        << setprecision(3) << (float)speedup_full_equal_count_ / std::max(1, num_subtests_called_) * 100
-        << "%"
-        << "\nGPUTOTAL failed rate: "
-        << setprecision(3) << (float)speedup_full_slower_count_ / std::max(1, num_subtests_called_) * 100
-        << "%"
-        << endl;
-    cout << resetiosflags(ios_base::fixed);
-}
-
-
-enum GTestColor {
-    COLOR_DEFAULT,
-    COLOR_RED,
-    COLOR_GREEN,
-    COLOR_YELLOW
-};
-#if GTEST_OS_WINDOWS&&!GTEST_OS_WINDOWS_MOBILE
-// Returns the character attribute for the given color.
-static WORD GetColorAttribute(GTestColor color) {
-    switch (color) {
-    case COLOR_RED:    return FOREGROUND_RED;
-    case COLOR_GREEN:  return FOREGROUND_GREEN;
-    case COLOR_YELLOW: return FOREGROUND_RED | FOREGROUND_GREEN;
-    default:           return 0;
-    }
-}
-#else
-static const char* GetAnsiColorCode(GTestColor color) {
-    switch (color) {
-    case COLOR_RED:     return "1";
-    case COLOR_GREEN:   return "2";
-    case COLOR_YELLOW:  return "3";
-    default:            return NULL;
-    };
-}
-#endif
-
-static void printMetricsUti(double cpu_time, double gpu_time, double gpu_full_time, double speedup, double fullspeedup, std::stringstream& stream, std::stringstream& cur_subtest_description)
-{
-    //cout <<TAB<< setw(7) << stream.str();
-    cout <<TAB;
-
-    stream.str("");
-    stream << cpu_time;
-    cout << setw(10) << stream.str();
-
-    stream.str("");
-    stream << gpu_time;
-    cout << setw(10) << stream.str();
-
-    stream.str("");
-    stream << "x" << setprecision(3) << speedup;
-    cout << setw(8) << stream.str();
-
-    stream.str("");
-    stream << gpu_full_time;
-    cout << setw(10) << stream.str();
-
-    stream.str("");
-    stream << "x" << setprecision(3) << fullspeedup;
-    cout << setw(10) << stream.str();
-
-    cout << cur_subtest_description.str();
-    cout << resetiosflags(ios_base::left) << endl;
-}
-
-void TestSystem::printMetrics(int is_accurate, double cpu_time, double gpu_time, double gpu_full_time, double speedup, double fullspeedup)
-{
-    cout << setiosflags(ios_base::left);
-    stringstream stream;
-
-    std::stringstream &cur_subtest_description = getCurSubtestDescription();
-
-#if GTEST_OS_WINDOWS&&!GTEST_OS_WINDOWS_MOBILE
-
-    WORD color;
-    const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
-    // Gets the current text color.
-    CONSOLE_SCREEN_BUFFER_INFO buffer_info;
-    GetConsoleScreenBufferInfo(stdout_handle, &buffer_info);
-    const WORD old_color_attrs = buffer_info.wAttributes;
-    // We need to flush the stream buffers into the console before each
-    // SetConsoleTextAttribute call lest it affect the text that is already
-    // printed but has not yet reached the console.
-    fflush(stdout);
-
-    if(is_accurate == 1||is_accurate == -1)
-    {
-        color = old_color_attrs;
-        printMetricsUti(cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup, stream, cur_subtest_description);
-
-    }else
-    {
-        color = GetColorAttribute(COLOR_RED);
-        SetConsoleTextAttribute(stdout_handle,
-            color| FOREGROUND_INTENSITY);
-
-        printMetricsUti(cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup, stream, cur_subtest_description);
-        fflush(stdout);
-        // Restores the text color.
-        SetConsoleTextAttribute(stdout_handle, old_color_attrs);
-    }
-#else
-    GTestColor color = COLOR_RED;
-    if(is_accurate == 1|| is_accurate == -1)
-    {
-        printMetricsUti(cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup, stream, cur_subtest_description);
-
-    }else
-    {
-        printf("\033[0;3%sm", GetAnsiColorCode(color));
-        printMetricsUti(cpu_time, gpu_time, gpu_full_time, speedup, fullspeedup, stream, cur_subtest_description);
-        printf("\033[m");  // Resets the terminal to default.
-    }
-#endif
-}
-
-void TestSystem::writeMetrics(double cpu_time, double gpu_time, double gpu_full_time, double speedup, double fullspeedup, double gpu_min, double gpu_max, double std_dev)
-{
-    if (!record_)
-    {
-        recordname_ += ".csv";
-        record_ = fopen(recordname_.c_str(), "w");
-    }
-
-    string _is_accurate_;
-
-    if(is_accurate_ == 1)
-        _is_accurate_ = "Pass";
-    else if(is_accurate_ == 0)
-        _is_accurate_ = "Fail";
-    else if(is_accurate_ == -1)
-        _is_accurate_ = " ";
-    else
-    {
-        std::cout<<"is_accurate errer: "<<is_accurate_<<"\n";
-        exit(-1);
-    }
-
-    fprintf(record_, "%s,%s,%s,%.2f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f\n",
-        itname_changed_ ? itname_.c_str() : "",
-        cur_subtest_description_.str().c_str(),
-        _is_accurate_.c_str(),
-        accurate_diff_,
-        cpu_time, gpu_time, speedup, gpu_full_time, fullspeedup,
-        gpu_min, gpu_max, std_dev);
-
-    if (itname_changed_)
-    {
-        itname_changed_ = false;
-    }
-
-    fflush(record_);
-}
-
-void TestSystem::writeSummary()
-{
-    if (!record_)
-    {
-        recordname_ += ".csv";
-        record_ = fopen(recordname_.c_str(), "w");
-    }
-
-    fprintf(record_, "\nAverage GPU speedup: %.3f\n"
-        "exceeded: %d (%.3f%%)\n"
-        "passed: %d (%.3f%%)\n"
-        "failed: %d (%.3f%%)\n"
-        "\nAverage GPUTOTAL speedup: %.3f\n"
-        "exceeded: %d (%.3f%%)\n"
-        "passed: %d (%.3f%%)\n"
-        "failed: %d (%.3f%%)\n",
-        speedup_total_ / std::max(1, num_subtests_called_),
-        speedup_faster_count_, (float)speedup_faster_count_ / std::max(1, num_subtests_called_) * 100,
-        speedup_equal_count_, (float)speedup_equal_count_ / std::max(1, num_subtests_called_) * 100,
-        speedup_slower_count_, (float)speedup_slower_count_ / std::max(1, num_subtests_called_) * 100,
-        speedup_full_total_ / std::max(1, num_subtests_called_),
-        speedup_full_faster_count_, (float)speedup_full_faster_count_ / std::max(1, num_subtests_called_) * 100,
-        speedup_full_equal_count_, (float)speedup_full_equal_count_ / std::max(1, num_subtests_called_) * 100,
-        speedup_full_slower_count_, (float)speedup_full_slower_count_ / std::max(1, num_subtests_called_) * 100
-        );
-    fflush(record_);
-}
-
-void TestSystem::printError(const std::string &msg)
-{
-    if(msg != "CL_INVALID_BUFFER_SIZE")
-    {
-        cout << TAB << "[error: " << msg << "] " << cur_subtest_description_.str() << endl;
-    }
-}
-
-void gen(Mat &mat, int rows, int cols, int type, Scalar low, Scalar high)
-{
-    mat.create(rows, cols, type);
-    RNG rng(0);
-    rng.fill(mat, RNG::UNIFORM, low, high);
-}
-
-string abspath(const string &relpath)
-{
-    return TestSystem::instance().workingDir() + relpath;
-}
-
-double checkNorm(const Mat &m)
-{
-    return norm(m, NORM_INF);
-}
-
-double checkNorm(const Mat &m1, const Mat &m2)
-{
-    return norm(m1, m2, NORM_INF);
-}
-
-double checkSimilarity(const Mat &m1, const Mat &m2)
-{
-    Mat diff;
-    matchTemplate(m1, m2, diff, TM_CCORR_NORMED);
-    return std::abs(diff.at<float>(0, 0) - 1.f);
-}
--- a/modules/ocl/perf/perf_precomp.hpp
+++ b/modules/ocl/perf/perf_precomp.hpp
@ -40,6 +40,14 @@
 //
 //M*/

+#ifdef __GNUC__
+#  pragma GCC diagnostic ignored "-Wmissing-declarations"
+#  if defined __clang__ || defined __APPLE__
+#    pragma GCC diagnostic ignored "-Wmissing-prototypes"
+#    pragma GCC diagnostic ignored "-Wextra"
+#  endif
+#endif
+
 #ifndef __OPENCV_PERF_PRECOMP_HPP__
 #define __OPENCV_PERF_PRECOMP_HPP__

@ -58,7 +66,9 @@
 #include <cstdio>
 #include <vector>
 #include <numeric>
+
 #include "opencv2/core.hpp"
+#include "opencv2/core/utility.hpp"
 #include "opencv2/imgproc.hpp"
 #include "opencv2/highgui.hpp"
 #include "opencv2/calib3d.hpp"
@ -67,456 +77,38 @@
 #include "opencv2/features2d.hpp"
 #include "opencv2/ocl.hpp"
 #include "opencv2/ts.hpp"
-#include "opencv2/ts/ts_perf.hpp"
-#include "opencv2/ts/ts_gtest.h"
-
-#include "opencv2/core/utility.hpp"
-
-#define Min_Size 1000
-#define Max_Size 4000
-#define Multiple 2
-#define TAB "    "

 using namespace std;
 using namespace cv;

-void gen(Mat &mat, int rows, int cols, int type, Scalar low, Scalar high);
-void gen(Mat &mat, int rows, int cols, int type, int low, int high, int n);
-
-string abspath(const string &relpath);
-
-typedef struct
-{
-    short x;
-    short y;
-} COOR;
-COOR do_meanShift(int x0, int y0, uchar *sptr, uchar *dptr, int sstep,
-                  cv::Size size, int sp, int sr, int maxIter, float eps, int *tab);
-void meanShiftProc_(const Mat &src_roi, Mat &dst_roi, Mat &dstCoor_roi,
-                    int sp, int sr, cv::TermCriteria crit);
-
-
-template<class T1, class T2>
-int ExpectedEQ(T1 expected, T2 actual)
-{
-    if(expected == actual)
-        return 1;
-
-    return 0;
-}
-
-template<class T1>
-int EeceptDoubleEQ(T1 expected, T1 actual)
-{
-    testing::internal::Double lhs(expected);
-    testing::internal::Double rhs(actual);
-
-    if (lhs.AlmostEquals(rhs))
-    {
-        return 1;
-    }
-
-    return 0;
-}
-
-template<class T>
-int AssertEQ(T expected, T actual)
-{
-    if(expected == actual)
-    {
-        return 1;
-    }
-    return 0;
-}
-
-int ExceptDoubleNear(double val1, double val2, double abs_error);
-bool match_rect(cv::Rect r1, cv::Rect r2, int threshold);
-
-double checkNorm(const cv::Mat &m);
-double checkNorm(const cv::Mat &m1, const cv::Mat &m2);
-double checkSimilarity(const cv::Mat &m1, const cv::Mat &m2);
-
-int ExpectedMatNear(cv::Mat dst, cv::Mat cpu_dst, double eps);
-int ExceptedMatSimilar(cv::Mat dst, cv::Mat cpu_dst, double eps);
-
-class Runnable
-{
-public:
-    explicit Runnable(const std::string &runname): name_(runname) {}
-    virtual ~Runnable() {}
-
-    const std::string &name() const
-    {
-        return name_;
-    }
-
-    virtual void run() = 0;
-
-private:
-    std::string name_;
-};
-
-class TestSystem
-{
-public:
-    static TestSystem &instance()
-    {
-        static TestSystem me;
-        return me;
-    }
-
-    void setWorkingDir(const std::string &val)
-    {
-        working_dir_ = val;
-    }
-    const std::string &workingDir() const
-    {
-        return working_dir_;
-    }
-
-    void setTestFilter(const std::string &val)
-    {
-        test_filter_ = val;
-    }
-    const std::string &testFilter() const
-    {
-        return test_filter_;
-    }
-
-    void setNumIters(int num_iters)
-    {
-        num_iters_ = num_iters;
-    }
-    void setGPUWarmupIters(int num_iters)
-    {
-        gpu_warmup_iters_ = num_iters;
-    }
-    void setCPUIters(int num_iters)
-    {
-        cpu_num_iters_ = num_iters;
-    }
-
-    void setTopThreshold(double top)
-    {
-        top_ = top;
-    }
-    void setBottomThreshold(double bottom)
-    {
-        bottom_ = bottom;
-    }
-
-    void addInit(Runnable *init)
-    {
-        inits_.push_back(init);
-    }
-    void addTest(Runnable *test)
-    {
-        tests_.push_back(test);
-    }
-    void run();
-
-    // It's public because OpenCV callback uses it
-    void printError(const std::string &msg);
-
-    std::stringstream &startNewSubtest()
-    {
-        finishCurrentSubtest();
-        return cur_subtest_description_;
-    }
-
-    bool stop() const
-    {
-        return cur_iter_idx_ >= num_iters_;
-    }
+#define OCL_SIZE_1000 Size(1000, 1000)
+#define OCL_SIZE_2000 Size(2000, 2000)
+#define OCL_SIZE_4000 Size(4000, 4000)

-    bool cpu_stop() const
-    {
-        return cur_iter_idx_ >= cpu_num_iters_;
-    }
+#define OCL_TYPICAL_MAT_SIZES ::testing::Values(OCL_SIZE_1000, OCL_SIZE_2000, OCL_SIZE_4000)

-    int get_cur_iter_idx()
-    {
-        return cur_iter_idx_;
-    }
+#define OCL_PERF_ENUM(type, ...) ::testing::Values(type, ## __VA_ARGS__ )

-    int get_cpu_num_iters()
-    {
-        return cpu_num_iters_;
-    }
+#define IMPL_OCL "ocl"
+#define IMPL_GPU "gpu"
+#define IMPL_PLAIN "plain"

-    bool warmupStop()
-    {
-        return cur_warmup_idx_++ >= gpu_warmup_iters_;
-    }
+#define RUN_OCL_IMPL (IMPL_OCL == getSelectedImpl())
+#define RUN_PLAIN_IMPL (IMPL_PLAIN == getSelectedImpl())

-    void warmupComplete()
-    {
-        cur_warmup_idx_ = 0;
-    }
-
-    void cpuOn()
-    {
-        cpu_started_ = cv::getTickCount();
-    }
-    void cpuOff()
-    {
-        int64 delta = cv::getTickCount() - cpu_started_;
-        cpu_times_.push_back(delta);
-        ++cur_iter_idx_;
-    }
-    void cpuComplete()
-    {
-        cpu_elapsed_ += meanTime(cpu_times_);
-        cur_subtest_is_empty_ = false;
-        cur_iter_idx_ = 0;
-    }
-
-    void gpuOn()
-    {
-        gpu_started_ = cv::getTickCount();
-    }
-    void gpuOff()
-    {
-        int64 delta = cv::getTickCount() - gpu_started_;
-        gpu_times_.push_back(delta);
-        ++cur_iter_idx_;
-    }
-    void gpuComplete()
-    {
-        gpu_elapsed_ += meanTime(gpu_times_);
-        cur_subtest_is_empty_ = false;
-        cur_iter_idx_ = 0;
-    }
-
-    void gpufullOn()
-    {
-        gpu_full_started_ = cv::getTickCount();
-    }
-    void gpufullOff()
-    {
-        int64 delta = cv::getTickCount() - gpu_full_started_;
-        gpu_full_times_.push_back(delta);
-        ++cur_iter_idx_;
-    }
-    void gpufullComplete()
-    {
-        gpu_full_elapsed_ += meanTime(gpu_full_times_);
-        cur_subtest_is_empty_ = false;
-        cur_iter_idx_ = 0;
-    }
-
-    bool isListMode() const
-    {
-        return is_list_mode_;
-    }
-    void setListMode(bool value)
-    {
-        is_list_mode_ = value;
-    }
-
-    void setRecordName(const std::string &name)
-    {
-        recordname_ = name;
-    }
-
-    void setCurrentTest(const std::string &name)
-    {
-        itname_ = name;
-        itname_changed_ = true;
-    }
-
-    void setAccurate(int accurate, double diff)
-    {
-        is_accurate_ = accurate;
-        accurate_diff_ = diff;
-    }
-
-    void ExpectMatsNear(vector<Mat>& dst, vector<Mat>& cpu_dst, vector<double>& eps)
-    {
-        assert(dst.size() == cpu_dst.size());
-        assert(cpu_dst.size() == eps.size());
-        is_accurate_ = 1;
-        for(size_t i=0; i<dst.size(); i++)
-        {
-            double cur_diff = checkNorm(dst[i], cpu_dst[i]);
-            accurate_diff_ = max(accurate_diff_, cur_diff);
-            if(cur_diff > eps[i])
-                is_accurate_ = 0;
-        }
-    }
-
-    void ExpectedMatNear(cv::Mat& dst, cv::Mat& cpu_dst, double eps)
-    {
-        assert(dst.type() == cpu_dst.type());
-        assert(dst.size() == cpu_dst.size());
-        accurate_diff_ = checkNorm(dst, cpu_dst);
-        if(accurate_diff_ <= eps)
-            is_accurate_ = 1;
-        else
-            is_accurate_ = 0;
-    }
-
-    void ExceptedMatSimilar(cv::Mat& dst, cv::Mat& cpu_dst, double eps)
-    {
-        assert(dst.type() == cpu_dst.type());
-        assert(dst.size() == cpu_dst.size());
-        accurate_diff_ = checkSimilarity(cpu_dst, dst);
-        if(accurate_diff_ <= eps)
-            is_accurate_ = 1;
-        else
-            is_accurate_ = 0;
-    }
-
-    std::stringstream &getCurSubtestDescription()
-    {
-        return cur_subtest_description_;
-    }
-
-private:
-    TestSystem():
-        cur_subtest_is_empty_(true), cpu_elapsed_(0),
-        gpu_elapsed_(0), gpu_full_elapsed_(0), speedup_total_(0.0),
-        num_subtests_called_(0),
-        speedup_faster_count_(0), speedup_slower_count_(0), speedup_equal_count_(0),
-        speedup_full_faster_count_(0), speedup_full_slower_count_(0), speedup_full_equal_count_(0), is_list_mode_(false),
-        num_iters_(10), cpu_num_iters_(2),
-        gpu_warmup_iters_(1), cur_iter_idx_(0), cur_warmup_idx_(0),
-        record_(0), recordname_("performance"), itname_changed_(true),
-        is_accurate_(-1), accurate_diff_(0.)
-    {
-        cpu_times_.reserve(num_iters_);
-        gpu_times_.reserve(num_iters_);
-        gpu_full_times_.reserve(num_iters_);
-    }
-
-    void finishCurrentSubtest();
-    void resetCurrentSubtest()
-    {
-        cpu_elapsed_ = 0;
-        gpu_elapsed_ = 0;
-        gpu_full_elapsed_ = 0;
-        cur_subtest_description_.str("");
-        cur_subtest_is_empty_ = true;
-        cur_iter_idx_ = 0;
-        cur_warmup_idx_ = 0;
-        cpu_times_.clear();
-        gpu_times_.clear();
-        gpu_full_times_.clear();
-        is_accurate_ = -1;
-        accurate_diff_ = 0.;
-    }
-
-    double meanTime(const std::vector<int64> &samples);
-
-    void printHeading();
-    void printSummary();
-    void printMetrics(int is_accurate, double cpu_time, double gpu_time = 0.0f, double gpu_full_time = 0.0f, double speedup = 0.0f, double fullspeedup = 0.0f);
-
-    void writeHeading();
-    void writeSummary();
-    void writeMetrics(double cpu_time, double gpu_time = 0.0f, double gpu_full_time = 0.0f,
-                      double speedup = 0.0f, double fullspeedup = 0.0f,
-                      double gpu_min = 0.0f, double gpu_max = 0.0f, double std_dev = 0.0f);
-
-    std::string working_dir_;
-    std::string test_filter_;
-
-    std::vector<Runnable *> inits_;
-    std::vector<Runnable *> tests_;
-
-    std::stringstream cur_subtest_description_;
-    bool cur_subtest_is_empty_;
-
-    int64 cpu_started_;
-    int64 gpu_started_;
-    int64 gpu_full_started_;
-    double cpu_elapsed_;
-    double gpu_elapsed_;
-    double gpu_full_elapsed_;
-
-    double speedup_total_;
-    double speedup_full_total_;
-    int num_subtests_called_;
-
-    int speedup_faster_count_;
-    int speedup_slower_count_;
-    int speedup_equal_count_;
-
-    int speedup_full_faster_count_;
-    int speedup_full_slower_count_;
-    int speedup_full_equal_count_;
-
-    bool is_list_mode_;
-
-    double top_;
-    double bottom_;
-
-    int num_iters_;
-    int cpu_num_iters_;     //there's no need to set cpu running same times with gpu
-    int gpu_warmup_iters_;  //gpu warm up times, default is 1
-    int cur_iter_idx_;
-    int cur_warmup_idx_;    //current gpu warm up times
-    std::vector<int64> cpu_times_;
-    std::vector<int64> gpu_times_;
-    std::vector<int64> gpu_full_times_;
-
-    FILE *record_;
-    std::string recordname_;
-    std::string itname_;
-    bool itname_changed_;
-
-    int is_accurate_;
-    double accurate_diff_;
-};
-
-
-#define GLOBAL_INIT(name) \
-struct name##_init: Runnable { \
-    name##_init(): Runnable(#name) { \
-    TestSystem::instance().addInit(this); \
-} \
-    void run(); \
-} name##_init_instance; \
-    void name##_init::run()
-
-
-#define PERFTEST(name) \
-struct name##_test: Runnable { \
-    name##_test(): Runnable(#name) { \
-    TestSystem::instance().addTest(this); \
-} \
-    void run(); \
-} name##_test_instance; \
-    void name##_test::run()
-
-#define SUBTEST TestSystem::instance().startNewSubtest()
-
-#define CPU_ON \
-    while (!TestSystem::instance().cpu_stop()) { \
-    TestSystem::instance().cpuOn()
-#define CPU_OFF \
-    TestSystem::instance().cpuOff(); \
-    } TestSystem::instance().cpuComplete()
-
-#define GPU_ON \
-    while (!TestSystem::instance().stop()) { \
-    TestSystem::instance().gpuOn()
-#define GPU_OFF \
-    ocl::finish(); \
-    TestSystem::instance().gpuOff(); \
-    } TestSystem::instance().gpuComplete()
-
-#define GPU_FULL_ON \
-    while (!TestSystem::instance().stop()) { \
-    TestSystem::instance().gpufullOn()
-#define GPU_FULL_OFF \
-    TestSystem::instance().gpufullOff(); \
-    } TestSystem::instance().gpufullComplete()
+#ifdef HAVE_OPENCV_GPU
+# define RUN_GPU_IMPL (IMPL_GPU == getSelectedImpl())
+#endif

-#define WARMUP_ON \
-    while (!TestSystem::instance().warmupStop()) {
-#define WARMUP_OFF \
-        ocl::finish(); \
-    } TestSystem::instance().warmupComplete()
+#ifdef HAVE_OPENCV_GPU
+#define OCL_PERF_ELSE               \
+        if (RUN_GPU_IMPL)          \
+            CV_TEST_FAIL_NO_IMPL(); \
+        else                        \
+            CV_TEST_FAIL_NO_IMPL();
+#else
+#define OCL_PERF_ELSE               \
+            CV_TEST_FAIL_NO_IMPL();
+#endif

 #endif
--- a/modules/ocl/perf/perf_pyramid.cpp
+++ b/modules/ocl/perf/perf_pyramid.cpp
@ -45,88 +45,80 @@
 //M*/
 #include "perf_precomp.hpp"

-///////////// pyrDown //////////////////////
-PERFTEST(pyrDown)
-{
-    Mat src, dst, ocl_dst;
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
+using namespace perf;
+using std::tr1::tuple;
+using std::tr1::get;

-    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
+///////////// pyrDown //////////////////////

-            gen(src, size, size, all_type[j], 0, 256);
+typedef Size_MatType pyrDownFixture;

-            pyrDown(src, dst);
+PERF_TEST_P(pyrDownFixture, pyrDown,
+            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
+                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4)))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);

-            CPU_ON;
-            pyrDown(src, dst);
-            CPU_OFF;
+    Mat src(srcSize, type), dst;
+    Size dstSize((srcSize.height + 1) >> 1, (srcSize.width + 1) >> 1);
+    dst.create(dstSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);

-            ocl::oclMat d_src(src);
-            ocl::oclMat d_dst;
+    if (RUN_OCL_IMPL)
+    {
+        ocl::oclMat oclSrc(src), oclDst(dstSize, type);

-            WARMUP_ON;
-            ocl::pyrDown(d_src, d_dst);
-            WARMUP_OFF;
+        TEST_CYCLE() ocl::pyrDown(oclSrc, oclDst);

-            GPU_ON;
-            ocl::pyrDown(d_src, d_dst);
-            GPU_OFF;
+        oclDst.download(dst);

-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::pyrDown(d_src, d_dst);
-            d_dst.download(ocl_dst);
-            GPU_FULL_OFF;
+        SANITY_CHECK(dst);
+    }
+    else if (RUN_PLAIN_IMPL)
+    {
+        TEST_CYCLE() pyrDown(src, dst);

-            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, dst.depth() == CV_32F ? 1e-4f : 1.0f);
-        }
+        SANITY_CHECK(dst);
    }
+    else
+        OCL_PERF_ELSE
 }

 ///////////// pyrUp ////////////////////////
-PERFTEST(pyrUp)
-{
-    Mat src, dst, ocl_dst;
-    int all_type[] = {CV_8UC1, CV_8UC4};
-    std::string type_name[] = {"CV_8UC1", "CV_8UC4"};
-
-    for (int size = 500; size <= 2000; size *= 2)
-    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;

-            gen(src, size, size, all_type[j], 0, 256);
+typedef Size_MatType pyrUpFixture;

-            pyrUp(src, dst);
+PERF_TEST_P(pyrUpFixture, pyrUp,
+            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
+                               OCL_PERF_ENUM(CV_8UC1, CV_8UC4)))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);

-            CPU_ON;
-            pyrUp(src, dst);
-            CPU_OFF;
+    Mat src(srcSize, type), dst;
+    Size dstSize(srcSize.height << 1, srcSize.width << 1);
+    dst.create(dstSize, type);
+    declare.in(src, WARMUP_RNG).out(dst);

-            ocl::oclMat d_src(src);
-            ocl::oclMat d_dst;
+    if (RUN_OCL_IMPL)
+    {
+        ocl::oclMat oclSrc(src), oclDst(dstSize, type);

-            WARMUP_ON;
-            ocl::pyrUp(d_src, d_dst);
-            WARMUP_OFF;
+        TEST_CYCLE() ocl::pyrDown(oclSrc, oclDst);

-            GPU_ON;
-            ocl::pyrUp(d_src, d_dst);
-            GPU_OFF;
+        oclDst.download(dst);

-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::pyrUp(d_src, d_dst);
-            d_dst.download(ocl_dst);
-            GPU_FULL_OFF;
+        SANITY_CHECK(dst);
+    }
+    else if (RUN_PLAIN_IMPL)
+    {
+        TEST_CYCLE() pyrDown(src, dst);

-            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, (src.depth() == CV_32F ? 1e-4f : 1.0));
-        }
+        SANITY_CHECK(dst);
    }
+    else
+        OCL_PERF_ELSE
 }
--- a/modules/ocl/perf/perf_split_merge.cpp
+++ b/modules/ocl/perf/perf_split_merge.cpp
@ -45,110 +45,97 @@
 //M*/
 #include "perf_precomp.hpp"

+using namespace perf;
+using std::tr1::tuple;
+using std::tr1::get;
+
 ///////////// Merge////////////////////////
-PERFTEST(Merge)
-{
-    Mat dst, ocl_dst;
-    ocl::oclMat d_dst;

-    int channels = 4;
-    int all_type[] = {CV_8UC1, CV_32FC1};
-    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
+typedef Size_MatType MergeFixture;

-    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
+PERF_TEST_P(MergeFixture, Merge,
+            ::testing::Combine(::testing::Values(OCL_SIZE_1000, OCL_SIZE_2000),
+                               OCL_PERF_ENUM(CV_8U, CV_32F)))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int depth = get<1>(params), channels = 3;
+
+    const int dstType = CV_MAKE_TYPE(depth, channels);
+    Mat dst(srcSize, dstType);
+    vector<Mat> src(channels);
+    for (vector<Mat>::iterator i = src.begin(), end = src.end(); i != end; ++i)
    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j] ;
-            Size size1 = Size(size, size);
-            std::vector<Mat> src(channels);
-
-            for (int i = 0; i < channels; ++i)
-            {
-                src[i] = Mat(size1, all_type[j], cv::Scalar::all(i));
-            }
-
-            merge(src, dst);
-
-            CPU_ON;
-            merge(src, dst);
-            CPU_OFF;
-
-            std::vector<ocl::oclMat> d_src(channels);
-
-            for (int i = 0; i < channels; ++i)
-            {
-                d_src[i] = ocl::oclMat(size1, all_type[j], cv::Scalar::all(i));
-            }
-
-            WARMUP_ON;
-            ocl::merge(d_src, d_dst);
-            WARMUP_OFF;
-
-            GPU_ON;
-            ocl::merge(d_src, d_dst);
-            GPU_OFF;
-
-            GPU_FULL_ON;
-            for (int i = 0; i < channels; ++i)
-            {
-                d_src[i] = ocl::oclMat(size1, all_type[j], cv::Scalar::all(i));
-            }
-            ocl::merge(d_src, d_dst);
-            d_dst.download(ocl_dst);
-            GPU_FULL_OFF;
-
-            TestSystem::instance().ExpectedMatNear(dst, ocl_dst, 0.0);
-        }
-
+        i->create(srcSize, CV_MAKE_TYPE(depth, 1));
+        declare.in(*i, WARMUP_RNG);
    }
-}
+    declare.out(dst);

-///////////// Split////////////////////////
-PERFTEST(Split)
-{
-    //int channels = 4;
-    int all_type[] = {CV_8UC1, CV_32FC1};
-    std::string type_name[] = {"CV_8UC1", "CV_32FC1"};
-
-    for (int size = Min_Size; size <= Max_Size; size *= Multiple)
+    if (RUN_OCL_IMPL)
    {
-        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
-        {
-            SUBTEST << size << 'x' << size << "; " << type_name[j];
-            Size size1 = Size(size, size);
-
-            Mat src(size1, CV_MAKE_TYPE(all_type[j], 4), cv::Scalar(1, 2, 3, 4));
+        ocl::oclMat oclDst(srcSize, dstType);
+        vector<ocl::oclMat> oclSrc(src.size());
+        for (vector<ocl::oclMat>::size_type i = 0, end = src.size(); i < end; ++i)
+            oclSrc[i] = src[i];

-            std::vector<cv::Mat> dst, ocl_dst(4);
+        TEST_CYCLE() cv::ocl::merge(oclSrc, oclDst);

-            split(src, dst);
+        oclDst.download(dst);

-            CPU_ON;
-            split(src, dst);
-            CPU_OFF;
+        SANITY_CHECK(dst);
+    }
+    else if (RUN_PLAIN_IMPL)
+    {
+        TEST_CYCLE() cv::merge(src, dst);

-            ocl::oclMat d_src(size1, CV_MAKE_TYPE(all_type[j], 4), cv::Scalar(1, 2, 3, 4));
-            std::vector<cv::ocl::oclMat> d_dst;
+        SANITY_CHECK(dst);
+    }
+    else
+        OCL_PERF_ELSE
+}

-            WARMUP_ON;
-            ocl::split(d_src, d_dst);
-            WARMUP_OFF;
+///////////// Split////////////////////////

-            GPU_ON;
-            ocl::split(d_src, d_dst);
-            GPU_OFF;
+typedef Size_MatType SplitFixture;

-            GPU_FULL_ON;
-            d_src.upload(src);
-            ocl::split(d_src, d_dst);
-            for(size_t i = 0; i < dst.size(); i++)
-                d_dst[i].download(ocl_dst[i]);
-            GPU_FULL_OFF;
+PERF_TEST_P(SplitFixture, Split,
+            ::testing::Combine(OCL_TYPICAL_MAT_SIZES,
+                               OCL_PERF_ENUM(CV_8U, CV_32F)))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int depth = get<1>(params), channels = 3;

-            vector<double> eps(4, 0.);
-            TestSystem::instance().ExpectMatsNear(dst, ocl_dst, eps);
-        }
+    Mat src(srcSize, CV_MAKE_TYPE(depth, channels));
+    declare.in(src, WARMUP_RNG);

+    if (RUN_OCL_IMPL)
+    {
+        ocl::oclMat oclSrc(src);
+        vector<ocl::oclMat> oclDst(channels, ocl::oclMat(srcSize, CV_MAKE_TYPE(depth, 1)));
+
+        TEST_CYCLE() cv::ocl::split(oclSrc, oclDst);
+
+        ASSERT_EQ(3, channels);
+        Mat dst0, dst1, dst2;
+        oclDst[0].download(dst0);
+        oclDst[1].download(dst1);
+        oclDst[2].download(dst2);
+        SANITY_CHECK(dst0);
+        SANITY_CHECK(dst1);
+        SANITY_CHECK(dst2);
+    }
+    else if (RUN_PLAIN_IMPL)
+    {
+        vector<Mat> dst(channels, Mat(srcSize, CV_MAKE_TYPE(depth, 1)));
+        TEST_CYCLE() cv::split(src, dst);
+
+        ASSERT_EQ(3, channels);
+        Mat & dst0 = dst[0], & dst1 = dst[1], & dst2 = dst[2];
+        SANITY_CHECK(dst0);
+        SANITY_CHECK(dst1);
+        SANITY_CHECK(dst2);
    }
+    else
+        OCL_PERF_ELSE
 }
--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
@ -2337,7 +2337,7 @@ void cv::ocl::pow(const oclMat &x, double p, oclMat &y)
        return;
    }

-    CV_Assert((x.type() == y.type() && x.size() == y.size() && x.depth() == CV_32F) || x.depth() == CV_64F);
+    CV_Assert(x.depth() == CV_32F || x.depth() == CV_64F);
    y.create(x.size(), x.type());
    String kernelName = "arithm_pow";

--- a/modules/ocl/test/test_norm.cpp
+++ b/modules/ocl/test/test_norm.cpp
@ -0,0 +1,63 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+typedef ::testing::TestWithParam<cv::Size> normFixture;
+
+TEST_P(normFixture, DISABLED_accuracy)
+{
+    const cv::Size srcSize = GetParam();
+
+    cv::Mat src1(srcSize, CV_8UC1), src2(srcSize, CV_8UC1);
+    cv::randu(src1, 0, 2);
+    cv::randu(src2, 0, 2);
+
+    cv::ocl::oclMat oclSrc1(src1), oclSrc2(src2);
+
+    double value = cv::norm(src1, src2, cv::NORM_INF);
+    double oclValue = cv::ocl::norm(oclSrc1, oclSrc2, cv::NORM_INF);
+
+    ASSERT_EQ(value, oclValue);
+}
+
+INSTANTIATE_TEST_CASE_P(oclNormTest, normFixture,
+                        ::testing::Values(cv::Size(500, 500), cv::Size(1000, 1000)));
--- a/platforms/ios/Info.plist.in
+++ b/platforms/ios/Info.plist.in
@ -7,9 +7,9 @@
    <key>CFBundleIdentifier</key>
    <string>org.opencv</string>
    <key>CFBundleVersion</key>
-    <string>${VERSION}</string>
+    <string>${OPENCV_LIBVERSION}</string>
    <key>CFBundleShortVersionString</key>
-    <string>${VERSION}</string>
+    <string>${OPENCV_LIBVERSION}</string>
    <key>CFBundleSignature</key>
    <string>????</string>
    <key>CFBundlePackageType</key>
--- a/platforms/ios/build_framework.py
+++ b/platforms/ios/build_framework.py
@ -71,19 +71,11 @@ def put_framework_together(srcroot, dstroot):
    os.makedirs(framework_dir)
    os.chdir(framework_dir)

-    # determine OpenCV version (without subminor part)
-    tdir0 = "../build/" + targetlist[0]
-    cfg = open(tdir0 + "/cvconfig.h", "rt")
-    for l in cfg.readlines():
-        if l.startswith("#define  VERSION"):
-            opencv_version = l[l.find("\"")+1:l.rfind(".")]
-            break
-    cfg.close()
-
    # form the directory tree
    dstdir = "Versions/A"
    os.makedirs(dstdir + "/Resources")

+    tdir0 = "../build/" + targetlist[0]
    # copy headers
    shutil.copytree(tdir0 + "/install/include/opencv2", dstdir + "/Headers")

@ -91,13 +83,8 @@ def put_framework_together(srcroot, dstroot):
    wlist = " ".join(["../build/" + t + "/lib/Release/libopencv_world.a" for t in targetlist])
    os.system("lipo -create " + wlist + " -o " + dstdir + "/opencv2")

-    # form Info.plist
-    srcfile = open(srcroot + "/platforms/ios/Info.plist.in", "rt")
-    dstfile = open(dstdir + "/Resources/Info.plist", "wt")
-    for l in srcfile.readlines():
-        dstfile.write(l.replace("${VERSION}", opencv_version))
-    srcfile.close()
-    dstfile.close()
+    # copy Info.plist
+    shutil.copyfile(tdir0 + "/ios/Info.plist", dstdir + "/Resources/Info.plist")

    # make symbolic links
    os.symlink("A", "Versions/Current")
--- a/samples/gpu/super_resolution.cpp
+++ b/samples/gpu/super_resolution.cpp
@ -212,7 +212,7 @@ int main(int argc, const char* argv[])
 #if defined(HAVE_OPENCV_OCL)
        cout << "Mode            : " << (useCuda ? "CUDA" : useOcl? "OpenCL" : "CPU") << endl;
 #else
-        cout << "Mode            : " << (useGpu ? "CUDA" : "CPU") << endl;
+        cout << "Mode            : " << (useCuda ? "CUDA" : "CPU") << endl;
 #endif
    }