Merge branch 'master' of code.opencv.org:opencv

13 years ago · 4fb15ae1f0
parent d9185ec21b bbf679267a
commit 4fb15ae1f0
121 changed files with 12554 additions and 8455 deletions
--- a/3rdparty/libtiff/CMakeLists.txt
+++ b/3rdparty/libtiff/CMakeLists.txt
@ -89,7 +89,7 @@ endif(WIN32)

 ocv_warnings_disable(CMAKE_C_FLAGS -Wno-unused-but-set-variable -Wmissing-prototypes -Wmissing-declarations -Wundef -Wunused -Wsign-compare
                                   -Wcast-align -Wshadow -Wno-maybe-uninitialized -Wno-pointer-to-int-cast -Wno-int-to-pointer-cast)
-ocv_warnings_disable(CMAKE_CXX_FLAGS -Wmissing-declarations -Wunused-parameter /wd4100 /wd4244 /wd4706 /wd4127 /wd4701 /wd4018 /wd4267 /wd4306 /wd4305 /wd4312 /wd4311)
+ocv_warnings_disable(CMAKE_CXX_FLAGS -Wmissing-declarations -Wunused-parameter /wd4100 /wd4244 /wd4706 /wd4127 /wd4701 /wd4018 /wd4267 /wd4306 /wd4305 /wd4312 /wd4311 /wd4703)

 if(UNIX AND (CMAKE_COMPILER_IS_GNUCXX OR CV_ICC))
  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -189,11 +189,11 @@ OCV_OPTION(ENABLE_PROFILING           "Enable profiling in the GCC compiler (Add
 OCV_OPTION(ENABLE_OMIT_FRAME_POINTER  "Enable -fomit-frame-pointer for GCC"                      ON   IF CMAKE_COMPILER_IS_GNUCXX )
 OCV_OPTION(ENABLE_POWERPC             "Enable PowerPC for GCC"                                   ON   IF (CMAKE_COMPILER_IS_GNUCXX AND CMAKE_SYSTEM_PROCESSOR MATCHES powerpc.*) )
 OCV_OPTION(ENABLE_FAST_MATH           "Enable -ffast-math (not recommended for GCC 4.6.x)"       OFF  IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_SSE                 "Enable SSE instructions"                                  ON   IF (MSVC OR CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_SSE2                "Enable SSE2 instructions"                                 ON   IF (MSVC OR CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_SSE3                "Enable SSE3 instructions"                                 OFF  IF (CV_ICC OR CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
+OCV_OPTION(ENABLE_SSE                 "Enable SSE instructions"                                  ON   IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
+OCV_OPTION(ENABLE_SSE2                "Enable SSE2 instructions"                                 ON   IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
+OCV_OPTION(ENABLE_SSE3                "Enable SSE3 instructions"                                 ON   IF ((CV_ICC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
 OCV_OPTION(ENABLE_SSSE3               "Enable SSSE3 instructions"                                OFF  IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
-OCV_OPTION(ENABLE_SSE41               "Enable SSE4.1 instructions"                               OFF  IF (CV_ICC OR CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
+OCV_OPTION(ENABLE_SSE41               "Enable SSE4.1 instructions"                               OFF  IF ((CV_ICC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
 OCV_OPTION(ENABLE_SSE42               "Enable SSE4.2 instructions"                               OFF  IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
 OCV_OPTION(ENABLE_NOISY_WARNINGS      "Show all warnings even if they are too noisy"             OFF )
 OCV_OPTION(OPENCV_WARNINGS_ARE_ERRORS "Treat warnings as errors"                                 OFF )
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@ -88,7 +88,11 @@ if(CUDA_FOUND)
    if(APPLE)
      set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fno-finite-math-only)
    endif()
-    string(REPLACE "-Wsign-promo" "" CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+
+    # disabled because of multiple warnings during building nvcc auto generated files
+    if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_GCC_REGEX_VERSION VERSION_GREATER "4.6.0")
+      ocv_warnings_disable(CMAKE_CXX_FLAGS -Wunused-but-set-variable)
+    endif()

    # we remove -ggdb3 flag as it leads to preprocessor errors when compiling CUDA files (CUDA 4.1)
    set(CMAKE_CXX_FLAGS_DEBUG_ ${CMAKE_CXX_FLAGS_DEBUG})
--- a/cmake/OpenCVGenConfig.cmake
+++ b/cmake/OpenCVGenConfig.cmake
@ -64,9 +64,14 @@ macro(ocv_generate_dependencies_map_configcmake suffix configuration)
      string(REGEX REPLACE "${CMAKE_SHARED_LIBRARY_SUFFIX}$" "${OPENCV_LINK_LIBRARY_SUFFIX}" __libname "${__libname}")
    endif()

+    string(REPLACE " " "\\ " __mod_deps "${${__ocv_lib}_MODULE_DEPS_${suffix}}")
+    string(REPLACE " " "\\ " __ext_deps "${${__ocv_lib}_EXTRA_DEPS_${suffix}}")
+    string(REPLACE "\"" "\\\"" __mod_deps "${__mod_deps}")
+    string(REPLACE "\"" "\\\"" __ext_deps "${__ext_deps}")
+
    set(OPENCV_DEPENDENCIES_MAP_${suffix} "${OPENCV_DEPENDENCIES_MAP_${suffix}}set(OpenCV_${__ocv_lib}_LIBNAME_${suffix} \"${__libname}\")\n")
-    set(OPENCV_DEPENDENCIES_MAP_${suffix} "${OPENCV_DEPENDENCIES_MAP_${suffix}}set(OpenCV_${__ocv_lib}_DEPS_${suffix} ${${__ocv_lib}_MODULE_DEPS_${suffix}})\n")
-    set(OPENCV_DEPENDENCIES_MAP_${suffix} "${OPENCV_DEPENDENCIES_MAP_${suffix}}set(OpenCV_${__ocv_lib}_EXTRA_DEPS_${suffix} ${${__ocv_lib}_EXTRA_DEPS_${suffix}})\n")
+    set(OPENCV_DEPENDENCIES_MAP_${suffix} "${OPENCV_DEPENDENCIES_MAP_${suffix}}set(OpenCV_${__ocv_lib}_DEPS_${suffix} ${__mod_deps})\n")
+    set(OPENCV_DEPENDENCIES_MAP_${suffix} "${OPENCV_DEPENDENCIES_MAP_${suffix}}set(OpenCV_${__ocv_lib}_EXTRA_DEPS_${suffix} ${__ext_deps})\n")

    list(APPEND OPENCV_PROCESSED_LIBS ${__ocv_lib})
    list(APPEND OPENCV_LIBS_TO_PROCESS ${${__ocv_lib}_MODULE_DEPS_${suffix}})
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@ -509,8 +509,6 @@ endmacro()
 macro(ocv_add_precompiled_headers the_target)
  if("${the_target}" MATCHES "^opencv_test_.*$")
    SET(pch_path "test/test_")
-  elseif("${the_target}" MATCHES "opencv_perf_gpu_cpu")
-    SET(pch_path "perf_cpu/perf_cpu_")
  elseif("${the_target}" MATCHES "^opencv_perf_.*$")
    SET(pch_path "perf/perf_")
  else()
--- a/data/lbpcascades/lbpcascade_profileface.xml
+++ b/data/lbpcascades/lbpcascade_profileface.xml
--- a/data/lbpcascades/lbpcascade_silverware.xml
+++ b/data/lbpcascades/lbpcascade_silverware.xml
--- a/ios/cmake/Modules/Platform/iOS.cmake
+++ b/ios/cmake/Modules/Platform/iOS.cmake
@ -42,6 +42,8 @@ set (CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}")
 set (CMAKE_C_FLAGS "")
 set (CMAKE_CXX_FLAGS "-headerpad_max_install_names -fvisibility=hidden -fvisibility-inlines-hidden")

+set (CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG -O3 -fomit-frame-pointer -ffast-math")
+
 if (HAVE_FLAG_SEARCH_PATHS_FIRST)
 	set (CMAKE_C_LINK_FLAGS "-Wl,-search_paths_first ${CMAKE_C_LINK_FLAGS}")
 	set (CMAKE_CXX_LINK_FLAGS "-Wl,-search_paths_first ${CMAKE_CXX_LINK_FLAGS}")
--- a/modules/core/include/opencv2/core/core.hpp
+++ b/modules/core/include/opencv2/core/core.hpp
@ -440,7 +440,7 @@ template<typename _Tp, int m, int n> class CV_EXPORTS Matx
 {
 public:
    typedef _Tp value_type;
-    typedef Matx<_Tp, MIN(m, n), 1> diag_type;
+    typedef Matx<_Tp, (m < n ? m : n), 1> diag_type;
    typedef Matx<_Tp, m, n> mat_type;
    enum { depth = DataDepth<_Tp>::value, rows = m, cols = n, channels = rows*cols,
           type = CV_MAKETYPE(depth, channels) };
--- a/modules/core/include/opencv2/core/gpumat.hpp
+++ b/modules/core/include/opencv2/core/gpumat.hpp
@ -72,9 +72,11 @@ namespace cv { namespace gpu
        FEATURE_SET_COMPUTE_13 = 13,
        FEATURE_SET_COMPUTE_20 = 20,
        FEATURE_SET_COMPUTE_21 = 21,
+        FEATURE_SET_COMPUTE_30 = 30,
        GLOBAL_ATOMICS = FEATURE_SET_COMPUTE_11,
        SHARED_ATOMICS = FEATURE_SET_COMPUTE_12,
-        NATIVE_DOUBLE = FEATURE_SET_COMPUTE_13
+        NATIVE_DOUBLE = FEATURE_SET_COMPUTE_13,
+        WARP_SHUFFLE_FUNCTIONS = FEATURE_SET_COMPUTE_30
    };

    // Gives information about what GPU archs this OpenCV GPU module was
--- a/modules/core/src/cuda/matrix_operations.cu
+++ b/modules/core/src/cuda/matrix_operations.cu
@ -44,7 +44,7 @@
 #include "opencv2/gpu/device/transform.hpp"
 #include "opencv2/gpu/device/functional.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    template <typename T> struct shift_and_sizeof;
    template <> struct shift_and_sizeof<signed char> { enum { shift = 0 }; };
@ -272,7 +272,7 @@ namespace cv { namespace gpu { namespace device
    template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> >
    {
    };
-        
+
    template<typename T, typename D>
    void cvt_(DevMem2Db src, DevMem2Db dst, double alpha, double beta, cudaStream_t stream)
    {
@ -282,6 +282,11 @@ namespace cv { namespace gpu { namespace device
        cv::gpu::device::transform((DevMem2D_<T>)src, (DevMem2D_<D>)dst, op, WithOutMask(), stream);
    }

+#if defined  __clang__
+# pragma clang diagnostic push
+# pragma clang diagnostic ignored "-Wmissing-declarations"
+#endif
+
    void convert_gpu(DevMem2Db src, int sdepth, DevMem2Db dst, int ddepth, double alpha, double beta, cudaStream_t stream)
    {
        typedef void (*caller_t)(DevMem2Db src, DevMem2Db dst, double alpha, double beta, cudaStream_t stream);
@ -318,4 +323,8 @@ namespace cv { namespace gpu { namespace device

        func(src, dst, alpha, beta, stream);
    }
+
+#if defined __clang__
+# pragma clang diagnostic pop
+#endif
 }}} // namespace cv { namespace gpu { namespace device
--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@ -1199,10 +1199,6 @@ namespace

        void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const
        {
-            NppiSize sz;
-            sz.width  = m.cols;
-            sz.height = m.rows;
-
            if (mask.empty())
            {
                if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0)
--- a/modules/features2d/src/features2d_init.cpp
+++ b/modules/features2d/src/features2d_init.cpp
@ -59,7 +59,7 @@ CV_INIT_ALGORITHM(BriefDescriptorExtractor, "Feature2D.BRIEF",
 CV_INIT_ALGORITHM(FastFeatureDetector, "Feature2D.FAST",
                  obj.info()->addParam(obj, "threshold", obj.threshold);
                  obj.info()->addParam(obj, "nonmaxSuppression", obj.nonmaxSuppression);
-                  obj.info()->addParam(obj, "type", obj.type, FastFeatureDetector::TYPE_9_16));
+                  obj.info()->addParam(obj, "type", obj.type));

 ///////////////////////////////////////////////////////////////////////////////////////////////////////////

--- a/modules/features2d/test/test_fast.cpp
+++ b/modules/features2d/test/test_fast.cpp
@ -75,8 +75,8 @@ void CV_FastTest::run( int )

    vector<KeyPoint> keypoints1;
    vector<KeyPoint> keypoints2;
-    FAST(gray1, keypoints1, 30, type);
-    FAST(gray2, keypoints2, 30, type);
+    FAST(gray1, keypoints1, 30, true, type);
+    FAST(gray2, keypoints2, 30, true, type);

    for(size_t i = 0; i < keypoints1.size(); ++i)
    {
--- a/modules/features2d/test/test_nearestneighbors.cpp
+++ b/modules/features2d/test/test_nearestneighbors.cpp
@ -200,7 +200,7 @@ int CV_KDTreeTest_CPP::checkGetPoins( const Mat& data )

 int CV_KDTreeTest_CPP::checkFindBoxed()
 {
-    vector<float> min( dims, minValue), max(dims, maxValue);
+    vector<float> min( dims, static_cast<float>(minValue)), max(dims, static_cast<float>(maxValue));
    vector<int> indices;
    tr->findOrthoRange( min, max, indices );
    // TODO check indices
@ -214,8 +214,8 @@ int CV_KDTreeTest_CPP::findNeighbors( Mat& points, Mat& neighbors )
    const int emax = 20;
    Mat neighbors2( neighbors.size(), CV_32SC1 );
    int j;
-    vector<float> min(points.cols, minValue);
-    vector<float> max(points.cols, maxValue);
+    vector<float> min(points.cols, static_cast<float>(minValue));
+    vector<float> max(points.cols, static_cast<float>(maxValue));
    for( int pi = 0; pi < points.rows; pi++ )
    {
        // 1st way
--- a/modules/features2d/test/test_rotation_and_scale_invariance.cpp
+++ b/modules/features2d/test/test_rotation_and_scale_invariance.cpp
@ -54,7 +54,7 @@ static
 Mat generateHomography(float angle)
 {
    // angle - rotation around Oz in degrees
-    float angleRadian = angle * CV_PI / 180.;
+    float angleRadian = static_cast<float>(angle * CV_PI / 180);
    Mat H = Mat::eye(3, 3, CV_32FC1);
    H.at<float>(0,0) = H.at<float>(1,1) = std::cos(angleRadian);
    H.at<float>(0,1) = -std::sin(angleRadian);
@ -69,8 +69,8 @@ Mat rotateImage(const Mat& srcImage, float angle, Mat& dstImage, Mat& dstMask)
    // angle - rotation around Oz in degrees
    float diag = std::sqrt(static_cast<float>(srcImage.cols * srcImage.cols + srcImage.rows * srcImage.rows));
    Mat LUShift = Mat::eye(3, 3, CV_32FC1); // left up
-    LUShift.at<float>(0,2) = -srcImage.cols/2;
-    LUShift.at<float>(1,2) = -srcImage.rows/2;
+    LUShift.at<float>(0,2) = static_cast<float>(-srcImage.cols/2);
+    LUShift.at<float>(1,2) = static_cast<float>(-srcImage.rows/2);
    Mat RDShift = Mat::eye(3, 3, CV_32FC1); // right down
    RDShift.at<float>(0,2) = diag/2;
    RDShift.at<float>(1,2) = diag/2;
@ -114,7 +114,7 @@ void scaleKeyPoints(const vector<KeyPoint>& src, vector<KeyPoint>& dst, float sc
 static
 float calcCirclesIntersectArea(const Point2f& p0, float r0, const Point2f& p1, float r1)
 {
-    float c = norm(p0 - p1), sqr_c = c * c;
+    float c = static_cast<float>(norm(p0 - p1)), sqr_c = c * c;

    float sqr_r0 = r0 * r0;
    float sqr_r1 = r1 * r1;
@ -125,7 +125,7 @@ float calcCirclesIntersectArea(const Point2f& p0, float r0, const Point2f& p1, f
    float minR = std::min(r0, r1);
    float maxR = std::max(r0, r1);
    if(c + minR <= maxR)
-        return CV_PI * minR * minR;
+        return static_cast<float>(CV_PI * minR * minR);

    float cos_halfA0 = (sqr_r0 + sqr_c - sqr_r1) / (2 * r0 * c);
    float cos_halfA1 = (sqr_r1 + sqr_c - sqr_r0) / (2 * r1 * c);
@ -133,15 +133,15 @@ float calcCirclesIntersectArea(const Point2f& p0, float r0, const Point2f& p1, f
    float A0 = 2 * acos(cos_halfA0);
    float A1 = 2 * acos(cos_halfA1);

-    return  0.5 * sqr_r0 * (A0 - sin(A0)) +
-            0.5 * sqr_r1 * (A1 - sin(A1));
+    return  0.5f * sqr_r0 * (A0 - sin(A0)) +
+            0.5f * sqr_r1 * (A1 - sin(A1));
 }

 static
 float calcIntersectRatio(const Point2f& p0, float r0, const Point2f& p1, float r1)
 {
    float intersectArea = calcCirclesIntersectArea(p0, r0, p1, r1);
-    float unionArea = CV_PI * (r0 * r0 + r1 * r1) - intersectArea;
+    float unionArea = static_cast<float>(CV_PI) * (r0 * r0 + r1 * r1) - intersectArea;
    return intersectArea / unionArea;
 }

@ -160,7 +160,7 @@ void matchKeyPoints(const vector<KeyPoint>& keypoints0, const Mat& H,

    matches.clear();
    vector<uchar> usedMask(keypoints1.size(), 0);
-    for(size_t i0 = 0; i0 < keypoints0.size(); i0++)
+    for(int i0 = 0; i0 < static_cast<int>(keypoints0.size()); i0++)
    {
        int nearestPointIndex = -1;
        float maxIntersectRatio = 0.f;
@ -176,7 +176,7 @@ void matchKeyPoints(const vector<KeyPoint>& keypoints0, const Mat& H,
            if(intersectRatio > maxIntersectRatio)
            {
                maxIntersectRatio = intersectRatio;
-                nearestPointIndex = i1;
+                nearestPointIndex = static_cast<int>(i1);
            }
        }

@ -222,7 +222,7 @@ protected:
        const int maxAngle = 360, angleStep = 15;
        for(int angle = 0; angle < maxAngle; angle += angleStep)
        {
-            Mat H = rotateImage(image0, angle, image1, mask1);
+            Mat H = rotateImage(image0, static_cast<float>(angle), image1, mask1);

            vector<KeyPoint> keypoints1;
            featureDetector->detect(image1, keypoints1, mask1);
@ -339,10 +339,10 @@ protected:
        const int maxAngle = 360, angleStep = 15;
        for(int angle = 0; angle < maxAngle; angle += angleStep)
        {
-            Mat H = rotateImage(image0, angle, image1, mask1);
+            Mat H = rotateImage(image0, static_cast<float>(angle), image1, mask1);

            vector<KeyPoint> keypoints1;
-            rotateKeyPoints(keypoints0, H, angle, keypoints1);
+            rotateKeyPoints(keypoints0, H, static_cast<float>(angle), keypoints1);
            Mat descriptors1;
            descriptorExtractor->compute(image1, keypoints1, descriptors1);

@ -457,7 +457,7 @@ protected:
                keyPointMatchesCount++;

                // Check does this inlier have consistent sizes
-                const float maxSizeDiff = 0.8;//0.9f; // grad
+                const float maxSizeDiff = 0.8f;//0.9f; // grad
                float size0 = keypoints0[matches[m].trainIdx].size;
                float size1 = osiKeypoints1[matches[m].queryIdx].size;
                CV_Assert(size0 > 0 && size1 > 0);
@ -545,7 +545,7 @@ protected:
            resize(image0, image1, Size(), 1./scale, 1./scale);

            vector<KeyPoint> keypoints1;
-            scaleKeyPoints(keypoints0, keypoints1, 1./scale);
+            scaleKeyPoints(keypoints0, keypoints1, 1.0f/scale);
            Mat descriptors1;
            descriptorExtractor->compute(image1, keypoints1, descriptors1);

--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@ -111,43 +111,3 @@ ocv_add_accuracy_tests(FILES "Include" ${test_hdrs}
                       FILES "Src" ${test_srcs}
                       ${nvidia})
 ocv_add_perf_tests()
-
-
-
-set(perf_cpu_path "${CMAKE_CURRENT_SOURCE_DIR}/perf_cpu")
-if(BUILD_PERF_TESTS AND EXISTS "${perf_cpu_path}")
-    # opencv_highgui is required for imread/imwrite
-    set(perf_deps ${the_module} opencv_ts opencv_highgui opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_nonfree)
-    ocv_check_dependencies(${perf_deps})
-
-    if(OCV_DEPENDENCIES_FOUND)
-      set(the_target "opencv_perf_gpu_cpu")
-
-      ocv_module_include_directories(${perf_deps} "${perf_cpu_path}")
-
-      if(NOT OPENCV_PERF_${the_module}_CPU_SOURCES)
-        file(GLOB perf_srcs "${perf_cpu_path}/*.cpp")
-        file(GLOB perf_hdrs "${perf_cpu_path}/*.hpp" "${perf_cpu_path}/*.h")
-        source_group("Src" FILES ${perf_srcs})
-        source_group("Include" FILES ${perf_hdrs})
-        set(OPENCV_PERF_${the_module}_CPU_SOURCES ${perf_srcs} ${perf_hdrs})
-      endif()
-
-      add_executable(${the_target} ${OPENCV_PERF_${the_module}_CPU_SOURCES})
-      target_link_libraries(${the_target} ${OPENCV_MODULE_${the_module}_DEPS} ${perf_deps} ${OPENCV_LINKER_LIBS})
-
-      # Additional target properties
-      set_target_properties(${the_target} PROPERTIES
-        DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
-        RUNTIME_OUTPUT_DIRECTORY "${EXECUTABLE_OUTPUT_PATH}"
-      )
-
-      if(ENABLE_SOLUTION_FOLDERS)
-        set_target_properties(${the_target} PROPERTIES FOLDER "tests performance")
-      endif()
-
-      ocv_add_precompiled_headers(${the_target})
-    else(OCV_DEPENDENCIES_FOUND)
-      #TODO: warn about unsatisfied dependencies
-    endif(OCV_DEPENDENCIES_FOUND)
-  endif()
--- a/modules/gpu/doc/object_detection.rst
+++ b/modules/gpu/doc/object_detection.rst
@ -204,7 +204,7 @@ gpu::CascadeClassifier_GPU
 --------------------------
 .. ocv:class:: gpu::CascadeClassifier_GPU

-Cascade classifier class used for object detection. ::
+Cascade classifier class used for object detection. Supports HAAR and LBP cascades. ::

    class CV_EXPORTS CascadeClassifier_GPU
    {
@ -219,6 +219,7 @@ Cascade classifier class used for object detection. ::

            /* Returns number of detected objects */
            int detectMultiScale( const GpuMat& image, GpuMat& objectsBuf, double scaleFactor=1.2, int minNeighbors=4, Size minSize=Size());
+            int detectMultiScale( const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize = Size(), double scaleFactor = 1.1, int minNeighbors = 4);

            /* Finds only the largest object. Special mode if training is required.*/
            bool findLargestObject;
@ -233,11 +234,11 @@ Cascade classifier class used for object detection. ::

 gpu::CascadeClassifier_GPU::CascadeClassifier_GPU
 -----------------------------------------------------
-Loads the classifier from a file.
+Loads the classifier from a file. Cascade type is detected automatically by constructor parameter.

 .. ocv:function:: gpu::CascadeClassifier_GPU::CascadeClassifier_GPU(const string& filename)

-    :param filename: Name of the file from which the classifier is loaded. Only the old ``haar`` classifier (trained by the ``haar`` training application) and NVIDIA's ``nvbin`` are supported.
+    :param filename: Name of the file from which the classifier is loaded. Only the old ``haar`` classifier (trained by the ``haar`` training application) and NVIDIA's ``nvbin`` are supported for HAAR and only new type of OpenCV XML cascade supported for LBP.



@ -255,8 +256,7 @@ Loads the classifier from a file. The previous content is destroyed.

 .. ocv:function:: bool gpu::CascadeClassifier_GPU::load(const string& filename)

-    :param filename: Name of the file from which the classifier is loaded. Only the old ``haar`` classifier (trained by the ``haar`` training application) and NVIDIA's ``nvbin`` are supported.
-
+    :param filename: Name of the file from which the classifier is loaded. Only the old ``haar`` classifier (trained by the ``haar`` training application) and NVIDIA's ``nvbin`` are supported for HAAR and only new type of OpenCV XML cascade supported for LBP.


 gpu::CascadeClassifier_GPU::release
@ -273,13 +273,17 @@ Detects objects of different sizes in the input image.

 .. ocv:function:: int gpu::CascadeClassifier_GPU::detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, double scaleFactor=1.2, int minNeighbors=4, Size minSize=Size())

+.. ocv:function:: int gpu::CascadeClassifier_GPU::detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize = Size(), double scaleFactor = 1.1, int minNeighbors = 4)
+
    :param image: Matrix of type  ``CV_8U``  containing an image where objects should be detected.

    :param objectsBuf: Buffer to store detected objects (rectangles). If it is empty, it is allocated with the default size. If not empty, the function searches not more than N objects, where ``N = sizeof(objectsBufer's data)/sizeof(cv::Rect)``.

-    :param scaleFactor: Value to specify how much the image size is reduced at each image scale.
+    :param maxObjectSize: Maximum possible object size. Objects larger than that are ignored. Used for second signature and supported only for LBP cascades.
+
+    :param scaleFactor:  Parameter specifying how much the image size is reduced at each image scale.

-    :param minNeighbors: Value to specify how many neighbours each candidate rectangle has to retain.
+    :param minNeighbors: Parameter specifying how many neighbors each candidate rectangle should have to retain it.

    :param minSize: Minimum possible object size. Objects smaller than that are ignored.

--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@ -820,6 +820,7 @@ private:
    int nLayers_;
 };

+//! HoughLines
 CV_EXPORTS void HoughLines(const GpuMat& src, GpuMat& lines, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096);
 CV_EXPORTS void HoughLines(const GpuMat& src, GpuMat& lines, GpuMat& accum, GpuMat& buf, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096);
 CV_EXPORTS void HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf, float rho, float theta);
--- a/modules/gpu/misc/mark_nvidia.py
+++ b/modules/gpu/misc/mark_nvidia.py
@ -1,255 +1,234 @@
 import sys, re

 spaces = '[\s]*'
-symbols = '[\s\w\d,.=:|]*'
+symbols = '[\s\w\d,.:|]*'

 def pattern1(prefix, test):
-    return re.compile(spaces + 'perf::' + prefix + '/' + test + '::' + '\(' + symbols + '\)' + spaces)
+    return re.compile(spaces + prefix + '_' + test + '::' + symbols + '::' + '\(' + symbols + '\)' + spaces)

-def pattern2(prefix, test, cvtype):
-    return re.compile(spaces + 'perf::' + prefix + '/' + test + '::' + '\(' + symbols + cvtype + symbols + '\)' + spaces)
+def pattern2(prefix, test, param1):
+    return re.compile(spaces + prefix + '_' + test + '::' + symbols + '::' + '\(' + symbols + param1 + symbols + '\)' + spaces)

-def pattern3(prefix, test, cvtype, param1):
-    return re.compile(spaces + 'perf::' + prefix + '/' + test + '::' + '\(' + symbols + cvtype + symbols + param1 + symbols + '\)' + spaces)
+def pattern3(prefix, test, param1, param2):
+    return re.compile(spaces + prefix + '_' + test + '::' + symbols + '::' + '\(' + symbols + param1 + symbols + param2 + symbols + '\)' + spaces)

-def pattern4(prefix, test, cvtype, param1, param2):
-    return re.compile(spaces + 'perf::' + prefix + '/' + test + '::' + '\(' + symbols + cvtype + symbols + param1 + symbols + param2 + symbols + '\)' + spaces)
+def pattern4(prefix, test, param1, param2, param3):
+    return re.compile(spaces + prefix + '_' + test + '::' + symbols + '::' + '\(' + symbols + param1 + symbols + param2 + symbols + param3 + symbols + '\)' + spaces)
+
+def pattern5(prefix, test, param1, param2, param3, param5):
+    return re.compile(spaces + prefix + '_' + test + '::' + symbols + '::' + '\(' + symbols + param1 + symbols + param2 + symbols + param3 + symbols + param4 + symbols + '\)' + spaces)

 npp_patterns = [
    ##############################################################
    # Core
-    
-    # Core/Add_Mat (CV_8U | CV_16U | CV_32F)
-    pattern2('Core', 'Add_Mat', '8U'),
-    pattern2('Core', 'Add_Mat', '16U'),
-    pattern2('Core', 'Add_Mat', '32F'),
-    
-    # Core/Add_Scalar (CV_8U | CV_16U | CV_32F)
-    pattern2('Core', 'Add_Scalar', '8U'),
-    pattern2('Core', 'Add_Scalar', '16U'),
-    pattern2('Core', 'Add_Scalar', '32F'),
-    
-    # Core/Subtract_Mat (CV_8U | CV_16U | CV_32F)
-    pattern2('Core', 'Subtract_Mat', '8U'),
-    pattern2('Core', 'Subtract_Mat', '16U'),
-    pattern2('Core', 'Subtract_Mat', '32F'),
-    
-    # Core/Subtract_Scalar (CV_8U | CV_16U | CV_32F)
-    pattern2('Core', 'Subtract_Scalar', '8U'),
-    pattern2('Core', 'Subtract_Scalar', '16U'),
-    pattern2('Core', 'Subtract_Scalar', '32F'),
-    
-    # Core/Multiply_Mat (CV_8U | CV_16U | CV_32F)
-    pattern2('Core', 'Multiply_Mat', '8U'),
-    pattern2('Core', 'Multiply_Mat', '16U'),
-    pattern2('Core', 'Multiply_Mat', '32F'),
-    
-    # Core/Multiply_Scalar (CV_8U | CV_16U | CV_32F)
-    pattern2('Core', 'Multiply_Scalar', '8U'),
-    pattern2('Core', 'Multiply_Scalar', '16U'),
-    pattern2('Core', 'Multiply_Scalar', '32F'),
-    
-    # Core/Divide_Mat (CV_8U | CV_16U | CV_32F)
-    pattern2('Core', 'Divide_Mat', '8U'),
-    pattern2('Core', 'Divide_Mat', '16U'),
-    pattern2('Core', 'Divide_Mat', '32F'),
-    
-    # Core/Divide_Scalar (CV_8U | CV_16U | CV_32F)
-    pattern2('Core', 'Divide_Scalar', '8U'),
-    pattern2('Core', 'Divide_Scalar', '16U'),
-    pattern2('Core', 'Divide_Scalar', '32F'),
-    
-    # Core/AbsDiff_Mat (CV_8U | CV_16U | CV_32F)
-    pattern2('Core', 'AbsDiff_Mat', '8U'),
-    pattern2('Core', 'AbsDiff_Mat', '16U'),
-    pattern2('Core', 'AbsDiff_Mat', '32F'),
-    
-    # Core/AbsDiff_Scalar (CV_8U | CV_16U | CV_32F)
-    pattern2('Core', 'AbsDiff_Scalar', '8U'),
-    pattern2('Core', 'AbsDiff_Scalar', '16U'),
-    pattern2('Core', 'AbsDiff_Scalar', '32F'),
-
-    # Core/Abs
+
+    # Core_AddMat (CV_8U | CV_16U | CV_32F)
+    pattern2('Core', 'AddMat', '8U'),
+    pattern2('Core', 'AddMat', '16U'),
+    pattern2('Core', 'AddMat', '32F'),
+
+    # Core_AddScalar (CV_8U | CV_16U | CV_32F)
+    pattern2('Core', 'AddScalar', '8U'),
+    pattern2('Core', 'AddScalar', '16U'),
+    pattern2('Core', 'AddScalar', '32F'),
+
+    # Core_SubtractMat (CV_8U | CV_16U | CV_32F)
+    pattern2('Core', 'SubtractMat', '8U'),
+    pattern2('Core', 'SubtractMat', '16U'),
+    pattern2('Core', 'SubtractMat', '32F'),
+
+    # Core_SubtractScalar (CV_8U | CV_16U | CV_32F)
+    pattern2('Core', 'SubtractScalar', '8U'),
+    pattern2('Core', 'SubtractScalar', '16U'),
+    pattern2('Core', 'SubtractScalar', '32F'),
+
+    # Core_MultiplyMat (CV_8U | CV_16U | CV_32F)
+    pattern2('Core', 'MultiplyMat', '8U'),
+    pattern2('Core', 'MultiplyMat', '16U'),
+    pattern2('Core', 'MultiplyMat', '32F'),
+
+    # Core_MultiplyScalar (CV_8U | CV_16U | CV_32F)
+    pattern2('Core', 'MultiplyScalar', '8U'),
+    pattern2('Core', 'MultiplyScalar', '16U'),
+    pattern2('Core', 'MultiplyScalar', '32F'),
+
+    # Core_DivideMat (CV_8U | CV_16U | CV_32F)
+    pattern2('Core', 'DivideMat', '8U'),
+    pattern2('Core', 'DivideMat', '16U'),
+    pattern2('Core', 'DivideMat', '32F'),
+
+    # Core_Divide_Scalar (CV_8U | CV_16U | CV_32F)
+    pattern2('Core', 'DivideScalar', '8U'),
+    pattern2('Core', 'DivideScalar', '16U'),
+    pattern2('Core', 'DivideScalar', '32F'),
+
+    # Core_AbsDiff_Mat (CV_8U | CV_16U | CV_32F)
+    pattern2('Core', 'AbsDiffMat', '8U'),
+    pattern2('Core', 'AbsDiffMat', '16U'),
+    pattern2('Core', 'AbsDiffMat', '32F'),
+
+    # Core_AbsDiffScalar (CV_8U | CV_16U | CV_32F)
+    pattern2('Core', 'AbsDiffScalar', '8U'),
+    pattern2('Core', 'AbsDiffScalar', '16U'),
+    pattern2('Core', 'AbsDiffScalar', '32F'),
+
+    # Core_Abs
    pattern1('Core', 'Abs'),

-    # Core/Sqr
+    # Core_Sqr
    pattern1('Core', 'Sqr'),

-    # Core/Sqrt
+    # Core_Sqrt
    pattern1('Core', 'Sqrt'),

-    # Core/Log
+    # Core_Log
    pattern1('Core', 'Log'),

-    # Core/Exp
+    # Core_Exp
    pattern1('Core', 'Exp'),

-    # Core/Bitwise_And_Scalar
-    pattern1('Core', 'Bitwise_And_Scalar'),
+    # Core_BitwiseAndScalar
+    pattern1('Core', 'BitwiseAndScalar'),

-    # Core/Bitwise_Or_Scalar
-    pattern1('Core', 'Bitwise_Or_Scalar'),
+    # Core_BitwiseOrScalar
+    pattern1('Core', 'BitwiseOrScalar'),

-    # Core/Bitwise_Xor_Scalar
-    pattern1('Core', 'Bitwise_Xor_Scalar'),
+    # Core_BitwiseXorScalar
+    pattern1('Core', 'BitwiseXorScalar'),

-    # Core/RShift
+    # Core_RShift
    pattern1('Core', 'RShift'),

-    # Core/LShift
+    # Core_LShift
    pattern1('Core', 'LShift'),

-    # Core/Transpose
+    # Core_Transpose
    pattern1('Core', 'Transpose'),

-    # Core/Flip
+    # Core_Flip
    pattern1('Core', 'Flip'),

-    # Core/LUT_OneChannel
-    pattern1('Core', 'LUT_OneChannel'),
+    # Core_LutOneChannel
+    pattern1('Core', 'LutOneChannel'),

-    # Core/LUT_MultiChannel
-    pattern1('Core', 'LUT_MultiChannel'),
+    # Core_LutMultiChannel
+    pattern1('Core', 'LutMultiChannel'),

-    # Core/Magnitude_Complex
-    pattern1('Core', 'Magnitude_Complex'),
+    # Core_MagnitudeComplex
+    pattern1('Core', 'MagnitudeComplex'),

-    # Core/Magnitude_Sqr_Complex
-    pattern1('Core', 'Magnitude_Sqr_Complex'),
+    # Core_MagnitudeSqrComplex
+    pattern1('Core', 'MagnitudeSqrComplex'),

-    # Core/MeanStdDev
+    # Core_MeanStdDev
    pattern1('Core', 'MeanStdDev'),

-    # Core/NormDiff
+    # Core_NormDiff
    pattern1('Core', 'NormDiff'),
-    
+
    ##############################################################
    # Filters

-    # Filters/Blur
+    # Filters_Blur
    pattern1('Filters', 'Blur'),
-    
-    # Filters/Erode
+
+    # Filters_Erode
    pattern1('Filters', 'Erode'),
-    
-    # Filters/Dilate
+
+    # Filters_Dilate
    pattern1('Filters', 'Dilate'),
-    
-    # Filters/MorphologyEx
+
+    # Filters_MorphologyEx
    pattern1('Filters', 'MorphologyEx'),
-    
+
    ##############################################################
    # ImgProc
-    
-    # ImgProc/Resize (8UC1 | 8UC4, INTER_NEAREST | INTER_LINEAR)
-    pattern3('ImgProc', 'Resize', '8UC1', 'INTER_NEAREST'),
-    pattern3('ImgProc', 'Resize', '8UC4', 'INTER_NEAREST'),
-    pattern3('ImgProc', 'Resize', '8UC1', 'INTER_LINEAR'),
-    pattern3('ImgProc', 'Resize', '8UC4', 'INTER_LINEAR'),
-    
-    # ImgProc/Resize (8UC4, INTER_CUBIC)
-    pattern3('ImgProc', 'Resize', '8UC4', 'INTER_CUBIC'),
-    
-    # ImgProc/WarpAffine (8UC1 | 8UC3 | 8UC4 | 32FC1 | 32FC3 | 32FC4, INTER_NEAREST | INTER_LINEAR | INTER_CUBIC, BORDER_CONSTANT)
-    pattern4('ImgProc', 'WarpAffine', '8UC1', 'INTER_NEAREST', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '8UC1', 'INTER_LINEAR', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '8UC1', 'INTER_CUBIC', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '8UC3', 'INTER_NEAREST', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '8UC3', 'INTER_LINEAR', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '8UC3', 'INTER_CUBIC', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '8UC4', 'INTER_NEAREST', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '8UC4', 'INTER_LINEAR', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '8UC4', 'INTER_CUBIC', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '32FC1', 'INTER_NEAREST', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '32FC1', 'INTER_LINEAR', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '32FC1', 'INTER_CUBIC', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '32FC3', 'INTER_NEAREST', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '32FC3', 'INTER_LINEAR', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '32FC3', 'INTER_CUBIC', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '32FC4', 'INTER_NEAREST', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '32FC4', 'INTER_LINEAR', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpAffine', '32FC4', 'INTER_CUBIC', 'BORDER_CONSTANT'),
-    
-    # ImgProc/WarpPerspective (8UC1 | 8UC3 | 8UC4 | 32FC1 | 32FC3 | 32FC4, INTER_NEAREST | INTER_LINEAR | INTER_CUBIC, BORDER_CONSTANT)
-    pattern4('ImgProc', 'WarpPerspective', '8UC1', 'INTER_NEAREST', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '8UC1', 'INTER_LINEAR', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '8UC1', 'INTER_CUBIC', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '8UC3', 'INTER_NEAREST', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '8UC3', 'INTER_LINEAR', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '8UC3', 'INTER_CUBIC', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '8UC4', 'INTER_NEAREST', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '8UC4', 'INTER_LINEAR', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '8UC4', 'INTER_CUBIC', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '32FC1', 'INTER_NEAREST', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '32FC1', 'INTER_LINEAR', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '32FC1', 'INTER_CUBIC', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '32FC3', 'INTER_NEAREST', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '32FC3', 'INTER_LINEAR', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '32FC3', 'INTER_CUBIC', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '32FC4', 'INTER_NEAREST', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '32FC4', 'INTER_LINEAR', 'BORDER_CONSTANT'),
-    pattern4('ImgProc', 'WarpPerspective', '32FC4', 'INTER_CUBIC', 'BORDER_CONSTANT'),
-    
-    # ImgProc/CopyMakeBorder (8UC1 | 8UC4 | 32SC1 | 32FC1, BORDER_CONSTANT)
-    pattern3('ImgProc', 'CopyMakeBorder', '8UC1', 'BORDER_CONSTANT'),
-    pattern3('ImgProc', 'CopyMakeBorder', '8UC4', 'BORDER_CONSTANT'),
-    pattern3('ImgProc', 'CopyMakeBorder', '32SC1', 'BORDER_CONSTANT'),
-    pattern3('ImgProc', 'CopyMakeBorder', '32FC1', 'BORDER_CONSTANT'),
-    
-    # ImgProc/Threshold (32F, THRESH_TRUNC)
+
+    # ImgProc_Resize (8U, 1 | 4, INTER_NEAREST | INTER_LINEAR)
+    pattern4('ImgProc', 'Resize', '8U', '1', 'INTER_NEAREST'),
+    pattern4('ImgProc', 'Resize', '8U', '4', 'INTER_NEAREST'),
+    pattern4('ImgProc', 'Resize', '8U', '1', 'INTER_LINEAR'),
+    pattern4('ImgProc', 'Resize', '8U', '4', 'INTER_LINEAR'),
+
+    # ImgProc_Resize (8U, 4, INTER_CUBIC)
+    pattern4('ImgProc', 'Resize', '8U', '4', 'INTER_CUBIC'),
+
+    # ImgProc_WarpAffine (8U | 32F, INTER_NEAREST | INTER_LINEAR | INTER_CUBIC, BORDER_CONSTANT)
+    pattern4('ImgProc', 'WarpAffine', '8U' , 'INTER_NEAREST', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'WarpAffine', '8U' , 'INTER_LINEAR', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'WarpAffine', '8U' , 'INTER_CUBIC', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'WarpAffine', '32F', 'INTER_NEAREST', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'WarpAffine', '32F', 'INTER_LINEAR', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'WarpAffine', '32F', 'INTER_CUBIC', 'BORDER_CONSTANT'),
+
+    # ImgProc_WarpPerspective (8U | 32F, INTER_NEAREST | INTER_LINEAR | INTER_CUBIC, BORDER_CONSTANT)
+    pattern4('ImgProc', 'WarpPerspective', '8U' , 'INTER_NEAREST', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'WarpPerspective', '8U' , 'INTER_LINEAR', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'WarpPerspective', '8U' , 'INTER_CUBIC', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'WarpPerspective', '32F', 'INTER_NEAREST', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'WarpPerspective', '32F', 'INTER_LINEAR', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'WarpPerspective', '32F', 'INTER_CUBIC', 'BORDER_CONSTANT'),
+
+    # ImgProc_CopyMakeBorder (8UC1 | 8UC4 | 32SC1 | 32FC1, BORDER_CONSTANT)
+    pattern4('ImgProc', 'CopyMakeBorder', '8U' , '1', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'CopyMakeBorder', '8U' , '4', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'CopyMakeBorder', '32S', '1', 'BORDER_CONSTANT'),
+    pattern4('ImgProc', 'CopyMakeBorder', '32F', '1', 'BORDER_CONSTANT'),
+
+    # ImgProc_Threshold (32F, THRESH_TRUNC)
    pattern3('ImgProc', 'Threshold', '32F', 'THRESH_TRUNC'),

-    # ImgProc/Integral_Sqr
-    pattern1('ImgProc', 'Integral_Sqr'),
+    # ImgProc_IntegralSqr
+    pattern1('ImgProc', 'IntegralSqr'),

-    # ImgProc/HistEven_OneChannel
-    pattern1('ImgProc', 'HistEven_OneChannel'),
+    # ImgProc_HistEven_OneChannel
+    pattern1('ImgProc', 'HistEvenOneChannel'),

-    # ImgProc/HistEven_FourChannel
-    pattern1('ImgProc', 'HistEven_FourChannel'),
+    # ImgProc_HistEven_FourChannel
+    pattern1('ImgProc', 'HistEvenFourChannel'),

-    # ImgProc/Rotate
+    # ImgProc_Rotate
    pattern1('ImgProc', 'Rotate'),

-    # ImgProc/SwapChannels
+    # ImgProc_SwapChannels
    pattern1('ImgProc', 'SwapChannels'),

-    # ImgProc/AlphaComp
+    # ImgProc_AlphaComp
    pattern1('ImgProc', 'AlphaComp'),

-    # ImgProc/ImagePyramid_build
-    pattern1('ImgProc', 'ImagePyramid_build'),
+    # ImgProc_ImagePyramidBuild
+    pattern1('ImgProc', 'ImagePyramidBuild'),
+
+    # ImgProc_ImagePyramid_getLayer
+    pattern1('ImgProc', 'ImagePyramidGetLayer'),

-    # ImgProc/ImagePyramid_getLayer
-    pattern1('ImgProc', 'ImagePyramid_getLayer'),
-    
    ##############################################################
    # MatOp
-    
-    # MatOp/SetTo (8UC4 | 16UC1 | 16UC4 | 32FC1 | 32FC4)
-    pattern2('MatOp', 'SetTo', '8UC4'),
-    pattern2('MatOp', 'SetTo', '16UC1'),
-    pattern2('MatOp', 'SetTo', '16UC4'),
-    pattern2('MatOp', 'SetTo', '32FC1'),
-    pattern2('MatOp', 'SetTo', '32FC4'),
-    
-    # MatOp/SetToMasked (8UC4 | 16UC1 | 16UC4 | 32FC1 | 32FC4)
-    pattern2('MatOp', 'SetToMasked', '8UC4'),
-    pattern2('MatOp', 'SetToMasked', '16UC1'),
-    pattern2('MatOp', 'SetToMasked', '16UC4'),
-    pattern2('MatOp', 'SetToMasked', '32FC1'),
-    pattern2('MatOp', 'SetToMasked', '32FC4'),
-    
-    # MatOp/CopyToMasked (8UC1 | 8UC3 |8UC4 | 16UC1 | 16UC3 | 16UC4 | 32FC1 | 32FC3 | 32FC4)
-    pattern2('MatOp', 'CopyToMasked', '8UC1'),
-    pattern2('MatOp', 'CopyToMasked', '8UC3'),
-    pattern2('MatOp', 'CopyToMasked', '8UC4'),
-    pattern2('MatOp', 'CopyToMasked', '16UC1'),
-    pattern2('MatOp', 'CopyToMasked', '16UC3'),
-    pattern2('MatOp', 'CopyToMasked', '16UC4'),
-    pattern2('MatOp', 'CopyToMasked', '32FC1'),
-    pattern2('MatOp', 'CopyToMasked', '32FC3'),
-    pattern2('MatOp', 'CopyToMasked', '32FC4'),    
+
+    # MatOp_SetTo (8UC4 | 16UC1 | 16UC4 | 32FC1 | 32FC4)
+    pattern3('MatOp', 'SetTo', '8U' , '4'),
+    pattern3('MatOp', 'SetTo', '16U', '1'),
+    pattern3('MatOp', 'SetTo', '16U', '4'),
+    pattern3('MatOp', 'SetTo', '32F', '1'),
+    pattern3('MatOp', 'SetTo', '32F', '4'),
+
+    # MatOp_SetToMasked (8UC4 | 16UC1 | 16UC4 | 32FC1 | 32FC4)
+    pattern3('MatOp', 'SetToMasked', '8U' , '4'),
+    pattern3('MatOp', 'SetToMasked', '16U', '1'),
+    pattern3('MatOp', 'SetToMasked', '16U', '4'),
+    pattern3('MatOp', 'SetToMasked', '32F', '1'),
+    pattern3('MatOp', 'SetToMasked', '32F', '4'),
+
+    # MatOp_CopyToMasked (8UC1 | 8UC3 |8UC4 | 16UC1 | 16UC3 | 16UC4 | 32FC1 | 32FC3 | 32FC4)
+    pattern3('MatOp', 'CopyToMasked', '8U' , '1'),
+    pattern3('MatOp', 'CopyToMasked', '8U' , '3'),
+    pattern3('MatOp', 'CopyToMasked', '8U' , '4'),
+    pattern3('MatOp', 'CopyToMasked', '16U', '1'),
+    pattern3('MatOp', 'CopyToMasked', '16U', '3'),
+    pattern3('MatOp', 'CopyToMasked', '16U', '4'),
+    pattern3('MatOp', 'CopyToMasked', '32F', '1'),
+    pattern3('MatOp', 'CopyToMasked', '32F', '3'),
+    pattern3('MatOp', 'CopyToMasked', '32F', '4'),
 ]

 cublasPattern = pattern1('Core', 'GEMM')
@ -260,7 +239,7 @@ if __name__ == "__main__":
    inputFile = open(sys.argv[1], 'r')
    lines = inputFile.readlines()
    inputFile.close()
-    
+

    for i in range(len(lines)):
        if cublasPattern.match(lines[i]):
--- a/modules/gpu/perf/main.cpp
+++ b/modules/gpu/perf/main.cpp
@ -0,0 +1,125 @@
+#include "perf_precomp.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+using namespace cvtest;
+using namespace testing;
+
+void printOsInfo()
+{
+#if defined _WIN32
+#   if defined _WIN64
+        cout << "OS: Windows x64 \n" << endl;
+#   else
+        cout << "OS: Windows x32 \n" << endl;
+#   endif
+#elif defined linux
+#   if defined _LP64
+        cout << "OS: Linux x64 \n" << endl;
+#   else
+        cout << "OS: Linux x32 \n" << endl;
+#   endif
+#elif defined __APPLE__
+#   if defined _LP64
+        cout << "OS: Apple x64 \n" << endl;
+#   else
+        cout << "OS: Apple x32 \n" << endl;
+#   endif
+#endif
+}
+
+void printCudaInfo()
+{
+#ifndef HAVE_CUDA
+    cout << "OpenCV was built without CUDA support \n" << endl;
+#else
+    int driver;
+    cudaDriverGetVersion(&driver);
+
+    cout << "CUDA Driver  version: " << driver << '\n';
+    cout << "CUDA Runtime version: " << CUDART_VERSION << '\n';
+
+    cout << endl;
+
+    cout << "GPU module was compiled for the following GPU archs:" << endl;
+    cout << "    BIN: " << CUDA_ARCH_BIN << '\n';
+    cout << "    PTX: " << CUDA_ARCH_PTX << '\n';
+
+    cout << endl;
+
+    int deviceCount = getCudaEnabledDeviceCount();
+    cout << "CUDA device count: " << deviceCount << '\n';
+
+    cout << endl;
+
+    for (int i = 0; i < deviceCount; ++i)
+    {
+        DeviceInfo info(i);
+
+        cout << "Device [" << i << "] \n";
+        cout << "\t Name: " << info.name() << '\n';
+        cout << "\t Compute capability: " << info.majorVersion() << '.' << info.minorVersion()<< '\n';
+        cout << "\t Multi Processor Count: " << info.multiProcessorCount() << '\n';
+        cout << "\t Total memory: " << static_cast<int>(static_cast<int>(info.totalMemory() / 1024.0) / 1024.0) << " Mb \n";
+        cout << "\t Free  memory: " << static_cast<int>(static_cast<int>(info.freeMemory() / 1024.0) / 1024.0) << " Mb \n";
+        if (!info.isCompatible())
+            cout << "\t !!! This device is NOT compatible with current GPU module build \n";
+
+        cout << endl;
+    }
+#endif
+}
+
+int main(int argc, char** argv)
+{
+    CommandLineParser cmd(argc, (const char**) argv,
+        "{ print_info_only | print_info_only | false | Print information about system and exit }"
+        "{ device | device | 0 | Device on which tests will be executed }"
+        "{ cpu | cpu | false | Run tests on cpu }"
+    );
+
+    printOsInfo();
+    printCudaInfo();
+
+    if (cmd.get<bool>("print_info_only"))
+        return 0;
+
+    int device = cmd.get<int>("device");
+    bool cpu = cmd.get<bool>("cpu");
+#ifndef HAVE_CUDA
+    cpu = true;
+#endif
+
+    if (cpu)
+    {
+        runOnGpu = false;
+
+        cout << "Run tests on CPU \n" << endl;
+    }
+    else
+    {
+        runOnGpu = true;
+
+        if (device < 0 || device >= getCudaEnabledDeviceCount())
+        {
+            cerr << "Incorrect device index - " << device << endl;
+            return -1;
+        }
+
+        DeviceInfo info(device);
+        if (!info.isCompatible())
+        {
+            cerr << "Device " << device << " [" << info.name() << "] is NOT compatible with current GPU module build" << endl;
+            return -1;
+        }
+
+        setDevice(device);
+
+        cout << "Run tests on device " << device << " [" << info.name() << "] \n" << endl;
+    }
+
+    InitGoogleTest(&argc, argv);
+    perf::TestBase::Init(argc, argv);
+    return RUN_ALL_TESTS();
+}
--- a/modules/gpu/perf/perf_calib3d.cpp
+++ b/modules/gpu/perf/perf_calib3d.cpp
@ -1,219 +1,263 @@
 #include "perf_precomp.hpp"

-#ifdef HAVE_CUDA
+using namespace std;
+using namespace testing;
+
+namespace {

 //////////////////////////////////////////////////////////////////////
 // StereoBM

-GPU_PERF_TEST_1(StereoBM, cv::gpu::DeviceInfo)
+typedef pair<string, string> pair_string;
+DEF_PARAM_TEST_1(ImagePair, pair_string);
+
+PERF_TEST_P(ImagePair, Calib3D_StereoBM, Values(make_pair<string, string>("gpu/perf/aloe.jpg", "gpu/perf/aloeR.jpg")))
 {
-    cv::gpu::DeviceInfo devInfo = GetParam();
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(5.0);

-    cv::Mat img_l_host = readImage("gpu/perf/aloe.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_l_host.empty());
+    const cv::Mat imgLeft = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(imgLeft.empty());

-    cv::Mat img_r_host = readImage("gpu/perf/aloeR.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_r_host.empty());
+    const cv::Mat imgRight = readImage(GetParam().second, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(imgRight.empty());

-    cv::gpu::StereoBM_GPU bm(0, 256);
-    cv::gpu::GpuMat img_l(img_l_host);
-    cv::gpu::GpuMat img_r(img_r_host);
-    cv::gpu::GpuMat dst;
+    const int preset = 0;
+    const int ndisp = 256;

-    bm(img_l, img_r, dst);
+    if (runOnGpu)
+    {
+        cv::gpu::StereoBM_GPU d_bm(preset, ndisp);

-    declare.time(5.0);
+        cv::gpu::GpuMat d_imgLeft(imgLeft);
+        cv::gpu::GpuMat d_imgRight(imgRight);
+        cv::gpu::GpuMat d_dst;
+
+        d_bm(d_imgLeft, d_imgRight, d_dst);

-    TEST_CYCLE()
+        TEST_CYCLE()
+        {
+            d_bm(d_imgLeft, d_imgRight, d_dst);
+        }
+    }
+    else
    {
-        bm(img_l, img_r, dst);
+        cv::StereoBM bm(preset, ndisp);
+
+        cv::Mat dst;
+
+        bm(imgLeft, imgRight, dst);
+
+        TEST_CYCLE()
+        {
+            bm(imgLeft, imgRight, dst);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Calib3D, StereoBM, ALL_DEVICES);
-
 //////////////////////////////////////////////////////////////////////
 // StereoBeliefPropagation

-GPU_PERF_TEST_1(StereoBeliefPropagation, cv::gpu::DeviceInfo)
+PERF_TEST_P(ImagePair, Calib3D_StereoBeliefPropagation, Values(make_pair<string, string>("gpu/stereobp/aloe-L.png", "gpu/stereobp/aloe-R.png")))
 {
-    cv::gpu::DeviceInfo devInfo = GetParam();
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(10.0);

-    cv::Mat img_l_host = readImage("gpu/stereobp/aloe-L.png");
-    ASSERT_FALSE(img_l_host.empty());
+    const cv::Mat imgLeft = readImage(GetParam().first);
+    ASSERT_FALSE(imgLeft.empty());

-    cv::Mat img_r_host = readImage("gpu/stereobp/aloe-R.png");
-    ASSERT_FALSE(img_r_host.empty());
+    const cv::Mat imgRight = readImage(GetParam().second);
+    ASSERT_FALSE(imgRight.empty());

-    cv::gpu::StereoBeliefPropagation bp(64);
-    cv::gpu::GpuMat img_l(img_l_host);
-    cv::gpu::GpuMat img_r(img_r_host);
-    cv::gpu::GpuMat dst;
+    const int ndisp = 64;

-    bp(img_l, img_r, dst);
+    if (runOnGpu)
+    {
+        cv::gpu::StereoBeliefPropagation d_bp(ndisp);

-    declare.time(10.0);
+        cv::gpu::GpuMat d_imgLeft(imgLeft);
+        cv::gpu::GpuMat d_imgRight(imgRight);
+        cv::gpu::GpuMat d_dst;

-    TEST_CYCLE()
+        d_bp(d_imgLeft, d_imgRight, d_dst);
+
+        TEST_CYCLE()
+        {
+            d_bp(d_imgLeft, d_imgRight, d_dst);
+        }
+    }
+    else
    {
-        bp(img_l, img_r, dst);
+        FAIL();
    }
 }

-INSTANTIATE_TEST_CASE_P(Calib3D, StereoBeliefPropagation, ALL_DEVICES);
-
 //////////////////////////////////////////////////////////////////////
 // StereoConstantSpaceBP

-GPU_PERF_TEST_1(StereoConstantSpaceBP, cv::gpu::DeviceInfo)
+PERF_TEST_P(ImagePair, Calib3D_StereoConstantSpaceBP, Values(make_pair<string, string>("gpu/stereobm/aloe-L.png", "gpu/stereobm/aloe-R.png")))
 {
-    cv::gpu::DeviceInfo devInfo = GetParam();
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(10.0);

-    cv::Mat img_l_host = readImage("gpu/stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_l_host.empty());
+    const cv::Mat imgLeft = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(imgLeft.empty());

-    cv::Mat img_r_host = readImage("gpu/stereobm/aloe-R.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_r_host.empty());
+    const cv::Mat imgRight = readImage(GetParam().second, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(imgRight.empty());

-    cv::gpu::StereoConstantSpaceBP csbp(128);
-    cv::gpu::GpuMat img_l(img_l_host);
-    cv::gpu::GpuMat img_r(img_r_host);
-    cv::gpu::GpuMat dst;
+    const int ndisp = 128;

-    csbp(img_l, img_r, dst);
+    if (runOnGpu)
+    {
+        cv::gpu::StereoConstantSpaceBP d_csbp(ndisp);

-    declare.time(10.0);
+        cv::gpu::GpuMat d_imgLeft(imgLeft);
+        cv::gpu::GpuMat d_imgRight(imgRight);
+        cv::gpu::GpuMat d_dst;
+
+        d_csbp(d_imgLeft, d_imgRight, d_dst);

-    TEST_CYCLE()
+        TEST_CYCLE()
+        {
+            d_csbp(d_imgLeft, d_imgRight, d_dst);
+        }
+    }
+    else
    {
-        csbp(img_l, img_r, dst);
+        FAIL();
    }
 }

-INSTANTIATE_TEST_CASE_P(Calib3D, StereoConstantSpaceBP, ALL_DEVICES);
-
 //////////////////////////////////////////////////////////////////////
 // DisparityBilateralFilter

-GPU_PERF_TEST_1(DisparityBilateralFilter, cv::gpu::DeviceInfo)
+PERF_TEST_P(ImagePair, Calib3D_DisparityBilateralFilter, Values(make_pair<string, string>("gpu/stereobm/aloe-L.png", "gpu/stereobm/aloe-disp.png")))
 {
-    cv::gpu::DeviceInfo devInfo = GetParam();
-    cv::gpu::setDevice(devInfo.deviceID());
+    const cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());
+
+    const cv::Mat disp = readImage(GetParam().second, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(disp.empty());

-    cv::Mat img_host = readImage("gpu/stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_host.empty());
+    const int ndisp = 128;

-    cv::Mat disp_host = readImage("gpu/stereobm/aloe-disp.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(disp_host.empty());
+    if (runOnGpu)
+    {
+        cv::gpu::DisparityBilateralFilter d_filter(ndisp);

-    cv::gpu::DisparityBilateralFilter f(128);
-    cv::gpu::GpuMat img(img_host);
-    cv::gpu::GpuMat disp(disp_host);
-    cv::gpu::GpuMat dst;
+        cv::gpu::GpuMat d_img(img);
+        cv::gpu::GpuMat d_disp(disp);
+        cv::gpu::GpuMat d_dst;

-    f(disp, img, dst);
+        d_filter(d_disp, d_img, d_dst);

-    TEST_CYCLE()
+        TEST_CYCLE()
+        {
+            d_filter(d_disp, d_img, d_dst);
+        }
+    }
+    else
    {
-        f(disp, img, dst);
+        FAIL();
    }
 }

-INSTANTIATE_TEST_CASE_P(Calib3D, DisparityBilateralFilter, ALL_DEVICES);
-
 //////////////////////////////////////////////////////////////////////
 // TransformPoints

-IMPLEMENT_PARAM_CLASS(Count, int)
+DEF_PARAM_TEST_1(Count, int);

-GPU_PERF_TEST(TransformPoints, cv::gpu::DeviceInfo, Count)
+PERF_TEST_P(Count, Calib3D_TransformPoints, Values(5000, 10000, 20000))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    const int count = GetParam();

-    int count = GET_PARAM(1);
+    cv::Mat src(1, count, CV_32FC3);
+    fillRandom(src, -100, 100);

-    cv::Mat src_host(1, count, CV_32FC3);
-    fill(src_host, -100, 100);
+    const cv::Mat rvec = cv::Mat::ones(1, 3, CV_32FC1);
+    const cv::Mat tvec = cv::Mat::ones(1, 3, CV_32FC1);

-    cv::gpu::GpuMat src(src_host);
-    cv::Mat rvec = cv::Mat::ones(1, 3, CV_32FC1);
-    cv::Mat tvec = cv::Mat::ones(1, 3, CV_32FC1);
-    cv::gpu::GpuMat dst;
+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;

-    cv::gpu::transformPoints(src, rvec, tvec, dst);
+        cv::gpu::transformPoints(d_src, rvec, tvec, d_dst);

-    TEST_CYCLE()
+        TEST_CYCLE()
+        {
+            cv::gpu::transformPoints(d_src, rvec, tvec, d_dst);
+        }
+    }
+    else
    {
-        cv::gpu::transformPoints(src, rvec, tvec, dst);
+        FAIL();
    }
 }

-INSTANTIATE_TEST_CASE_P(Calib3D, TransformPoints, testing::Combine(
-    ALL_DEVICES,
-    testing::Values<Count>(5000, 10000, 20000)));
-
 //////////////////////////////////////////////////////////////////////
 // ProjectPoints

-GPU_PERF_TEST(ProjectPoints, cv::gpu::DeviceInfo, Count)
+PERF_TEST_P(Count, Calib3D_ProjectPoints, Values(5000, 10000, 20000))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    const int count = GetParam();

-    int count = GET_PARAM(1);
+    cv::Mat src(1, count, CV_32FC3);
+    fillRandom(src, -100, 100);

-    cv::Mat src_host(1, count, CV_32FC3);
-    fill(src_host, -100, 100);
+    const cv::Mat rvec = cv::Mat::ones(1, 3, CV_32FC1);
+    const cv::Mat tvec = cv::Mat::ones(1, 3, CV_32FC1);
+    const cv::Mat camera_mat = cv::Mat::ones(3, 3, CV_32FC1);

-    cv::gpu::GpuMat src(src_host);
-    cv::Mat rvec = cv::Mat::ones(1, 3, CV_32FC1);
-    cv::Mat tvec = cv::Mat::ones(1, 3, CV_32FC1);
-    cv::Mat camera_mat = cv::Mat::ones(3, 3, CV_32FC1);
-    cv::gpu::GpuMat dst;
+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;

-    cv::gpu::projectPoints(src, rvec, tvec, camera_mat, cv::Mat(), dst);
+        cv::gpu::projectPoints(d_src, rvec, tvec, camera_mat, cv::Mat(), d_dst);

-    TEST_CYCLE()
+        TEST_CYCLE()
+        {
+            cv::gpu::projectPoints(d_src, rvec, tvec, camera_mat, cv::Mat(), d_dst);
+        }
+    }
+    else
    {
-        cv::gpu::projectPoints(src, rvec, tvec, camera_mat, cv::Mat(), dst);
+        cv::Mat dst;
+
+        cv::projectPoints(src, rvec, tvec, camera_mat, cv::noArray(), dst);
+
+        TEST_CYCLE()
+        {
+            cv::projectPoints(src, rvec, tvec, camera_mat, cv::noArray(), dst);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Calib3D, ProjectPoints, testing::Combine(
-    ALL_DEVICES,
-    testing::Values<Count>(5000, 10000, 20000)));
-
 //////////////////////////////////////////////////////////////////////
 // SolvePnPRansac

-GPU_PERF_TEST(SolvePnPRansac, cv::gpu::DeviceInfo, Count)
+PERF_TEST_P(Count, Calib3D_SolvePnPRansac, Values(5000, 10000, 20000))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(10.0);

-    int count = GET_PARAM(1);
+    const int count = GetParam();

    cv::Mat object(1, count, CV_32FC3);
-    fill(object, -100, 100);
+    fillRandom(object, -100, 100);

    cv::Mat camera_mat(3, 3, CV_32FC1);
-    fill(camera_mat, 0.5, 1);
+    fillRandom(camera_mat, 0.5, 1);
    camera_mat.at<float>(0, 1) = 0.f;
    camera_mat.at<float>(1, 0) = 0.f;
    camera_mat.at<float>(2, 0) = 0.f;
    camera_mat.at<float>(2, 1) = 0.f;

-    cv::Mat dist_coef(1, 8, CV_32F, cv::Scalar::all(0));
+    const cv::Mat dist_coef(1, 8, CV_32F, cv::Scalar::all(0));

    std::vector<cv::Point2f> image_vec;
    cv::Mat rvec_gold(1, 3, CV_32FC1);
-    fill(rvec_gold, 0, 1);
+    fillRandom(rvec_gold, 0, 1);
    cv::Mat tvec_gold(1, 3, CV_32FC1);
-    fill(tvec_gold, 0, 1);
+    fillRandom(tvec_gold, 0, 1);
    cv::projectPoints(object, rvec_gold, tvec_gold, camera_mat, dist_coef, image_vec);

    cv::Mat image(1, count, CV_32FC2, &image_vec[0]);
@ -221,82 +265,92 @@ GPU_PERF_TEST(SolvePnPRansac, cv::gpu::DeviceInfo, Count)
    cv::Mat rvec;
    cv::Mat tvec;

-    cv::gpu::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
-
-    declare.time(3.0);
-
-    TEST_CYCLE()
+    if (runOnGpu)
    {
        cv::gpu::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
+
+        TEST_CYCLE()
+        {
+            cv::gpu::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
+        }
    }
-}
+    else
+    {
+        cv::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);

-INSTANTIATE_TEST_CASE_P(Calib3D, SolvePnPRansac, testing::Combine(
-    ALL_DEVICES,
-    testing::Values<Count>(5000, 10000, 20000)));
+        TEST_CYCLE()
+        {
+            cv::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
+        }
+    }
+}

 //////////////////////////////////////////////////////////////////////
 // ReprojectImageTo3D

-GPU_PERF_TEST(ReprojectImageTo3D, cv::gpu::DeviceInfo, cv::Size, MatDepth)
+PERF_TEST_P(Sz_Depth, Calib3D_ReprojectImageTo3D, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16S)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);

-    cv::Size size = GET_PARAM(1);
-    int depth = GET_PARAM(2);
-
-    cv::Mat src_host(size, depth);
-    fill(src_host, 5.0, 30.0);
+    cv::Mat src(size, depth);
+    fillRandom(src, 5.0, 30.0);

    cv::Mat Q(4, 4, CV_32FC1);
-    fill(Q, 0.1, 1.0);
+    fillRandom(Q, 0.1, 1.0);

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst;
+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;

-    cv::gpu::reprojectImageTo3D(src, dst, Q);
+        cv::gpu::reprojectImageTo3D(d_src, d_dst, Q);

-    TEST_CYCLE()
+        TEST_CYCLE()
+        {
+            cv::gpu::reprojectImageTo3D(d_src, d_dst, Q);
+        }
+    }
+    else
    {
-        cv::gpu::reprojectImageTo3D(src, dst, Q);
+        cv::Mat dst;
+
+        cv::reprojectImageTo3D(src, dst, Q);
+
+        TEST_CYCLE()
+        {
+            cv::reprojectImageTo3D(src, dst, Q);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Calib3D, ReprojectImageTo3D, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values<MatDepth>(CV_8U, CV_16S)));
-
 //////////////////////////////////////////////////////////////////////
 // DrawColorDisp

-GPU_PERF_TEST(DrawColorDisp, cv::gpu::DeviceInfo, cv::Size, MatDepth)
+PERF_TEST_P(Sz_Depth, Calib3D_DrawColorDisp, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16S)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
-
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);

-    cv::Mat src_host(size, type);
-    fill(src_host, 0, 255);
+    cv::Mat src(size, type);
+    fillRandom(src, 0, 255);

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst;
+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;

-    cv::gpu::drawColorDisp(src, dst, 255);
+        cv::gpu::drawColorDisp(d_src, d_dst, 255);

-    TEST_CYCLE()
+        TEST_CYCLE()
+        {
+            cv::gpu::drawColorDisp(d_src, d_dst, 255);
+        }
+    }
+    else
    {
-        cv::gpu::drawColorDisp(src, dst, 255);
+        FAIL();
    }
 }

-INSTANTIATE_TEST_CASE_P(Calib3D, DrawColorDisp, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatDepth(CV_8U), MatDepth(CV_16S))));
-
-#endif
-
+} // namespace
--- a/modules/gpu/perf/perf_core.cpp
+++ b/modules/gpu/perf/perf_core.cpp
--- a/modules/gpu/perf/perf_features2d.cpp
+++ b/modules/gpu/perf/perf_features2d.cpp
@ -1,209 +1,278 @@
 #include "perf_precomp.hpp"

-#ifdef HAVE_CUDA
+using namespace std;
+using namespace testing;
+
+namespace {

 //////////////////////////////////////////////////////////////////////
 // SURF

-GPU_PERF_TEST_1(SURF, cv::gpu::DeviceInfo)
-{
-    cv::gpu::DeviceInfo devInfo = GetParam();
-    cv::gpu::setDevice(devInfo.deviceID());
+DEF_PARAM_TEST_1(Image, string);

-    cv::Mat img_host = readImage("gpu/perf/aloe.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_host.empty());
+PERF_TEST_P(Image, Features2D_SURF, Values<string>("gpu/perf/aloe.jpg"))
+{
+    declare.time(50.0);

-    cv::gpu::SURF_GPU surf;
+    cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());

-    cv::gpu::GpuMat img(img_host);
-    cv::gpu::GpuMat keypoints, descriptors;
+    if (runOnGpu)
+    {
+        cv::gpu::SURF_GPU d_surf;

-    surf(img, cv::gpu::GpuMat(), keypoints, descriptors);
+        cv::gpu::GpuMat d_img(img);
+        cv::gpu::GpuMat d_keypoints, d_descriptors;

-    declare.time(2.0);
+        d_surf(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);

-    TEST_CYCLE()
+        TEST_CYCLE()
+        {
+            d_surf(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);
+        }
+    }
+    else
    {
-        surf(img, cv::gpu::GpuMat(), keypoints, descriptors);
+        cv::SURF surf;
+
+        std::vector<cv::KeyPoint> keypoints;
+        cv::Mat descriptors;
+
+        surf(img, cv::noArray(), keypoints, descriptors);
+
+        TEST_CYCLE()
+        {
+            keypoints.clear();
+            surf(img, cv::noArray(), keypoints, descriptors);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Features2D, SURF, ALL_DEVICES);
-
 //////////////////////////////////////////////////////////////////////
 // FAST

-GPU_PERF_TEST_1(FAST, cv::gpu::DeviceInfo)
+PERF_TEST_P(Image, Features2D_FAST, Values<string>("gpu/perf/aloe.jpg"))
 {
-    cv::gpu::DeviceInfo devInfo = GetParam();
-    cv::gpu::setDevice(devInfo.deviceID());
+    cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());

-    cv::Mat img_host = readImage("gpu/perf/aloe.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_host.empty());
-
-    cv::gpu::FAST_GPU fast(20);
+    if (runOnGpu)
+    {
+        cv::gpu::FAST_GPU d_fast(20);

-    cv::gpu::GpuMat img(img_host);
-    cv::gpu::GpuMat keypoints;
+        cv::gpu::GpuMat d_img(img);
+        cv::gpu::GpuMat d_keypoints;

-    fast(img, cv::gpu::GpuMat(), keypoints);
+        d_fast(d_img, cv::gpu::GpuMat(), d_keypoints);

-    TEST_CYCLE()
+        TEST_CYCLE()
+        {
+            d_fast(d_img, cv::gpu::GpuMat(), d_keypoints);
+        }
+    }
+    else
    {
-        fast(img, cv::gpu::GpuMat(), keypoints);
+        std::vector<cv::KeyPoint> keypoints;
+
+        cv::FAST(img, keypoints, 20);
+
+        TEST_CYCLE()
+        {
+            keypoints.clear();
+            cv::FAST(img, keypoints, 20);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Features2D, FAST, ALL_DEVICES);
-
 //////////////////////////////////////////////////////////////////////
 // ORB

-GPU_PERF_TEST_1(ORB, cv::gpu::DeviceInfo)
+PERF_TEST_P(Image, Features2D_ORB, Values<string>("gpu/perf/aloe.jpg"))
 {
-    cv::gpu::DeviceInfo devInfo = GetParam();
-    cv::gpu::setDevice(devInfo.deviceID());
+    cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());

-    cv::Mat img_host = readImage("gpu/perf/aloe.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_host.empty());
+    if (runOnGpu)
+    {
+        cv::gpu::ORB_GPU d_orb(4000);

-    cv::gpu::ORB_GPU orb(4000);
+        cv::gpu::GpuMat d_img(img);
+        cv::gpu::GpuMat d_keypoints, d_descriptors;

-    cv::gpu::GpuMat img(img_host);
-    cv::gpu::GpuMat keypoints, descriptors;
+        d_orb(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);

-    TEST_CYCLE()
+        TEST_CYCLE()
+        {
+            d_orb(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);
+        }
+    }
+    else
    {
-        orb(img, cv::gpu::GpuMat(), keypoints, descriptors);
+        cv::ORB orb(4000);
+
+        std::vector<cv::KeyPoint> keypoints;
+        cv::Mat descriptors;
+
+        orb(img, cv::noArray(), keypoints, descriptors);
+
+        TEST_CYCLE()
+        {
+            keypoints.clear();
+            orb(img, cv::noArray(), keypoints, descriptors);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Features2D, ORB, ALL_DEVICES);
-
 //////////////////////////////////////////////////////////////////////
-// BruteForceMatcher_match
+// BFMatch

-IMPLEMENT_PARAM_CLASS(DescriptorSize, int)
+DEF_PARAM_TEST(DescSize_Norm, int, NormType);

-GPU_PERF_TEST(BruteForceMatcher_match, cv::gpu::DeviceInfo, DescriptorSize, NormType)
+PERF_TEST_P(DescSize_Norm, Features2D_BFMatch, Combine(Values(64, 128, 256), Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(20.0);

-    int desc_size = GET_PARAM(1);
-    int normType = GET_PARAM(2);
+    int desc_size = GET_PARAM(0);
+    int normType = GET_PARAM(1);

    int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;

-    cv::Mat query_host(3000, desc_size, type);
-    fill(query_host, 0.0, 10.0);
-
-    cv::Mat train_host(3000, desc_size, type);
-    fill(train_host, 0.0, 10.0);
+    cv::Mat query(3000, desc_size, type);
+    fillRandom(query);

-    cv::gpu::BFMatcher_GPU matcher(normType);
+    cv::Mat train(3000, desc_size, type);
+    fillRandom(train);

-    cv::gpu::GpuMat query(query_host);
-    cv::gpu::GpuMat train(train_host);
-    cv::gpu::GpuMat trainIdx, distance;
+    if (runOnGpu)
+    {
+        cv::gpu::BFMatcher_GPU d_matcher(normType);

-    matcher.matchSingle(query, train, trainIdx, distance);
+        cv::gpu::GpuMat d_query(query);
+        cv::gpu::GpuMat d_train(train);
+        cv::gpu::GpuMat d_trainIdx, d_distance;

-    declare.time(3.0);
+        d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);

-    TEST_CYCLE()
+        TEST_CYCLE()
+        {
+            d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
+        }
+    }
+    else
    {
-        matcher.matchSingle(query, train, trainIdx, distance);
+        cv::BFMatcher matcher(normType);
+
+        std::vector<cv::DMatch> matches;
+
+        matcher.match(query, train, matches);
+
+        TEST_CYCLE()
+        {
+            matcher.match(query, train, matches);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Features2D, BruteForceMatcher_match, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(DescriptorSize(64), DescriptorSize(128), DescriptorSize(256)),
-    testing::Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))));
-
 //////////////////////////////////////////////////////////////////////
-// BruteForceMatcher_knnMatch
+// BFKnnMatch

-IMPLEMENT_PARAM_CLASS(K, int)
+DEF_PARAM_TEST(DescSize_K_Norm, int, int, NormType);

-GPU_PERF_TEST(BruteForceMatcher_knnMatch, cv::gpu::DeviceInfo, DescriptorSize, K, NormType)
+PERF_TEST_P(DescSize_K_Norm, Features2D_BFKnnMatch, Combine(
+    Values(64, 128, 256),
+    Values(2, 3),
+    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(30.0);

-    int desc_size = GET_PARAM(1);
-    int k = GET_PARAM(2);
-    int normType = GET_PARAM(3);
+    int desc_size = GET_PARAM(0);
+    int k = GET_PARAM(1);
+    int normType = GET_PARAM(2);

    int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;

-    cv::Mat query_host(3000, desc_size, type);
-    fill(query_host, 0.0, 10.0);
-
-    cv::Mat train_host(3000, desc_size, type);
-    fill(train_host, 0.0, 10.0);
+    cv::Mat query(3000, desc_size, type);
+    fillRandom(query);

-    cv::gpu::BFMatcher_GPU matcher(normType);
+    cv::Mat train(3000, desc_size, type);
+    fillRandom(train);

-    cv::gpu::GpuMat query(query_host);
-    cv::gpu::GpuMat train(train_host);
-    cv::gpu::GpuMat trainIdx, distance, allDist;
+    if (runOnGpu)
+    {
+        cv::gpu::BFMatcher_GPU d_matcher(normType);

-    matcher.knnMatchSingle(query, train, trainIdx, distance, allDist, k);
+        cv::gpu::GpuMat d_query(query);
+        cv::gpu::GpuMat d_train(train);
+        cv::gpu::GpuMat d_trainIdx, d_distance, d_allDist;

-    declare.time(3.0);
+        d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, k);

-    TEST_CYCLE()
+        TEST_CYCLE()
+        {
+            d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, k);
+        }
+    }
+    else
    {
-        matcher.knnMatchSingle(query, train, trainIdx, distance, allDist, k);
+        cv::BFMatcher matcher(normType);
+
+        std::vector< std::vector<cv::DMatch> > matches;
+
+        matcher.knnMatch(query, train, matches, k);
+
+        TEST_CYCLE()
+        {
+            matcher.knnMatch(query, train, matches, k);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Features2D, BruteForceMatcher_knnMatch, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(DescriptorSize(64), DescriptorSize(128), DescriptorSize(256)),
-    testing::Values(K(2), K(3)),
-    testing::Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))));
-
 //////////////////////////////////////////////////////////////////////
-// BruteForceMatcher_radiusMatch
+// BFRadiusMatch

-GPU_PERF_TEST(BruteForceMatcher_radiusMatch, cv::gpu::DeviceInfo, DescriptorSize, NormType)
+PERF_TEST_P(DescSize_Norm, Features2D_BFRadiusMatch, Combine(Values(64, 128, 256), Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(30.0);

-    int desc_size = GET_PARAM(1);
-    int normType = GET_PARAM(2);
+    int desc_size = GET_PARAM(0);
+    int normType = GET_PARAM(1);

    int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;

-    cv::Mat query_host(3000, desc_size, type);
-    fill(query_host, 0.0, 1.0);
-
-    cv::Mat train_host(3000, desc_size, type);
-    fill(train_host, 0.0, 1.0);
+    cv::Mat query(3000, desc_size, type);
+    fillRandom(query, 0.0, 1.0);

-    cv::gpu::BFMatcher_GPU matcher(normType);
+    cv::Mat train(3000, desc_size, type);
+    fillRandom(train, 0.0, 1.0);

-    cv::gpu::GpuMat query(query_host);
-    cv::gpu::GpuMat train(train_host);
-    cv::gpu::GpuMat trainIdx, nMatches, distance;
+    if (runOnGpu)
+    {
+        cv::gpu::BFMatcher_GPU d_matcher(normType);

-    matcher.radiusMatchSingle(query, train, trainIdx, distance, nMatches, 2.0);
+        cv::gpu::GpuMat d_query(query);
+        cv::gpu::GpuMat d_train(train);
+        cv::gpu::GpuMat d_trainIdx, d_nMatches, d_distance;

-    declare.time(3.0);
+        d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, 2.0);

-    TEST_CYCLE()
+        TEST_CYCLE()
+        {
+            d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, 2.0);
+        }
+    }
+    else
    {
-        matcher.radiusMatchSingle(query, train, trainIdx, distance, nMatches, 2.0);
+        cv::BFMatcher matcher(normType);
+
+        std::vector< std::vector<cv::DMatch> > matches;
+
+        matcher.radiusMatch(query, train, matches, 2.0);
+
+        TEST_CYCLE()
+        {
+            matcher.radiusMatch(query, train, matches, 2.0);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Features2D, BruteForceMatcher_radiusMatch, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(DescriptorSize(64), DescriptorSize(128), DescriptorSize(256)),
-    testing::Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))));
-
-#endif
+} // namespace
--- a/modules/gpu/perf/perf_filters.cpp
+++ b/modules/gpu/perf/perf_filters.cpp
@ -1,308 +1,379 @@
 #include "perf_precomp.hpp"

-#ifdef HAVE_CUDA
+using namespace std;
+using namespace testing;
+
+namespace {

 //////////////////////////////////////////////////////////////////////
 // Blur

-IMPLEMENT_PARAM_CLASS(KernelSize, int)
+DEF_PARAM_TEST(Sz_Type_KernelSz, cv::Size, MatType, int);

-GPU_PERF_TEST(Blur, cv::gpu::DeviceInfo, cv::Size, MatType, KernelSize)
+PERF_TEST_P(Sz_Type_KernelSz, Filters_Blur, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4), Values(3, 5, 7)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(20.0);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int ksize = GET_PARAM(3);
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);
+    int ksize = GET_PARAM(2);

-    cv::Mat src_host(size, type);
-    fill(src_host, 0.0, 255.0);
+    cv::Mat src(size, type);
+    fillRandom(src);

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst;
+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;

-    cv::gpu::blur(src, dst, cv::Size(ksize, ksize));
+        cv::gpu::blur(d_src, d_dst, cv::Size(ksize, ksize));

-    TEST_CYCLE()
+        TEST_CYCLE()
+        {
+            cv::gpu::blur(d_src, d_dst, cv::Size(ksize, ksize));
+        }
+    }
+    else
    {
-        cv::gpu::blur(src, dst, cv::Size(ksize, ksize));
+        cv::Mat dst;
+
+        cv::blur(src, dst, cv::Size(ksize, ksize));
+
+        TEST_CYCLE()
+        {
+            cv::blur(src, dst, cv::Size(ksize, ksize));
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Filters, Blur, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4)),
-    testing::Values(KernelSize(3), KernelSize(5), KernelSize(7))));
-
 //////////////////////////////////////////////////////////////////////
 // Sobel

-GPU_PERF_TEST(Sobel, cv::gpu::DeviceInfo, cv::Size, MatType, KernelSize)
+PERF_TEST_P(Sz_Type_KernelSz, Filters_Sobel, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1), Values(3, 5, 7, 9, 11, 13, 15)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(20.0);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int ksize = GET_PARAM(3);
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);
+    int ksize = GET_PARAM(2);

-    cv::Mat src_host(size, type);
-    fill(src_host, 0.0, 255.0);
+    cv::Mat src(size, type);
+    fillRandom(src);

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst;
-    cv::gpu::GpuMat buf;
+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+        cv::gpu::GpuMat d_buf;

-    cv::gpu::Sobel(src, dst, -1, 1, 1, buf, ksize);
+        cv::gpu::Sobel(d_src, d_dst, -1, 1, 1, d_buf, ksize);

-    TEST_CYCLE()
+        TEST_CYCLE()
+        {
+            cv::gpu::Sobel(d_src, d_dst, -1, 1, 1, d_buf, ksize);
+        }
+    }
+    else
    {
-        cv::gpu::Sobel(src, dst, -1, 1, 1, buf, ksize);
+        cv::Mat dst;
+
+        cv::Sobel(src, dst, -1, 1, 1, ksize);
+
+        TEST_CYCLE()
+        {
+            cv::Sobel(src, dst, -1, 1, 1, ksize);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Filters, Sobel, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1)),
-    testing::Values(KernelSize(3), KernelSize(5), KernelSize(7), KernelSize(9), KernelSize(11), KernelSize(13), KernelSize(15))));
-
 //////////////////////////////////////////////////////////////////////
 // Scharr

-GPU_PERF_TEST(Scharr, cv::gpu::DeviceInfo, cv::Size, MatType)
+PERF_TEST_P(Sz_Type, Filters_Scharr, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(20.0);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);

-    cv::Mat src_host(size, type);
-    fill(src_host, 0.0, 255.0);
+    cv::Mat src(size, type);
+    fillRandom(src);

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst;
-    cv::gpu::GpuMat buf;
+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+        cv::gpu::GpuMat d_buf;

-    cv::gpu::Scharr(src, dst, -1, 1, 0, buf);
+        cv::gpu::Scharr(d_src, d_dst, -1, 1, 0, d_buf);

-    TEST_CYCLE()
+        TEST_CYCLE()
+        {
+            cv::gpu::Scharr(d_src, d_dst, -1, 1, 0, d_buf);
+        }
+    }
+    else
    {
-        cv::gpu::Scharr(src, dst, -1, 1, 0, buf);
+        cv::Mat dst;
+
+        cv::Scharr(src, dst, -1, 1, 0);
+
+        TEST_CYCLE()
+        {
+            cv::Scharr(src, dst, -1, 1, 0);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Filters, Scharr, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1))));
-
 //////////////////////////////////////////////////////////////////////
 // GaussianBlur

-GPU_PERF_TEST(GaussianBlur, cv::gpu::DeviceInfo, cv::Size, MatType, KernelSize)
+PERF_TEST_P(Sz_Type_KernelSz, Filters_GaussianBlur, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1), Values(3, 5, 7, 9, 11, 13, 15)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(20.0);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int ksize = GET_PARAM(3);
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);
+    int ksize = GET_PARAM(2);

-    cv::Mat src_host(size, type);
-    fill(src_host, 0.0, 255.0);
+    cv::Mat src(size, type);
+    fillRandom(src);

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst;
-    cv::gpu::GpuMat buf;
+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+        cv::gpu::GpuMat d_buf;

-    cv::gpu::GaussianBlur(src, dst, cv::Size(ksize, ksize), buf, 0.5);
+        cv::gpu::GaussianBlur(d_src, d_dst, cv::Size(ksize, ksize), d_buf, 0.5);

-    TEST_CYCLE()
+        TEST_CYCLE()
+        {
+            cv::gpu::GaussianBlur(d_src, d_dst, cv::Size(ksize, ksize), d_buf, 0.5);
+        }
+    }
+    else
    {
-        cv::gpu::GaussianBlur(src, dst, cv::Size(ksize, ksize), buf, 0.5);
+        cv::Mat dst;
+
+        cv::GaussianBlur(src, dst, cv::Size(ksize, ksize), 0.5);
+
+        TEST_CYCLE()
+        {
+            cv::GaussianBlur(src, dst, cv::Size(ksize, ksize), 0.5);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Filters, GaussianBlur, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1)),
-    testing::Values(KernelSize(3), KernelSize(5), KernelSize(7), KernelSize(9), KernelSize(11), KernelSize(13), KernelSize(15))));
-
 //////////////////////////////////////////////////////////////////////
 // Laplacian

-GPU_PERF_TEST(Laplacian, cv::gpu::DeviceInfo, cv::Size, MatType, KernelSize)
+PERF_TEST_P(Sz_Type_KernelSz, Filters_Laplacian, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1, 3)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(20.0);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int ksize = GET_PARAM(3);
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);
+    int ksize = GET_PARAM(2);

-    cv::Mat src_host(size, type);
-    fill(src_host, 0.0, 255.0);
+    cv::Mat src(size, type);
+    fillRandom(src);

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst;
+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;

-    cv::gpu::Laplacian(src, dst, -1, ksize);
+        cv::gpu::Laplacian(d_src, d_dst, -1, ksize);

-    TEST_CYCLE()
+        TEST_CYCLE()
+        {
+            cv::gpu::Laplacian(d_src, d_dst, -1, ksize);
+        }
+    }
+    else
    {
-        cv::gpu::Laplacian(src, dst, -1, ksize);
+        cv::Mat dst;
+
+        cv::Laplacian(src, dst, -1, ksize);
+
+        TEST_CYCLE()
+        {
+            cv::Laplacian(src, dst, -1, ksize);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Filters, Laplacian, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC4)),
-    testing::Values(KernelSize(1), KernelSize(3))));
-
 //////////////////////////////////////////////////////////////////////
 // Erode

-GPU_PERF_TEST(Erode, cv::gpu::DeviceInfo, cv::Size, MatType)
+PERF_TEST_P(Sz_Type, Filters_Erode, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(20.0);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);

-    cv::Mat src_host(size, type);
-    fill(src_host, 0.0, 255.0);
+    cv::Mat src(size, type);
+    fillRandom(src);

    cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst;
-    cv::gpu::GpuMat buf;
+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+        cv::gpu::GpuMat d_buf;

-    cv::gpu::erode(src, dst, ker, buf);
+        cv::gpu::erode(d_src, d_dst, ker, d_buf);

-    TEST_CYCLE()
+        TEST_CYCLE()
+        {
+            cv::gpu::erode(d_src, d_dst, ker, d_buf);
+        }
+    }
+    else
    {
-        cv::gpu::erode(src, dst, ker, buf);
+        cv::Mat dst;
+
+        cv::erode(src, dst, ker);
+
+        TEST_CYCLE()
+        {
+            cv::erode(src, dst, ker);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Filters, Erode, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4))));
-
 //////////////////////////////////////////////////////////////////////
 // Dilate

-GPU_PERF_TEST(Dilate, cv::gpu::DeviceInfo, cv::Size, MatType)
+PERF_TEST_P(Sz_Type, Filters_Dilate, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(20.0);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);

-    cv::Mat src_host(size, type);
-    fill(src_host, 0.0, 255.0);
+    cv::Mat src(size, type);
+    fillRandom(src);

    cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst;
-    cv::gpu::GpuMat buf;
+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+        cv::gpu::GpuMat d_buf;

-    cv::gpu::dilate(src, dst, ker, buf);
+        cv::gpu::dilate(d_src, d_dst, ker, d_buf);

-    TEST_CYCLE()
+        TEST_CYCLE()
+        {
+            cv::gpu::dilate(d_src, d_dst, ker, d_buf);
+        }
+    }
+    else
    {
-        cv::gpu::dilate(src, dst, ker, buf);
+        cv::Mat dst;
+
+        cv::dilate(src, dst, ker);
+
+        TEST_CYCLE()
+        {
+            cv::dilate(src, dst, ker);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Filters, Dilate, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4))));
-
 //////////////////////////////////////////////////////////////////////
 // MorphologyEx

 CV_ENUM(MorphOp, cv::MORPH_OPEN, cv::MORPH_CLOSE, cv::MORPH_GRADIENT, cv::MORPH_TOPHAT, cv::MORPH_BLACKHAT)
-#define ALL_MORPH_OPS testing::Values(MorphOp(cv::MORPH_OPEN), MorphOp(cv::MORPH_CLOSE), MorphOp(cv::MORPH_GRADIENT), MorphOp(cv::MORPH_TOPHAT), MorphOp(cv::MORPH_BLACKHAT))
+#define ALL_MORPH_OPS ValuesIn(MorphOp::all())

-GPU_PERF_TEST(MorphologyEx, cv::gpu::DeviceInfo, cv::Size, MatType, MorphOp)
+DEF_PARAM_TEST(Sz_Type_Op, cv::Size, MatType, MorphOp);
+
+PERF_TEST_P(Sz_Type_Op, Filters_MorphologyEx, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4), ALL_MORPH_OPS))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(20.0);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int morphOp = GET_PARAM(3);
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);
+    int morphOp = GET_PARAM(2);

-    cv::Mat src_host(size, type);
-    fill(src_host, 0.0, 255.0);
+    cv::Mat src(size, type);
+    fillRandom(src);

    cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst;
-    cv::gpu::GpuMat buf1;
-    cv::gpu::GpuMat buf2;
+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+        cv::gpu::GpuMat d_buf1;
+        cv::gpu::GpuMat d_buf2;

-    cv::gpu::morphologyEx(src, dst, morphOp, ker, buf1, buf2);
+        cv::gpu::morphologyEx(d_src, d_dst, morphOp, ker, d_buf1, d_buf2);

-    TEST_CYCLE()
+        TEST_CYCLE()
+        {
+            cv::gpu::morphologyEx(d_src, d_dst, morphOp, ker, d_buf1, d_buf2);
+        }
+    }
+    else
    {
-        cv::gpu::morphologyEx(src, dst, morphOp, ker, buf1, buf2);
+        cv::Mat dst;
+
+        cv::morphologyEx(src, dst, morphOp, ker);
+
+        TEST_CYCLE()
+        {
+            cv::morphologyEx(src, dst, morphOp, ker);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Filters, MorphologyEx, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4)),
-    ALL_MORPH_OPS));
-
 //////////////////////////////////////////////////////////////////////
 // Filter2D

-GPU_PERF_TEST(Filter2D, cv::gpu::DeviceInfo, cv::Size, MatType, KernelSize)
+PERF_TEST_P(Sz_Type_KernelSz, Filters_Filter2D, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(3, 5, 7, 9, 11, 13, 15)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    declare.time(20.0);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int ksize = GET_PARAM(3);
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);
+    int ksize = GET_PARAM(2);

-    cv::Mat src_host(size, type);
-    fill(src_host, 0.0, 255.0);
+    cv::Mat src(size, type);
+    fillRandom(src);

    cv::Mat kernel(ksize, ksize, CV_32FC1);
-    fill(kernel, 0.0, 1.0);
+    fillRandom(kernel, 0.0, 1.0);

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst;
+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;

-    cv::gpu::filter2D(src, dst, -1, kernel);
+        cv::gpu::filter2D(d_src, d_dst, -1, kernel);

-    TEST_CYCLE()
+        TEST_CYCLE()
+        {
+            cv::gpu::filter2D(d_src, d_dst, -1, kernel);
+        }
+    }
+    else
    {
-        cv::gpu::filter2D(src, dst, -1, kernel);
+        cv::Mat dst;
+
+        cv::filter2D(src, dst, -1, kernel);
+
+        TEST_CYCLE()
+        {
+            cv::filter2D(src, dst, -1, kernel);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(Filters, Filter2D, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC4)),
-    testing::Values(KernelSize(3), KernelSize(5), KernelSize(7), KernelSize(9), KernelSize(11), KernelSize(13), KernelSize(15))));
-
-#endif
+} // namespace
--- a/modules/gpu/perf/perf_imgproc.cpp
+++ b/modules/gpu/perf/perf_imgproc.cpp
--- a/modules/gpu/perf/perf_labeling.cpp
+++ b/modules/gpu/perf/perf_labeling.cpp
@ -1,75 +1,141 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-//                          License Agreement
-//               For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2008-2011, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//  * Redistributions of source code must retain the above copyright notice,
-//    this list of conditions and the following disclaimer.
-//
-//  * Redistributions in binary form must reproduce the above copyright notice,
-//    this list of conditions and the following disclaimer in the documentation
-//    and/or other materials provided with the distribution.
-//
-//  * The name of the copyright holders may not be used to endorse or promote products
-//    derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//M*/
-
 #include "perf_precomp.hpp"

-#ifdef HAVE_CUDA
+using namespace std;
+using namespace testing;
+
+namespace {
+
+DEF_PARAM_TEST_1(Image, string);

-GPU_PERF_TEST(ConnectedComponents, cv::gpu::DeviceInfo, cv::Size)
+struct GreedyLabeling
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    struct dot
+    {
+        int x;
+        int y;
+
+        static dot make(int i, int j)
+        {
+            dot d; d.x = i; d.y = j;
+            return d;
+        }
+    };
+
+    struct InInterval
+    {
+        InInterval(const int& _lo, const int& _hi) : lo(-_lo), hi(_hi) {};
+        const int lo, hi;
+
+        bool operator() (const unsigned char a, const unsigned char b) const
+        {
+            int d = a - b;
+            return lo <= d && d <= hi;
+        }
+
+	private:
+		InInterval& operator=(const InInterval&);
+
+
+    };
+
+    GreedyLabeling(cv::Mat img)
+    : image(img), _labels(image.size(), CV_32SC1, cv::Scalar::all(-1)) {stack = new dot[image.cols * image.rows];}
+
+    ~GreedyLabeling(){delete[] stack;}
+
+    void operator() (cv::Mat labels) const
+    {
+        labels.setTo(cv::Scalar::all(-1));
+        InInterval inInt(0, 2);
+        int cc = -1;
+
+        int* dist_labels = (int*)labels.data;
+        int pitch = static_cast<int>(labels.step1());
+
+        unsigned char* source = (unsigned char*)image.data;
+        int width = image.cols;
+        int height = image.rows;
+
+        for (int j = 0; j < image.rows; ++j)
+            for (int i = 0; i < image.cols; ++i)
+            {
+                if (dist_labels[j * pitch + i] != -1) continue;

-    cv::Mat image = readImage("gpu/labeling/aloe-disp.png", cv::IMREAD_GRAYSCALE);
+                dot* top = stack;
+                dot p = dot::make(i, j);
+                cc++;

-    // cv::threshold(image, image, 150, 255, CV_THRESH_BINARY);
+                dist_labels[j * pitch + i] = cc;

-    cv::gpu::GpuMat mask;
-    mask.create(image.rows, image.cols, CV_8UC1);
+                while (top >= stack)
+                {
+                    int*  dl = &dist_labels[p.y * pitch + p.x];
+                    unsigned char* sp = &source[p.y * image.step1() + p.x];

-    cv::gpu::GpuMat components;
-    components.create(image.rows, image.cols, CV_32SC1);
+                    dl[0] = cc;

-    cv::gpu::connectivityMask(cv::gpu::GpuMat(image), mask, cv::Scalar::all(0), cv::Scalar::all(2));
+                    //right
+                    if( p.x < (width - 1) && dl[ +1] == -1 && inInt(sp[0], sp[+1]))
+                        *top++ = dot::make(p.x + 1, p.y);

-    ASSERT_NO_THROW(cv::gpu::labelComponents(mask, components));
+                    //left
+                    if( p.x > 0 && dl[-1] == -1 && inInt(sp[0], sp[-1]))
+                        *top++ = dot::make(p.x - 1, p.y);

+                    //bottom
+                    if( p.y < (height - 1) && dl[+pitch] == -1 && inInt(sp[0], sp[+image.step1()]))
+                        *top++ = dot::make(p.x, p.y + 1);
+
+                    //top
+                    if( p.y > 0 && dl[-pitch] == -1 && inInt(sp[0], sp[-static_cast<int>(image.step1())]))
+                        *top++ = dot::make(p.x, p.y - 1);
+
+                    p = *--top;
+                }
+            }
+    }
+
+    cv::Mat image;
+    cv::Mat _labels;
+    dot* stack;
+};
+
+PERF_TEST_P(Image, Labeling_ConnectedComponents, Values<string>("gpu/labeling/aloe-disp.png"))
+{
    declare.time(1.0);

-    TEST_CYCLE()
+    cv::Mat image = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+
+    if (runOnGpu)
    {
-        cv::gpu::labelComponents(mask, components);
+        cv::gpu::GpuMat mask;
+        mask.create(image.rows, image.cols, CV_8UC1);
+
+        cv::gpu::GpuMat components;
+        components.create(image.rows, image.cols, CV_32SC1);
+
+        cv::gpu::connectivityMask(cv::gpu::GpuMat(image), mask, cv::Scalar::all(0), cv::Scalar::all(2));
+
+        ASSERT_NO_THROW(cv::gpu::labelComponents(mask, components));
+
+        TEST_CYCLE()
+        {
+            cv::gpu::labelComponents(mask, components);
+        }
    }
-}
+    else
+    {
+        GreedyLabeling host(image);

-INSTANTIATE_TEST_CASE_P(Labeling, ConnectedComponents, testing::Combine(ALL_DEVICES, testing::Values(cv::Size(261, 262))));
+        host(host._labels);
+
+        declare.time(1.0);
+
+        TEST_CYCLE()
+        {
+            host(host._labels);
+        }
+    }
+}

-#endif
+} // namespace
--- a/modules/gpu/perf/perf_main.cpp
+++ b/modules/gpu/perf/perf_main.cpp
@ -1,20 +0,0 @@
-#include "perf_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-int main(int argc, char **argv)
-{
-    testing::InitGoogleTest(&argc, argv);
-    perf::TestBase::Init(argc, argv);
-    return RUN_ALL_TESTS();
-}
-
-#else
-
-int main()
-{
-    printf("OpenCV was built without CUDA support\n");
-    return 0;
-}
-
-#endif
--- a/modules/gpu/perf/perf_matop.cpp
+++ b/modules/gpu/perf/perf_matop.cpp
@ -1,141 +1,169 @@
 #include "perf_precomp.hpp"

-#ifdef HAVE_CUDA
+using namespace std;
+using namespace testing;
+
+namespace {

 //////////////////////////////////////////////////////////////////////
 // SetTo

-GPU_PERF_TEST(SetTo, cv::gpu::DeviceInfo, cv::Size, MatType)
+PERF_TEST_P(Sz_Depth_Cn, MatOp_SetTo, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16U, CV_32F, CV_64F), Values(1, 3, 4)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    cv::Size size = GET_PARAM(0);
+    int depth = GET_PARAM(1);
+    int channels = GET_PARAM(2);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
+    int type = CV_MAKE_TYPE(depth, channels);

-    cv::gpu::GpuMat src(size, type);
    cv::Scalar val(1, 2, 3, 4);

-    src.setTo(val);
+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_src(size, type);
+
+        d_src.setTo(val);

-    TEST_CYCLE()
+        TEST_CYCLE()
+        {
+            d_src.setTo(val);
+        }
+    }
+    else
    {
+        cv::Mat src(size, type);
+
        src.setTo(val);
+
+        TEST_CYCLE()
+        {
+            src.setTo(val);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(MatOp, SetTo, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4),
-                    MatType(CV_64FC1), MatType(CV_64FC3), MatType(CV_64FC4))));
-
 //////////////////////////////////////////////////////////////////////
 // SetToMasked

-GPU_PERF_TEST(SetToMasked, cv::gpu::DeviceInfo, cv::Size, MatType)
+PERF_TEST_P(Sz_Depth_Cn, MatOp_SetToMasked, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16U, CV_32F, CV_64F), Values(1, 3, 4)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    cv::Size size = GET_PARAM(0);
+    int depth = GET_PARAM(1);
+    int channels = GET_PARAM(2);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
+    int type = CV_MAKE_TYPE(depth, channels);

-    cv::Mat src_host(size, type);
-    fill(src_host, 0, 255);
+    cv::Mat src(size, type);
+    fillRandom(src);

-    cv::Mat mask_host(size, CV_8UC1);
-    fill(mask_host, 0, 2);
+    cv::Mat mask(size, CV_8UC1);
+    fillRandom(mask, 0, 2);

-    cv::gpu::GpuMat src(src_host);
    cv::Scalar val(1, 2, 3, 4);
-    cv::gpu::GpuMat mask(mask_host);

-    src.setTo(val, mask);
+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_mask(mask);
+
+        d_src.setTo(val, d_mask);

-    TEST_CYCLE()
+        TEST_CYCLE()
+        {
+            d_src.setTo(val, d_mask);
+        }
+    }
+    else
    {
        src.setTo(val, mask);
+
+        TEST_CYCLE()
+        {
+            src.setTo(val, mask);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(MatOp, SetToMasked, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4),
-                    MatType(CV_64FC1), MatType(CV_64FC3), MatType(CV_64FC4))));
-
 //////////////////////////////////////////////////////////////////////
 // CopyToMasked

-GPU_PERF_TEST(CopyToMasked, cv::gpu::DeviceInfo, cv::Size, MatType)
+PERF_TEST_P(Sz_Depth_Cn, MatOp_CopyToMasked, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16U, CV_32F, CV_64F), Values(1, 3, 4)))
 {
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+    cv::Size size = GET_PARAM(0);
+    int depth = GET_PARAM(1);
+    int channels = GET_PARAM(2);

-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
+    int type = CV_MAKE_TYPE(depth, channels);

-    cv::Mat src_host(size, type);
-    fill(src_host, 0, 255);
+    cv::Mat src(size, type);
+    fillRandom(src);

-    cv::Mat mask_host(size, CV_8UC1);
-    fill(mask_host, 0, 2);
+    cv::Mat mask(size, CV_8UC1);
+    fillRandom(mask, 0, 2);

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat mask(mask_host);
-    cv::gpu::GpuMat dst;
+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_mask(mask);
+        cv::gpu::GpuMat d_dst;

-    src.copyTo(dst, mask);
+        d_src.copyTo(d_dst, d_mask);

-    TEST_CYCLE()
+        TEST_CYCLE()
+        {
+            d_src.copyTo(d_dst, d_mask);
+        }
+    }
+    else
    {
+        cv::Mat dst;
+
        src.copyTo(dst, mask);
+
+        TEST_CYCLE()
+        {
+            src.copyTo(dst, mask);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(MatOp, CopyToMasked, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4),
-                    MatType(CV_64FC1), MatType(CV_64FC3), MatType(CV_64FC4))));
-
 //////////////////////////////////////////////////////////////////////
 // ConvertTo

-GPU_PERF_TEST(ConvertTo, cv::gpu::DeviceInfo, cv::Size, MatDepth, MatDepth)
-{
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
+DEF_PARAM_TEST(Sz_2Depth, cv::Size, MatDepth, MatDepth);

-    cv::Size size = GET_PARAM(1);
-    int depth1 = GET_PARAM(2);
-    int depth2 = GET_PARAM(3);
+PERF_TEST_P(Sz_2Depth, MatOp_ConvertTo, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16U, CV_32F, CV_64F), Values(CV_8U, CV_16U, CV_32F, CV_64F)))
+{
+    cv::Size size = GET_PARAM(0);
+    int depth1 = GET_PARAM(1);
+    int depth2 = GET_PARAM(2);

-    cv::Mat src_host(size, depth1);
-    fill(src_host, 0, 255);
+    cv::Mat src(size, depth1);
+    fillRandom(src);

-    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst;
+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;

-    src.convertTo(dst, depth2, 0.5, 1.0);
+        d_src.convertTo(d_dst, depth2, 0.5, 1.0);

-    TEST_CYCLE()
+        TEST_CYCLE()
+        {
+            d_src.convertTo(d_dst, depth2, 0.5, 1.0);
+        }
+    }
+    else
    {
+        cv::Mat dst;
+
        src.convertTo(dst, depth2, 0.5, 1.0);
+
+        TEST_CYCLE()
+        {
+            src.convertTo(dst, depth2, 0.5, 1.0);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(MatOp, ConvertTo, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_32F), MatDepth(CV_64F)),
-    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_32F), MatDepth(CV_64F))));
-
-#endif
+} // namespace
--- a/modules/gpu/perf/perf_objdetect.cpp
+++ b/modules/gpu/perf/perf_objdetect.cpp
@ -1,85 +1,131 @@
 #include "perf_precomp.hpp"

-#ifdef HAVE_CUDA
+using namespace std;
+using namespace testing;
+
+namespace {

 ///////////////////////////////////////////////////////////////
 // HOG

-GPU_PERF_TEST_1(HOG, cv::gpu::DeviceInfo)
-{
-    cv::gpu::DeviceInfo devInfo = GetParam();
-    cv::gpu::setDevice(devInfo.deviceID());
+DEF_PARAM_TEST_1(Image, string);

-    cv::Mat img_host = readImage("gpu/hog/road.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_host.empty());
+PERF_TEST_P(Image, ObjDetect_HOG, Values<string>("gpu/hog/road.png"))
+{
+    cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());

-    cv::gpu::GpuMat img(img_host);
    std::vector<cv::Rect> found_locations;

-    cv::gpu::HOGDescriptor hog;
-    hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());
+    if (runOnGpu)
+    {
+        cv::gpu::GpuMat d_img(img);
+
+        cv::gpu::HOGDescriptor d_hog;
+        d_hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());

-    hog.detectMultiScale(img, found_locations);
+        d_hog.detectMultiScale(d_img, found_locations);

-    TEST_CYCLE()
+        TEST_CYCLE()
+        {
+            d_hog.detectMultiScale(d_img, found_locations);
+        }
+    }
+    else
    {
+        cv::HOGDescriptor hog;
+        hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());
+
        hog.detectMultiScale(img, found_locations);
+
+        TEST_CYCLE()
+        {
+            hog.detectMultiScale(img, found_locations);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(ObjDetect, HOG, ALL_DEVICES);
-
 ///////////////////////////////////////////////////////////////
 // HaarClassifier

-GPU_PERF_TEST_1(HaarClassifier, cv::gpu::DeviceInfo)
+typedef pair<string, string> pair_string;
+DEF_PARAM_TEST_1(ImageAndCascade, pair_string);
+
+PERF_TEST_P(ImageAndCascade, ObjDetect_HaarClassifier,
+    Values<pair_string>(make_pair("gpu/haarcascade/group_1_640x480_VGA.pgm", "gpu/perf/haarcascade_frontalface_alt.xml")))
 {
-    cv::gpu::DeviceInfo devInfo = GetParam();
-    cv::gpu::setDevice(devInfo.deviceID());
+    cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());

-    cv::Mat img_host = readImage("gpu/haarcascade/group_1_640x480_VGA.pgm", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_host.empty());
+    if (runOnGpu)
+    {
+        cv::gpu::CascadeClassifier_GPU d_cascade;
+        ASSERT_TRUE(d_cascade.load(perf::TestBase::getDataPath(GetParam().second)));

-    cv::gpu::CascadeClassifier_GPU cascade;
+        cv::gpu::GpuMat d_img(img);
+        cv::gpu::GpuMat d_objects_buffer;

-    ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/perf/haarcascade_frontalface_alt.xml")));
+        d_cascade.detectMultiScale(d_img, d_objects_buffer);

-    cv::gpu::GpuMat img(img_host);
-    cv::gpu::GpuMat objects_buffer;
+        TEST_CYCLE()
+        {
+            d_cascade.detectMultiScale(d_img, d_objects_buffer);
+        }
+    }
+    else
+    {
+        cv::CascadeClassifier cascade;
+        ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/perf/haarcascade_frontalface_alt.xml")));

-    cascade.detectMultiScale(img, objects_buffer);
+        std::vector<cv::Rect> rects;

-    TEST_CYCLE()
-    {
-        cascade.detectMultiScale(img, objects_buffer);
+        cascade.detectMultiScale(img, rects);
+
+        TEST_CYCLE()
+        {
+            cascade.detectMultiScale(img, rects);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(ObjDetect, HaarClassifier, ALL_DEVICES);
+///////////////////////////////////////////////////////////////
+// LBP cascade

-//===================== LBP cascade ==========================//
-GPU_PERF_TEST_1(LBPClassifier, cv::gpu::DeviceInfo)
+PERF_TEST_P(ImageAndCascade, ObjDetect_LBPClassifier,
+    Values<pair_string>(make_pair("gpu/haarcascade/group_1_640x480_VGA.pgm", "gpu/lbpcascade/lbpcascade_frontalface.xml")))
 {
-    cv::gpu::DeviceInfo devInfo = GetParam();
-    cv::gpu::setDevice(devInfo.deviceID());
-
-    cv::Mat img_host = readImage("gpu/haarcascade/group_1_640x480_VGA.pgm", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_host.empty());
+    cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());

+    if (runOnGpu)
+    {
+        cv::gpu::CascadeClassifier_GPU d_cascade;
+        ASSERT_TRUE(d_cascade.load(perf::TestBase::getDataPath(GetParam().second)));

+        cv::gpu::GpuMat d_img(img);
+        cv::gpu::GpuMat d_gpu_rects;

-    cv::gpu::GpuMat img(img_host);
-        cv::gpu::GpuMat gpu_rects;
-    cv::gpu::CascadeClassifier_GPU cascade;
-    ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/lbpcascade/lbpcascade_frontalface.xml")));
+        d_cascade.detectMultiScale(d_img, d_gpu_rects);

-    cascade.detectMultiScale(img, gpu_rects);
-    TEST_CYCLE()
+        TEST_CYCLE()
+        {
+            d_cascade.detectMultiScale(d_img, d_gpu_rects);
+        }
+    }
+    else
    {
-        cascade.detectMultiScale(img, gpu_rects);
+        cv::CascadeClassifier cascade;
+        ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/lbpcascade/lbpcascade_frontalface.xml")));
+
+        std::vector<cv::Rect> rects;
+
+        cascade.detectMultiScale(img, rects);
+
+        TEST_CYCLE()
+        {
+            cascade.detectMultiScale(img, rects);
+        }
    }
 }

-INSTANTIATE_TEST_CASE_P(ObjDetect, LBPClassifier, ALL_DEVICES);
-
-#endif
+} // namespace
--- a/modules/gpu/perf/perf_precomp.hpp
+++ b/modules/gpu/perf/perf_precomp.hpp
@ -11,6 +11,10 @@

 #include "cvconfig.h"

+#ifdef HAVE_CUDA
+#include <cuda_runtime.h>
+#endif
+
 #include "opencv2/ts/ts.hpp"
 #include "opencv2/ts/ts_perf.hpp"

@ -18,8 +22,12 @@
 #include "opencv2/highgui/highgui.hpp"
 #include "opencv2/gpu/gpu.hpp"
 #include "opencv2/calib3d/calib3d.hpp"
+#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/video/video.hpp"
+#include "opencv2/nonfree/nonfree.hpp"
+#include "opencv2/legacy/legacy.hpp"

-#include "perf_utility.hpp"
+#include "utility.hpp"

 #ifdef GTEST_CREATE_SHARED_LIBRARY
 #error no modules except ts should have GTEST_CREATE_SHARED_LIBRARY defined
--- a/modules/gpu/perf/perf_utility.hpp
+++ b/modules/gpu/perf/perf_utility.hpp
@ -1,77 +0,0 @@
-#ifndef __OPENCV_PERF_GPU_UTILITY_HPP__
-#define __OPENCV_PERF_GPU_UTILITY_HPP__
-
-void fill(cv::Mat& m, double a, double b);
-
-using perf::MatType;
-using perf::MatDepth;
-
-CV_ENUM(BorderMode, cv::BORDER_REFLECT101, cv::BORDER_REPLICATE, cv::BORDER_CONSTANT, cv::BORDER_REFLECT, cv::BORDER_WRAP)
-        CV_ENUM(Interpolation, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC, cv::INTER_AREA)
-CV_ENUM(NormType, cv::NORM_INF, cv::NORM_L1, cv::NORM_L2, cv::NORM_HAMMING)
-
-struct CvtColorInfo
-{
-    int scn;
-    int dcn;
-    int code;
-
-    explicit CvtColorInfo(int scn_=0, int dcn_=0, int code_=0) : scn(scn_), dcn(dcn_), code(code_) {}
-};
-
-void PrintTo(const CvtColorInfo& info, std::ostream* os);
-
-#define IMPLEMENT_PARAM_CLASS(name, type) \
-    class name \
-    { \
-    public: \
-        name ( type arg = type ()) : val_(arg) {} \
-        operator type () const {return val_;} \
-    private: \
-        type val_; \
-    }; \
-    inline void PrintTo( name param, std::ostream* os) \
-    { \
-        *os << #name <<  " = " << testing::PrintToString(static_cast< type >(param)); \
-    }
-
-IMPLEMENT_PARAM_CLASS(Channels, int)
-
-namespace cv { namespace gpu
-{
-    void PrintTo(const cv::gpu::DeviceInfo& info, std::ostream* os);
-}}
-
-#define GPU_PERF_TEST(name, ...) \
-    struct name : perf::TestBaseWithParam< std::tr1::tuple< __VA_ARGS__ > > \
-    { \
-    public: \
-        name() {} \
-    protected: \
-        void PerfTestBody(); \
-    }; \
-    TEST_P(name, perf){ RunPerfTestBody(); } \
-    void name :: PerfTestBody()
-
-#define GPU_PERF_TEST_1(name, param_type) \
-    struct name : perf::TestBaseWithParam< param_type > \
-    { \
-    public: \
-        name() {} \
-    protected: \
-        void PerfTestBody(); \
-    }; \
-    TEST_P(name, perf){ RunPerfTestBody(); } \
-    void name :: PerfTestBody()
-
-#define GPU_TYPICAL_MAT_SIZES testing::Values(perf::szSXGA, perf::sz1080p, cv::Size(1800, 1500))
-
-cv::Mat readImage(const std::string& fileName, int flags = cv::IMREAD_COLOR);
-
-const std::vector<cv::gpu::DeviceInfo>& devices();
-
-#define ALL_DEVICES testing::ValuesIn(devices())
-
-#define GET_PARAM(k) std::tr1::get< k >(GetParam())
-
-#endif // __OPENCV_PERF_GPU_UTILITY_HPP__
--- a/modules/gpu/perf/perf_video.cpp
+++ b/modules/gpu/perf/perf_video.cpp
--- a/modules/gpu/perf/perf_utility.cpp
+++ b/modules/gpu/perf/perf_utility.cpp
@ -4,12 +4,19 @@ using namespace std;
 using namespace cv;
 using namespace cv::gpu;

-void fill(Mat& m, double a, double b)
+bool runOnGpu = true;
+
+void fillRandom(Mat& m, double a, double b)
 {
    RNG rng(123456789);
    rng.fill(m, RNG::UNIFORM, Scalar::all(a), Scalar::all(b));
 }

+Mat readImage(const string& fileName, int flags)
+{
+    return imread(perf::TestBase::getDataPath(fileName), flags);
+}
+
 void PrintTo(const CvtColorInfo& info, ostream* os)
 {
    static const char* str[] =
@ -184,37 +191,3 @@ void PrintTo(const CvtColorInfo& info, ostream* os)

    *os << str[info.code];
 }
-
-void cv::gpu::PrintTo(const DeviceInfo& info, ostream* os)
-{
-    *os << info.name();
-}
-
-Mat readImage(const string& fileName, int flags)
-{
-    return imread(perf::TestBase::getDataPath(fileName), flags);
-}
-
-const vector<DeviceInfo>& devices()
-{
-    static vector<DeviceInfo> devs;
-    static bool first = true;
-
-    if (first)
-    {
-        int deviceCount = getCudaEnabledDeviceCount();
-
-        devs.reserve(deviceCount);
-
-        for (int i = 0; i < deviceCount; ++i)
-        {
-            DeviceInfo info(i);
-            if (info.isCompatible())
-                devs.push_back(info);
-        }
-
-        first = false;
-    }
-
-    return devs;
-}
--- a/modules/gpu/perf/utility.hpp
+++ b/modules/gpu/perf/utility.hpp
@ -0,0 +1,45 @@
+#ifndef __OPENCV_PERF_GPU_UTILITY_HPP__
+#define __OPENCV_PERF_GPU_UTILITY_HPP__
+
+#include "opencv2/core/core.hpp"
+#include "opencv2/core/gpumat.hpp"
+#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/ts/ts_perf.hpp"
+
+extern bool runOnGpu;
+
+void fillRandom(cv::Mat& m, double a = 0.0, double b = 255.0);
+cv::Mat readImage(const std::string& fileName, int flags = cv::IMREAD_COLOR);
+
+using perf::MatType;
+using perf::MatDepth;
+
+CV_ENUM(BorderMode, cv::BORDER_REFLECT101, cv::BORDER_REPLICATE, cv::BORDER_CONSTANT, cv::BORDER_REFLECT, cv::BORDER_WRAP)
+#define ALL_BORDER_MODES testing::ValuesIn(BorderMode::all())
+CV_ENUM(Interpolation, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC, cv::INTER_AREA)
+#define ALL_INTERPOLATIONS testing::ValuesIn(Interpolation::all())
+CV_ENUM(NormType, cv::NORM_INF, cv::NORM_L1, cv::NORM_L2, cv::NORM_HAMMING)
+
+struct CvtColorInfo
+{
+    int scn;
+    int dcn;
+    int code;
+
+    explicit CvtColorInfo(int scn_=0, int dcn_=0, int code_=0) : scn(scn_), dcn(dcn_), code(code_) {}
+};
+void PrintTo(const CvtColorInfo& info, std::ostream* os);
+
+#define GET_PARAM(k) std::tr1::get< k >(GetParam())
+
+#define DEF_PARAM_TEST(name, ...) typedef ::perf::TestBaseWithParam< std::tr1::tuple< __VA_ARGS__ > > name
+#define DEF_PARAM_TEST_1(name, param_type) typedef ::perf::TestBaseWithParam< param_type > name
+
+DEF_PARAM_TEST_1(Sz, cv::Size);
+typedef perf::Size_MatType Sz_Type;
+DEF_PARAM_TEST(Sz_Depth, cv::Size, MatDepth);
+DEF_PARAM_TEST(Sz_Depth_Cn, cv::Size, MatDepth, int);
+
+#define GPU_TYPICAL_MAT_SIZES testing::Values(perf::szSXGA, perf::sz720p, perf::sz1080p)
+
+#endif // __OPENCV_PERF_GPU_UTILITY_HPP__
--- a/modules/gpu/perf_cpu/perf_calib3d.cpp
+++ b/modules/gpu/perf_cpu/perf_calib3d.cpp
@ -1,136 +0,0 @@
-#include "perf_cpu_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-//////////////////////////////////////////////////////////////////////
-// StereoBM
-
-GPU_PERF_TEST_1(StereoBM, cv::gpu::DeviceInfo)
-{
-    cv::Mat img_l = readImage("gpu/perf/aloe.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_l.empty());
-
-    cv::Mat img_r = readImage("gpu/perf/aloeR.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img_r.empty());
-
-    cv::StereoBM bm(0, 256);
-
-    cv::Mat dst;
-
-    bm(img_l, img_r, dst);
-
-    declare.time(5.0);
-
-    TEST_CYCLE()
-    {
-        bm(img_l, img_r, dst);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Calib3D, StereoBM, ALL_DEVICES);
-
-//////////////////////////////////////////////////////////////////////
-// ProjectPoints
-
-IMPLEMENT_PARAM_CLASS(Count, int)
-
-GPU_PERF_TEST(ProjectPoints, cv::gpu::DeviceInfo, Count)
-{
-    int count = GET_PARAM(1);
-
-    cv::Mat src(1, count, CV_32FC3);
-    fill(src, -100, 100);
-
-    cv::Mat rvec = cv::Mat::ones(1, 3, CV_32FC1);
-    cv::Mat tvec = cv::Mat::ones(1, 3, CV_32FC1);
-    cv::Mat camera_mat = cv::Mat::ones(3, 3, CV_32FC1);
-    cv::Mat dst;
-
-    cv::projectPoints(src, rvec, tvec, camera_mat, cv::noArray(), dst);
-
-    TEST_CYCLE()
-    {
-        cv::projectPoints(src, rvec, tvec, camera_mat, cv::noArray(), dst);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Calib3D, ProjectPoints, testing::Combine(
-    ALL_DEVICES,
-    testing::Values<Count>(5000, 10000, 20000)));
-
-//////////////////////////////////////////////////////////////////////
-// SolvePnPRansac
-
-GPU_PERF_TEST(SolvePnPRansac, cv::gpu::DeviceInfo, Count)
-{
-    int count = GET_PARAM(1);
-
-    cv::Mat object(1, count, CV_32FC3);
-    fill(object, -100, 100);
-
-    cv::Mat camera_mat(3, 3, CV_32FC1);
-    fill(camera_mat, 0.5, 1);
-    camera_mat.at<float>(0, 1) = 0.f;
-    camera_mat.at<float>(1, 0) = 0.f;
-    camera_mat.at<float>(2, 0) = 0.f;
-    camera_mat.at<float>(2, 1) = 0.f;
-
-    cv::Mat dist_coef(1, 8, CV_32F, cv::Scalar::all(0));
-
-    std::vector<cv::Point2f> image_vec;
-    cv::Mat rvec_gold(1, 3, CV_32FC1);
-    fill(rvec_gold, 0, 1);
-    cv::Mat tvec_gold(1, 3, CV_32FC1);
-    fill(tvec_gold, 0, 1);
-    cv::projectPoints(object, rvec_gold, tvec_gold, camera_mat, dist_coef, image_vec);
-
-    cv::Mat image(1, count, CV_32FC2, &image_vec[0]);
-
-    cv::Mat rvec;
-    cv::Mat tvec;
-
-    cv::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
-
-    declare.time(10.0);
-
-    TEST_CYCLE()
-    {
-        cv::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Calib3D, SolvePnPRansac, testing::Combine(
-    ALL_DEVICES,
-    testing::Values<Count>(5000, 10000, 20000)));
-
-//////////////////////////////////////////////////////////////////////
-// ReprojectImageTo3D
-
-GPU_PERF_TEST(ReprojectImageTo3D, cv::gpu::DeviceInfo, cv::Size, MatDepth)
-{
-    cv::Size size = GET_PARAM(1);
-    int depth = GET_PARAM(2);
-
-    cv::Mat src(size, depth);
-    fill(src, 5.0, 30.0);
-
-    cv::Mat Q(4, 4, CV_32FC1);
-    fill(Q, 0.1, 1.0);
-
-    cv::Mat dst;
-
-    cv::reprojectImageTo3D(src, dst, Q);
-
-    TEST_CYCLE()
-    {
-        cv::reprojectImageTo3D(src, dst, Q);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Calib3D, ReprojectImageTo3D, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values<MatDepth>(CV_8U, CV_16S)));
-
-#endif
-
--- a/modules/gpu/perf_cpu/perf_core.cpp
+++ b/modules/gpu/perf_cpu/perf_core.cpp
--- a/modules/gpu/perf_cpu/perf_cpu_precomp.cpp
+++ b/modules/gpu/perf_cpu/perf_cpu_precomp.cpp
@ -1 +0,0 @@
-#include "perf_cpu_precomp.hpp"
--- a/modules/gpu/perf_cpu/perf_cpu_precomp.hpp
+++ b/modules/gpu/perf_cpu/perf_cpu_precomp.hpp
@ -1,32 +0,0 @@
-#ifdef __GNUC__
-#  pragma GCC diagnostic ignored "-Wmissing-declarations"
-#  pragma GCC diagnostic ignored "-Wmissing-prototypes" //OSX
-#endif
-
-#ifndef __OPENCV_PERF_CPU_PRECOMP_HPP__
-#define __OPENCV_PERF_CPU_PRECOMP_HPP__
-
-#include <cstdio>
-#include <iostream>
-
-#include "cvconfig.h"
-
-#include "opencv2/ts/ts.hpp"
-#include "opencv2/ts/ts_perf.hpp"
-
-#include "opencv2/core/core.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/gpu/gpu.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
-#include "opencv2/video/video.hpp"
-#include "opencv2/calib3d/calib3d.hpp"
-#include "opencv2/nonfree/nonfree.hpp"
-#include "opencv2/legacy/legacy.hpp"
-
-#include "perf_utility.hpp"
-
-#ifdef GTEST_CREATE_SHARED_LIBRARY
-#error no modules except ts should have GTEST_CREATE_SHARED_LIBRARY defined
-#endif
-
-#endif
--- a/modules/gpu/perf_cpu/perf_features2d.cpp
+++ b/modules/gpu/perf_cpu/perf_features2d.cpp
@ -1,187 +0,0 @@
-#include "perf_cpu_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-//////////////////////////////////////////////////////////////////////
-// SURF
-
-GPU_PERF_TEST_1(SURF, cv::gpu::DeviceInfo)
-{
-    cv::Mat img = readImage("gpu/perf/aloe.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    cv::SURF surf;
-
-    std::vector<cv::KeyPoint> keypoints;
-    cv::Mat descriptors;
-
-    surf(img, cv::noArray(), keypoints, descriptors);
-
-    declare.time(50.0);
-
-    TEST_CYCLE()
-    {
-        keypoints.clear();
-        surf(img, cv::noArray(), keypoints, descriptors);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Features2D, SURF, ALL_DEVICES);
-
-//////////////////////////////////////////////////////////////////////
-// FAST
-
-GPU_PERF_TEST_1(FAST, cv::gpu::DeviceInfo)
-{
-    cv::Mat img = readImage("gpu/perf/aloe.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    std::vector<cv::KeyPoint> keypoints;
-
-    cv::FAST(img, keypoints, 20);
-
-    TEST_CYCLE()
-    {
-        keypoints.clear();
-        cv::FAST(img, keypoints, 20);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Features2D, FAST, ALL_DEVICES);
-
-//////////////////////////////////////////////////////////////////////
-// ORB
-
-GPU_PERF_TEST_1(ORB, cv::gpu::DeviceInfo)
-{
-    cv::Mat img = readImage("gpu/perf/aloe.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    cv::ORB orb(4000);
-
-    std::vector<cv::KeyPoint> keypoints;
-    cv::Mat descriptors;
-
-    orb(img, cv::noArray(), keypoints, descriptors);
-
-    TEST_CYCLE()
-    {
-        keypoints.clear();
-        orb(img, cv::noArray(), keypoints, descriptors);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Features2D, ORB, ALL_DEVICES);
-
-//////////////////////////////////////////////////////////////////////
-// BruteForceMatcher_match
-
-IMPLEMENT_PARAM_CLASS(DescriptorSize, int)
-
-GPU_PERF_TEST(BruteForceMatcher_match, cv::gpu::DeviceInfo, DescriptorSize, NormType)
-{
-    int desc_size = GET_PARAM(1);
-    int normType = GET_PARAM(2);
-
-    int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;
-
-    cv::Mat query(3000, desc_size, type);
-    fill(query, 0.0, 10.0);
-
-    cv::Mat train(3000, desc_size, type);
-    fill(train, 0.0, 10.0);
-
-    cv::BFMatcher matcher(normType);
-
-    std::vector<cv::DMatch> matches;
-
-    matcher.match(query, train, matches);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        matcher.match(query, train, matches);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Features2D, BruteForceMatcher_match, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(DescriptorSize(64), DescriptorSize(128), DescriptorSize(256)),
-    testing::Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))));
-
-//////////////////////////////////////////////////////////////////////
-// BruteForceMatcher_knnMatch
-
-IMPLEMENT_PARAM_CLASS(K, int)
-
-GPU_PERF_TEST(BruteForceMatcher_knnMatch, cv::gpu::DeviceInfo, DescriptorSize, K, NormType)
-{
-    int desc_size = GET_PARAM(1);
-    int k = GET_PARAM(2);
-    int normType = GET_PARAM(3);
-
-    int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;
-
-    cv::Mat query(3000, desc_size, type);
-    fill(query, 0.0, 10.0);
-
-    cv::Mat train(3000, desc_size, type);
-    fill(train, 0.0, 10.0);
-
-    cv::BFMatcher matcher(normType);
-
-    std::vector< std::vector<cv::DMatch> > matches;
-
-    matcher.knnMatch(query, train, matches, k);
-
-    declare.time(30.0);
-
-    TEST_CYCLE()
-    {
-        matcher.knnMatch(query, train, matches, k);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Features2D, BruteForceMatcher_knnMatch, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(DescriptorSize(64), DescriptorSize(128), DescriptorSize(256)),
-    testing::Values(K(2), K(3)),
-    testing::Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))));
-
-//////////////////////////////////////////////////////////////////////
-// BruteForceMatcher_radiusMatch
-
-GPU_PERF_TEST(BruteForceMatcher_radiusMatch, cv::gpu::DeviceInfo, DescriptorSize, NormType)
-{
-    int desc_size = GET_PARAM(1);
-    int normType = GET_PARAM(2);
-
-    int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;
-
-    cv::Mat query(3000, desc_size, type);
-    fill(query, 0.0, 1.0);
-
-    cv::Mat train(3000, desc_size, type);
-    fill(train, 0.0, 1.0);
-
-    cv::BFMatcher matcher(normType);
-
-    std::vector< std::vector<cv::DMatch> > matches;
-
-    matcher.radiusMatch(query, train, matches, 2.0);
-
-    declare.time(30.0);
-
-    TEST_CYCLE()
-    {
-        matcher.radiusMatch(query, train, matches, 2.0);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Features2D, BruteForceMatcher_radiusMatch, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(DescriptorSize(64), DescriptorSize(128), DescriptorSize(256)),
-    testing::Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))));
-
-#endif
--- a/modules/gpu/perf_cpu/perf_filters.cpp
+++ b/modules/gpu/perf_cpu/perf_filters.cpp
@ -1,283 +0,0 @@
-#include "perf_cpu_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-IMPLEMENT_PARAM_CLASS(KernelSize, int)
-
-//////////////////////////////////////////////////////////////////////
-// Blur
-
-GPU_PERF_TEST(Blur, cv::gpu::DeviceInfo, cv::Size, MatType, KernelSize)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int ksize = GET_PARAM(3);
-
-    cv::Mat src(size, type);
-    fill(src, 0.0, 255.0);
-
-    cv::Mat dst;
-
-    cv::blur(src, dst, cv::Size(ksize, ksize));
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::blur(src, dst, cv::Size(ksize, ksize));
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Filters, Blur, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4)),
-    testing::Values(KernelSize(3), KernelSize(5), KernelSize(7))));
-
-//////////////////////////////////////////////////////////////////////
-// Sobel
-
-GPU_PERF_TEST(Sobel, cv::gpu::DeviceInfo, cv::Size, MatType, KernelSize)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int ksize = GET_PARAM(3);
-
-    cv::Mat src(size, type);
-    fill(src, 0.0, 255.0);
-
-    cv::Mat dst;
-
-    cv::Sobel(src, dst, -1, 1, 1, ksize);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::Sobel(src, dst, -1, 1, 1, ksize);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Filters, Sobel, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1)),
-    testing::Values(KernelSize(3), KernelSize(5), KernelSize(7), KernelSize(9), KernelSize(11), KernelSize(13), KernelSize(15))));
-
-//////////////////////////////////////////////////////////////////////
-// Scharr
-
-GPU_PERF_TEST(Scharr, cv::gpu::DeviceInfo, cv::Size, MatType)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    fill(src, 0.0, 255.0);
-
-    cv::Mat dst;
-
-    cv::Scharr(src, dst, -1, 1, 0);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::Scharr(src, dst, -1, 1, 0);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Filters, Scharr, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1))));
-
-//////////////////////////////////////////////////////////////////////
-// GaussianBlur
-
-GPU_PERF_TEST(GaussianBlur, cv::gpu::DeviceInfo, cv::Size, MatType, KernelSize)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int ksize = GET_PARAM(3);
-
-    cv::Mat src(size, type);
-    fill(src, 0.0, 255.0);
-
-    cv::Mat dst;
-
-    cv::GaussianBlur(src, dst, cv::Size(ksize, ksize), 0.5);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::GaussianBlur(src, dst, cv::Size(ksize, ksize), 0.5);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Filters, GaussianBlur, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1)),
-    testing::Values(KernelSize(3), KernelSize(5), KernelSize(7), KernelSize(9), KernelSize(11), KernelSize(13), KernelSize(15))));
-
-//////////////////////////////////////////////////////////////////////
-// Laplacian
-
-GPU_PERF_TEST(Laplacian, cv::gpu::DeviceInfo, cv::Size, MatType, KernelSize)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int ksize = GET_PARAM(3);
-
-    cv::Mat src(size, type);
-    fill(src, 0.0, 255.0);
-
-    cv::Mat dst;
-
-    cv::Laplacian(src, dst, -1, ksize);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::Laplacian(src, dst, -1, ksize);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Filters, Laplacian, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC4)),
-    testing::Values(KernelSize(1), KernelSize(3))));
-
-//////////////////////////////////////////////////////////////////////
-// Erode
-
-GPU_PERF_TEST(Erode, cv::gpu::DeviceInfo, cv::Size, MatType)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    fill(src, 0.0, 255.0);
-
-    cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));
-
-    cv::Mat dst;
-
-    cv::erode(src, dst, ker);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::erode(src, dst, ker);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Filters, Erode, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4))));
-
-//////////////////////////////////////////////////////////////////////
-// Dilate
-
-GPU_PERF_TEST(Dilate, cv::gpu::DeviceInfo, cv::Size, MatType)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    fill(src, 0.0, 255.0);
-
-    cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));
-
-    cv::Mat dst;
-
-    cv::dilate(src, dst, ker);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::dilate(src, dst, ker);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Filters, Dilate, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4))));
-
-//////////////////////////////////////////////////////////////////////
-// MorphologyEx
-
-CV_ENUM(MorphOp, cv::MORPH_OPEN, cv::MORPH_CLOSE, cv::MORPH_GRADIENT, cv::MORPH_TOPHAT, cv::MORPH_BLACKHAT)
-#define ALL_MORPH_OPS testing::Values(MorphOp(cv::MORPH_OPEN), MorphOp(cv::MORPH_CLOSE), MorphOp(cv::MORPH_GRADIENT), MorphOp(cv::MORPH_TOPHAT), MorphOp(cv::MORPH_BLACKHAT))
-
-GPU_PERF_TEST(MorphologyEx, cv::gpu::DeviceInfo, cv::Size, MatType, MorphOp)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int morphOp = GET_PARAM(3);
-
-    cv::Mat src(size, type);
-    fill(src, 0.0, 255.0);
-
-    cv::Mat dst;
-
-    cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));
-
-    cv::morphologyEx(src, dst, morphOp, ker);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::morphologyEx(src, dst, morphOp, ker);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Filters, MorphologyEx, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4)),
-    ALL_MORPH_OPS));
-
-//////////////////////////////////////////////////////////////////////
-// Filter2D
-
-GPU_PERF_TEST(Filter2D, cv::gpu::DeviceInfo, cv::Size, MatType, KernelSize)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int ksize = GET_PARAM(3);
-
-    cv::Mat src(size, type);
-    fill(src, 0.0, 255.0);
-
-    cv::Mat kernel(ksize, ksize, CV_32FC1);
-    fill(kernel, 0.0, 1.0);
-
-    cv::Mat dst;
-
-    cv::filter2D(src, dst, -1, kernel);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::filter2D(src, dst, -1, kernel);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Filters, Filter2D, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC4)),
-    testing::Values(KernelSize(3), KernelSize(5), KernelSize(7), KernelSize(9), KernelSize(11), KernelSize(13), KernelSize(15))));
-
-#endif
--- a/modules/gpu/perf_cpu/perf_imgproc.cpp
+++ b/modules/gpu/perf_cpu/perf_imgproc.cpp
@ -1,771 +0,0 @@
-#include "perf_cpu_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-//////////////////////////////////////////////////////////////////////
-// Remap
-
-GPU_PERF_TEST(Remap, cv::gpu::DeviceInfo, cv::Size, MatType, Interpolation, BorderMode)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int interpolation = GET_PARAM(3);
-    int borderMode = GET_PARAM(4);
-
-    cv::Mat src(size, type);
-    fill(src, 0, 255);
-
-    cv::Mat xmap(size, CV_32FC1);
-    fill(xmap, 0, size.width);
-
-    cv::Mat ymap(size, CV_32FC1);
-    fill(ymap, 0, size.height);
-
-    cv::Mat dst;
-
-    cv::remap(src, dst, xmap, ymap, interpolation, borderMode);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::remap(src, dst, xmap, ymap, interpolation, borderMode);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, Remap, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
-    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
-    testing::Values(BorderMode(cv::BORDER_REFLECT101), BorderMode(cv::BORDER_REPLICATE), BorderMode(cv::BORDER_CONSTANT), BorderMode(cv::BORDER_REFLECT), BorderMode(cv::BORDER_WRAP))));
-
-
-//////////////////////////////////////////////////////////////////////
-// Resize
-
-IMPLEMENT_PARAM_CLASS(Scale, double)
-
-GPU_PERF_TEST(Resize, cv::gpu::DeviceInfo, cv::Size, MatType, Interpolation, Scale)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int interpolation = GET_PARAM(3);
-    double f = GET_PARAM(4);
-
-    cv::Mat src(size, type);
-    fill(src, 0, 255);
-
-    cv::Mat dst;
-
-    cv::resize(src, dst, cv::Size(), f, f, interpolation);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::resize(src, dst, cv::Size(), f, f, interpolation);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, Resize, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
-    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR),
-                    Interpolation(cv::INTER_CUBIC),   Interpolation(cv::INTER_AREA)),
-    testing::Values(Scale(0.5), Scale(0.3), Scale(2.0))));
-
-GPU_PERF_TEST(ResizeArea, cv::gpu::DeviceInfo, cv::Size, MatType, Scale)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int interpolation = cv::INTER_AREA;
-    double f = GET_PARAM(3);
-
-    cv::Mat src_host(size, type);
-    fill(src_host, 0, 255);
-
-    cv::Mat src(src_host);
-    cv::Mat dst;
-
-    cv::resize(src, dst, cv::Size(), f, f, interpolation);
-
-    declare.time(1.0);
-
-    TEST_CYCLE()
-    {
-        cv::resize(src, dst, cv::Size(), f, f, interpolation);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, ResizeArea, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(perf::sz1080p, cv::Size(4096, 2048)),
-    testing::Values(MatType(CV_8UC1)/*,  MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)*/),
-    testing::Values(Scale(0.2),Scale(0.1),Scale(0.05))));
-
-//////////////////////////////////////////////////////////////////////
-// WarpAffine
-
-GPU_PERF_TEST(WarpAffine, cv::gpu::DeviceInfo, cv::Size, MatType, Interpolation, BorderMode)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int interpolation = GET_PARAM(3);
-    int borderMode = GET_PARAM(4);
-
-    cv::Mat src(size, type);
-    fill(src, 0, 255);
-
-    cv::Mat dst;
-
-    const double aplha = CV_PI / 4;
-    double mat[2][3] = { {std::cos(aplha), -std::sin(aplha), src.cols / 2},
-                         {std::sin(aplha),  std::cos(aplha), 0}};
-    cv::Mat M(2, 3, CV_64F, (void*) mat);
-
-    cv::warpAffine(src, dst, M, size, interpolation, borderMode);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::warpAffine(src, dst, M, size, interpolation, borderMode);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, WarpAffine, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
-    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
-    testing::Values(BorderMode(cv::BORDER_REFLECT101), BorderMode(cv::BORDER_REPLICATE), BorderMode(cv::BORDER_CONSTANT), BorderMode(cv::BORDER_REFLECT), BorderMode(cv::BORDER_WRAP))));
-
-//////////////////////////////////////////////////////////////////////
-// WarpPerspective
-
-GPU_PERF_TEST(WarpPerspective, cv::gpu::DeviceInfo, cv::Size, MatType, Interpolation, BorderMode)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int interpolation = GET_PARAM(3);
-    int borderMode = GET_PARAM(4);
-
-    cv::Mat src(size, type);
-    fill(src, 0, 255);
-
-    cv::Mat dst;
-
-    const double aplha = CV_PI / 4;
-    double mat[3][3] = { {std::cos(aplha), -std::sin(aplha), src.cols / 2},
-                         {std::sin(aplha),  std::cos(aplha), 0},
-                         {0.0,              0.0,             1.0}};
-    cv::Mat M(3, 3, CV_64F, (void*) mat);
-
-    cv::warpPerspective(src, dst, M, size, interpolation, borderMode);
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::warpPerspective(src, dst, M, size, interpolation, borderMode);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, WarpPerspective, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
-    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
-    testing::Values(BorderMode(cv::BORDER_REFLECT101), BorderMode(cv::BORDER_REPLICATE), BorderMode(cv::BORDER_CONSTANT), BorderMode(cv::BORDER_REFLECT), BorderMode(cv::BORDER_WRAP))));
-
-//////////////////////////////////////////////////////////////////////
-// CopyMakeBorder
-
-GPU_PERF_TEST(CopyMakeBorder, cv::gpu::DeviceInfo, cv::Size, MatType, BorderMode)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-    int borderType = GET_PARAM(3);
-
-    cv::Mat src(size, type);
-    fill(src, 0, 255);
-
-    cv::Mat dst;
-
-    cv::copyMakeBorder(src, dst, 5, 5, 5, 5, borderType);
-
-    TEST_CYCLE()
-    {
-        cv::copyMakeBorder(src, dst, 5, 5, 5, 5, borderType);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, CopyMakeBorder, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
-    testing::Values(BorderMode(cv::BORDER_REFLECT101), BorderMode(cv::BORDER_REPLICATE), BorderMode(cv::BORDER_CONSTANT), BorderMode(cv::BORDER_REFLECT), BorderMode(cv::BORDER_WRAP))));
-
-//////////////////////////////////////////////////////////////////////
-// Threshold
-
-CV_ENUM(ThreshOp, cv::THRESH_BINARY, cv::THRESH_BINARY_INV, cv::THRESH_TRUNC, cv::THRESH_TOZERO, cv::THRESH_TOZERO_INV)
-#define ALL_THRESH_OPS testing::Values(ThreshOp(cv::THRESH_BINARY), ThreshOp(cv::THRESH_BINARY_INV), ThreshOp(cv::THRESH_TRUNC), ThreshOp(cv::THRESH_TOZERO), ThreshOp(cv::THRESH_TOZERO_INV))
-
-GPU_PERF_TEST(Threshold, cv::gpu::DeviceInfo, cv::Size, MatDepth, ThreshOp)
-{
-    cv::Size size = GET_PARAM(1);
-    int depth = GET_PARAM(2);
-    int threshOp = GET_PARAM(3);
-
-    cv::Mat src(size, depth);
-    fill(src, 0, 255);
-
-    cv::Mat dst;
-
-    cv::threshold(src, dst, 100.0, 255.0, threshOp);
-
-    TEST_CYCLE()
-    {
-        cv::threshold(src, dst, 100.0, 255.0, threshOp);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, Threshold, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_32F), MatDepth(CV_64F)),
-    ALL_THRESH_OPS));
-
-//////////////////////////////////////////////////////////////////////
-// Integral
-
-GPU_PERF_TEST(Integral, cv::gpu::DeviceInfo, cv::Size)
-{
-    cv::Size size = GET_PARAM(1);
-
-    cv::Mat src(size, CV_8UC1);
-    fill(src, 0, 255);
-
-    cv::Mat dst;
-
-    cv::integral(src, dst);
-
-    TEST_CYCLE()
-    {
-        cv::integral(src, dst);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, Integral, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES));
-
-//////////////////////////////////////////////////////////////////////
-// HistEven_OneChannel
-
-GPU_PERF_TEST(HistEven_OneChannel, cv::gpu::DeviceInfo, cv::Size, MatDepth)
-{
-    cv::Size size = GET_PARAM(1);
-    int depth = GET_PARAM(2);
-
-    cv::Mat src(size, depth);
-    fill(src, 0, 255);
-
-    int hbins = 30;
-    float hranges[] = {0.0f, 180.0f};
-    cv::Mat hist;
-    int histSize[] = {hbins};
-    const float* ranges[] = {hranges};
-    int channels[] = {0};
-
-    cv::calcHist(&src, 1, channels, cv::Mat(), hist, 1, histSize, ranges);
-
-    TEST_CYCLE()
-    {
-        cv::calcHist(&src, 1, channels, cv::Mat(), hist, 1, histSize, ranges);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, HistEven_OneChannel, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_16S))));
-
-//////////////////////////////////////////////////////////////////////
-// EqualizeHist
-
-GPU_PERF_TEST(EqualizeHist, cv::gpu::DeviceInfo, cv::Size)
-{
-    cv::Size size = GET_PARAM(1);
-
-    cv::Mat src(size, CV_8UC1);
-    fill(src, 0, 255);
-
-    cv::Mat dst;
-
-    cv::equalizeHist(src, dst);
-
-    TEST_CYCLE()
-    {
-        cv::equalizeHist(src, dst);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, EqualizeHist, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES));
-
-//////////////////////////////////////////////////////////////////////
-// Canny
-
-IMPLEMENT_PARAM_CLASS(AppertureSize, int)
-IMPLEMENT_PARAM_CLASS(L2gradient, bool)
-
-GPU_PERF_TEST(Canny, cv::gpu::DeviceInfo, AppertureSize, L2gradient)
-{
-    int apperture_size = GET_PARAM(1);
-    bool useL2gradient = GET_PARAM(2);
-
-    cv::Mat image = readImage("perf/1280x1024.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(image.empty());
-
-    cv::Mat dst;
-
-    cv::Canny(image, dst, 50.0, 100.0, apperture_size, useL2gradient);
-
-    TEST_CYCLE()
-    {
-        cv::Canny(image, dst, 50.0, 100.0, apperture_size, useL2gradient);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, Canny, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(AppertureSize(3), AppertureSize(5)),
-    testing::Values(L2gradient(false), L2gradient(true))));
-
-//////////////////////////////////////////////////////////////////////
-// MeanShiftFiltering
-
-GPU_PERF_TEST_1(MeanShiftFiltering, cv::gpu::DeviceInfo)
-{
-    cv::Mat img = readImage("gpu/meanshift/cones.png");
-    ASSERT_FALSE(img.empty());
-
-    cv::Mat dst;
-
-    cv::pyrMeanShiftFiltering(img, dst, 50, 50);
-
-    declare.time(15.0);
-
-    TEST_CYCLE()
-    {
-        cv::pyrMeanShiftFiltering(img, dst, 50, 50);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, MeanShiftFiltering, ALL_DEVICES);
-
-//////////////////////////////////////////////////////////////////////
-// Convolve
-
-IMPLEMENT_PARAM_CLASS(KSize, int)
-IMPLEMENT_PARAM_CLASS(Ccorr, bool)
-
-GPU_PERF_TEST(Convolve, cv::gpu::DeviceInfo, cv::Size, KSize, Ccorr)
-{
-    cv::Size size = GET_PARAM(1);
-    int templ_size = GET_PARAM(2);
-    bool ccorr = GET_PARAM(3);
-
-    ASSERT_FALSE(ccorr);
-
-    cv::Mat image(size, CV_32FC1);
-    image.setTo(1.0);
-
-    cv::Mat templ(templ_size, templ_size, CV_32FC1);
-    templ.setTo(1.0);
-
-    cv::Mat dst;
-
-    cv::filter2D(image, dst, image.depth(), templ);
-
-    declare.time(10.0);
-
-    TEST_CYCLE()
-    {
-        cv::filter2D(image, dst, image.depth(), templ);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, Convolve, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(KSize(3), KSize(9), KSize(17), KSize(27), KSize(32), KSize(64)),
-    testing::Values(Ccorr(false), Ccorr(true))));
-
-////////////////////////////////////////////////////////////////////////////////
-// MatchTemplate_8U
-
-CV_ENUM(TemplateMethod, cv::TM_SQDIFF, cv::TM_SQDIFF_NORMED, cv::TM_CCORR, cv::TM_CCORR_NORMED, cv::TM_CCOEFF, cv::TM_CCOEFF_NORMED)
-#define ALL_TEMPLATE_METHODS testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_SQDIFF_NORMED), TemplateMethod(cv::TM_CCORR), TemplateMethod(cv::TM_CCORR_NORMED), TemplateMethod(cv::TM_CCOEFF), TemplateMethod(cv::TM_CCOEFF_NORMED))
-
-IMPLEMENT_PARAM_CLASS(TemplateSize, cv::Size)
-
-GPU_PERF_TEST(MatchTemplate_8U, cv::gpu::DeviceInfo, cv::Size, TemplateSize, Channels, TemplateMethod)
-{
-    cv::Size size = GET_PARAM(1);
-    cv::Size templ_size = GET_PARAM(2);
-    int cn = GET_PARAM(3);
-    int method = GET_PARAM(4);
-
-    cv::Mat image(size, CV_MAKE_TYPE(CV_8U, cn));
-    fill(image, 0, 255);
-
-    cv::Mat templ(templ_size, CV_MAKE_TYPE(CV_8U, cn));
-    fill(templ, 0, 255);
-
-    cv::Mat dst;
-
-    cv::matchTemplate(image, templ, dst, method);
-
-    TEST_CYCLE()
-    {
-        cv::matchTemplate(image, templ, dst, method);
-    }
-};
-
-INSTANTIATE_TEST_CASE_P(ImgProc, MatchTemplate_8U, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16)), TemplateSize(cv::Size(30, 30))),
-    testing::Values(Channels(1), Channels(3), Channels(4)),
-    ALL_TEMPLATE_METHODS));
-
-////////////////////////////////////////////////////////////////////////////////
-// MatchTemplate_32F
-
-GPU_PERF_TEST(MatchTemplate_32F, cv::gpu::DeviceInfo, cv::Size, TemplateSize, Channels, TemplateMethod)
-{
-    cv::Size size = GET_PARAM(1);
-    cv::Size templ_size = GET_PARAM(2);
-    int cn = GET_PARAM(3);
-    int method = GET_PARAM(4);
-
-    cv::Mat image(size, CV_MAKE_TYPE(CV_32F, cn));
-    fill(image, 0, 255);
-
-    cv::Mat templ(templ_size, CV_MAKE_TYPE(CV_32F, cn));
-    fill(templ, 0, 255);
-
-    cv::Mat dst;
-
-    cv::matchTemplate(image, templ, dst, method);
-
-    TEST_CYCLE()
-    {
-        cv::matchTemplate(image, templ, dst, method);
-    }
-};
-
-INSTANTIATE_TEST_CASE_P(ImgProc, MatchTemplate_32F, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16)), TemplateSize(cv::Size(30, 30))),
-    testing::Values(Channels(1), Channels(3), Channels(4)),
-    testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR))));
-
-//////////////////////////////////////////////////////////////////////
-// MulSpectrums
-
-CV_FLAGS(DftFlags, 0, cv::DFT_INVERSE, cv::DFT_SCALE, cv::DFT_ROWS, cv::DFT_COMPLEX_OUTPUT, cv::DFT_REAL_OUTPUT)
-
-GPU_PERF_TEST(MulSpectrums, cv::gpu::DeviceInfo, cv::Size, DftFlags)
-{
-    cv::Size size = GET_PARAM(1);
-    int flag = GET_PARAM(2);
-
-    cv::Mat a(size, CV_32FC2);
-    fill(a, 0, 100);
-
-    cv::Mat b(size, CV_32FC2);
-    fill(b, 0, 100);
-
-    cv::Mat dst;
-
-    cv::mulSpectrums(a, b, dst, flag);
-
-    TEST_CYCLE()
-    {
-        cv::mulSpectrums(a, b, dst, flag);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, MulSpectrums, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(DftFlags(0), DftFlags(cv::DFT_ROWS))));
-
-//////////////////////////////////////////////////////////////////////
-// Dft
-
-GPU_PERF_TEST(Dft, cv::gpu::DeviceInfo, cv::Size, DftFlags)
-{
-    cv::Size size = GET_PARAM(1);
-    int flag = GET_PARAM(2);
-
-    cv::Mat src(size, CV_32FC2);
-    fill(src, 0, 100);
-
-    cv::Mat dst;
-
-    cv::dft(src, dst, flag);
-
-    declare.time(10.0);
-
-    TEST_CYCLE()
-    {
-        cv::dft(src, dst, flag);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, Dft, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(DftFlags(0), DftFlags(cv::DFT_ROWS), DftFlags(cv::DFT_INVERSE))));
-
-//////////////////////////////////////////////////////////////////////
-// CornerHarris
-
-IMPLEMENT_PARAM_CLASS(BlockSize, int)
-IMPLEMENT_PARAM_CLASS(ApertureSize, int)
-
-GPU_PERF_TEST(CornerHarris, cv::gpu::DeviceInfo, MatType, BorderMode, BlockSize, ApertureSize)
-{
-    int type = GET_PARAM(1);
-    int borderType = GET_PARAM(2);
-    int blockSize = GET_PARAM(3);
-    int apertureSize = GET_PARAM(4);
-
-    cv::Mat img = readImage("gpu/stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    img.convertTo(img, type, type == CV_32F ? 1.0 / 255.0 : 1.0);
-
-    cv::Mat dst;
-
-    double k = 0.5;
-
-    cv::cornerHarris(img, dst, blockSize, apertureSize, k, borderType);
-
-    TEST_CYCLE()
-    {
-        cv::cornerHarris(img, dst, blockSize, apertureSize, k, borderType);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, CornerHarris, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_32FC1)),
-    testing::Values(BorderMode(cv::BORDER_REFLECT101), BorderMode(cv::BORDER_REPLICATE), BorderMode(cv::BORDER_REFLECT)),
-    testing::Values(BlockSize(3), BlockSize(5), BlockSize(7)),
-    testing::Values(ApertureSize(0), ApertureSize(3), ApertureSize(5), ApertureSize(7))));
-
-//////////////////////////////////////////////////////////////////////
-// CornerMinEigenVal
-
-GPU_PERF_TEST(CornerMinEigenVal, cv::gpu::DeviceInfo, MatType, BorderMode, BlockSize, ApertureSize)
-{
-    int type = GET_PARAM(1);
-    int borderType = GET_PARAM(2);
-    int blockSize = GET_PARAM(3);
-    int apertureSize = GET_PARAM(4);
-
-    cv::Mat img = readImage("gpu/stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    img.convertTo(img, type, type == CV_32F ? 1.0 / 255.0 : 1.0);
-
-    cv::Mat dst;
-
-    cv::cornerMinEigenVal(img, dst, blockSize, apertureSize, borderType);
-
-    TEST_CYCLE()
-    {
-        cv::cornerMinEigenVal(img, dst, blockSize, apertureSize, borderType);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, CornerMinEigenVal, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_32FC1)),
-    testing::Values(BorderMode(cv::BORDER_REFLECT101), BorderMode(cv::BORDER_REPLICATE), BorderMode(cv::BORDER_REFLECT)),
-    testing::Values(BlockSize(3), BlockSize(5), BlockSize(7)),
-    testing::Values(ApertureSize(0), ApertureSize(3), ApertureSize(5), ApertureSize(7))));
-
-//////////////////////////////////////////////////////////////////////
-// PyrDown
-
-GPU_PERF_TEST(PyrDown, cv::gpu::DeviceInfo, cv::Size, MatType)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    fill(src, 0, 255);
-
-    cv::Mat dst;
-
-    cv::pyrDown(src, dst);
-
-    TEST_CYCLE()
-    {
-        cv::pyrDown(src, dst);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, PyrDown, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4))));
-
-//////////////////////////////////////////////////////////////////////
-// PyrUp
-
-GPU_PERF_TEST(PyrUp, cv::gpu::DeviceInfo, cv::Size, MatType)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    fill(src, 0, 255);
-
-    cv::Mat dst;
-
-    cv::pyrUp(src, dst);
-
-    TEST_CYCLE()
-    {
-        cv::pyrUp(src, dst);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, PyrUp, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4))));
-
-//////////////////////////////////////////////////////////////////////
-// CvtColor
-
-GPU_PERF_TEST(CvtColor, cv::gpu::DeviceInfo, cv::Size, MatDepth, CvtColorInfo)
-{
-    cv::Size size = GET_PARAM(1);
-    int depth = GET_PARAM(2);
-    CvtColorInfo info = GET_PARAM(3);
-
-    cv::Mat src(size, CV_MAKETYPE(depth, info.scn));
-    fill(src, 0, 255);
-
-    cv::Mat dst;
-
-    cv::cvtColor(src, dst, info.code, info.dcn);
-
-    TEST_CYCLE()
-    {
-        cv::cvtColor(src, dst, info.code, info.dcn);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, CvtColor, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_32F)),
-    testing::Values(CvtColorInfo(4, 4, cv::COLOR_RGBA2BGRA),
-                    CvtColorInfo(4, 1, cv::COLOR_BGRA2GRAY),
-                    CvtColorInfo(1, 4, cv::COLOR_GRAY2BGRA),
-                    CvtColorInfo(3, 3, cv::COLOR_BGR2XYZ),
-                    CvtColorInfo(3, 3, cv::COLOR_XYZ2BGR),
-                    CvtColorInfo(3, 3, cv::COLOR_BGR2YCrCb),
-                    CvtColorInfo(3, 3, cv::COLOR_YCrCb2BGR),
-                    CvtColorInfo(3, 3, cv::COLOR_BGR2YUV),
-                    CvtColorInfo(3, 3, cv::COLOR_YUV2BGR),
-                    CvtColorInfo(3, 3, cv::COLOR_BGR2HSV),
-                    CvtColorInfo(3, 3, cv::COLOR_HSV2BGR),
-                    CvtColorInfo(3, 3, cv::COLOR_BGR2HLS),
-                    CvtColorInfo(3, 3, cv::COLOR_HLS2BGR),
-                    CvtColorInfo(3, 3, cv::COLOR_BGR2Lab),
-                    CvtColorInfo(3, 3, cv::COLOR_RGB2Lab),
-                    CvtColorInfo(3, 3, cv::COLOR_BGR2Luv),
-                    CvtColorInfo(3, 3, cv::COLOR_RGB2Luv),
-                    CvtColorInfo(3, 3, cv::COLOR_Lab2BGR),
-                    CvtColorInfo(3, 3, cv::COLOR_Lab2RGB),
-                    CvtColorInfo(3, 3, cv::COLOR_Luv2BGR),
-                    CvtColorInfo(3, 3, cv::COLOR_Luv2RGB),
-                    CvtColorInfo(1, 3, cv::COLOR_BayerBG2BGR),
-                    CvtColorInfo(1, 3, cv::COLOR_BayerGB2BGR),
-                    CvtColorInfo(1, 3, cv::COLOR_BayerRG2BGR),
-                    CvtColorInfo(1, 3, cv::COLOR_BayerGR2BGR),
-                    CvtColorInfo(4, 4, cv::COLOR_RGBA2mRGBA))));
-
-//////////////////////////////////////////////////////////////////////
-// HoughLines
-
-IMPLEMENT_PARAM_CLASS(DoSort, bool)
-
-GPU_PERF_TEST(HoughLines, cv::gpu::DeviceInfo, cv::Size, DoSort)
-{
-    declare.time(30.0);
-
-    const cv::Size size = GET_PARAM(1);
-
-    const float rho = 1.0f;
-    const float theta = CV_PI / 180.0f;
-    const int threshold = 300;
-
-    cv::RNG rng(123456789);
-
-    cv::Mat src(size, CV_8UC1, cv::Scalar::all(0));
-
-    const int numLines = rng.uniform(500, 2000);
-    for (int i = 0; i < numLines; ++i)
-    {
-        cv::Point p1(rng.uniform(0, src.cols), rng.uniform(0, src.rows));
-        cv::Point p2(rng.uniform(0, src.cols), rng.uniform(0, src.rows));
-        cv::line(src, p1, p2, cv::Scalar::all(255), 2);
-    }
-
-    std::vector<cv::Vec2f> lines;
-    cv::HoughLines(src, lines, rho, theta, threshold);
-
-    TEST_CYCLE()
-    {
-        cv::HoughLines(src, lines, rho, theta, threshold);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ImgProc, HoughLines, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(DoSort(false), DoSort(true))));
-
-#endif
--- a/modules/gpu/perf_cpu/perf_labeling.cpp
+++ b/modules/gpu/perf_cpu/perf_labeling.cpp
@ -1,158 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-//                          License Agreement
-//               For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2008-2011, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//  * Redistributions of source code must retain the above copyright notice,
-//    this list of conditions and the following disclaimer.
-//
-//  * Redistributions in binary form must reproduce the above copyright notice,
-//    this list of conditions and the following disclaimer in the documentation
-//    and/or other materials provided with the distribution.
-//
-//  * The name of the copyright holders may not be used to endorse or promote products
-//    derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//M*/
-
-#include "perf_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-namespace {
-
-    struct GreedyLabeling
-    {
-        struct dot
-        {
-            int x;
-            int y;
-
-            static dot make(int i, int j)
-            {
-                dot d; d.x = i; d.y = j;
-                return d;
-            }
-        };
-
-        struct InInterval
-        {
-            InInterval(const int& _lo, const int& _hi) : lo(-_lo), hi(_hi) {};
-            const int lo, hi;
-
-            bool operator() (const unsigned char a, const unsigned char b) const
-            {
-                int d = a - b;
-                return lo <= d && d <= hi;
-            }
-        };
-
-        GreedyLabeling(cv::Mat img)
-        : image(img), _labels(image.size(), CV_32SC1, cv::Scalar::all(-1)) {stack = new dot[image.cols * image.rows];}
-
-        ~GreedyLabeling(){delete[] stack;}
-
-        void operator() (cv::Mat labels) const
-        {
-            labels.setTo(cv::Scalar::all(-1));
-            InInterval inInt(0, 2);
-            int cc = -1;
-
-            int* dist_labels = (int*)labels.data;
-            int pitch = labels.step1();
-
-            unsigned char* source = (unsigned char*)image.data;
-            int width = image.cols;
-            int height = image.rows;
-
-            for (int j = 0; j < image.rows; ++j)
-                for (int i = 0; i < image.cols; ++i)
-                {
-                    if (dist_labels[j * pitch + i] != -1) continue;
-
-                    dot* top = stack;
-                    dot p = dot::make(i, j);
-                    cc++;
-
-                    dist_labels[j * pitch + i] = cc;
-
-                    while (top >= stack)
-                    {
-                        int*  dl = &dist_labels[p.y * pitch + p.x];
-                        unsigned char* sp = &source[p.y * image.step1() + p.x];
-
-                        dl[0] = cc;
-
-                        //right
-                        if( p.x < (width - 1) && dl[ +1] == -1 && inInt(sp[0], sp[+1]))
-                            *top++ = dot::make(p.x + 1, p.y);
-
-                        //left
-                        if( p.x > 0 && dl[-1] == -1 && inInt(sp[0], sp[-1]))
-                            *top++ = dot::make(p.x - 1, p.y);
-
-                        //bottom
-                        if( p.y < (height - 1) && dl[+pitch] == -1 && inInt(sp[0], sp[+image.step1()]))
-                            *top++ = dot::make(p.x, p.y + 1);
-
-                        //top
-                        if( p.y > 0 && dl[-pitch] == -1 && inInt(sp[0], sp[-image.step1()]))
-                            *top++ = dot::make(p.x, p.y - 1);
-
-                        p = *--top;
-                    }
-                }
-        }
-
-        cv::Mat image;
-        cv::Mat _labels;
-        dot* stack;
-    };
-}
-
-GPU_PERF_TEST(ConnectedComponents, cv::gpu::DeviceInfo, cv::Size)
-{
-    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
-    cv::gpu::setDevice(devInfo.deviceID());
-
-    cv::Mat image = readImage("gpu/labeling/aloe-disp.png", cv::IMREAD_GRAYSCALE);
-
-    GreedyLabeling host(image);
-
-    host(host._labels);
-
-    declare.time(1.0);
-
-    TEST_CYCLE()
-    {
-        host(host._labels);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Labeling, ConnectedComponents, testing::Combine(ALL_DEVICES, testing::Values(cv::Size(261, 262))));
-
-#endif
--- a/modules/gpu/perf_cpu/perf_main.cpp
+++ b/modules/gpu/perf_cpu/perf_main.cpp
@ -1,20 +0,0 @@
-#include "perf_cpu_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-int main(int argc, char **argv)
-{
-    testing::InitGoogleTest(&argc, argv);
-    perf::TestBase::Init(argc, argv);
-    return RUN_ALL_TESTS();
-}
-
-#else
-
-int main()
-{
-    printf("OpenCV was built without CUDA support\n");
-    return 0;
-}
-
-#endif
--- a/modules/gpu/perf_cpu/perf_matop.cpp
+++ b/modules/gpu/perf_cpu/perf_matop.cpp
@ -1,124 +0,0 @@
-#include "perf_cpu_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-//////////////////////////////////////////////////////////////////////
-// SetTo
-
-GPU_PERF_TEST(SetTo, cv::gpu::DeviceInfo, cv::Size, MatType)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    cv::Scalar val(1, 2, 3, 4);
-
-    src.setTo(val);
-
-    TEST_CYCLE()
-    {
-        src.setTo(val);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(MatOp, SetTo, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4),
-                    MatType(CV_64FC1), MatType(CV_64FC3), MatType(CV_64FC4))));
-
-//////////////////////////////////////////////////////////////////////
-// SetToMasked
-
-GPU_PERF_TEST(SetToMasked, cv::gpu::DeviceInfo, cv::Size, MatType)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    fill(src, 0, 255);
-
-    cv::Mat mask(size, CV_8UC1);
-    fill(mask, 0, 2);
-
-    cv::Scalar val(1, 2, 3, 4);
-
-    src.setTo(val, mask);
-
-    TEST_CYCLE()
-    {
-        src.setTo(val, mask);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(MatOp, SetToMasked, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4),
-                    MatType(CV_64FC1), MatType(CV_64FC3), MatType(CV_64FC4))));
-
-//////////////////////////////////////////////////////////////////////
-// CopyToMasked
-
-GPU_PERF_TEST(CopyToMasked, cv::gpu::DeviceInfo, cv::Size, MatType)
-{
-    cv::Size size = GET_PARAM(1);
-    int type = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    fill(src, 0, 255);
-
-    cv::Mat mask(size, CV_8UC1);
-    fill(mask, 0, 2);
-
-    cv::Mat dst;
-
-    src.copyTo(dst, mask);
-
-    TEST_CYCLE()
-    {
-        src.copyTo(dst, mask);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(MatOp, CopyToMasked, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
-                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
-                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4),
-                    MatType(CV_64FC1), MatType(CV_64FC3), MatType(CV_64FC4))));
-
-//////////////////////////////////////////////////////////////////////
-// ConvertTo
-
-GPU_PERF_TEST(ConvertTo, cv::gpu::DeviceInfo, cv::Size, MatDepth, MatDepth)
-{
-    cv::Size size = GET_PARAM(1);
-    int depth1 = GET_PARAM(2);
-    int depth2 = GET_PARAM(3);
-
-    cv::Mat src(size, depth1);
-    fill(src, 0, 255);
-
-    cv::Mat dst;
-
-    src.convertTo(dst, depth2, 0.5, 1.0);
-
-    TEST_CYCLE()
-    {
-        src.convertTo(dst, depth2, 0.5, 1.0);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(MatOp, ConvertTo, testing::Combine(
-    ALL_DEVICES,
-    GPU_TYPICAL_MAT_SIZES,
-    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_32F), MatDepth(CV_64F)),
-    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_32F), MatDepth(CV_64F))));
-
-#endif
--- a/modules/gpu/perf_cpu/perf_objdetect.cpp
+++ b/modules/gpu/perf_cpu/perf_objdetect.cpp
@ -1,74 +0,0 @@
-#include "perf_cpu_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-///////////////////////////////////////////////////////////////
-// HOG
-
-GPU_PERF_TEST_1(HOG, cv::gpu::DeviceInfo)
-{
-    cv::Mat img = readImage("gpu/hog/road.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    std::vector<cv::Rect> found_locations;
-
-    cv::HOGDescriptor hog;
-    hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());
-
-    hog.detectMultiScale(img, found_locations);
-
-    TEST_CYCLE()
-    {
-        hog.detectMultiScale(img, found_locations);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ObjDetect, HOG, ALL_DEVICES);
-
-///////////////////////////////////////////////////////////////
-// HaarClassifier
-
-GPU_PERF_TEST_1(HaarClassifier, cv::gpu::DeviceInfo)
-{
-    cv::Mat img = readImage("gpu/haarcascade/group_1_640x480_VGA.pgm", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    cv::CascadeClassifier cascade;
-
-    ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/perf/haarcascade_frontalface_alt.xml")));
-
-    std::vector<cv::Rect> rects;
-
-    cascade.detectMultiScale(img, rects);
-
-    TEST_CYCLE()
-    {
-        cascade.detectMultiScale(img, rects);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ObjDetect, HaarClassifier, ALL_DEVICES);
-
-//===================== LBP cascade ==========================//
-GPU_PERF_TEST_1(LBPClassifier, cv::gpu::DeviceInfo)
-{
-    cv::Mat img = readImage("gpu/haarcascade/group_1_640x480_VGA.pgm", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    cv::CascadeClassifier cascade;
-
-    ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/lbpcascade/lbpcascade_frontalface.xml")));
-
-    std::vector<cv::Rect> rects;
-
-    cascade.detectMultiScale(img, rects);
-
-    TEST_CYCLE()
-    {
-        cascade.detectMultiScale(img, rects);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(ObjDetect, LBPClassifier, ALL_DEVICES);
-
-#endif
--- a/modules/gpu/perf_cpu/perf_utility.cpp
+++ b/modules/gpu/perf_cpu/perf_utility.cpp
@ -1,220 +0,0 @@
-#include "perf_cpu_precomp.hpp"
-
-using namespace std;
-using namespace cv;
-using namespace cv::gpu;
-
-void fill(Mat& m, double a, double b)
-{
-    RNG rng(123456789);
-    rng.fill(m, RNG::UNIFORM, a, b);
-}
-
-void PrintTo(const CvtColorInfo& info, ostream* os)
-{
-    static const char* str[] =
-    {
-        "BGR2BGRA",
-        "BGRA2BGR",
-        "BGR2RGBA",
-        "RGBA2BGR",
-        "BGR2RGB",
-        "BGRA2RGBA",
-
-        "BGR2GRAY",
-        "RGB2GRAY",
-        "GRAY2BGR",
-        "GRAY2BGRA",
-        "BGRA2GRAY",
-        "RGBA2GRAY",
-
-        "BGR2BGR565",
-        "RGB2BGR565",
-        "BGR5652BGR",
-        "BGR5652RGB",
-        "BGRA2BGR565",
-        "RGBA2BGR565",
-        "BGR5652BGRA",
-        "BGR5652RGBA",
-
-        "GRAY2BGR565",
-        "BGR5652GRAY",
-
-        "BGR2BGR555",
-        "RGB2BGR555",
-        "BGR5552BGR",
-        "BGR5552RGB",
-        "BGRA2BGR555",
-        "RGBA2BGR555",
-        "BGR5552BGRA",
-        "BGR5552RGBA",
-
-        "GRAY2BGR555",
-        "BGR5552GRAY",
-
-        "BGR2XYZ",
-        "RGB2XYZ",
-        "XYZ2BGR",
-        "XYZ2RGB",
-
-        "BGR2YCrCb",
-        "RGB2YCrCb",
-        "YCrCb2BGR",
-        "YCrCb2RGB",
-
-        "BGR2HSV",
-        "RGB2HSV",
-
-        "",
-        "",
-
-        "BGR2Lab",
-        "RGB2Lab",
-
-        "BayerBG2BGR",
-        "BayerGB2BGR",
-        "BayerRG2BGR",
-        "BayerGR2BGR",
-
-        "BGR2Luv",
-        "RGB2Luv",
-
-        "BGR2HLS",
-        "RGB2HLS",
-
-        "HSV2BGR",
-        "HSV2RGB",
-
-        "Lab2BGR",
-        "Lab2RGB",
-        "Luv2BGR",
-        "Luv2RGB",
-
-        "HLS2BGR",
-        "HLS2RGB",
-
-        "BayerBG2BGR_VNG",
-        "BayerGB2BGR_VNG",
-        "BayerRG2BGR_VNG",
-        "BayerGR2BGR_VNG",
-
-        "BGR2HSV_FULL",
-        "RGB2HSV_FULL",
-        "BGR2HLS_FULL",
-        "RGB2HLS_FULL",
-
-        "HSV2BGR_FULL",
-        "HSV2RGB_FULL",
-        "HLS2BGR_FULL",
-        "HLS2RGB_FULL",
-
-        "LBGR2Lab",
-        "LRGB2Lab",
-        "LBGR2Luv",
-        "LRGB2Luv",
-
-        "Lab2LBGR",
-        "Lab2LRGB",
-        "Luv2LBGR",
-        "Luv2LRGB",
-
-        "BGR2YUV",
-        "RGB2YUV",
-        "YUV2BGR",
-        "YUV2RGB",
-
-        "BayerBG2GRAY",
-        "BayerGB2GRAY",
-        "BayerRG2GRAY",
-        "BayerGR2GRAY",
-
-        //YUV 4:2:0 formats family
-        "YUV2RGB_NV12",
-        "YUV2BGR_NV12",
-        "YUV2RGB_NV21",
-        "YUV2BGR_NV21",
-
-        "YUV2RGBA_NV12",
-        "YUV2BGRA_NV12",
-        "YUV2RGBA_NV21",
-        "YUV2BGRA_NV21",
-
-        "YUV2RGB_YV12",
-        "YUV2BGR_YV12",
-        "YUV2RGB_IYUV",
-        "YUV2BGR_IYUV",
-
-        "YUV2RGBA_YV12",
-        "YUV2BGRA_YV12",
-        "YUV2RGBA_IYUV",
-        "YUV2BGRA_IYUV",
-
-        "YUV2GRAY_420",
-
-        //YUV 4:2:2 formats family
-        "YUV2RGB_UYVY",
-        "YUV2BGR_UYVY",
-        "YUV2RGB_VYUY",
-        "YUV2BGR_VYUY",
-
-        "YUV2RGBA_UYVY",
-        "YUV2BGRA_UYVY",
-        "YUV2RGBA_VYUY",
-        "YUV2BGRA_VYUY",
-
-        "YUV2RGB_YUY2",
-        "YUV2BGR_YUY2",
-        "YUV2RGB_YVYU",
-        "YUV2BGR_YVYU",
-
-        "YUV2RGBA_YUY2",
-        "YUV2BGRA_YUY2",
-        "YUV2RGBA_YVYU",
-        "YUV2BGRA_YVYU",
-
-        "YUV2GRAY_UYVY",
-        "YUV2GRAY_YUY2",
-
-        // alpha premultiplication
-        "RGBA2mRGBA",
-        "mRGBA2RGBA",
-
-        "COLORCVT_MAX"
-    };
-
-    *os << str[info.code];
-}
-
-void cv::gpu::PrintTo(const DeviceInfo& info, ostream* os)
-{
-    *os << info.name();
-}
-
-Mat readImage(const string& fileName, int flags)
-{
-    return imread(perf::TestBase::getDataPath(fileName), flags);
-}
-
-const vector<DeviceInfo>& devices()
-{
-    static vector<DeviceInfo> devs;
-    static bool first = true;
-
-    if (first)
-    {
-        int deviceCount = getCudaEnabledDeviceCount();
-
-        devs.reserve(deviceCount);
-
-        for (int i = 0; i < deviceCount; ++i)
-        {
-            DeviceInfo info(i);
-            if (info.isCompatible())
-                devs.push_back(info);
-        }
-
-        first = false;
-    }
-
-    return devs;
-}
--- a/modules/gpu/perf_cpu/perf_utility.hpp
+++ b/modules/gpu/perf_cpu/perf_utility.hpp
@ -1,77 +0,0 @@
-#ifndef __OPENCV_PERF_GPU_UTILITY_HPP__
-#define __OPENCV_PERF_GPU_UTILITY_HPP__
-
-void fill(cv::Mat& m, double a, double b);
-
-using perf::MatType;
-using perf::MatDepth;
-
-CV_ENUM(BorderMode, cv::BORDER_REFLECT101, cv::BORDER_REPLICATE, cv::BORDER_CONSTANT, cv::BORDER_REFLECT, cv::BORDER_WRAP)
-        CV_ENUM(Interpolation, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC, cv::INTER_AREA)
-CV_ENUM(NormType, cv::NORM_INF, cv::NORM_L1, cv::NORM_L2, cv::NORM_HAMMING)
-
-struct CvtColorInfo
-{
-    int scn;
-    int dcn;
-    int code;
-
-    explicit CvtColorInfo(int scn_=0, int dcn_=0, int code_=0) : scn(scn_), dcn(dcn_), code(code_) {}
-};
-
-void PrintTo(const CvtColorInfo& info, std::ostream* os);
-
-#define IMPLEMENT_PARAM_CLASS(name, type) \
-    class name \
-    { \
-    public: \
-        name ( type arg = type ()) : val_(arg) {} \
-        operator type () const {return val_;} \
-    private: \
-        type val_; \
-    }; \
-    inline void PrintTo( name param, std::ostream* os) \
-    { \
-        *os << #name <<  " = " << testing::PrintToString(static_cast< type >(param)); \
-    }
-
-IMPLEMENT_PARAM_CLASS(Channels, int)
-
-namespace cv { namespace gpu
-{
-    void PrintTo(const cv::gpu::DeviceInfo& info, std::ostream* os);
-}}
-
-#define GPU_PERF_TEST(name, ...) \
-    struct name : perf::TestBaseWithParam< std::tr1::tuple< __VA_ARGS__ > > \
-    { \
-    public: \
-        name() {} \
-    protected: \
-        void PerfTestBody(); \
-    }; \
-    TEST_P(name, perf){ RunPerfTestBody(); } \
-    void name :: PerfTestBody()
-
-#define GPU_PERF_TEST_1(name, param_type) \
-    struct name : perf::TestBaseWithParam< param_type > \
-    { \
-    public: \
-        name() {} \
-    protected: \
-        void PerfTestBody(); \
-    }; \
-    TEST_P(name, perf){ RunPerfTestBody(); } \
-    void name :: PerfTestBody()
-
-#define GPU_TYPICAL_MAT_SIZES testing::Values(perf::szSXGA, perf::sz1080p, cv::Size(1800, 1500))
-
-cv::Mat readImage(const std::string& fileName, int flags = cv::IMREAD_COLOR);
-
-const std::vector<cv::gpu::DeviceInfo>& devices();
-
-#define ALL_DEVICES testing::ValuesIn(devices())
-
-#define GET_PARAM(k) std::tr1::get< k >(GetParam())
-
-#endif // __OPENCV_PERF_GPU_UTILITY_HPP__
--- a/modules/gpu/perf_cpu/perf_video.cpp
+++ b/modules/gpu/perf_cpu/perf_video.cpp
@ -1,466 +0,0 @@
-#include "perf_cpu_precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-//////////////////////////////////////////////////////
-// GoodFeaturesToTrack
-
-IMPLEMENT_PARAM_CLASS(MinDistance, double)
-
-GPU_PERF_TEST(GoodFeaturesToTrack, cv::gpu::DeviceInfo, MinDistance)
-{
-    double minDistance = GET_PARAM(1);
-
-    cv::Mat image = readImage("gpu/perf/aloe.jpg", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(image.empty());
-
-    cv::Mat corners;
-
-    cv::goodFeaturesToTrack(image, corners, 8000, 0.01, minDistance);
-
-    TEST_CYCLE()
-    {
-        cv::goodFeaturesToTrack(image, corners, 8000, 0.01, minDistance);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Video, GoodFeaturesToTrack, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(MinDistance(0.0), MinDistance(3.0))));
-
-//////////////////////////////////////////////////////
-// PyrLKOpticalFlowSparse
-
-IMPLEMENT_PARAM_CLASS(GraySource, bool)
-IMPLEMENT_PARAM_CLASS(Points, int)
-IMPLEMENT_PARAM_CLASS(WinSize, int)
-IMPLEMENT_PARAM_CLASS(Levels, int)
-IMPLEMENT_PARAM_CLASS(Iters, int)
-
-GPU_PERF_TEST(PyrLKOpticalFlowSparse, cv::gpu::DeviceInfo, GraySource, Points, WinSize, Levels, Iters)
-{
-    bool useGray = GET_PARAM(1);
-    int points = GET_PARAM(2);
-    int win_size = GET_PARAM(3);
-    int levels = GET_PARAM(4);
-    int iters = GET_PARAM(5);
-
-    cv::Mat frame0 = readImage("gpu/opticalflow/frame0.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
-    ASSERT_FALSE(frame0.empty());
-
-    cv::Mat frame1 = readImage("gpu/opticalflow/frame1.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
-    ASSERT_FALSE(frame1.empty());
-
-    cv::Mat gray_frame;
-    if (useGray)
-        gray_frame = frame0;
-    else
-        cv::cvtColor(frame0, gray_frame, cv::COLOR_BGR2GRAY);
-
-    cv::Mat pts;
-    cv::goodFeaturesToTrack(gray_frame, pts, points, 0.01, 0.0);
-
-    cv::Mat nextPts;
-    cv::Mat status;
-
-    cv::calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, cv::noArray(),
-                             cv::Size(win_size, win_size), levels - 1,
-                             cv::TermCriteria(cv::TermCriteria::COUNT + cv::TermCriteria::EPS, iters, 0.01));
-
-    declare.time(20.0);
-
-    TEST_CYCLE()
-    {
-        cv::calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, cv::noArray(),
-                                 cv::Size(win_size, win_size), levels - 1,
-                                 cv::TermCriteria(cv::TermCriteria::COUNT + cv::TermCriteria::EPS, iters, 0.01));
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Video, PyrLKOpticalFlowSparse, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(GraySource(true), GraySource(false)),
-    testing::Values(Points(1000), Points(2000), Points(4000), Points(8000)),
-    testing::Values(WinSize(9), WinSize(13), WinSize(17), WinSize(21)),
-    testing::Values(Levels(1), Levels(2), Levels(3)),
-    testing::Values(Iters(1), Iters(10), Iters(30))));
-
-//////////////////////////////////////////////////////
-// FarnebackOpticalFlowTest
-
-GPU_PERF_TEST_1(FarnebackOpticalFlowTest, cv::gpu::DeviceInfo)
-{
-    cv::Mat frame0 = readImage("gpu/opticalflow/frame0.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame0.empty());
-
-    cv::Mat frame1 = readImage("gpu/opticalflow/frame1.png", cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(frame1.empty());
-
-    cv::Mat flow;
-
-    int numLevels = 5;
-    double pyrScale = 0.5;
-    int winSize = 13;
-    int numIters = 10;
-    int polyN = 5;
-    double polySigma = 1.1;
-    int flags = 0;
-
-    cv::calcOpticalFlowFarneback(frame0, frame1, flow, pyrScale, numLevels, winSize, numIters, polyN, polySigma, flags);
-
-    declare.time(10);
-
-    TEST_CYCLE()
-    {
-        cv::calcOpticalFlowFarneback(frame0, frame1, flow, pyrScale, numLevels, winSize, numIters, polyN, polySigma, flags);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Video, FarnebackOpticalFlowTest, ALL_DEVICES);
-
-//////////////////////////////////////////////////////
-// FGDStatModel
-
-namespace cv
-{
-    template<> void Ptr<CvBGStatModel>::delete_obj()
-    {
-        cvReleaseBGStatModel(&obj);
-    }
-}
-
-GPU_PERF_TEST(FGDStatModel, cv::gpu::DeviceInfo, std::string)
-{
-    std::string inputFile = perf::TestBase::getDataPath(std::string("gpu/video/") + GET_PARAM(1));
-
-    cv::VideoCapture cap(inputFile);
-    ASSERT_TRUE(cap.isOpened());
-
-    cv::Mat frame;
-    cap >> frame;
-    ASSERT_FALSE(frame.empty());
-
-    IplImage ipl_frame = frame;
-    cv::Ptr<CvBGStatModel> model(cvCreateFGDStatModel(&ipl_frame));
-
-    declare.time(60);
-
-    for (int i = 0; i < 10; ++i)
-    {
-        cap >> frame;
-        ASSERT_FALSE(frame.empty());
-
-        ipl_frame = frame;
-
-        startTimer();
-        next();
-
-        cvUpdateBGStatModel(&ipl_frame, model);
-
-        stopTimer();
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Video, FGDStatModel, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(std::string("768x576.avi"), std::string("1920x1080.avi"))));
-
-//////////////////////////////////////////////////////
-// MOG
-
-IMPLEMENT_PARAM_CLASS(LearningRate, double)
-
-GPU_PERF_TEST(MOG, cv::gpu::DeviceInfo, std::string, Channels, LearningRate)
-{
-    std::string inputFile = perf::TestBase::getDataPath(std::string("gpu/video/") + GET_PARAM(1));
-    int cn = GET_PARAM(2);
-    double learningRate = GET_PARAM(3);
-
-    cv::VideoCapture cap(inputFile);
-    ASSERT_TRUE(cap.isOpened());
-
-    cv::Mat frame;
-
-    cv::BackgroundSubtractorMOG mog;
-    cv::Mat foreground;
-
-    cap >> frame;
-    ASSERT_FALSE(frame.empty());
-
-    if (cn != 3)
-    {
-        cv::Mat temp;
-        if (cn == 1)
-            cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
-        else
-            cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
-        cv::swap(temp, frame);
-    }
-
-    mog(frame, foreground, learningRate);
-
-    for (int i = 0; i < 10; ++i)
-    {
-        cap >> frame;
-        ASSERT_FALSE(frame.empty());
-
-        if (cn != 3)
-        {
-            cv::Mat temp;
-            if (cn == 1)
-                cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
-            else
-                cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
-            cv::swap(temp, frame);
-        }
-
-        startTimer(); next();
-        mog(frame, foreground, learningRate);
-        stopTimer();
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Video, MOG, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(std::string("768x576.avi"), std::string("1920x1080.avi")),
-    testing::Values(Channels(1), Channels(3)/*, Channels(4)*/),
-    testing::Values(LearningRate(0.0), LearningRate(0.01))));
-
-//////////////////////////////////////////////////////
-// MOG2
-
-GPU_PERF_TEST(MOG2_update, cv::gpu::DeviceInfo, std::string, Channels)
-{
-    std::string inputFile = perf::TestBase::getDataPath(std::string("gpu/video/") + GET_PARAM(1));
-    int cn = GET_PARAM(2);
-
-    cv::VideoCapture cap(inputFile);
-    ASSERT_TRUE(cap.isOpened());
-
-    cv::Mat frame;
-
-    cv::BackgroundSubtractorMOG2 mog2;
-    cv::Mat foreground;
-
-    cap >> frame;
-    ASSERT_FALSE(frame.empty());
-
-    if (cn != 3)
-    {
-        cv::Mat temp;
-        if (cn == 1)
-            cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
-        else
-            cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
-        cv::swap(temp, frame);
-    }
-
-    mog2(frame, foreground);
-
-    for (int i = 0; i < 10; ++i)
-    {
-        cap >> frame;
-        ASSERT_FALSE(frame.empty());
-
-        if (cn != 3)
-        {
-            cv::Mat temp;
-            if (cn == 1)
-                cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
-            else
-                cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
-            cv::swap(temp, frame);
-        }
-
-        startTimer(); next();
-        mog2(frame, foreground);
-        stopTimer();
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Video, MOG2_update, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(std::string("768x576.avi"), std::string("1920x1080.avi")),
-    testing::Values(Channels(1), Channels(3)/*, Channels(4)*/)));
-
-GPU_PERF_TEST(MOG2_getBackgroundImage, cv::gpu::DeviceInfo, std::string, Channels)
-{
-    std::string inputFile = perf::TestBase::getDataPath(std::string("gpu/video/") + GET_PARAM(1));
-    int cn = GET_PARAM(2);
-
-    cv::VideoCapture cap(inputFile);
-    ASSERT_TRUE(cap.isOpened());
-
-    cv::Mat frame;
-
-    cv::BackgroundSubtractorMOG2 mog2;
-    cv::Mat foreground;
-
-    for (int i = 0; i < 10; ++i)
-    {
-        cap >> frame;
-        ASSERT_FALSE(frame.empty());
-
-        if (cn != 3)
-        {
-            cv::Mat temp;
-            if (cn == 1)
-                cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
-            else
-                cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
-            cv::swap(temp, frame);
-        }
-
-        mog2(frame, foreground);
-    }
-
-    cv::Mat background;
-    mog2.getBackgroundImage(background);
-
-    TEST_CYCLE()
-    {
-        mog2.getBackgroundImage(background);
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Video, MOG2_getBackgroundImage, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(std::string("768x576.avi"), std::string("1920x1080.avi")),
-    testing::Values(/*Channels(1),*/ Channels(3)/*, Channels(4)*/)));
-
-//////////////////////////////////////////////////////
-// GMG
-
-IMPLEMENT_PARAM_CLASS(MaxFeatures, int)
-
-GPU_PERF_TEST(GMG, cv::gpu::DeviceInfo, std::string, Channels, MaxFeatures)
-{
-    std::string inputFile = perf::TestBase::getDataPath(std::string("gpu/video/") + GET_PARAM(1));
-    int cn = GET_PARAM(2);
-    int maxFeatures = GET_PARAM(3);
-
-    cv::VideoCapture cap(inputFile);
-    ASSERT_TRUE(cap.isOpened());
-
-    cv::Mat frame;
-    cap >> frame;
-    ASSERT_FALSE(frame.empty());
-
-    if (cn != 3)
-    {
-        cv::Mat temp;
-        if (cn == 1)
-            cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
-        else
-            cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
-        cv::swap(temp, frame);
-    }
-
-    cv::Mat fgmask;
-    cv::Mat zeros(frame.size(), CV_8UC1, cv::Scalar::all(0));
-
-    cv::BackgroundSubtractorGMG gmg;
-    gmg.set("maxFeatures", maxFeatures);
-    gmg.initialize(frame.size(), 0.0, 255.0);
-
-    gmg(frame, fgmask);
-
-    for (int i = 0; i < 150; ++i)
-    {
-        cap >> frame;
-        if (frame.empty())
-        {
-            cap.open(inputFile);
-            cap >> frame;
-        }
-
-        if (cn != 3)
-        {
-            cv::Mat temp;
-            if (cn == 1)
-                cv::cvtColor(frame, temp, cv::COLOR_BGR2GRAY);
-            else
-                cv::cvtColor(frame, temp, cv::COLOR_BGR2BGRA);
-            cv::swap(temp, frame);
-        }
-
-        startTimer(); next();
-        gmg(frame, fgmask);
-        stopTimer();
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Video, GMG, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(std::string("768x576.avi"), std::string("1920x1080.avi")),
-    testing::Values(Channels(1), Channels(3), Channels(4)),
-    testing::Values(MaxFeatures(20), MaxFeatures(40), MaxFeatures(60))));
-
-//////////////////////////////////////////////////////
-// VideoWriter
-
-#ifdef WIN32
-
-GPU_PERF_TEST(VideoWriter, cv::gpu::DeviceInfo, std::string)
-{
-    const double FPS = 25.0;
-
-    std::string inputFile = perf::TestBase::getDataPath(std::string("gpu/video/") + GET_PARAM(1));
-    std::string outputFile = cv::tempfile(".avi");
-
-    cv::VideoCapture reader(inputFile);
-    ASSERT_TRUE( reader.isOpened() );
-
-    cv::VideoWriter writer;
-
-    cv::Mat frame;
-
-    declare.time(30);
-
-    for (int i = 0; i < 10; ++i)
-    {
-        reader >> frame;
-        ASSERT_FALSE(frame.empty());
-
-        if (!writer.isOpened())
-            writer.open(outputFile, CV_FOURCC('X', 'V', 'I', 'D'), FPS, frame.size());
-
-        startTimer(); next();
-        writer.write(frame);
-        stopTimer();
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Video, VideoWriter, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(std::string("768x576.avi"), std::string("1920x1080.avi"))));
-
-#endif // WIN32
-
-//////////////////////////////////////////////////////
-// VideoReader
-
-GPU_PERF_TEST(VideoReader, cv::gpu::DeviceInfo, std::string)
-{
-    std::string inputFile = perf::TestBase::getDataPath(std::string("gpu/video/") + GET_PARAM(1));
-
-    cv::VideoCapture reader(inputFile);
-    ASSERT_TRUE( reader.isOpened() );
-
-    cv::Mat frame;
-
-    reader >> frame;
-
-    declare.time(20);
-
-    TEST_CYCLE_N(10)
-    {
-        reader >> frame;
-    }
-}
-
-INSTANTIATE_TEST_CASE_P(Video, VideoReader, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(std::string("768x576.avi"), std::string("1920x1080.avi"))));
-
-#endif
--- a/modules/gpu/src/brute_force_matcher.cpp
+++ b/modules/gpu/src/brute_force_matcher.cpp
@ -420,16 +420,16 @@ void cv::gpu::BFMatcher_GPU::matchConvert(const Mat& trainIdx, const Mat& imgIdx
    const float* distance_ptr =  distance.ptr<float>();
    for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
    {
-        int trainIdx = *trainIdx_ptr;
+        int _trainIdx = *trainIdx_ptr;

-        if (trainIdx == -1)
+        if (_trainIdx == -1)
            continue;

-        int imgIdx = *imgIdx_ptr;
+        int _imgIdx = *imgIdx_ptr;

-        float distance = *distance_ptr;
+        float _distance = *distance_ptr;

-        DMatch m(queryIdx, trainIdx, imgIdx, distance);
+        DMatch m(queryIdx, _trainIdx, _imgIdx, _distance);

        matches.push_back(m);
    }
@ -558,13 +558,13 @@ void cv::gpu::BFMatcher_GPU::knnMatchConvert(const Mat& trainIdx, const Mat& dis

        for (int i = 0; i < k; ++i, ++trainIdx_ptr, ++distance_ptr)
        {
-            int trainIdx = *trainIdx_ptr;
+            int _trainIdx = *trainIdx_ptr;

-            if (trainIdx != -1)
+            if (_trainIdx != -1)
            {
-                float distance = *distance_ptr;
+                float _distance = *distance_ptr;

-                DMatch m(queryIdx, trainIdx, 0, distance);
+                DMatch m(queryIdx, _trainIdx, 0, _distance);

                curMatches.push_back(m);
            }
@ -680,15 +680,15 @@ void cv::gpu::BFMatcher_GPU::knnMatch2Convert(const Mat& trainIdx, const Mat& im

        for (int i = 0; i < 2; ++i, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
        {
-            int trainIdx = *trainIdx_ptr;
+            int _trainIdx = *trainIdx_ptr;

-            if (trainIdx != -1)
+            if (_trainIdx != -1)
            {
-                int imgIdx = *imgIdx_ptr;
+                int _imgIdx = *imgIdx_ptr;

-                float distance = *distance_ptr;
+                float _distance = *distance_ptr;

-                DMatch m(queryIdx, trainIdx, imgIdx, distance);
+                DMatch m(queryIdx, _trainIdx, _imgIdx, _distance);

                curMatches.push_back(m);
            }
@ -868,25 +868,25 @@ void cv::gpu::BFMatcher_GPU::radiusMatchConvert(const Mat& trainIdx, const Mat&
        const int* trainIdx_ptr = trainIdx.ptr<int>(queryIdx);
        const float* distance_ptr = distance.ptr<float>(queryIdx);

-        const int nMatches = std::min(nMatches_ptr[queryIdx], trainIdx.cols);
+        const int nMatched = std::min(nMatches_ptr[queryIdx], trainIdx.cols);

-        if (nMatches == 0)
+        if (nMatched == 0)
        {
            if (!compactResult)
                matches.push_back(vector<DMatch>());
            continue;
        }

-        matches.push_back(vector<DMatch>(nMatches));
+        matches.push_back(vector<DMatch>(nMatched));
        vector<DMatch>& curMatches = matches.back();

-        for (int i = 0; i < nMatches; ++i, ++trainIdx_ptr, ++distance_ptr)
+        for (int i = 0; i < nMatched; ++i, ++trainIdx_ptr, ++distance_ptr)
        {
-            int trainIdx = *trainIdx_ptr;
+            int _trainIdx = *trainIdx_ptr;

-            float distance = *distance_ptr;
+            float _distance = *distance_ptr;

-            DMatch m(queryIdx, trainIdx, 0, distance);
+            DMatch m(queryIdx, _trainIdx, 0, _distance);

            curMatches[i] = m;
        }
@ -1009,9 +1009,9 @@ void cv::gpu::BFMatcher_GPU::radiusMatchConvert(const Mat& trainIdx, const Mat&
        const int* imgIdx_ptr = imgIdx.ptr<int>(queryIdx);
        const float* distance_ptr = distance.ptr<float>(queryIdx);

-        const int nMatches = std::min(nMatches_ptr[queryIdx], trainIdx.cols);
+        const int nMatched = std::min(nMatches_ptr[queryIdx], trainIdx.cols);

-        if (nMatches == 0)
+        if (nMatched == 0)
        {
            if (!compactResult)
                matches.push_back(vector<DMatch>());
@ -1020,9 +1020,9 @@ void cv::gpu::BFMatcher_GPU::radiusMatchConvert(const Mat& trainIdx, const Mat&

        matches.push_back(vector<DMatch>());
        vector<DMatch>& curMatches = matches.back();
-        curMatches.reserve(nMatches);
+        curMatches.reserve(nMatched);

-        for (int i = 0; i < nMatches; ++i, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
+        for (int i = 0; i < nMatched; ++i, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
        {
            int _trainIdx = *trainIdx_ptr;
            int _imgIdx = *imgIdx_ptr;
--- a/modules/gpu/src/calib3d.cpp
+++ b/modules/gpu/src/calib3d.cpp
@ -56,14 +56,14 @@ void cv::gpu::solvePnPRansac(const Mat&, const Mat&, const Mat&, const Mat&, Mat

 #else

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace transform_points 
+    namespace transform_points
    {
        void call(const DevMem2D_<float3> src, const float* rot, const float* transl, DevMem2D_<float3> dst, cudaStream_t stream);
    }

-    namespace project_points 
+    namespace project_points
    {
        void call(const DevMem2D_<float3> src, const float* rot, const float* transl, const float* proj, DevMem2D_<float2> dst, cudaStream_t stream);
    }
@ -154,11 +154,11 @@ namespace
    class TransformHypothesesGenerator
    {
    public:
-        TransformHypothesesGenerator(const Mat& object_, const Mat& image_, const Mat& dist_coef_, 
-                                     const Mat& camera_mat_, int num_points_, int subset_size_, 
+        TransformHypothesesGenerator(const Mat& object_, const Mat& image_, const Mat& dist_coef_,
+                                     const Mat& camera_mat_, int num_points_, int subset_size_,
                                     Mat rot_matrices_, Mat transl_vectors_)
-                : object(&object_), image(&image_), dist_coef(&dist_coef_), camera_mat(&camera_mat_), 
-                  num_points(num_points_), subset_size(subset_size_), rot_matrices(rot_matrices_), 
+                : object(&object_), image(&image_), dist_coef(&dist_coef_), camera_mat(&camera_mat_),
+                  num_points(num_points_), subset_size(subset_size_), rot_matrices(rot_matrices_),
                  transl_vectors(transl_vectors_) {}

        void operator()(const BlockedRange& range) const
@ -211,9 +211,10 @@ namespace

 void cv::gpu::solvePnPRansac(const Mat& object, const Mat& image, const Mat& camera_mat,
                             const Mat& dist_coef, Mat& rvec, Mat& tvec, bool use_extrinsic_guess,
-                             int num_iters, float max_dist, int min_inlier_count, 
+                             int num_iters, float max_dist, int min_inlier_count,
                             vector<int>* inliers)
 {
+    (void)min_inlier_count;
    CV_Assert(object.rows == 1 && object.cols > 0 && object.type() == CV_32FC3);
    CV_Assert(image.rows == 1 && image.cols > 0 && image.type() == CV_32FC2);
    CV_Assert(object.cols == image.cols);
@ -236,7 +237,7 @@ void cv::gpu::solvePnPRansac(const Mat& object, const Mat& image, const Mat& cam
    Mat transl_vectors(1, num_iters * 3, CV_32F);

    // Generate set of hypotheses using small subsets of the input data
-    TransformHypothesesGenerator body(object, image_normalized, empty_dist_coef, eye_camera_mat, 
+    TransformHypothesesGenerator body(object, image_normalized, empty_dist_coef, eye_camera_mat,
                                      num_points, subset_size, rot_matrices, transl_vectors);
    parallel_for(BlockedRange(0, num_iters), body);

@ -246,7 +247,7 @@ void cv::gpu::solvePnPRansac(const Mat& object, const Mat& image, const Mat& cam
    GpuMat d_hypothesis_scores(1, num_iters, CV_32S);
    solve_pnp_ransac::computeHypothesisScores(
            num_iters, num_points, rot_matrices.ptr<float>(), transl_vectors.ptr<float3>(),
-            d_object.ptr<float3>(), d_image_normalized.ptr<float2>(), max_dist * max_dist, 
+            d_object.ptr<float3>(), d_image_normalized.ptr<float2>(), max_dist * max_dist,
            d_hypothesis_scores.ptr<int>());

    // Find the best hypothesis index
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@ -143,7 +143,7 @@ public:
    }

    unsigned int process(const GpuMat& image, GpuMat& objectsBuf, float scaleFactor, int minNeighbors,
-                      bool findLargestObject, bool visualizeInPlace, cv::Size minSize, cv::Size maxObjectSize)
+                      bool findLargestObject, bool visualizeInPlace, cv::Size minSize, cv::Size /*maxObjectSize*/)
    {
        CV_Assert( scaleFactor > 1 && image.depth() == CV_8U);

@ -380,12 +380,12 @@ public:
    LbpCascade(){}
    virtual ~LbpCascade(){}

-    virtual unsigned int process(const GpuMat& image, GpuMat& objects, float scaleFactor, int groupThreshold, bool findLargestObject,
-        bool visualizeInPlace, cv::Size minObjectSize, cv::Size maxObjectSize)
+    virtual unsigned int process(const GpuMat& image, GpuMat& objects, float scaleFactor, int groupThreshold, bool /*findLargestObject*/,
+        bool /*visualizeInPlace*/, cv::Size minObjectSize, cv::Size maxObjectSize)
    {
        CV_Assert(scaleFactor > 1 && image.depth() == CV_8U);

-        const int defaultObjSearchNum = 100;
+        // const int defaultObjSearchNum = 100;
        const float grouping_eps = 0.2f;

        if( !objects.empty() && objects.depth() == CV_32S)
--- a/modules/gpu/src/cuda/ccomponetns.cu
+++ b/modules/gpu/src/cuda/ccomponetns.cu
@ -316,7 +316,7 @@ namespace cv { namespace gpu { namespace device
                        }
                    }

-                changed = Emulation::sycthOr(changed);
+                changed = Emulation::syncthreadsOr(changed);

                if (!changed)
                    break;
@ -474,7 +474,7 @@ namespace cv { namespace gpu { namespace device
                        }
                    }
                }
-            } while (Emulation::sycthOr(changed));
+            } while (Emulation::syncthreadsOr(changed));
        }

        __global__ void flatten(const DevMem2D edges, DevMem2Di comps)
--- a/modules/gpu/src/cuda/hough.cu
+++ b/modules/gpu/src/cuda/hough.cu
@ -59,7 +59,7 @@ namespace cv { namespace gpu { namespace device
        {
            __shared__ int s_queues[4][32 * PIXELS_PER_THREAD];
            __shared__ int s_qsize[4];
-            __shared__ int s_start[4];
+            __shared__ int s_globStart[4];

            const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -73,9 +73,10 @@ namespace cv { namespace gpu { namespace device
            __syncthreads();

            // fill the queue
+            const uchar* srcRow = src.ptr(y);
            for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < src.cols; ++i, xx += blockDim.x)
            {
-                if (src(y, xx))
+                if (srcRow[xx])
                {
                    const unsigned int val = (y << 16) | xx;
                    const int qidx = Emulation::smem::atomicAdd(&s_qsize[threadIdx.y], 1);
@ -89,36 +90,34 @@ namespace cv { namespace gpu { namespace device
            if (threadIdx.x == 0 && threadIdx.y == 0)
            {
                // find how many items are stored in each list
-                int total_size = 0;
+                int totalSize = 0;
                for (int i = 0; i < blockDim.y; ++i)
                {
-                    s_start[i] = total_size;
-                    total_size += s_qsize[i];
+                    s_globStart[i] = totalSize;
+                    totalSize += s_qsize[i];
                }

                // calculate the offset in the global list
-                const int global_offset = atomicAdd(&g_counter, total_size);
+                const int globalOffset = atomicAdd(&g_counter, totalSize);
                for (int i = 0; i < blockDim.y; ++i)
-                    s_start[i] += global_offset;
+                    s_globStart[i] += globalOffset;
            }

            __syncthreads();

            // copy local queues to global queue
            const int qsize = s_qsize[threadIdx.y];
-            for(int i = threadIdx.x; i < qsize; i += blockDim.x)
-            {
-                const unsigned int val = s_queues[threadIdx.y][i];
-                list[s_start[threadIdx.y] + i] = val;
-            }
+            int gidx = s_globStart[threadIdx.y] + threadIdx.x;
+            for(int i = threadIdx.x; i < qsize; i += blockDim.x, gidx += blockDim.x)
+                list[gidx] = s_queues[threadIdx.y][i];
        }

        int buildPointList_gpu(DevMem2Db src, unsigned int* list)
        {
-            void* counter_ptr;
-            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );

-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );

            const dim3 block(32, 4);
            const dim3 grid(divUp(src.cols, block.x * PIXELS_PER_THREAD), divUp(src.rows, block.y));
@ -130,10 +129,10 @@ namespace cv { namespace gpu { namespace device

            cudaSafeCall( cudaDeviceSynchronize() );

-            int total_count;
-            cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );

-            return total_count;
+            return totalCount;
        }

        ////////////////////////////////////////////////////////////////////////
@ -144,24 +143,26 @@ namespace cv { namespace gpu { namespace device
            const int n = blockIdx.x;
            const float ang = n * theta;

-            float sin_ang;
-            float cos_ang;
-            sincosf(ang, &sin_ang, &cos_ang);
+            float sinVal;
+            float cosVal;
+            sincosf(ang, &sinVal, &cosVal);
+            sinVal *= irho;
+            cosVal *= irho;

-            const float tabSin = sin_ang * irho;
-            const float tabCos = cos_ang * irho;
+            const int shift = (numrho - 1) / 2;

+            int* accumRow = accum.ptr(n + 1);
            for (int i = threadIdx.x; i < count; i += blockDim.x)
            {
-                const unsigned int qvalue = list[i];
+                const unsigned int val = list[i];

-                const int x = (qvalue & 0x0000FFFF);
-                const int y = (qvalue >> 16) & 0x0000FFFF;
+                const int x = (val & 0xFFFF);
+                const int y = (val >> 16) & 0xFFFF;

-                int r = __float2int_rn(x * tabCos + y * tabSin);
-                r += (numrho - 1) / 2;
+                int r = __float2int_rn(x * cosVal + y * sinVal);
+                r += shift;

-                ::atomicAdd(accum.ptr(n + 1) + r + 1, 1);
+                ::atomicAdd(accumRow + r + 1, 1);
            }
        }

@ -177,30 +178,32 @@ namespace cv { namespace gpu { namespace device
            const int n = blockIdx.x;
            const float ang = n * theta;

-            float sin_ang;
-            float cos_ang;
-            sincosf(ang, &sin_ang, &cos_ang);
+            float sinVal;
+            float cosVal;
+            sincosf(ang, &sinVal, &cosVal);
+            sinVal *= irho;
+            cosVal *= irho;

-            const float tabSin = sin_ang * irho;
-            const float tabCos = cos_ang * irho;
+            const int shift = (numrho - 1) / 2;

            for (int i = threadIdx.x; i < count; i += blockDim.x)
            {
-                const unsigned int qvalue = list[i];
+                const unsigned int val = list[i];

-                const int x = (qvalue & 0x0000FFFF);
-                const int y = (qvalue >> 16) & 0x0000FFFF;
+                const int x = (val & 0xFFFF);
+                const int y = (val >> 16) & 0xFFFF;

-                int r = __float2int_rn(x * tabCos + y * tabSin);
-                r += (numrho - 1) / 2;
+                int r = __float2int_rn(x * cosVal + y * sinVal);
+                r += shift;

                Emulation::smem::atomicAdd(&smem[r + 1], 1);
            }

            __syncthreads();

-            for (int i = threadIdx.x; i < numrho; i += blockDim.x)
-                accum(n + 1, i) = smem[i];
+            int* accumRow = accum.ptr(n + 1);
+            for (int i = threadIdx.x; i < numrho + 1; i += blockDim.x)
+                accumRow[i] = smem[i];
        }

        void linesAccum_gpu(const unsigned int* list, int count, DevMem2Di accum, float rho, float theta, size_t sharedMemPerBlock, bool has20)
@ -225,21 +228,21 @@ namespace cv { namespace gpu { namespace device
        ////////////////////////////////////////////////////////////////////////
        // linesGetResult

-        __global__ void linesGetResult(const DevMem2Di accum, float2* out, int* votes, const int maxSize, const float threshold, const float theta, const float rho, const int numrho)
+        __global__ void linesGetResult(const DevMem2Di accum, float2* out, int* votes, const int maxSize, const float rho, const float theta, const float threshold, const int numrho)
        {
            __shared__ int smem[8][32];

-            int r = blockIdx.x * (blockDim.x - 2) + threadIdx.x;
-            int n = blockIdx.y * (blockDim.y - 2) + threadIdx.y;
+            const int x = blockIdx.x * (blockDim.x - 2) + threadIdx.x;
+            const int y = blockIdx.y * (blockDim.y - 2) + threadIdx.y;

-            if (r >= accum.cols || n >= accum.rows)
+            if (x >= accum.cols || y >= accum.rows)
                return;

-            smem[threadIdx.y][threadIdx.x] = accum(n, r);
+            smem[threadIdx.y][threadIdx.x] = accum(y, x);
            __syncthreads();

-            r -= 1;
-            n -= 1;
+            const int r = x - 1;
+            const int n = y - 1;

            if (threadIdx.x == 0 || threadIdx.x == blockDim.x - 1 || threadIdx.y == 0 || threadIdx.y == blockDim.y - 1 || r >= accum.cols - 2 || n >= accum.rows - 2)
                return;
@ -264,32 +267,32 @@ namespace cv { namespace gpu { namespace device

        int linesGetResult_gpu(DevMem2Di accum, float2* out, int* votes, int maxSize, float rho, float theta, float threshold, bool doSort)
        {
-            void* counter_ptr;
-            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );

-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );

            const dim3 block(32, 8);
            const dim3 grid(divUp(accum.cols, block.x - 2), divUp(accum.rows, block.y - 2));

-            linesGetResult<<<grid, block>>>(accum, out, votes, maxSize, threshold, theta, rho, accum.cols - 2);
+            linesGetResult<<<grid, block>>>(accum, out, votes, maxSize, rho, theta, threshold, accum.cols - 2);
            cudaSafeCall( cudaGetLastError() );

            cudaSafeCall( cudaDeviceSynchronize() );

-            int total_count;
-            cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );

-            total_count = ::min(total_count, maxSize);
+            totalCount = ::min(totalCount, maxSize);

-            if (doSort && total_count > 0)
+            if (doSort && totalCount > 0)
            {
-                thrust::device_ptr<float2> out_ptr(out);
-                thrust::device_ptr<int> votes_ptr(votes);
-                thrust::sort_by_key(votes_ptr, votes_ptr + total_count, out_ptr, thrust::greater<int>());
+                thrust::device_ptr<float2> outPtr(out);
+                thrust::device_ptr<int> votesPtr(votes);
+                thrust::sort_by_key(votesPtr, votesPtr + totalCount, outPtr, thrust::greater<int>());
            }

-            return total_count;
+            return totalCount;
        }
    }
 }}}
--- a/modules/gpu/src/cuda/integral_image.cu
+++ b/modules/gpu/src/cuda/integral_image.cu
@ -0,0 +1,385 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/gpu/device/common.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace imgproc
+    {
+        // Utility function to extract unsigned chars from an unsigned integer
+        __device__ uchar4 int_to_uchar4(unsigned int in)
+        {
+            uchar4 bytes;
+            bytes.x = (in && 0x000000ff) >>  0;
+            bytes.y = (in && 0x0000ff00) >>  8;
+            bytes.z = (in && 0x00ff0000) >> 16;
+            bytes.w = (in && 0xff000000) >> 24;
+            return bytes;
+        }
+
+        __global__ void shfl_integral_horizontal(const PtrStep_<uint4> img, PtrStep_<uint4> integral)
+        {
+        #if __CUDA_ARCH__ >= 300
+            __shared__ int sums[128];
+
+            const int id = threadIdx.x;
+            const int lane_id = id % warpSize;
+            const int warp_id = id / warpSize;
+
+            const uint4 data = img(blockIdx.x, id);
+
+            const uchar4 a = int_to_uchar4(data.x);
+            const uchar4 b = int_to_uchar4(data.y);
+            const uchar4 c = int_to_uchar4(data.z);
+            const uchar4 d = int_to_uchar4(data.w);
+
+            int result[16];
+
+            result[0]  =              a.x;
+            result[1]  = result[0]  + a.y;
+            result[2]  = result[1]  + a.z;
+            result[3]  = result[2]  + a.w;
+
+            result[4]  = result[3]  + b.x;
+            result[5]  = result[4]  + b.y;
+            result[6]  = result[5]  + b.z;
+            result[7]  = result[6]  + b.w;
+
+            result[8]  = result[7]  + c.x;
+            result[9]  = result[8]  + c.y;
+            result[10] = result[9]  + c.z;
+            result[11] = result[10] + c.w;
+
+            result[12] = result[11] + d.x;
+            result[13] = result[12] + d.y;
+            result[14] = result[13] + d.z;
+            result[15] = result[14] + d.w;
+
+            int sum = result[15];
+
+            // the prefix sum for each thread's 16 value is computed,
+            // now the final sums (result[15]) need to be shared
+            // with the other threads and add.  To do this,
+            // the __shfl_up() instruction is used and a shuffle scan
+            // operation is performed to distribute the sums to the correct
+            // threads
+            #pragma unroll
+            for (int i = 1; i < 32; i *= 2)
+            {
+                const int n = __shfl_up(sum, i, 32);
+
+                if (lane_id >= i)
+                {
+                    #pragma unroll
+                    for (int i = 0; i < 16; ++i)
+                        result[i] += n;
+
+                    sum += n;
+                }
+            }
+
+            // Now the final sum for the warp must be shared
+            // between warps.  This is done by each warp
+            // having a thread store to shared memory, then
+            // having some other warp load the values and
+            // compute a prefix sum, again by using __shfl_up.
+            // The results are uniformly added back to the warps.
+            // last thread in the warp holding sum of the warp
+            // places that in shared
+            if (threadIdx.x % warpSize == warpSize - 1)
+                sums[warp_id] = result[15];
+
+            __syncthreads();
+
+            if (warp_id == 0)
+            {
+                int warp_sum = sums[lane_id];
+
+                #pragma unroll
+                for (int i = 1; i <= 32; i *= 2)
+                {
+                    const int n = __shfl_up(warp_sum, i, 32);
+
+                    if (lane_id >= i)
+                        warp_sum += n;
+                }
+
+                sums[lane_id] = warp_sum;
+            }
+
+            __syncthreads();
+
+            int blockSum = 0;
+
+            // fold in unused warp
+            if (warp_id > 0)
+            {
+                blockSum = sums[warp_id - 1];
+
+                #pragma unroll
+                for (int i = 0; i < 16; ++i)
+                    result[i] += blockSum;
+            }
+
+            // assemble result
+            // Each thread has 16 values to write, which are
+            // now integer data (to avoid overflow).  Instead of
+            // each thread writing consecutive uint4s, the
+            // approach shown here experiments using
+            // the shuffle command to reformat the data
+            // inside the registers so that each thread holds
+            // consecutive data to be written so larger contiguous
+            // segments can be assembled for writing.
+
+            /*
+                For example data that needs to be written as
+
+                GMEM[16] <- x0 x1 x2 x3 y0 y1 y2 y3 z0 z1 z2 z3 w0 w1 w2 w3
+                but is stored in registers (r0..r3), in four threads (0..3) as:
+
+                threadId   0  1  2  3
+                  r0      x0 y0 z0 w0
+                  r1      x1 y1 z1 w1
+                  r2      x2 y2 z2 w2
+                  r3      x3 y3 z3 w3
+
+                  after apply __shfl_xor operations to move data between registers r1..r3:
+
+                threadId  00 01 10 11
+                          x0 y0 z0 w0
+                 xor(01)->y1 x1 w1 z1
+                 xor(10)->z2 w2 x2 y2
+                 xor(11)->w3 z3 y3 x3
+
+                 and now x0..x3, and z0..z3 can be written out in order by all threads.
+
+                 In the current code, each register above is actually representing
+                 four integers to be written as uint4's to GMEM.
+            */
+
+            result[4]  = __shfl_xor(result[4] , 1, 32);
+            result[5]  = __shfl_xor(result[5] , 1, 32);
+            result[6]  = __shfl_xor(result[6] , 1, 32);
+            result[7]  = __shfl_xor(result[7] , 1, 32);
+
+            result[8]  = __shfl_xor(result[8] , 2, 32);
+            result[9]  = __shfl_xor(result[9] , 2, 32);
+            result[10] = __shfl_xor(result[10], 2, 32);
+            result[11] = __shfl_xor(result[11], 2, 32);
+
+            result[12] = __shfl_xor(result[12], 3, 32);
+            result[13] = __shfl_xor(result[13], 3, 32);
+            result[14] = __shfl_xor(result[14], 3, 32);
+            result[15] = __shfl_xor(result[15], 3, 32);
+
+            uint4* integral_row = integral.ptr(blockIdx.x);
+            uint4 output;
+
+            ///////
+
+            if (threadIdx.x % 4 == 0)
+                output = make_uint4(result[0], result[1], result[2], result[3]);
+
+            if (threadIdx.x % 4 == 1)
+                output = make_uint4(result[4], result[5], result[6], result[7]);
+
+            if (threadIdx.x % 4 == 2)
+                output = make_uint4(result[8], result[9], result[10], result[11]);
+
+            if (threadIdx.x % 4 == 3)
+                output = make_uint4(result[12], result[13], result[14], result[15]);
+
+            integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16] = output;
+
+            ///////
+
+            if (threadIdx.x % 4 == 2)
+                output = make_uint4(result[0], result[1], result[2], result[3]);
+
+            if (threadIdx.x % 4 == 3)
+                output = make_uint4(result[4], result[5], result[6], result[7]);
+
+            if (threadIdx.x % 4 == 0)
+                output = make_uint4(result[8], result[9], result[10], result[11]);
+
+            if (threadIdx.x % 4 == 1)
+                output = make_uint4(result[12], result[13], result[14], result[15]);
+
+            integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 8] = output;
+
+            // continuning from the above example,
+            // this use of __shfl_xor() places the y0..y3 and w0..w3 data
+            // in order.
+
+            #pragma unroll
+            for (int i = 0; i < 16; ++i)
+                result[i] = __shfl_xor(result[i], 1, 32);
+
+            if (threadIdx.x % 4 == 0)
+                output = make_uint4(result[0], result[1], result[2], result[3]);
+
+            if (threadIdx.x % 4 == 1)
+                output = make_uint4(result[4], result[5], result[6], result[7]);
+
+            if (threadIdx.x % 4 == 2)
+                output = make_uint4(result[8], result[9], result[10], result[11]);
+
+            if (threadIdx.x % 4 == 3)
+                output = make_uint4(result[12], result[13], result[14], result[15]);
+
+            integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16 + 4] = output;
+
+            ///////
+
+            if (threadIdx.x % 4 == 2)
+                output = make_uint4(result[0], result[1], result[2], result[3]);
+
+            if (threadIdx.x % 4 == 3)
+                output = make_uint4(result[4], result[5], result[6], result[7]);
+
+            if (threadIdx.x % 4 == 0)
+                output = make_uint4(result[8], result[9], result[10], result[11]);
+
+            if (threadIdx.x % 4 == 1)
+                output = make_uint4(result[12], result[13], result[14], result[15]);
+
+            integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 12] = output;
+        #endif
+        }
+
+        // This kernel computes columnwise prefix sums.  When the data input is
+        // the row sums from above, this completes the integral image.
+        // The approach here is to have each block compute a local set of sums.
+        // First , the data covered by the block is loaded into shared memory,
+        // then instead of performing a sum in shared memory using __syncthreads
+        // between stages, the data is reformatted so that the necessary sums
+        // occur inside warps and the shuffle scan operation is used.
+        // The final set of sums from the block is then propgated, with the block
+        // computing "down" the image and adding the running sum to the local
+        // block sums.
+        __global__ void shfl_integral_vertical(DevMem2D_<unsigned int> integral)
+        {
+        #if __CUDA_ARCH__ >= 300
+            __shared__ unsigned int sums[32][9];
+
+            const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
+            const int lane_id = tidx % 8;
+
+            if (tidx >= integral.cols)
+                return;
+
+            sums[threadIdx.x][threadIdx.y] = 0;
+            __syncthreads();
+
+            unsigned int stepSum = 0;
+
+            for (int y = threadIdx.y; y < integral.rows; y += blockDim.y)
+            {
+                unsigned int* p = integral.ptr(y) + tidx;
+
+                unsigned int sum = *p;
+
+                sums[threadIdx.x][threadIdx.y] = sum;
+                __syncthreads();
+
+                // place into SMEM
+                // shfl scan reduce the SMEM, reformating so the column
+                // sums are computed in a warp
+                // then read out properly
+                const int j = threadIdx.x % 8;
+                const int k = threadIdx.x / 8 + threadIdx.y * 4;
+
+                int partial_sum = sums[k][j];
+
+                for (int i = 1; i <= 8; i *= 2)
+                {
+                    int n = __shfl_up(partial_sum, i, 32);
+
+                    if (lane_id >= i)
+                        partial_sum += n;
+                }
+
+                sums[k][j] = partial_sum;
+                __syncthreads();
+
+                if (threadIdx.y > 0)
+                    sum += sums[threadIdx.x][threadIdx.y - 1];
+
+                sum += stepSum;
+                stepSum += sums[threadIdx.x][blockDim.y - 1];
+
+                __syncthreads();
+
+                *p = sum;
+            }
+        #endif
+        }
+
+        void shfl_integral_gpu(DevMem2Db img, DevMem2D_<unsigned int> integral, cudaStream_t stream)
+        {
+            {
+                // each thread handles 16 values, use 1 block/row
+                const int block = img.cols / 16;
+
+                // launch 1 block / row
+                const int grid = img.rows;
+
+                cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
+
+                shfl_integral_horizontal<<<grid, block, 0, stream>>>((DevMem2D_<uint4>) img, (DevMem2D_<uint4>) integral);
+                cudaSafeCall( cudaGetLastError() );
+            }
+
+            {
+                const dim3 block(32, 8);
+                const dim3 grid(divUp(integral.cols, block.x), 1);
+
+                shfl_integral_vertical<<<grid, block, 0, stream>>>(integral);
+                cudaSafeCall( cudaGetLastError() );
+            }
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    }
+}}}
--- a/modules/gpu/src/cuda/lbp.cu
+++ b/modules/gpu/src/cuda/lbp.cu
@ -279,7 +279,7 @@ namespace cv { namespace gpu { namespace device
                rect.z = __float2int_rn(windowW * scale);
                rect.w = __float2int_rn(windowH * scale);

-                int res = Emulation::smem::atomicInc(classified, (unsigned int)objects.cols);
+                int res = atomicInc(classified, (unsigned int)objects.cols);
                objects(0, res) = rect;
            }
        }
--- a/modules/gpu/src/hog.cpp
+++ b/modules/gpu/src/hog.cpp
@ -315,7 +315,7 @@ void cv::gpu::HOGDescriptor::computeConfidenceMultiScale(const GpuMat& img, vect
  double scale = 1.;
  int levels = 0;

-  for (levels = 0; levels < conf_out.size(); levels++)
+  for (levels = 0; levels < (int)conf_out.size(); levels++)
    {
      scale = conf_out[levels].scale;
      level_scale.push_back(scale);
@ -332,8 +332,8 @@ void cv::gpu::HOGDescriptor::computeConfidenceMultiScale(const GpuMat& img, vect

  for (size_t i = 0; i < level_scale.size(); i++)
    {
-      double scale = level_scale[i];
-      Size sz(cvRound(img.cols / scale), cvRound(img.rows / scale));
+      double _scale = level_scale[i];
+      Size sz(cvRound(img.cols / _scale), cvRound(img.rows / _scale));
      GpuMat smaller_img;

      if (sz == img.size())
--- a/modules/gpu/src/hough.cpp
+++ b/modules/gpu/src/hough.cpp
@ -57,11 +57,27 @@ namespace cv { namespace gpu { namespace device
    namespace hough
    {
        int buildPointList_gpu(DevMem2Db src, unsigned int* list);
+
        void linesAccum_gpu(const unsigned int* list, int count, DevMem2Di accum, float rho, float theta, size_t sharedMemPerBlock, bool has20);
        int linesGetResult_gpu(DevMem2Di accum, float2* out, int* votes, int maxSize, float rho, float theta, float threshold, bool doSort);
    }
 }}}

+//////////////////////////////////////////////////////////
+// HoughLines
+
+void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, float rho, float theta, int threshold, bool doSort, int maxLines)
+{
+    GpuMat accum, buf;
+    HoughLines(src, lines, accum, buf, rho, theta, threshold, doSort, maxLines);
+}
+
+void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, GpuMat& accum, GpuMat& buf, float rho, float theta, int threshold, bool doSort, int maxLines)
+{
+    HoughLinesTransform(src, accum, buf, rho, theta);
+    HoughLinesGet(accum, lines, rho, theta, threshold, doSort, maxLines);
+}
+
 void cv::gpu::HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf, float rho, float theta)
 {
    using namespace cv::gpu::device::hough;
@ -80,23 +96,23 @@ void cv::gpu::HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf,
    CV_Assert(numangle > 0 && numrho > 0);

    ensureSizeIsEnough(numangle + 2, numrho + 2, CV_32SC1, accum);
-    accum.setTo(cv::Scalar::all(0));
+    accum.setTo(Scalar::all(0));

-    cv::gpu::DeviceInfo devInfo;
+    DeviceInfo devInfo;

    if (count > 0)
-        linesAccum_gpu(buf.ptr<unsigned int>(), count, accum, rho, theta, devInfo.sharedMemPerBlock(), devInfo.supports(cv::gpu::FEATURE_SET_COMPUTE_20));
+        linesAccum_gpu(buf.ptr<unsigned int>(), count, accum, rho, theta, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20));
 }

 void cv::gpu::HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float theta, int threshold, bool doSort, int maxLines)
 {
-    using namespace cv::gpu::device;
+    using namespace cv::gpu::device::hough;

    CV_Assert(accum.type() == CV_32SC1);

    ensureSizeIsEnough(2, maxLines, CV_32FC2, lines);

-    int count = hough::linesGetResult_gpu(accum, lines.ptr<float2>(0), lines.ptr<int>(1), maxLines, rho, theta, threshold, doSort);
+    int count = linesGetResult_gpu(accum, lines.ptr<float2>(0), lines.ptr<int>(1), maxLines, rho, theta, threshold, doSort);

    if (count > 0)
        lines.cols = count;
@ -104,18 +120,6 @@ void cv::gpu::HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float
        lines.release();
 }

-void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, float rho, float theta, int threshold, bool doSort, int maxLines)
-{
-    cv::gpu::GpuMat accum, buf;
-    HoughLines(src, lines, accum, buf, rho, theta, threshold, doSort, maxLines);
-}
-
-void cv::gpu::HoughLines(const GpuMat& src, GpuMat& lines, GpuMat& accum, GpuMat& buf, float rho, float theta, int threshold, bool doSort, int maxLines)
-{
-    HoughLinesTransform(src, accum, buf, rho, theta);
-    HoughLinesGet(accum, lines, rho, theta, threshold, doSort, maxLines);
-}
-
 void cv::gpu::HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines_, OutputArray h_votes_)
 {
    if (d_lines.empty())
@ -129,14 +133,14 @@ void cv::gpu::HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines_, Ou
    CV_Assert(d_lines.rows == 2 && d_lines.type() == CV_32FC2);

    h_lines_.create(1, d_lines.cols, CV_32FC2);
-    cv::Mat h_lines = h_lines_.getMat();
+    Mat h_lines = h_lines_.getMat();
    d_lines.row(0).download(h_lines);

    if (h_votes_.needed())
    {
        h_votes_.create(1, d_lines.cols, CV_32SC1);
-        cv::Mat h_votes = h_votes_.getMat();
-        cv::gpu::GpuMat d_votes(1, d_lines.cols, CV_32SC1, const_cast<int*>(d_lines.ptr<int>(1)));
+        Mat h_votes = h_votes_.getMat();
+        GpuMat d_votes(1, d_lines.cols, CV_32SC1, const_cast<int*>(d_lines.ptr<int>(1)));
        d_votes.download(h_votes);
    }
 }
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@ -223,7 +223,7 @@ void cv::gpu::reprojectImageTo3D(const GpuMat& disp, GpuMat& xyz, const Mat& Q,
    using namespace cv::gpu::device::imgproc;

    typedef void (*func_t)(const DevMem2Db disp, DevMem2Db xyz, const float* q, cudaStream_t stream);
-    static const func_t funcs[2][4] = 
+    static const func_t funcs[2][4] =
    {
        {reprojectImageTo3D_gpu<uchar, float3>, 0, 0, reprojectImageTo3D_gpu<short, float3>},
        {reprojectImageTo3D_gpu<uchar, float4>, 0, 0, reprojectImageTo3D_gpu<short, float4>}
@ -261,6 +261,12 @@ namespace
    }
 }

+#if defined __GNUC__ && __GNUC__ > 2 && __GNUC_MINOR__  > 4
+typedef Npp32s __attribute__((__may_alias__)) Npp32s_a;
+#else
+typedef Npp32s Npp32s_a;
+#endif
+
 void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, int borderType, const Scalar& value, Stream& s)
 {
    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
@ -308,7 +314,7 @@ void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom
        case CV_32FC1:
            {
                Npp32f val = saturate_cast<Npp32f>(value[0]);
-                Npp32s nVal = *(reinterpret_cast<Npp32s*>(&val));
+                Npp32s nVal = *(reinterpret_cast<Npp32s_a*>(&val));
                nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), srcsz,
                    dst.ptr<Npp32s>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
                break;
@ -527,32 +533,86 @@ void cv::gpu::integral(const GpuMat& src, GpuMat& sum, Stream& s)
    integralBuffered(src, sum, buffer, s);
 }

+namespace cv { namespace gpu { namespace device
+{
+    namespace imgproc
+    {
+        void shfl_integral_gpu(DevMem2Db img, DevMem2D_<unsigned int> integral, cudaStream_t stream);
+    }
+}}}
+
 void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, Stream& s)
 {
    CV_Assert(src.type() == CV_8UC1);
-    if (sum.cols != src.cols + 1 && sum.rows != src.rows + 1)
-        sum.create(src.rows + 1, src.cols + 1, CV_32S);

-    NcvSize32u roiSize;
-    roiSize.width = src.cols;
-    roiSize.height = src.rows;
+    cudaStream_t stream = StreamAccessor::getStream(s);

-    cudaDeviceProp prop;
-    cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );
+    DeviceInfo info;

-    Ncv32u bufSize;
-    ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
-    ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer);
+    if (info.supports(WARP_SHUFFLE_FUNCTIONS))
+    {
+        GpuMat src16;

-    cudaStream_t stream = StreamAccessor::getStream(s);
+        if (src.cols % 16 == 0)
+            src16 = src;
+        else
+        {
+            ensureSizeIsEnough(src.rows, ((src.cols + 15) / 16) * 16, src.type(), buffer);

-    NppStStreamHandler h(stream);
+            GpuMat inner = buffer(Rect(0, 0, src.cols, src.rows));

-    ncvSafeCall( nppiStIntegral_8u32u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>()), static_cast<int>(src.step),
-        sum.ptr<Ncv32u>(), static_cast<int>(sum.step), roiSize, buffer.ptr<Ncv8u>(), bufSize, prop) );
+            if (s)
+            {
+                s.enqueueMemSet(buffer, Scalar::all(0));
+                s.enqueueCopy(src, inner);
+            }
+            else
+            {
+                buffer.setTo(Scalar::all(0));
+                src.copyTo(inner);
+            }

-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
+            src16 = buffer;
+        }
+
+        sum.create(src16.rows + 1, src16.cols + 1, CV_32SC1);
+
+        if (s)
+            s.enqueueMemSet(sum, Scalar::all(0));
+        else
+            sum.setTo(Scalar::all(0));
+
+        GpuMat inner = sum(Rect(1, 1, src16.cols, src16.rows));
+
+        cv::gpu::device::imgproc::shfl_integral_gpu(src16, inner, stream);
+
+        if (src16.cols != src.cols)
+            sum = sum(Rect(0, 0, src.cols + 1, src.rows + 1));
+    }
+    else
+    {
+        sum.create(src.rows + 1, src.cols + 1, CV_32SC1);
+
+        NcvSize32u roiSize;
+        roiSize.width = src.cols;
+        roiSize.height = src.rows;
+
+        cudaDeviceProp prop;
+        cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );
+
+        Ncv32u bufSize;
+        ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
+        ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer);
+
+
+        NppStStreamHandler h(stream);
+
+        ncvSafeCall( nppiStIntegral_8u32u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>()), static_cast<int>(src.step),
+            sum.ptr<Ncv32u>(), static_cast<int>(sum.step), roiSize, buffer.ptr<Ncv8u>(), bufSize, prop) );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
 }

 //////////////////////////////////////////////////////////////////////////////
@ -1334,7 +1394,7 @@ Size cv::gpu::ConvolveBuf::estimateBlockSize(Size result_size, Size /*templ_size
    int width = (result_size.width + 2) / 3;
    int height = (result_size.height + 2) / 3;
    width = std::min(width, result_size.width);
-    height = std::min(height, result_size.height);    
+    height = std::min(height, result_size.height);
    return Size(width, height);
 }

@ -1374,7 +1434,7 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,

    cufftHandle planR2C, planC2R;
    cufftSafeCall(cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R));
-    cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C));   
+    cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C));

    cufftSafeCall( cufftSetStream(planR2C, StreamAccessor::getStream(stream)) );
    cufftSafeCall( cufftSetStream(planC2R, StreamAccessor::getStream(stream)) );
--- a/modules/gpu/src/match_template.cpp
+++ b/modules/gpu/src/match_template.cpp
@ -52,9 +52,9 @@ void cv::gpu::matchTemplate(const GpuMat&, const GpuMat&, GpuMat&, int, Stream&)

 #else

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace match_template 
+    namespace match_template
    {
        void matchTemplateNaive_CCORR_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
        void matchTemplateNaive_CCORR_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
@ -71,47 +71,47 @@ namespace cv { namespace gpu { namespace device
        void matchTemplatePrepared_CCOFF_8U(int w, int h, const DevMem2D_<unsigned int> image_sum, unsigned int templ_sum, DevMem2Df result, cudaStream_t stream);
        void matchTemplatePrepared_CCOFF_8UC2(
            int w, int h,
-            const DevMem2D_<unsigned int> image_sum_r, 
-            const DevMem2D_<unsigned int> image_sum_g, 
+            const DevMem2D_<unsigned int> image_sum_r,
+            const DevMem2D_<unsigned int> image_sum_g,
            unsigned int templ_sum_r,
-            unsigned int templ_sum_g, 
+            unsigned int templ_sum_g,
            DevMem2Df result, cudaStream_t stream);
        void matchTemplatePrepared_CCOFF_8UC3(
-                int w, int h, 
-                const DevMem2D_<unsigned int> image_sum_r, 
+                int w, int h,
+                const DevMem2D_<unsigned int> image_sum_r,
                const DevMem2D_<unsigned int> image_sum_g,
                const DevMem2D_<unsigned int> image_sum_b,
-                unsigned int templ_sum_r, 
-                unsigned int templ_sum_g, 
-                unsigned int templ_sum_b, 
+                unsigned int templ_sum_r,
+                unsigned int templ_sum_g,
+                unsigned int templ_sum_b,
                DevMem2Df result, cudaStream_t stream);
        void matchTemplatePrepared_CCOFF_8UC4(
-                int w, int h, 
-                const DevMem2D_<unsigned int> image_sum_r, 
+                int w, int h,
+                const DevMem2D_<unsigned int> image_sum_r,
                const DevMem2D_<unsigned int> image_sum_g,
                const DevMem2D_<unsigned int> image_sum_b,
                const DevMem2D_<unsigned int> image_sum_a,
-                unsigned int templ_sum_r, 
-                unsigned int templ_sum_g, 
-                unsigned int templ_sum_b, 
-                unsigned int templ_sum_a, 
+                unsigned int templ_sum_r,
+                unsigned int templ_sum_g,
+                unsigned int templ_sum_b,
+                unsigned int templ_sum_a,
                DevMem2Df result, cudaStream_t stream);


        void matchTemplatePrepared_CCOFF_NORMED_8U(
-                int w, int h, const DevMem2D_<unsigned int> image_sum, 
+                int w, int h, const DevMem2D_<unsigned int> image_sum,
                const DevMem2D_<unsigned long long> image_sqsum,
                unsigned int templ_sum, unsigned long long templ_sqsum,
                DevMem2Df result, cudaStream_t stream);
        void matchTemplatePrepared_CCOFF_NORMED_8UC2(
-                int w, int h, 
+                int w, int h,
                const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
                const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
                unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
                unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
                DevMem2Df result, cudaStream_t stream);
        void matchTemplatePrepared_CCOFF_NORMED_8UC3(
-                int w, int h, 
+                int w, int h,
                const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
                const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
                const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,
@ -120,7 +120,7 @@ namespace cv { namespace gpu { namespace device
                unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
                DevMem2Df result, cudaStream_t stream);
        void matchTemplatePrepared_CCOFF_NORMED_8UC4(
-                int w, int h, 
+                int w, int h,
                const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
                const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
                const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,
@ -131,7 +131,7 @@ namespace cv { namespace gpu { namespace device
                unsigned int templ_sum_a, unsigned long long templ_sqsum_a,
                DevMem2Df result, cudaStream_t stream);

-        void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, 
+        void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum,
                          unsigned long long templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream);

        void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn, cudaStream_t stream);
@ -140,17 +140,17 @@ namespace cv { namespace gpu { namespace device

 using namespace ::cv::gpu::device::match_template;

-namespace 
+namespace
 {

-    // Evaluates optimal template's area threshold. If 
-    // template's area is less  than the threshold, we use naive match 
+    // Evaluates optimal template's area threshold. If
+    // template's area is less  than the threshold, we use naive match
    // template version, otherwise FFT-based (if available)
    int getTemplateThreshold(int method, int depth)
    {
        switch (method)
        {
-        case CV_TM_CCORR: 
+        case CV_TM_CCORR:
            if (depth == CV_32F) return 250;
            if (depth == CV_8U) return 300;
            break;
@ -162,10 +162,10 @@ namespace
        return 0;
    }

-    
+
    void matchTemplate_CCORR_32F(
            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
-    {        
+    {
        result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
        if (templ.size().area() < getTemplateThreshold(CV_TM_CCORR, CV_32F))
        {
@ -223,10 +223,11 @@ namespace
        normalize_8U(templ.cols, templ.rows, buf.image_sqsums[0], templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
    }

-    
+
    void matchTemplate_SQDIFF_32F(
            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
    {
+        (void)buf;
        result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
        matchTemplateNaive_SQDIFF_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
    }
@ -362,7 +363,7 @@ namespace
            {
            case 2:
                matchTemplatePrepared_CCOFF_NORMED_8UC2(
-                        templ.cols, templ.rows, 
+                        templ.cols, templ.rows,
                        buf.image_sums[0], buf.image_sqsums[0],
                        buf.image_sums[1], buf.image_sqsums[1],
                        (unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0],
@ -371,7 +372,7 @@ namespace
                break;
            case 3:
                matchTemplatePrepared_CCOFF_NORMED_8UC3(
-                        templ.cols, templ.rows, 
+                        templ.cols, templ.rows,
                        buf.image_sums[0], buf.image_sqsums[0],
                        buf.image_sums[1], buf.image_sqsums[1],
                        buf.image_sums[2], buf.image_sqsums[2],
@ -382,7 +383,7 @@ namespace
                break;
            case 4:
                matchTemplatePrepared_CCOFF_NORMED_8UC4(
-                        templ.cols, templ.rows, 
+                        templ.cols, templ.rows,
                        buf.image_sums[0], buf.image_sqsums[0],
                        buf.image_sums[1], buf.image_sqsums[1],
                        buf.image_sums[2], buf.image_sqsums[2],
@ -391,7 +392,7 @@ namespace
                        (unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1],
                        (unsigned int)templ_sum[2], (unsigned long long)templ_sqsum[2],
                        (unsigned int)templ_sum[3], (unsigned long long)templ_sqsum[3],
-                        result, StreamAccessor::getStream(stream));                
+                        result, StreamAccessor::getStream(stream));
                break;
            default:
                CV_Error(CV_StsBadArg, "matchTemplate: unsupported number of channels");
--- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.hpp
+++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.hpp
@ -67,7 +67,11 @@
 // Guaranteed size cross-platform classifier structures
 //
 //==============================================================================
-
+#if defined __GNUC__ && __GNUC__ > 2 && __GNUC_MINOR__  > 4
+typedef Ncv32f __attribute__((__may_alias__)) Ncv32f_a;
+#else
+typedef Ncv32f Ncv32f_a;
+#endif

 struct HaarFeature64
 {
@ -87,7 +91,7 @@ struct HaarFeature64

    __host__ NCVStatus setWeight(Ncv32f weight)
    {
-        ((Ncv32f*)&(this->_ui2.y))[0] = weight;
+        ((Ncv32f_a*)&(this->_ui2.y))[0] = weight;
        return NCV_SUCCESS;
    }

@ -102,7 +106,7 @@ struct HaarFeature64

    __device__ __host__ Ncv32f getWeight(void)
    {
-        return *(Ncv32f*)(&this->_ui2.y);
+        return *(Ncv32f_a*)(&this->_ui2.y);
    }
 };

@ -168,14 +172,13 @@ public:
    }
 };

-
 struct HaarClassifierNodeDescriptor32
 {
    uint1 _ui1;

    __host__ NCVStatus create(Ncv32f leafValue)
    {
-        *(Ncv32f *)&this->_ui1 = leafValue;
+        *(Ncv32f_a *)&this->_ui1 = leafValue;
        return NCV_SUCCESS;
    }

@ -187,7 +190,7 @@ struct HaarClassifierNodeDescriptor32

    __host__ Ncv32f getLeafValueHost(void)
    {
-        return *(Ncv32f *)&this->_ui1.x;
+        return *(Ncv32f_a *)&this->_ui1.x;
    }

 #ifdef __CUDACC__
@ -203,6 +206,11 @@ struct HaarClassifierNodeDescriptor32
    }
 };

+#if defined __GNUC__ && __GNUC__ > 2 && __GNUC_MINOR__  > 4
+typedef Ncv32u __attribute__((__may_alias__)) Ncv32u_a;
+#else
+typedef Ncv32u Ncv32u_a;
+#endif

 struct HaarClassifierNode128
 {
@ -216,19 +224,19 @@ struct HaarClassifierNode128

    __host__ NCVStatus setThreshold(Ncv32f t)
    {
-        this->_ui4.y = *(Ncv32u *)&t;
+        this->_ui4.y = *(Ncv32u_a *)&t;
        return NCV_SUCCESS;
    }

    __host__ NCVStatus setLeftNodeDesc(HaarClassifierNodeDescriptor32 nl)
    {
-        this->_ui4.z = *(Ncv32u *)&nl;
+        this->_ui4.z = *(Ncv32u_a *)&nl;
        return NCV_SUCCESS;
    }

    __host__ NCVStatus setRightNodeDesc(HaarClassifierNodeDescriptor32 nr)
    {
-        this->_ui4.w = *(Ncv32u *)&nr;
+        this->_ui4.w = *(Ncv32u_a *)&nr;
        return NCV_SUCCESS;
    }

@ -239,7 +247,7 @@ struct HaarClassifierNode128

    __host__ __device__ Ncv32f getThreshold(void)
    {
-        return *(Ncv32f*)&this->_ui4.y;
+        return *(Ncv32f_a*)&this->_ui4.y;
    }

    __host__ __device__ HaarClassifierNodeDescriptor32 getLeftNodeDesc(void)
@ -264,7 +272,7 @@ struct HaarStage64

    __host__ NCVStatus setStageThreshold(Ncv32f t)
    {
-        this->_ui2.x = *(Ncv32u *)&t;
+        this->_ui2.x = *(Ncv32u_a *)&t;
        return NCV_SUCCESS;
    }

@ -290,7 +298,7 @@ struct HaarStage64

    __host__ __device__ Ncv32f getStageThreshold(void)
    {
-        return *(Ncv32f*)&this->_ui2.x;
+        return *(Ncv32f_a*)&this->_ui2.x;
    }

    __host__ __device__ Ncv32u getStartClassifierRootNodeOffset(void)
--- a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
+++ b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
@ -1423,7 +1423,7 @@ NCVStatus compactVector_32u_device(Ncv32u *d_src, Ncv32u srcLen,
                    (d_hierSums.ptr() + partSumOffsets[i],
                     partSumNums[i], NULL,
                     d_hierSums.ptr() + partSumOffsets[i+1],
-                     NULL);
+                     0);
            }
            else
            {
@ -1433,7 +1433,7 @@ NCVStatus compactVector_32u_device(Ncv32u *d_src, Ncv32u srcLen,
                    (d_hierSums.ptr() + partSumOffsets[i],
                     partSumNums[i], NULL,
                     NULL,
-                     NULL);
+                     0);
            }

            ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
@ -1557,16 +1557,21 @@ NCVStatus nppsStCompact_32s(Ncv32s *d_src, Ncv32u srcLen,
 }


+#if defined __GNUC__ && __GNUC__ > 2 && __GNUC_MINOR__  > 4
+typedef Ncv32u __attribute__((__may_alias__)) Ncv32u_a;
+#else
+typedef Ncv32u Ncv32u_a;
+#endif
+
 NCVStatus nppsStCompact_32f(Ncv32f *d_src, Ncv32u srcLen,
                            Ncv32f *d_dst, Ncv32u *p_dstLen,
                            Ncv32f elemRemove, Ncv8u *pBuffer,
                            Ncv32u bufSize, cudaDeviceProp &devProp)
 {
    return nppsStCompact_32u((Ncv32u *)d_src, srcLen, (Ncv32u *)d_dst, p_dstLen,
-                             *(Ncv32u *)&elemRemove, pBuffer, bufSize, devProp);
+                             *(Ncv32u_a *)&elemRemove, pBuffer, bufSize, devProp);
 }

-
 NCVStatus nppsStCompact_32u_host(Ncv32u *h_src, Ncv32u srcLen,
                                 Ncv32u *h_dst, Ncv32u *dstLen, Ncv32u elemRemove)
 {
@ -1602,17 +1607,16 @@ NCVStatus nppsStCompact_32u_host(Ncv32u *h_src, Ncv32u srcLen,
 NCVStatus nppsStCompact_32s_host(Ncv32s *h_src, Ncv32u srcLen,
                                 Ncv32s *h_dst, Ncv32u *dstLen, Ncv32s elemRemove)
 {
-    return nppsStCompact_32u_host((Ncv32u *)h_src, srcLen, (Ncv32u *)h_dst, dstLen, *(Ncv32u *)&elemRemove);
+    return nppsStCompact_32u_host((Ncv32u *)h_src, srcLen, (Ncv32u *)h_dst, dstLen, *(Ncv32u_a *)&elemRemove);
 }


 NCVStatus nppsStCompact_32f_host(Ncv32f *h_src, Ncv32u srcLen,
                                 Ncv32f *h_dst, Ncv32u *dstLen, Ncv32f elemRemove)
 {
-    return nppsStCompact_32u_host((Ncv32u *)h_src, srcLen, (Ncv32u *)h_dst, dstLen, *(Ncv32u *)&elemRemove);
+    return nppsStCompact_32u_host((Ncv32u *)h_src, srcLen, (Ncv32u *)h_dst, dstLen, *(Ncv32u_a *)&elemRemove);
 }

-
 //==============================================================================
 //
 // Filter.cu
--- a/modules/gpu/src/opencv2/gpu/device/emulation.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/emulation.hpp
@ -51,11 +51,11 @@ namespace cv { namespace gpu { namespace device
    struct Emulation
    {

-        static __device__ __forceinline__ int sycthOr(int pred)
+        static __device__ __forceinline__ int syncthreadsOr(int pred)
        {
 #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 200)
                // just campilation stab
-                return false;
+                return 0;
 #else
                return __syncthreads_or(pred);
 #endif
--- a/modules/gpu/src/split_merge.cpp
+++ b/modules/gpu/src/split_merge.cpp
@ -119,7 +119,6 @@ namespace

        int depth = src.depth();
        int num_channels = src.channels();
-        Size size = src.size();

        if (depth == CV_64F)
        {
--- a/modules/gpu/src/video_decoder.cpp
+++ b/modules/gpu/src/video_decoder.cpp
@ -49,36 +49,36 @@ void cv::gpu::detail::VideoDecoder::create(const VideoReader_GPU::FormatInfo& vi
 {
    release();

-    cudaVideoCodec codec = static_cast<cudaVideoCodec>(videoFormat.codec);
-    cudaVideoChromaFormat chromaFormat = static_cast<cudaVideoChromaFormat>(videoFormat.chromaFormat);
+    cudaVideoCodec _codec = static_cast<cudaVideoCodec>(videoFormat.codec);
+    cudaVideoChromaFormat _chromaFormat = static_cast<cudaVideoChromaFormat>(videoFormat.chromaFormat);

-    cudaVideoCreateFlags videoCreateFlags = (codec == cudaVideoCodec_JPEG || codec == cudaVideoCodec_MPEG2) ?
+    cudaVideoCreateFlags videoCreateFlags = (_codec == cudaVideoCodec_JPEG || _codec == cudaVideoCodec_MPEG2) ?
                                            cudaVideoCreate_PreferCUDA :
                                            cudaVideoCreate_PreferCUVID;

    // Validate video format.  These are the currently supported formats via NVCUVID
-    CV_Assert(cudaVideoCodec_MPEG1 == codec ||
-              cudaVideoCodec_MPEG2 == codec ||
-              cudaVideoCodec_MPEG4 == codec ||
-              cudaVideoCodec_VC1   == codec ||
-              cudaVideoCodec_H264  == codec ||
-              cudaVideoCodec_JPEG  == codec ||
-              cudaVideoCodec_YUV420== codec ||
-              cudaVideoCodec_YV12  == codec ||
-              cudaVideoCodec_NV12  == codec ||
-              cudaVideoCodec_YUYV  == codec ||
-              cudaVideoCodec_UYVY  == codec );
-
-    CV_Assert(cudaVideoChromaFormat_Monochrome == chromaFormat ||
-              cudaVideoChromaFormat_420        == chromaFormat ||
-              cudaVideoChromaFormat_422        == chromaFormat ||
-              cudaVideoChromaFormat_444        == chromaFormat);
+    CV_Assert(cudaVideoCodec_MPEG1 == _codec ||
+              cudaVideoCodec_MPEG2 == _codec ||
+              cudaVideoCodec_MPEG4 == _codec ||
+              cudaVideoCodec_VC1   == _codec ||
+              cudaVideoCodec_H264  == _codec ||
+              cudaVideoCodec_JPEG  == _codec ||
+              cudaVideoCodec_YUV420== _codec ||
+              cudaVideoCodec_YV12  == _codec ||
+              cudaVideoCodec_NV12  == _codec ||
+              cudaVideoCodec_YUYV  == _codec ||
+              cudaVideoCodec_UYVY  == _codec );
+
+    CV_Assert(cudaVideoChromaFormat_Monochrome == _chromaFormat ||
+              cudaVideoChromaFormat_420        == _chromaFormat ||
+              cudaVideoChromaFormat_422        == _chromaFormat ||
+              cudaVideoChromaFormat_444        == _chromaFormat);

    // Fill the decoder-create-info struct from the given video-format struct.
    std::memset(&createInfo_, 0, sizeof(CUVIDDECODECREATEINFO));

    // Create video decoder
-    createInfo_.CodecType           = codec;
+    createInfo_.CodecType           = _codec;
    createInfo_.ulWidth             = videoFormat.width;
    createInfo_.ulHeight            = videoFormat.height;
    createInfo_.ulNumDecodeSurfaces = FrameQueue::MaximumSize;
@ -87,7 +87,7 @@ void cv::gpu::detail::VideoDecoder::create(const VideoReader_GPU::FormatInfo& vi
    while (createInfo_.ulNumDecodeSurfaces * videoFormat.width * videoFormat.height > 16 * 1024 * 1024)
        createInfo_.ulNumDecodeSurfaces--;

-    createInfo_.ChromaFormat    = chromaFormat;
+    createInfo_.ChromaFormat    = _chromaFormat;
    createInfo_.OutputFormat    = cudaVideoSurfaceFormat_NV12;
    createInfo_.DeinterlaceMode = cudaVideoDeinterlaceMode_Adaptive;

--- a/modules/gpu/test/main.cpp
+++ b/modules/gpu/test/main.cpp
@ -39,7 +39,7 @@
 //
 //M*/

-#include "precomp.hpp"
+#include "test_precomp.hpp"

 #ifdef HAVE_CUDA

@ -49,87 +49,128 @@ using namespace cv::gpu;
 using namespace cvtest;
 using namespace testing;

-void print_info()
+void printOsInfo()
 {
-    printf("\n");
 #if defined _WIN32
 #   if defined _WIN64
-        puts("OS: Windows 64");
+        cout << "OS: Windows x64 \n" << endl;
 #   else
-        puts("OS: Windows 32");
+        cout << "OS: Windows x32 \n" << endl;
 #   endif
 #elif defined linux
 #   if defined _LP64
-        puts("OS: Linux 64");
+        cout << "OS: Linux x64 \n" << endl;
 #   else
-        puts("OS: Linux 32");
+        cout << "OS: Linux x32 \n" << endl;
 #   endif
 #elif defined __APPLE__
 #   if defined _LP64
-        puts("OS: Apple 64");
+        cout << "OS: Apple x64 \n" << endl;
 #   else
-        puts("OS: Apple 32");
+        cout << "OS: Apple x32 \n" << endl;
 #   endif
 #endif
+}

-    int deviceCount = getCudaEnabledDeviceCount();
+void printCudaInfo()
+{
+#ifndef HAVE_CUDA
+    cout << "OpenCV was built without CUDA support \n" << endl;
+#else
    int driver;
    cudaDriverGetVersion(&driver);

-    printf("CUDA Driver  version: %d\n", driver);
-    printf("CUDA Runtime version: %d\n", CUDART_VERSION);
-    printf("CUDA device count: %d\n\n", deviceCount);
+    cout << "CUDA Driver  version: " << driver << '\n';
+    cout << "CUDA Runtime version: " << CUDART_VERSION << '\n';
+
+    cout << endl;
+
+    cout << "GPU module was compiled for the following GPU archs:" << endl;
+    cout << "    BIN: " << CUDA_ARCH_BIN << '\n';
+    cout << "    PTX: " << CUDA_ARCH_PTX << '\n';
+
+    cout << endl;
+
+    int deviceCount = getCudaEnabledDeviceCount();
+    cout << "CUDA device count: " << deviceCount << '\n';
+
+    cout << endl;

    for (int i = 0; i < deviceCount; ++i)
    {
        DeviceInfo info(i);

-        printf("Device %d:\n", i);
-        printf("    Name: %s\n", info.name().c_str());
-        printf("    Compute capability version: %d.%d\n", info.majorVersion(), info.minorVersion());
-        printf("    Total memory: %d Mb\n", static_cast<int>(static_cast<int>(info.totalMemory() / 1024.0) / 1024.0));
-        printf("    Free  memory: %d Mb\n", static_cast<int>(static_cast<int>(info.freeMemory() / 1024.0) / 1024.0));
-        if (info.isCompatible())
-            puts("    This device is compatible with current GPU module build\n");
-        else
-            puts("    This device is NOT compatible with current GPU module build\n");
-    }
+        cout << "Device [" << i << "] \n";
+        cout << "\t Name: " << info.name() << '\n';
+        cout << "\t Compute capability: " << info.majorVersion() << '.' << info.minorVersion()<< '\n';
+        cout << "\t Multi Processor Count: " << info.multiProcessorCount() << '\n';
+        cout << "\t Total memory: " << static_cast<int>(static_cast<int>(info.totalMemory() / 1024.0) / 1024.0) << " Mb \n";
+        cout << "\t Free  memory: " << static_cast<int>(static_cast<int>(info.freeMemory() / 1024.0) / 1024.0) << " Mb \n";
+        if (!info.isCompatible())
+            cout << "\t !!! This device is NOT compatible with current GPU module build \n";

-    puts("GPU module was compiled for the following GPU archs:");
-    printf("    BIN: %s\n", CUDA_ARCH_BIN);
-    printf("    PTX: %s\n\n", CUDA_ARCH_PTX);
+        cout << endl;
+    }
+#endif
 }

-enum OutputLevel
+int main(int argc, char** argv)
 {
-    OutputLevelNone,
-    OutputLevelCompact,
-    OutputLevelFull
-};
+    try
+    {
+        CommandLineParser cmd(argc, (const char**)argv,
+            "{ print_info_only | print_info_only | false | Print information about system and exit }"
+            "{ device | device | -1 | Device on which tests will be executed (-1 means all devices) }"
+            "{ nvtest_output_level | nvtest_output_level | compact | NVidia test verbosity level }"
+        );

-extern OutputLevel nvidiaTestOutputLevel;
+        printOsInfo();
+        printCudaInfo();

-int main(int argc, char** argv)
-{
-    TS::ptr()->init("gpu");
-    InitGoogleTest(&argc, argv);
+        if (cmd.get<bool>("print_info_only"))
+            return 0;

-    const char* keys ="{ nvtest_output_level | nvtest_output_level | compact | NVidia test verbosity level }";
+        int device = cmd.get<int>("device");
+        if (device < 0)
+        {
+            DeviceManager::instance().loadAll();

-    CommandLineParser parser(argc, (const char**)argv, keys);
+            cout << "Run tests on all supported devices \n" << endl;
+        }
+        else
+        {
+            DeviceManager::instance().load(device);
+
+            DeviceInfo info(device);
+            cout << "Run tests on device " << device << " [" << info.name() << "] \n" << endl;
+        }

-    string outputLevel = parser.get<string>("nvtest_output_level", "none");
+        string outputLevel = cmd.get<string>("nvtest_output_level");

-    if (outputLevel == "none")
-        nvidiaTestOutputLevel = OutputLevelNone;
-    else if (outputLevel == "compact")
-        nvidiaTestOutputLevel = OutputLevelCompact;
-    else if (outputLevel == "full")
-        nvidiaTestOutputLevel = OutputLevelFull;
+        if (outputLevel == "none")
+            nvidiaTestOutputLevel = OutputLevelNone;
+        else if (outputLevel == "compact")
+            nvidiaTestOutputLevel = OutputLevelCompact;
+        else if (outputLevel == "full")
+            nvidiaTestOutputLevel = OutputLevelFull;

-    print_info();
+        TS::ptr()->init("gpu");
+        InitGoogleTest(&argc, argv);

-    return RUN_ALL_TESTS();
+        return RUN_ALL_TESTS();
+    }
+    catch (const exception& e)
+    {
+        cerr << e.what() << endl;
+        return -1;
+    }
+    catch (...)
+    {
+        cerr << "Unknown error" << endl;
+        return -1;
+    }
+
+    return 0;
 }

 #else // HAVE_CUDA
--- a/modules/gpu/test/main_test_nvidia.h
+++ b/modules/gpu/test/main_test_nvidia.h
@ -1,7 +1,7 @@
 #ifndef __main_test_nvidia_h__
 #define __main_test_nvidia_h__

-#include<string>
+#include <string>

 enum OutputLevel
 {
@ -10,6 +10,8 @@ enum OutputLevel
    OutputLevelFull
 };

+extern OutputLevel nvidiaTestOutputLevel;
+
 bool nvidia_NPPST_Integral_Image(const std::string& test_data_path, OutputLevel outputLevel);
 bool nvidia_NPPST_Squared_Integral_Image(const std::string& test_data_path, OutputLevel outputLevel);
 bool nvidia_NPPST_RectStdDev(const std::string& test_data_path, OutputLevel outputLevel);
--- a/modules/gpu/test/nvidia/TestHaarCascadeApplication.cpp
+++ b/modules/gpu/test/nvidia/TestHaarCascadeApplication.cpp
@ -245,8 +245,8 @@ bool TestHaarCascadeApplication::process()

    int devId;
    ncvAssertCUDAReturn(cudaGetDevice(&devId), false);
-    cudaDeviceProp devProp;
-    ncvAssertCUDAReturn(cudaGetDeviceProperties(&devProp, devId), false);
+    cudaDeviceProp _devProp;
+    ncvAssertCUDAReturn(cudaGetDeviceProperties(&_devProp, devId), false);

    ncvStat = ncvApplyHaarClassifierCascade_device(
        d_integralImage, d_rectStdDev, d_pixelMask,
@ -254,7 +254,7 @@ bool TestHaarCascadeApplication::process()
        haar, h_HaarStages, d_HaarStages, d_HaarNodes, d_HaarFeatures, false,
        searchRoiU, 1, 1.0f,
        *this->allocatorGPU.get(), *this->allocatorCPU.get(),
-        devProp, 0);
+        _devProp, 0);
    ncvAssertReturn(ncvStat == NCV_SUCCESS, false);

    NCVMatrixAlloc<Ncv32u> h_pixelMask_d(*this->allocatorCPU.get(), this->width, this->height);
--- a/modules/gpu/test/nvidia/main_nvidia.cpp
+++ b/modules/gpu/test/nvidia/main_nvidia.cpp
@ -1,4 +1,6 @@
-#pragma warning (disable : 4408 4201 4100)
+#if defined _MSC_VER && _MSC_VER >= 1200
+# pragma warning (disable : 4408 4201 4100)
+#endif

 #include <cstdio>

--- a/modules/gpu/test/test_calib3d.cpp
+++ b/modules/gpu/test/test_calib3d.cpp
@ -39,7 +39,9 @@
 //
 //M*/

-#include "precomp.hpp"
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA

 namespace {

@ -329,7 +331,7 @@ TEST_P(ReprojectImageTo3D, Accuracy)

    cv::gpu::GpuMat dst;
    cv::gpu::reprojectImageTo3D(loadMat(disp, useRoi), dst, Q, 3);
-    
+
    cv::Mat dst_gold;
    cv::reprojectImageTo3D(disp, dst_gold, Q, false);

@ -343,3 +345,5 @@ INSTANTIATE_TEST_CASE_P(GPU_Calib3D, ReprojectImageTo3D, testing::Combine(
    WHOLE_SUBMAT));

 } // namespace
+
+#endif // HAVE_CUDA
--- a/modules/gpu/test/test_color.cpp
+++ b/modules/gpu/test/test_color.cpp
@ -39,7 +39,7 @@
 //
 //M*/

-#include "precomp.hpp"
+#include "test_precomp.hpp"

 #ifdef HAVE_CUDA

--- a/modules/gpu/test/test_copy_make_border.cpp
+++ b/modules/gpu/test/test_copy_make_border.cpp
@ -39,7 +39,9 @@
 //
 //M*/

-#include "precomp.hpp"
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA

 namespace {

@ -98,3 +100,5 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CopyMakeBorder, testing::Combine(
    WHOLE_SUBMAT));

 } // namespace
+
+#endif // HAVE_CUDA
--- a/modules/gpu/test/test_core.cpp
+++ b/modules/gpu/test/test_core.cpp
@ -39,7 +39,9 @@
 //
 //M*/

-#include "precomp.hpp"
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA

 namespace {

@ -3396,3 +3398,5 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Reduce, testing::Combine(
    WHOLE_SUBMAT));

 } // namespace
+
+#endif // HAVE_CUDA
--- a/modules/gpu/test/test_features2d.cpp
+++ b/modules/gpu/test/test_features2d.cpp
@ -39,7 +39,9 @@
 //
 //M*/

-#include "precomp.hpp"
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA

 namespace {

@ -984,3 +986,5 @@ INSTANTIATE_TEST_CASE_P(GPU_Features2D, BruteForceMatcher, testing::Combine(
    testing::Values(UseMask(false), UseMask(true))));

 } // namespace
+
+#endif // HAVE_CUDA
--- a/modules/gpu/test/test_filters.cpp
+++ b/modules/gpu/test/test_filters.cpp
@ -39,7 +39,9 @@
 //
 //M*/

-#include "precomp.hpp"
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA

 namespace {

@ -552,3 +554,5 @@ INSTANTIATE_TEST_CASE_P(GPU_Filter, Filter2D, testing::Combine(
    WHOLE_SUBMAT));

 } // namespace
+
+#endif // HAVE_CUDA
--- a/modules/gpu/test/test_global_motion.cpp
+++ b/modules/gpu/test/test_global_motion.cpp
@ -39,11 +39,11 @@
 //
 //M*/

-#include "precomp.hpp"
+#include "test_precomp.hpp"

-#include <iostream>
-using namespace std;
+#ifdef HAVE_CUDA

+using namespace std;
 using namespace cv;

 struct CompactPoints : testing::TestWithParam<gpu::DeviceInfo>
@ -85,3 +85,5 @@ TEST_P(CompactPoints, CanCompactizeSmallInput)
 }

 INSTANTIATE_TEST_CASE_P(GPU_GlobalMotion, CompactPoints, ALL_DEVICES);
+
+#endif // HAVE_CUDA
--- a/modules/gpu/test/test_gpumat.cpp
+++ b/modules/gpu/test/test_gpumat.cpp
@ -40,7 +40,9 @@
 //
 //M*/

-#include "precomp.hpp"
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA

 namespace {

@ -323,3 +325,5 @@ INSTANTIATE_TEST_CASE_P(GPU_GpuMat, ConvertTo, testing::Combine(
    WHOLE_SUBMAT));

 } // namespace
+
+#endif // HAVE_CUDA
--- a/modules/gpu/test/test_imgproc.cpp
+++ b/modules/gpu/test/test_imgproc.cpp
@ -39,7 +39,9 @@
 //
 //M*/

-#include "precomp.hpp"
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA

 namespace {

@ -1127,62 +1129,68 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CornerMinEigen, testing::Combine(
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // HoughLines

-PARAM_TEST_CASE(HoughLines, cv::gpu::DeviceInfo, std::string)
+PARAM_TEST_CASE(HoughLines, cv::gpu::DeviceInfo, cv::Size, UseRoi)
 {
-};
+    void generateLines(cv::Mat& img)
+    {
+        img.setTo(cv::Scalar::all(0));

-void drawLines(cv::Mat& dst, const std::vector<cv::Vec2f>& lines)
-{
-    for (size_t i = 0; i < lines.size(); ++i)
+        cv::line(img, cv::Point(20, 0), cv::Point(20, img.rows), cv::Scalar::all(255));
+        cv::line(img, cv::Point(0, 50), cv::Point(img.cols, 50), cv::Scalar::all(255));
+        cv::line(img, cv::Point(0, 0), cv::Point(img.cols, img.rows), cv::Scalar::all(255));
+        cv::line(img, cv::Point(img.cols, 0), cv::Point(0, img.rows), cv::Scalar::all(255));
+    }
+
+    void drawLines(cv::Mat& dst, const std::vector<cv::Vec2f>& lines)
    {
-        float rho = lines[i][0], theta = lines[i][1];
-        cv::Point pt1, pt2;
-        double a = std::cos(theta), b = std::sin(theta);
-        double x0 = a*rho, y0 = b*rho;
-        pt1.x = cvRound(x0 + 1000*(-b));
-        pt1.y = cvRound(y0 + 1000*(a));
-        pt2.x = cvRound(x0 - 1000*(-b));
-        pt2.y = cvRound(y0 - 1000*(a));
-        cv::line(dst, pt1, pt2, cv::Scalar::all(255));
+        dst.setTo(cv::Scalar::all(0));
+
+        for (size_t i = 0; i < lines.size(); ++i)
+        {
+            float rho = lines[i][0], theta = lines[i][1];
+            cv::Point pt1, pt2;
+            double a = std::cos(theta), b = std::sin(theta);
+            double x0 = a*rho, y0 = b*rho;
+            pt1.x = cvRound(x0 + 1000*(-b));
+            pt1.y = cvRound(y0 + 1000*(a));
+            pt2.x = cvRound(x0 - 1000*(-b));
+            pt2.y = cvRound(y0 - 1000*(a));
+            cv::line(dst, pt1, pt2, cv::Scalar::all(255));
+        }
    }
-}
+};

 TEST_P(HoughLines, Accuracy)
 {
    const cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
    cv::gpu::setDevice(devInfo.deviceID());
-    const std::string fileName = GET_PARAM(1);
+    const cv::Size size = GET_PARAM(1);
+    const bool useRoi = GET_PARAM(2);

    const float rho = 1.0f;
-    const float theta = static_cast<float>(CV_PI / 180);
-    const int threshold = 50;
-
-    cv::Mat img = readImage(fileName, cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
+    const float theta = 1.5f * CV_PI / 180.0f;
+    const int threshold = 100;

-    cv::Mat edges;
-    cv::Canny(img, edges, 50, 200);
+    cv::Mat src(size, CV_8UC1);
+    generateLines(src);

    cv::gpu::GpuMat d_lines;
-    cv::gpu::HoughLines(loadMat(edges), d_lines, rho, theta, threshold);
+    cv::gpu::HoughLines(loadMat(src, useRoi), d_lines, rho, theta, threshold);
+
    std::vector<cv::Vec2f> lines;
    cv::gpu::HoughLinesDownload(d_lines, lines);
-    cv::Mat dst(img.size(), CV_8UC1, cv::Scalar::all(0));
-    drawLines(dst, lines);

-    std::vector<cv::Vec2f> lines_gold;
-    cv::HoughLines(edges, lines_gold, rho, theta, threshold);
-    cv::Mat dst_gold(img.size(), CV_8UC1, cv::Scalar::all(0));
-    drawLines(dst_gold, lines_gold);
+    cv::Mat dst(size, CV_8UC1);
+    drawLines(dst, lines);

-    ASSERT_MAT_NEAR(dst_gold, dst, 0.0);
+    ASSERT_MAT_NEAR(src, dst, 0.0);
 }

 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, HoughLines, testing::Combine(
    ALL_DEVICES,
-    testing::Values(std::string("../cv/shared/pic1.png"),
-                    std::string("../cv/shared/pic3.png"),
-                    std::string("../cv/shared/pic5.png"),
-                    std::string("../cv/shared/pic6.png"))));
+    DIFFERENT_SIZES,
+    WHOLE_SUBMAT));

 } // namespace
+
+#endif // HAVE_CUDA
--- a/modules/gpu/test/test_labeling.cpp
+++ b/modules/gpu/test/test_labeling.cpp
@ -39,9 +39,7 @@
 // the use of this software, even if advised of the possibility of such damage.
 //M*/

-#include "precomp.hpp"
-#include <string>
-#include <iostream>
+#include "test_precomp.hpp"

 #ifdef HAVE_CUDA

@ -141,10 +139,9 @@ namespace {
                    if ( (_labels.at<int>(j,i) == gpu.at<int>(j,i + 1)) && (diff.at<int>(j, i) != diff.at<int>(j,i + 1)))
                    {
                        outliers++;
-                        // std::cout <<  j << " " << i << " " << _labels.at<int>(j,i) << " " << gpu.at<int>(j,i + 1) << " " << diff.at<int>(j, i) << " " << diff.at<int>(j,i + 1) << std::endl;
                    }
                }
-            ASSERT_FALSE(outliers);
+            ASSERT_TRUE(outliers < gpu.cols + gpu.rows);
        }

        cv::Mat image;
@ -164,7 +161,7 @@ struct Labeling : testing::TestWithParam<cv::gpu::DeviceInfo>

    cv::Mat loat_image()
    {
-        return cv::imread(std::string( cvtest::TS::ptr()->get_data_path() ) + "labeling/IMG_0727.JPG");
+        return cv::imread(std::string( cvtest::TS::ptr()->get_data_path() ) + "labeling/label.png");
    }
 };

@ -191,12 +188,8 @@ TEST_P(Labeling, ConnectedComponents)
    ASSERT_NO_THROW(cv::gpu::labelComponents(mask, components));

    host.checkCorrectness(cv::Mat(components));
-    cv::imshow("test", image);
-    cv::waitKey(0);
-    cv::imshow("test", host._labels);
-    cv::waitKey(0);
 }

 INSTANTIATE_TEST_CASE_P(ConnectedComponents, Labeling, ALL_DEVICES);

-#endif
+#endif // HAVE_CUDA
--- a/modules/gpu/test/test_nvidia.cpp
+++ b/modules/gpu/test/test_nvidia.cpp
@ -39,21 +39,15 @@
 //
 //M*/

-#include <main_test_nvidia.h>
-#include "precomp.hpp"
+#include "test_precomp.hpp"
+
+OutputLevel nvidiaTestOutputLevel = OutputLevelCompact;

 #ifdef HAVE_CUDA

 using namespace cvtest;
 using namespace testing;

-//enum OutputLevel
-//{
-//    OutputLevelNone,
-//    OutputLevelCompact,
-//    OutputLevelFull
-//};
-
 struct NVidiaTest : TestWithParam<cv::gpu::DeviceInfo>
 {
    cv::gpu::DeviceInfo devInfo;
@ -73,8 +67,6 @@ struct NVidiaTest : TestWithParam<cv::gpu::DeviceInfo>
 struct NPPST : NVidiaTest {};
 struct NCV : NVidiaTest {};

-OutputLevel nvidiaTestOutputLevel = OutputLevelCompact;
-
 //TEST_P(NPPST, Integral)
 //{
 //    bool res = nvidia_NPPST_Integral_Image(path, nvidiaTestOutputLevel);
--- a/modules/gpu/test/test_objdetect.cpp
+++ b/modules/gpu/test/test_objdetect.cpp
@ -39,8 +39,9 @@
 //
 //M*/

-#include "precomp.hpp"
-#include <string>
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA

 namespace {

@ -302,13 +303,13 @@ PARAM_TEST_CASE(LBP_Read_classifier, cv::gpu::DeviceInfo, int)

 TEST_P(LBP_Read_classifier, Accuracy)
 {
-    cv::gpu::CascadeClassifier_GPU classifier;
+    cv::gpu::CascadeClassifier_GPU classifier;
    std::string classifierXmlPath = std::string(cvtest::TS::ptr()->get_data_path()) + "lbpcascade/lbpcascade_frontalface.xml";
    ASSERT_TRUE(classifier.load(classifierXmlPath));
 }

-INSTANTIATE_TEST_CASE_P(GPU_ObjDetect, LBP_Read_classifier,
-                        testing::Combine(ALL_DEVICES, testing::Values<int>(0)));
+INSTANTIATE_TEST_CASE_P(GPU_ObjDetect, LBP_Read_classifier,
+                        testing::Combine(ALL_DEVICES, testing::Values<int>(0)));


 PARAM_TEST_CASE(LBP_classify, cv::gpu::DeviceInfo, int)
@ -344,31 +345,34 @@ TEST_P(LBP_classify, Accuracy)
    for (; it != rects.end(); ++it)
        cv::rectangle(markedImage, *it, CV_RGB(0, 0, 255));

-    cv::gpu::CascadeClassifier_GPU gpuClassifier;
+    cv::gpu::CascadeClassifier_GPU gpuClassifier;
    ASSERT_TRUE(gpuClassifier.load(classifierXmlPath));

    cv::gpu::GpuMat gpu_rects;
    cv::gpu::GpuMat tested(grey);
    int count = gpuClassifier.detectMultiScale(tested, gpu_rects);

+#if defined (LOG_CASCADE_STATISTIC)
    cv::Mat downloaded(gpu_rects);
-    const cv::Rect* faces = downloaded.ptr<cv::Rect>();
+    const cv::Rect* faces = downloaded.ptr<cv::Rect>();
    for (int i = 0; i < count; i++)
    {
        cv::Rect r = faces[i];

-#if defined (LOG_CASCADE_STATISTIC)
-        std::cout << r.x << " " << r.y  << " " << r.width << " " << r.height << std::endl;
+        std::cout << r.x << " " << r.y  << " " << r.width << " " << r.height << std::endl;
        cv::rectangle(markedImage, r , CV_RGB(255, 0, 0));
-#endif
    }
+#endif

 #if defined (LOG_CASCADE_STATISTIC)
-    cv::imshow("Res", markedImage); cv::waitKey();
+    cv::imshow("Res", markedImage); cv::waitKey();
 #endif
+    (void)count;
 }

 INSTANTIATE_TEST_CASE_P(GPU_ObjDetect, LBP_classify,
-                        testing::Combine(ALL_DEVICES, testing::Values<int>(0)));
+                        testing::Combine(ALL_DEVICES, testing::Values<int>(0)));

 } // namespace
+
+#endif // HAVE_CUDA
--- a/modules/gpu/test/test_precomp.cpp
+++ b/modules/gpu/test/test_precomp.cpp
@ -39,4 +39,4 @@
 //
 //M*/

-#include "precomp.hpp"
+#include "test_precomp.hpp"
--- a/modules/gpu/test/test_precomp.hpp
+++ b/modules/gpu/test/test_precomp.hpp
@ -57,8 +57,10 @@
 #include <limits>
 #include <algorithm>
 #include <iterator>
+#include <stdexcept>

 #include "cvconfig.h"
+
 #include "opencv2/core/core.hpp"
 #include "opencv2/highgui/highgui.hpp"
 #include "opencv2/calib3d/calib3d.hpp"
@ -72,6 +74,7 @@

 #include "utility.hpp"
 #include "interpolation.hpp"
+#include "main_test_nvidia.h"

 #ifdef HAVE_CUDA
    #include <cuda.h>
--- a/modules/gpu/test/test_pyramids.cpp
+++ b/modules/gpu/test/test_pyramids.cpp
@ -39,7 +39,7 @@
 //
 //M*/

-#include "precomp.hpp"
+#include "test_precomp.hpp"

 #ifdef HAVE_CUDA

--- a/modules/gpu/test/test_remap.cpp
+++ b/modules/gpu/test/test_remap.cpp
@ -39,7 +39,7 @@
 //
 //M*/

-#include "precomp.hpp"
+#include "test_precomp.hpp"

 #ifdef HAVE_CUDA

--- a/modules/gpu/test/test_resize.cpp
+++ b/modules/gpu/test/test_resize.cpp
@ -39,8 +39,7 @@
 //
 //M*/

-#include "precomp.hpp"
-#include <iostream>
+#include "test_precomp.hpp"

 #ifdef HAVE_CUDA

--- a/modules/gpu/test/test_threshold.cpp
+++ b/modules/gpu/test/test_threshold.cpp
@ -39,7 +39,7 @@
 //
 //M*/

-#include "precomp.hpp"
+#include "test_precomp.hpp"

 #ifdef HAVE_CUDA

--- a/modules/gpu/test/test_video.cpp
+++ b/modules/gpu/test/test_video.cpp
@ -39,7 +39,9 @@
 //
 //M*/

-#include "precomp.hpp"
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA

 //#define DUMP

@ -865,3 +867,5 @@ TEST_P(VideoReader, Regression)
 INSTANTIATE_TEST_CASE_P(GPU_Video, VideoReader, testing::Combine(
    ALL_DEVICES,
    testing::Values(std::string("768x576.avi"), std::string("1920x1080.avi"))));
+
+#endif // HAVE_CUDA
--- a/modules/gpu/test/test_warp_affine.cpp
+++ b/modules/gpu/test/test_warp_affine.cpp
@ -39,7 +39,7 @@
 //
 //M*/

-#include "precomp.hpp"
+#include "test_precomp.hpp"

 #ifdef HAVE_CUDA

--- a/modules/gpu/test/test_warp_perspective.cpp
+++ b/modules/gpu/test/test_warp_perspective.cpp
@ -39,7 +39,7 @@
 //
 //M*/

-#include "precomp.hpp"
+#include "test_precomp.hpp"

 #ifdef HAVE_CUDA

--- a/modules/gpu/test/utility.cpp
+++ b/modules/gpu/test/utility.cpp
@ -39,13 +39,14 @@
 //
 //M*/

-#include "precomp.hpp"
+#include "test_precomp.hpp"

 using namespace std;
 using namespace cv;
 using namespace cv::gpu;
 using namespace cvtest;
 using namespace testing;
+using namespace testing::internal;

 //////////////////////////////////////////////////////////////////////
 // random generators
@ -108,12 +109,12 @@ GpuMat loadMat(const Mat& m, bool useRoi)
 //////////////////////////////////////////////////////////////////////
 // Image load

-Mat readImage(const string& fileName, int flags)
+Mat readImage(const std::string& fileName, int flags)
 {
-    return imread(string(cvtest::TS::ptr()->get_data_path()) + fileName, flags);
+    return imread(TS::ptr()->get_data_path() + fileName, flags);
 }

-Mat readImageType(const string& fname, int type)
+Mat readImageType(const std::string& fname, int type)
 {
    Mat src = readImage(fname, CV_MAT_CN(type) == 1 ? IMREAD_GRAYSCALE : IMREAD_COLOR);
    if (CV_MAT_CN(type) == 4)
@ -134,50 +135,51 @@ bool supportFeature(const DeviceInfo& info, FeatureSet feature)
    return TargetArchs::builtWith(feature) && info.supports(feature);
 }

-const vector<DeviceInfo>& devices()
+DeviceManager& DeviceManager::instance()
 {
-    static vector<DeviceInfo> devs;
-    static bool first = true;
+    static DeviceManager obj;
+    return obj;
+}

-    if (first)
-    {
-        int deviceCount = getCudaEnabledDeviceCount();
+void DeviceManager::load(int i)
+{
+    devices_.clear();
+    devices_.reserve(1);

-        devs.reserve(deviceCount);
+    ostringstream msg;

-        for (int i = 0; i < deviceCount; ++i)
-        {
-            DeviceInfo info(i);
-            if (info.isCompatible())
-                devs.push_back(info);
-        }
+    if (i < 0 || i >= getCudaEnabledDeviceCount())
+    {
+        msg << "Incorrect device number - " << i;
+        throw runtime_error(msg.str());
+    }
+
+    DeviceInfo info(i);

-        first = false;
+    if (!info.isCompatible())
+    {
+        msg << "Device " << i << " [" << info.name() << "] is NOT compatible with current GPU module build";
+        throw runtime_error(msg.str());
    }

-    return devs;
+    devices_.push_back(info);
 }

-vector<DeviceInfo> devices(FeatureSet feature)
+void DeviceManager::loadAll()
 {
-    const vector<DeviceInfo>& d = devices();
+    int deviceCount = getCudaEnabledDeviceCount();

-    vector<DeviceInfo> devs_filtered;
+    devices_.clear();
+    devices_.reserve(deviceCount);

-    if (TargetArchs::builtWith(feature))
+    for (int i = 0; i < deviceCount; ++i)
    {
-        devs_filtered.reserve(d.size());
-
-        for (size_t i = 0, size = d.size(); i < size; ++i)
+        DeviceInfo info(i);
+        if (info.isCompatible())
        {
-            const DeviceInfo& info = d[i];
-
-            if (info.supports(feature))
-                devs_filtered.push_back(info);
+            devices_.push_back(info);
        }
    }
-
-    return devs_filtered;
 }

 //////////////////////////////////////////////////////////////////////
@ -250,7 +252,7 @@ void minMaxLocGold(const Mat& src, double* minVal_, double* maxVal_, Point* minL

 namespace
 {
-    template <typename T, typename OutT> string printMatValImpl(const Mat& m, Point p)
+    template <typename T, typename OutT> std::string printMatValImpl(const Mat& m, Point p)
    {
        const int cn = m.channels();

@ -269,9 +271,9 @@ namespace
        return ostr.str();
    }

-    string printMatVal(const Mat& m, Point p)
+    std::string printMatVal(const Mat& m, Point p)
    {
-        typedef string (*func_t)(const Mat& m, Point p);
+        typedef std::string (*func_t)(const Mat& m, Point p);

        static const func_t funcs[] =
        {
--- a/modules/gpu/test/utility.hpp
+++ b/modules/gpu/test/utility.hpp
@ -80,14 +80,21 @@ cv::Mat readImageType(const std::string& fname, int type);
 //! return true if device supports specified feature and gpu module was built with support the feature.
 bool supportFeature(const cv::gpu::DeviceInfo& info, cv::gpu::FeatureSet feature);

-//! return all devices compatible with current gpu module build.
-const std::vector<cv::gpu::DeviceInfo>& devices();
+class DeviceManager
+{
+public:
+    static DeviceManager& instance();
+
+    void load(int i);
+    void loadAll();

-//! return all devices compatible with current gpu module build which support specified feature.
-std::vector<cv::gpu::DeviceInfo> devices(cv::gpu::FeatureSet feature);
+    const std::vector<cv::gpu::DeviceInfo>& values() const { return devices_; }
+
+private:
+    std::vector<cv::gpu::DeviceInfo> devices_;
+};

-#define ALL_DEVICES testing::ValuesIn(devices())
-#define DEVICES(feature) testing::ValuesIn(devices(feature))
+#define ALL_DEVICES testing::ValuesIn(DeviceManager::instance().values())

 //////////////////////////////////////////////////////////////////////
 // Additional assertion
--- a/modules/highgui/CMakeLists.txt
+++ b/modules/highgui/CMakeLists.txt
@ -193,6 +193,12 @@ elseif(APPLE)
  endif()
 endif()

+if(IOS)
+  add_definitions(-DHAVE_IOS=1)
+  list(APPEND highgui_srcs src/cap_ios_abstract_camera.mm src/cap_ios_photo_camera.mm src/cap_ios_video_camera.mm)
+  list(APPEND HIGHGUI_LIBRARIES "-framework Accelerate" "-framework AVFoundation" "-framework CoreGraphics" "-framework CoreImage" "-framework CoreMedia" "-framework CoreVideo" "-framework QuartzCore" "-framework AssetsLibrary")
+endif()
+
 if(WIN32)
  link_directories("${OpenCV_SOURCE_DIR}/3rdparty/lib") # for ffmpeg wrapper only
  include_directories(AFTER SYSTEM "${OpenCV_SOURCE_DIR}/3rdparty/include") # for directshow in VS2005 and multi-monitor support on MinGW
--- a/modules/highgui/include/opencv2/highgui/cap_ios.h
+++ b/modules/highgui/include/opencv2/highgui/cap_ios.h
@ -0,0 +1,163 @@
+/*
+ *  cap_ios.h
+ *  For iOS video I/O
+ *  by Eduard Feicho on 29/07/12
+ *  Copyright 2012. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+ * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#import <UIKit/UIKit.h>
+#import <Accelerate/Accelerate.h>
+#import <AVFoundation/AVFoundation.h>
+#import <ImageIO/ImageIO.h>
+#include "opencv2/core/core.hpp"
+
+/////////////////////////////////////// CvAbstractCamera /////////////////////////////////////
+
+@class CvAbstractCamera;
+
+@interface CvAbstractCamera : NSObject
+{
+	AVCaptureSession* captureSession;
+	AVCaptureConnection* videoCaptureConnection;
+	AVCaptureVideoPreviewLayer *captureVideoPreviewLayer;
+	
+	UIDeviceOrientation currentDeviceOrientation;
+	
+	BOOL cameraAvailable;
+	BOOL captureSessionLoaded;
+	BOOL running;
+	BOOL useAVCaptureVideoPreviewLayer;
+	
+	AVCaptureDevicePosition defaultAVCaptureDevicePosition;
+	AVCaptureVideoOrientation defaultAVCaptureVideoOrientation;
+	NSString *const defaultAVCaptureSessionPreset;
+	
+	int defaultFPS;
+	
+	UIView* parentView;
+    
+    int imageWidth;
+    int imageHeight;
+}
+
+@property (nonatomic, retain) AVCaptureSession* captureSession;
+@property (nonatomic, retain) AVCaptureConnection* videoCaptureConnection;
+
+@property (nonatomic, readonly) BOOL running;
+@property (nonatomic, readonly) BOOL captureSessionLoaded;
+
+@property (nonatomic, assign) int defaultFPS;
+@property (nonatomic, assign) AVCaptureDevicePosition defaultAVCaptureDevicePosition;
+@property (nonatomic, assign) AVCaptureVideoOrientation defaultAVCaptureVideoOrientation;
+@property (nonatomic, assign) BOOL useAVCaptureVideoPreviewLayer;
+@property (nonatomic, strong) NSString *const defaultAVCaptureSessionPreset;
+
+@property (nonatomic, assign) int imageWidth;
+@property (nonatomic, assign) int imageHeight;
+
+@property (nonatomic, retain) UIView* parentView;
+
+- (void)start;
+- (void)stop;
+- (void)switchCameras;
+
+- (id)initWithParentView:(UIView*)parent;
+
+- (void)createCaptureOutput;
+- (void)createVideoPreviewLayer;
+- (void)updateOrientation;
+
+
+@end
+
+///////////////////////////////// CvVideoCamera ///////////////////////////////////////////
+
+@class CvVideoCamera;
+
+@protocol CvVideoCameraDelegate <NSObject>
+
+#ifdef __cplusplus
+// delegate method for processing image frames
+- (void)processImage:(cv::Mat&)image;
+#endif
+
+@end
+
+@interface CvVideoCamera : CvAbstractCamera<AVCaptureVideoDataOutputSampleBufferDelegate>
+{
+	AVCaptureVideoDataOutput *videoDataOutput;
+	
+	dispatch_queue_t videoDataOutputQueue;
+	CALayer *customPreviewLayer;
+	
+	BOOL grayscaleMode;
+    
+    BOOL recordVideo;
+    AVAssetWriterInput* recordAssetWriterInput;
+    AVAssetWriterInputPixelBufferAdaptor* recordPixelBufferAdaptor;
+    AVAssetWriter* recordAssetWriter;
+
+	CMTime lastSampleTime;
+    
+}
+
+@property (nonatomic, assign) id<CvVideoCameraDelegate> delegate;
+@property (nonatomic, assign) BOOL grayscaleMode;
+
+@property (nonatomic, assign) BOOL recordVideo;
+@property (nonatomic, retain) AVAssetWriterInput* recordAssetWriterInput;
+@property (nonatomic, retain) AVAssetWriterInputPixelBufferAdaptor* recordPixelBufferAdaptor;
+@property (nonatomic, retain) AVAssetWriter* recordAssetWriter;
+
+- (void)adjustLayoutToInterfaceOrientation:(UIInterfaceOrientation)interfaceOrientation;
+- (void)layoutPreviewLayer;
+- (void)saveVideo;
+- (NSURL *)videoFileURL;
+
+
+@end
+
+///////////////////////////////// CvPhotoCamera ///////////////////////////////////////////
+
+@class CvPhotoCamera;
+
+@protocol CvPhotoCameraDelegate <NSObject>
+
+- (void)photoCamera:(CvPhotoCamera*)photoCamera capturedImage:(UIImage *)image;
+- (void)photoCameraCancel:(CvPhotoCamera*)photoCamera;
+
+@end
+
+@interface CvPhotoCamera : CvAbstractCamera
+{
+	AVCaptureStillImageOutput *stillImageOutput;
+}
+
+@property (nonatomic, assign) id<CvPhotoCameraDelegate> delegate;
+
+- (void)takePicture;
+
+@end
--- a/modules/highgui/src/cap_ios_abstract_camera.mm
+++ b/modules/highgui/src/cap_ios_abstract_camera.mm
@ -0,0 +1,408 @@
+/*
+ *  cap_ios_abstract_camera.mm
+ *  For iOS video I/O
+ *  by Eduard Feicho on 29/07/12
+ *  Copyright 2012. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+ * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+
+#import "opencv2/highgui/cap_ios.h"
+#include "precomp.hpp"
+
+#pragma mark - Private Interface
+
+@interface CvAbstractCamera ()
+
+@property (nonatomic, retain) AVCaptureVideoPreviewLayer* captureVideoPreviewLayer;
+
+- (void)deviceOrientationDidChange:(NSNotification*)notification;
+- (void)startCaptureSession;
+
+- (void)setDesiredCameraPosition:(AVCaptureDevicePosition)desiredPosition;
+
+- (void)updateSize;
+
+@end
+
+
+#pragma mark - Implementation
+
+
+@implementation CvAbstractCamera
+
+
+
+#pragma mark Public
+
+@synthesize imageWidth;
+@synthesize imageHeight;
+
+
+@synthesize defaultFPS;
+@synthesize defaultAVCaptureDevicePosition;
+@synthesize defaultAVCaptureVideoOrientation;
+@synthesize defaultAVCaptureSessionPreset;
+
+
+
+@synthesize captureSession;
+@synthesize captureVideoPreviewLayer;
+@synthesize videoCaptureConnection;
+@synthesize running;
+@synthesize captureSessionLoaded;
+@synthesize useAVCaptureVideoPreviewLayer;
+
+@synthesize parentView;
+
+#pragma mark - Constructors
+
+- (id)init;
+{
+	self = [super init];
+	if (self) {
+		// react to device orientation notifications
+		[[NSNotificationCenter defaultCenter] addObserver:self
+												 selector:@selector(deviceOrientationDidChange:)
+													 name:UIDeviceOrientationDidChangeNotification
+												   object:nil];
+		[[UIDevice currentDevice] beginGeneratingDeviceOrientationNotifications];
+		currentDeviceOrientation = [[UIDevice currentDevice] orientation];
+		
+		
+		// check if camera available
+		cameraAvailable = [UIImagePickerController isSourceTypeAvailable:UIImagePickerControllerSourceTypeCamera];
+		NSLog(@"camera available: %@", (cameraAvailable == YES ? @"YES" : @"NO") );
+		
+		running = NO;
+		
+		// set camera default configuration
+		self.defaultAVCaptureDevicePosition = AVCaptureDevicePositionFront;
+		self.defaultAVCaptureVideoOrientation = AVCaptureVideoOrientationLandscapeLeft;
+		self.defaultFPS = 15;
+		self.defaultAVCaptureSessionPreset = AVCaptureSessionPreset352x288;
+		
+		self.parentView = nil;
+		self.useAVCaptureVideoPreviewLayer = NO;
+    }
+	return self;
+}
+
+
+
+- (id)initWithParentView:(UIView*)parent;
+{
+	self = [super init];
+	if (self) {
+		// react to device orientation notifications
+		[[NSNotificationCenter defaultCenter] addObserver:self
+												 selector:@selector(deviceOrientationDidChange:)
+													 name:UIDeviceOrientationDidChangeNotification
+												   object:nil];
+		[[UIDevice currentDevice] beginGeneratingDeviceOrientationNotifications];
+		currentDeviceOrientation = [[UIDevice currentDevice] orientation];
+		
+		
+		// check if camera available
+		cameraAvailable = [UIImagePickerController isSourceTypeAvailable:UIImagePickerControllerSourceTypeCamera];
+		NSLog(@"camera available: %@", (cameraAvailable == YES ? @"YES" : @"NO") );
+		
+		running = NO;
+		
+		// set camera default configuration
+		self.defaultAVCaptureDevicePosition = AVCaptureDevicePositionFront;
+		self.defaultAVCaptureVideoOrientation = AVCaptureVideoOrientationLandscapeLeft;
+		self.defaultFPS = 15;
+		self.defaultAVCaptureSessionPreset = AVCaptureSessionPreset640x480;
+		
+		self.parentView = parent;
+		self.useAVCaptureVideoPreviewLayer = YES;
+	}
+	return self;
+}
+
+
+
+- (void)dealloc;
+{
+	[[NSNotificationCenter defaultCenter] removeObserver:self];
+	[[UIDevice currentDevice] endGeneratingDeviceOrientationNotifications];
+}
+
+
+#pragma mark - Public interface
+
+
+- (void)start;
+{
+    if (![NSThread isMainThread]) {
+        NSLog(@"[Camera] Warning: Call start only from main thread");
+        [self performSelectorOnMainThread:@selector(start) withObject:nil waitUntilDone:NO];
+        return;
+    }
+    
+	if (running == YES) {
+		return;
+	}
+	running = YES;
+    
+    // TOOD update image size data before actually starting (needed for recording)
+    [self updateSize];
+    
+	if (cameraAvailable) {
+		[self startCaptureSession];
+	}
+}
+
+
+- (void)pause;
+{
+	running = NO;
+	[self.captureSession stopRunning];
+}
+
+
+
+- (void)stop;
+{
+	running = NO;
+	
+	// Release any retained subviews of the main view.
+	// e.g. self.myOutlet = nil;
+	
+	[self.captureSession stopRunning];
+	self.captureSession = nil;
+	self.captureVideoPreviewLayer = nil;
+	self.videoCaptureConnection = nil;
+	captureSessionLoaded = NO;
+}
+
+
+
+// use front/back camera
+- (void)switchCameras;
+{
+	BOOL was_running = self.running;
+	if (was_running) {
+		[self stop];
+	}
+	if (self.defaultAVCaptureDevicePosition == AVCaptureDevicePositionFront) {
+		self.defaultAVCaptureDevicePosition = AVCaptureDevicePositionBack;
+	} else {
+		self.defaultAVCaptureDevicePosition  = AVCaptureDevicePositionFront;
+	}
+	if (was_running) {
+		[self start];
+	}
+}
+
+
+
+#pragma mark - Device Orientation Changes
+
+
+- (void)deviceOrientationDidChange:(NSNotification*)notification
+{
+	UIDeviceOrientation orientation = [UIDevice currentDevice].orientation;
+	
+	switch (orientation)
+	{
+		case UIDeviceOrientationPortrait:
+		case UIDeviceOrientationPortraitUpsideDown:
+		case UIDeviceOrientationLandscapeLeft:
+		case UIDeviceOrientationLandscapeRight:
+			currentDeviceOrientation = orientation;
+			break;
+			
+		case UIDeviceOrientationFaceUp:
+		case UIDeviceOrientationFaceDown:
+		default:
+			break;
+	}
+	NSLog(@"deviceOrientationDidChange: %d", orientation);
+	
+	[self updateOrientation];
+}
+
+
+
+#pragma mark - Private Interface
+
+- (void)createCaptureSession;
+{
+	// set a av capture session preset
+	self.captureSession = [[AVCaptureSession alloc] init];
+	if ([self.captureSession canSetSessionPreset:self.defaultAVCaptureSessionPreset]) {
+		[self.captureSession setSessionPreset:self.defaultAVCaptureSessionPreset];
+	} else if ([self.captureSession canSetSessionPreset:AVCaptureSessionPresetLow]) {
+		[self.captureSession setSessionPreset:AVCaptureSessionPresetLow];
+	} else {
+		NSLog(@"[Camera] Error: could not set session preset");
+	}
+}
+
+- (void)createCaptureDevice;
+{
+	// setup the device
+	AVCaptureDevice *device = [AVCaptureDevice defaultDeviceWithMediaType:AVMediaTypeVideo];
+	[self setDesiredCameraPosition:self.defaultAVCaptureDevicePosition];
+	NSLog(@"[Camera] device connected? %@", device.connected ? @"YES" : @"NO");
+	NSLog(@"[Camera] device position %@", (device.position == AVCaptureDevicePositionBack) ? @"back" : @"front");
+}
+
+
+- (void)createVideoPreviewLayer;
+{
+	self.captureVideoPreviewLayer = [[AVCaptureVideoPreviewLayer alloc] initWithSession:self.captureSession];
+	
+	if ([self.captureVideoPreviewLayer isOrientationSupported]) {
+		[self.captureVideoPreviewLayer setOrientation:self.defaultAVCaptureVideoOrientation];
+	}
+	
+	if (parentView != nil) {
+		self.captureVideoPreviewLayer.frame = self.parentView.bounds;
+		self.captureVideoPreviewLayer.videoGravity = AVLayerVideoGravityResizeAspectFill;
+		[self.parentView.layer addSublayer:self.captureVideoPreviewLayer];
+	}
+	NSLog(@"[Camera] created AVCaptureVideoPreviewLayer");
+}
+
+
+
+
+- (void)setDesiredCameraPosition:(AVCaptureDevicePosition)desiredPosition;
+{
+	for (AVCaptureDevice *device in [AVCaptureDevice devicesWithMediaType:AVMediaTypeVideo]) {
+		if ([device position] == desiredPosition) {
+			[self.captureSession beginConfiguration];
+			
+			NSError* error;
+			AVCaptureDeviceInput *input = [AVCaptureDeviceInput deviceInputWithDevice:device error:&error];
+			if (!input) {
+				NSLog(@"error creating input %@", [error localizedDescription]);
+			}
+			
+			// support for autofocus
+			if ([device isFocusModeSupported:AVCaptureFocusModeContinuousAutoFocus]) {
+				NSError *error = nil;
+				if ([device lockForConfiguration:&error]) {
+					device.focusMode = AVCaptureFocusModeContinuousAutoFocus;
+					[device unlockForConfiguration];
+				} else {
+					NSLog(@"unable to lock device for autofocos configuration %@", [error localizedDescription]);
+				}
+			}
+			[self.captureSession addInput:input];
+			
+			for (AVCaptureInput *oldInput in self.captureSession.inputs) {
+				[self.captureSession removeInput:oldInput];
+			}
+			[self.captureSession addInput:input];
+			[self.captureSession commitConfiguration];
+			
+			break;
+		}
+	}
+}
+
+
+
+- (void)startCaptureSession
+{
+	if (!cameraAvailable) {
+		return;
+	}
+	
+	if (self.captureSessionLoaded == NO) {
+		[self createCaptureSession];
+		[self createCaptureDevice];
+		[self createCaptureOutput];
+		
+		// setup preview layer
+		if (self.useAVCaptureVideoPreviewLayer) {
+			[self createVideoPreviewLayer];
+		} else {
+			[self createCustomVideoPreview];
+		}
+		
+		captureSessionLoaded = YES;
+	}
+	
+	[self.captureSession startRunning];
+}
+
+
+- (void)createCaptureOutput;
+{
+	[NSException raise:NSInternalInconsistencyException
+				format:@"You must override %@ in a subclass", NSStringFromSelector(_cmd)];
+}
+
+- (void)createCustomVideoPreview;
+{
+	[NSException raise:NSInternalInconsistencyException
+				format:@"You must override %@ in a subclass", NSStringFromSelector(_cmd)];
+}
+
+- (void)updateOrientation;
+{
+	// nothing to do here
+}
+
+
+- (void)updateSize;
+{
+    if ([self.defaultAVCaptureSessionPreset isEqualToString:AVCaptureSessionPresetPhoto]) {
+        //TODO: find the correct resolution
+        self.imageWidth = 640;
+        self.imageHeight = 480;
+    } else if ([self.defaultAVCaptureSessionPreset isEqualToString:AVCaptureSessionPresetHigh]) {
+        //TODO: find the correct resolution
+        self.imageWidth = 640;
+        self.imageHeight = 480;
+    } else if ([self.defaultAVCaptureSessionPreset isEqualToString:AVCaptureSessionPresetMedium]) {
+        //TODO: find the correct resolution
+        self.imageWidth = 640;
+        self.imageHeight = 480;        
+    } else if ([self.defaultAVCaptureSessionPreset isEqualToString:AVCaptureSessionPresetLow]) {
+        //TODO: find the correct resolution
+        self.imageWidth = 640;
+        self.imageHeight = 480;
+    } else if ([self.defaultAVCaptureSessionPreset isEqualToString:AVCaptureSessionPreset352x288]) {
+        self.imageWidth = 352;
+        self.imageHeight = 288;
+    } else if ([self.defaultAVCaptureSessionPreset isEqualToString:AVCaptureSessionPreset640x480]) {
+        self.imageWidth = 640;
+        self.imageHeight = 480;
+    } else if ([self.defaultAVCaptureSessionPreset isEqualToString:AVCaptureSessionPreset1280x720]) {
+        self.imageWidth = 1280;
+        self.imageHeight = 720;
+    } else {
+        self.imageWidth = 640;
+        self.imageHeight = 480;
+    }
+}
+
+@end
--- a/modules/highgui/src/cap_ios_photo_camera.mm
+++ b/modules/highgui/src/cap_ios_photo_camera.mm
@ -0,0 +1,165 @@
+/*
+ *  cap_ios_photo_camera.mm
+ *  For iOS video I/O
+ *  by Eduard Feicho on 29/07/12
+ *  Copyright 2012. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+ * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+
+#import "opencv2/highgui/cap_ios.h"
+#include "precomp.hpp"
+
+#pragma mark - Private Interface
+
+
+@interface CvPhotoCamera ()
+
+@property (nonatomic, retain) AVCaptureStillImageOutput* stillImageOutput;
+
+@end
+
+
+
+#pragma mark - Implementation
+
+
+@implementation CvPhotoCamera
+
+
+
+#pragma mark Public
+
+@synthesize stillImageOutput;
+@synthesize delegate;
+
+
+#pragma mark - Public interface
+
+
+- (void)takePicture
+{
+	if (cameraAvailable == NO) {
+		return;
+	}
+	cameraAvailable = NO;
+	
+	
+	[self.stillImageOutput captureStillImageAsynchronouslyFromConnection:self.videoCaptureConnection
+													   completionHandler:
+	 ^(CMSampleBufferRef imageSampleBuffer, NSError *error)
+	 {
+		 if (error == nil && imageSampleBuffer != NULL)
+		 {
+			 // TODO check
+			 //			 NSNumber* imageOrientation = [UIImage cgImageOrientationForUIDeviceOrientation:currentDeviceOrientation];
+			 //			 CMSetAttachment(imageSampleBuffer, kCGImagePropertyOrientation, imageOrientation, 1);
+			 
+			 NSData *jpegData = [AVCaptureStillImageOutput jpegStillImageNSDataRepresentation:imageSampleBuffer];
+			 
+			 dispatch_async(dispatch_get_main_queue(), ^{
+				 [self.captureSession stopRunning];
+				 
+				 // Make sure we create objects on the main thread in the main context
+				 UIImage* newImage = [UIImage imageWithData:jpegData];
+				 
+				 //UIImageOrientation orientation = [newImage imageOrientation];
+				 
+				 // TODO: only apply rotation, don't scale, since we can set this directly in the camera
+				 /*
+				  switch (orientation) {
+				  case UIImageOrientationUp:
+				  case UIImageOrientationDown:
+				  newImage = [newImage imageWithAppliedRotationAndMaxSize:CGSizeMake(640.0, 480.0)];
+				  break;
+				  case UIImageOrientationLeft:
+				  case UIImageOrientationRight:
+				  newImage = [newImage imageWithMaxSize:CGSizeMake(640.0, 480.0)];
+				  default:
+				  break;
+				  }
+				  */
+				 
+				 // We have captured the image, we can allow the user to take another picture
+				 cameraAvailable = YES;
+				 
+				 NSLog(@"CvPhotoCamera captured image");
+				 if (self.delegate) {
+					 [self.delegate photoCamera:self capturedImage:newImage];
+				 }
+				 
+				 [self.captureSession startRunning];
+			 });
+		 }
+	 }];
+	
+
+}
+
+- (void)stop;
+{
+	[super stop];
+	self.stillImageOutput = nil;
+}
+
+
+#pragma mark - Private Interface
+
+
+- (void)createStillImageOutput;
+{
+	// setup still image output with jpeg codec
+	self.stillImageOutput = [[AVCaptureStillImageOutput alloc] init];
+	NSDictionary *outputSettings = [NSDictionary dictionaryWithObjectsAndKeys:AVVideoCodecJPEG, AVVideoCodecKey, nil];
+	[self.stillImageOutput setOutputSettings:outputSettings];
+	[self.captureSession addOutput:self.stillImageOutput];
+	
+	for (AVCaptureConnection *connection in self.stillImageOutput.connections) {
+		for (AVCaptureInputPort *port in [connection inputPorts]) {
+			if ([port.mediaType isEqual:AVMediaTypeVideo]) {
+				self.videoCaptureConnection = connection;
+				break;
+			}
+		}
+		if (self.videoCaptureConnection) {
+			break;
+		}
+	}
+	NSLog(@"[Camera] still image output created");
+}
+
+
+- (void)createCaptureOutput;
+{
+	[self createStillImageOutput];
+}
+
+- (void)createCustomVideoPreview;
+{
+	//do nothing, always use AVCaptureVideoPreviewLayer
+}
+
+
+@end
--- a/modules/highgui/src/cap_ios_video_camera.mm
+++ b/modules/highgui/src/cap_ios_video_camera.mm
@ -0,0 +1,585 @@
+/*
+ *  cap_ios_video_camera.mm
+ *  For iOS video I/O
+ *  by Eduard Feicho on 29/07/12
+ *  Copyright 2012. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+ * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#import "opencv2/highgui/cap_ios.h"
+#include "precomp.hpp"
+
+#import <AssetsLibrary/AssetsLibrary.h>
+
+
+static CGFloat DegreesToRadians(CGFloat degrees) {return degrees * M_PI / 180;};
+
+#pragma mark - Private Interface
+
+
+
+
+@interface CvVideoCamera ()
+
+- (void)createVideoDataOutput;
+- (void)createVideoFileOutput;
+
+
+@property (nonatomic, retain) CALayer *customPreviewLayer;
+@property (nonatomic, retain) AVCaptureVideoDataOutput *videoDataOutput;
+
+@end
+
+
+
+#pragma mark - Implementation
+
+
+
+@implementation CvVideoCamera
+
+
+
+
+@synthesize delegate;
+@synthesize grayscaleMode;
+
+@synthesize customPreviewLayer;
+@synthesize videoDataOutput;
+
+@synthesize recordVideo;
+//@synthesize videoFileOutput;
+@synthesize recordAssetWriterInput;
+@synthesize recordPixelBufferAdaptor;
+@synthesize recordAssetWriter;
+
+
+
+#pragma mark - Constructors
+
+- (id)initWithParentView:(UIView*)parent;
+{
+	self = [super initWithParentView:parent];
+	if (self) {
+		self.useAVCaptureVideoPreviewLayer = NO;
+		self.recordVideo = NO;
+	}
+	return self;
+}
+
+
+
+#pragma mark - Public interface
+
+
+- (void)start;
+{
+    [super start];
+    
+	if (self.recordVideo == YES) {		
+        NSError* error;
+        if ([[NSFileManager defaultManager] fileExistsAtPath:[self videoFileString]]) {
+			[[NSFileManager defaultManager] removeItemAtPath:[self videoFileString] error:&error];
+		}
+        if (error == nil) {
+            NSLog(@"[Camera] Delete file %@", [self videoFileString]);
+        }
+	}
+}
+
+
+
+- (void)stop;
+{
+	[super stop];
+    
+	self.videoDataOutput = nil;
+	if (videoDataOutputQueue) {
+		dispatch_release(videoDataOutputQueue);
+	}
+	
+	if (self.recordVideo == YES) {
+        
+        if (self.recordAssetWriter.status == AVAssetWriterStatusWriting) {
+            [self.recordAssetWriter finishWriting];
+            NSLog(@"[Camera] recording stopped");
+        } else {
+            NSLog(@"[Camera] Recording Error: asset writer status is not writing");
+        }
+        
+        self.recordAssetWriter = nil;
+        self.recordAssetWriterInput = nil;
+        self.recordPixelBufferAdaptor = nil;
+	}
+	
+	[self.customPreviewLayer removeFromSuperlayer];
+	self.customPreviewLayer = nil;
+}
+
+// TODO fix
+- (void)adjustLayoutToInterfaceOrientation:(UIInterfaceOrientation)interfaceOrientation;
+{	
+	
+	NSLog(@"layout preview layer");
+	if (self.parentView != nil) {
+		
+		CALayer* layer = self.customPreviewLayer;
+		CGRect bounds = self.customPreviewLayer.bounds;
+		int rotation_angle = 0;
+		bool flip_bounds = false;
+		
+		switch (interfaceOrientation) {
+            case UIInterfaceOrientationPortrait:
+				NSLog(@"to Portrait");
+                rotation_angle = 270;
+                break;
+            case UIInterfaceOrientationPortraitUpsideDown:
+                rotation_angle = 90;
+				NSLog(@"to UpsideDown");
+				break;
+            case UIInterfaceOrientationLandscapeLeft:
+                rotation_angle = 0;
+				NSLog(@"to LandscapeLeft");
+				break;
+            case UIInterfaceOrientationLandscapeRight:
+                rotation_angle = 180;
+				NSLog(@"to LandscapeRight");
+				break;
+            default:
+                break; // leave the layer in its last known orientation
+        }
+		
+		switch (defaultAVCaptureVideoOrientation) {
+			case AVCaptureVideoOrientationLandscapeRight:
+				rotation_angle += 180;
+				break;
+			case AVCaptureVideoOrientationPortraitUpsideDown:
+				rotation_angle += 270;
+				break;
+			case AVCaptureVideoOrientationPortrait:
+				rotation_angle += 90;
+			case AVCaptureVideoOrientationLandscapeLeft:
+				break;
+			default:
+				break;
+		}
+		rotation_angle = rotation_angle % 360;
+		
+		if (rotation_angle == 90 || rotation_angle == 270) {
+			flip_bounds = true;
+		}
+		
+		if (flip_bounds) {
+			NSLog(@"flip bounds");
+			bounds = CGRectMake(0, 0, bounds.size.height, bounds.size.width);
+		}
+		
+		layer.position = CGPointMake(self.parentView.frame.size.width/2., self.parentView.frame.size.height/2.);
+		self.customPreviewLayer.bounds = CGRectMake(0, 0, self.parentView.frame.size.width, self.parentView.frame.size.height);
+		
+		layer.affineTransform = CGAffineTransformMakeRotation( DegreesToRadians(rotation_angle) );
+		layer.bounds = bounds;
+	}
+
+}
+
+// TODO fix
+- (void)layoutPreviewLayer;
+{
+	NSLog(@"layout preview layer");
+	if (self.parentView != nil) {
+		
+		CALayer* layer = self.customPreviewLayer;
+		CGRect bounds = self.customPreviewLayer.bounds;
+		int rotation_angle = 0;
+		bool flip_bounds = false;
+		
+		switch (currentDeviceOrientation) {
+            case UIDeviceOrientationPortrait:
+                rotation_angle = 270;
+                break;
+            case UIDeviceOrientationPortraitUpsideDown:
+                rotation_angle = 90;
+				break;
+            case UIDeviceOrientationLandscapeLeft:
+				NSLog(@"left");
+                rotation_angle = 180;
+				break;
+            case UIDeviceOrientationLandscapeRight:
+				NSLog(@"right");
+                rotation_angle = 0;
+				break;
+            case UIDeviceOrientationFaceUp:
+            case UIDeviceOrientationFaceDown:
+            default:
+                break; // leave the layer in its last known orientation
+        }
+		
+		switch (defaultAVCaptureVideoOrientation) {
+			case AVCaptureVideoOrientationLandscapeRight:
+				rotation_angle += 180;
+				break;
+			case AVCaptureVideoOrientationPortraitUpsideDown:
+				rotation_angle += 270;
+				break;
+			case AVCaptureVideoOrientationPortrait:
+				rotation_angle += 90;
+			case AVCaptureVideoOrientationLandscapeLeft:
+				break;
+			default:
+				break;
+		}
+		rotation_angle = rotation_angle % 360;
+		
+		if (rotation_angle == 90 || rotation_angle == 270) {
+			flip_bounds = true;
+		}
+		
+		if (flip_bounds) {
+			NSLog(@"flip bounds");
+			bounds = CGRectMake(0, 0, bounds.size.height, bounds.size.width);
+		}
+		
+		layer.position = CGPointMake(self.parentView.frame.size.width/2., self.parentView.frame.size.height/2.);
+		layer.affineTransform = CGAffineTransformMakeRotation( DegreesToRadians(rotation_angle) );
+		layer.bounds = bounds;
+	}
+	
+}
+
+
+
+
+#pragma mark - Private Interface
+
+
+
+- (void)createVideoDataOutput;
+{
+	// Make a video data output
+	self.videoDataOutput = [AVCaptureVideoDataOutput new];
+	
+	// In grayscale mode we want YUV (YpCbCr 4:2:0) so we can directly access the graylevel intensity values (Y component)
+	// In color mode we, BGRA format is used
+	OSType format = self.grayscaleMode ? kCVPixelFormatType_420YpCbCr8BiPlanarFullRange : kCVPixelFormatType_32BGRA;
+	
+	self.videoDataOutput.videoSettings  = [NSDictionary dictionaryWithObject:[NSNumber numberWithUnsignedInt:format]
+                                                                      forKey:(id)kCVPixelBufferPixelFormatTypeKey];
+	
+	// discard if the data output queue is blocked (as we process the still image)
+	[self.videoDataOutput setAlwaysDiscardsLateVideoFrames:YES];
+	
+	if ( [self.captureSession canAddOutput:self.videoDataOutput] ) {
+		[self.captureSession addOutput:self.videoDataOutput];
+	}
+	[[self.videoDataOutput connectionWithMediaType:AVMediaTypeVideo] setEnabled:YES];
+		
+	
+	// set default FPS
+	if ([self.videoDataOutput connectionWithMediaType:AVMediaTypeVideo].supportsVideoMinFrameDuration) {
+		[self.videoDataOutput connectionWithMediaType:AVMediaTypeVideo].videoMinFrameDuration = CMTimeMake(1, self.defaultFPS);
+	}
+	if ([self.videoDataOutput connectionWithMediaType:AVMediaTypeVideo].supportsVideoMaxFrameDuration) {
+		[self.videoDataOutput connectionWithMediaType:AVMediaTypeVideo].videoMaxFrameDuration = CMTimeMake(1, self.defaultFPS);
+	}
+	
+	// set video mirroring for front camera (more intuitive)
+	if ([self.videoDataOutput connectionWithMediaType:AVMediaTypeVideo].supportsVideoMirroring) {
+		if (self.defaultAVCaptureDevicePosition == AVCaptureDevicePositionFront) {
+			[self.videoDataOutput connectionWithMediaType:AVMediaTypeVideo].videoMirrored = YES;
+		} else {
+			[self.videoDataOutput connectionWithMediaType:AVMediaTypeVideo].videoMirrored = NO;
+		}
+	}
+	
+	// set default video orientation
+	if ([self.videoDataOutput connectionWithMediaType:AVMediaTypeVideo].supportsVideoOrientation) {
+		[self.videoDataOutput connectionWithMediaType:AVMediaTypeVideo].videoOrientation = self.defaultAVCaptureVideoOrientation;
+	}
+	
+	
+	// create a custom preview layer
+	self.customPreviewLayer = [CALayer layer];
+	self.customPreviewLayer.bounds = CGRectMake(0, 0, self.parentView.frame.size.width, self.parentView.frame.size.height);
+	[self layoutPreviewLayer];
+	
+	// create a serial dispatch queue used for the sample buffer delegate as well as when a still image is captured
+	// a serial dispatch queue must be used to guarantee that video frames will be delivered in order
+	// see the header doc for setSampleBufferDelegate:queue: for more information
+	videoDataOutputQueue = dispatch_queue_create("VideoDataOutputQueue", DISPATCH_QUEUE_SERIAL);
+	[self.videoDataOutput setSampleBufferDelegate:self queue:videoDataOutputQueue];
+	
+	
+	NSLog(@"[Camera] created AVCaptureVideoDataOutput at %d FPS", self.defaultFPS);
+}
+
+
+
+- (void)createVideoFileOutput;
+{
+	/* Video File Output in H.264, via AVAsserWriter */
+    NSLog(@"Create Video with dimensions %dx%d", self.imageWidth, self.imageHeight);
+    
+	NSDictionary *outputSettings
+	 = [NSDictionary dictionaryWithObjectsAndKeys:[NSNumber numberWithInt:self.imageWidth], AVVideoWidthKey,
+	 											  [NSNumber numberWithInt:self.imageHeight], AVVideoHeightKey,
+	 											  AVVideoCodecH264, AVVideoCodecKey,
+			 									  nil
+	 ];
+    
+	
+	self.recordAssetWriterInput = [AVAssetWriterInput assetWriterInputWithMediaType:AVMediaTypeVideo outputSettings:outputSettings];
+	
+	
+	int pixelBufferFormat = (self.grayscaleMode == YES) ? kCVPixelFormatType_420YpCbCr8BiPlanarFullRange : kCVPixelFormatType_32BGRA;
+	
+	self.recordPixelBufferAdaptor =
+	           [[AVAssetWriterInputPixelBufferAdaptor alloc] 
+	                initWithAssetWriterInput:self.recordAssetWriterInput 
+	                sourcePixelBufferAttributes:[NSDictionary dictionaryWithObjectsAndKeys:[NSNumber numberWithInt:pixelBufferFormat], kCVPixelBufferPixelFormatTypeKey, nil]];
+	
+	NSError* error = nil;
+    NSLog(@"Create AVAssetWriter with url: %@", [self videoFileURL]);
+	self.recordAssetWriter = [AVAssetWriter assetWriterWithURL:[self videoFileURL]
+                                                      fileType:AVFileTypeMPEG4
+                                                         error:&error];
+	if (error != nil) {
+		NSLog(@"[Camera] Unable to create AVAssetWriter: %@", error);
+	}
+	
+	[self.recordAssetWriter addInput:self.recordAssetWriterInput];
+	self.recordAssetWriterInput.expectsMediaDataInRealTime = YES;
+    
+    NSLog(@"[Camera] created AVAssetWriter");
+}
+
+
+- (void)createCaptureOutput;
+{
+	[self createVideoDataOutput];
+	if (self.recordVideo == YES) {
+		[self createVideoFileOutput];
+	}
+}
+
+- (void)createCustomVideoPreview;
+{
+	[self.parentView.layer addSublayer:self.customPreviewLayer];
+}
+
+
+#pragma mark - Protocol AVCaptureVideoDataOutputSampleBufferDelegate
+
+
+- (void)captureOutput:(AVCaptureOutput *)captureOutput didOutputSampleBuffer:(CMSampleBufferRef)sampleBuffer fromConnection:(AVCaptureConnection *)connection
+{
+	if (self.delegate) {
+		
+		// convert from Core Media to Core Video
+		CVImageBufferRef imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer);
+		CVPixelBufferLockBaseAddress(imageBuffer, 0);
+		
+		void* bufferAddress;
+		size_t width;
+		size_t height;
+		size_t bytesPerRow;
+		
+		CGColorSpaceRef colorSpace;
+		CGContextRef context;
+		
+		int format_opencv;
+		
+		OSType format = CVPixelBufferGetPixelFormatType(imageBuffer);
+		if (format == kCVPixelFormatType_420YpCbCr8BiPlanarFullRange) {
+			
+			format_opencv = CV_8UC1;
+			
+			bufferAddress = CVPixelBufferGetBaseAddressOfPlane(imageBuffer, 0);
+			width = CVPixelBufferGetWidthOfPlane(imageBuffer, 0);
+			height = CVPixelBufferGetHeightOfPlane(imageBuffer, 0);
+			bytesPerRow = CVPixelBufferGetBytesPerRowOfPlane(imageBuffer, 0);
+			
+		} else { // expect kCVPixelFormatType_32BGRA
+			
+			format_opencv = CV_8UC4;
+			
+			bufferAddress = CVPixelBufferGetBaseAddress(imageBuffer);
+			width = CVPixelBufferGetWidth(imageBuffer);
+			height = CVPixelBufferGetHeight(imageBuffer);
+			bytesPerRow = CVPixelBufferGetBytesPerRow(imageBuffer);
+			
+		}
+		
+		// delegate image processing to the delegate
+		cv::Mat image(height, width, format_opencv, bufferAddress, bytesPerRow);
+		
+		cv::Mat* result = NULL;
+		CGImage* dstImage;
+		
+		if ([self.delegate respondsToSelector:@selector(processImage:)]) {
+			[self.delegate processImage:image];
+		}
+		
+		// check if matrix data pointer or dimensions were changed by the delegate
+		bool iOSimage = false;
+		if (height == image.rows && width == image.cols && format_opencv == image.type() && bufferAddress == image.data && bytesPerRow == image.step) {
+			iOSimage = true;
+		}
+		
+		
+		// (create color space, create graphics context, render buffer)
+		CGBitmapInfo bitmapInfo;
+		
+		// basically we decide if it's a grayscale, rgb or rgba image
+		if (image.channels() == 1) {
+			colorSpace = CGColorSpaceCreateDeviceGray();
+			bitmapInfo = kCGImageAlphaNone;
+		} else if (image.channels() == 3) {
+			colorSpace = CGColorSpaceCreateDeviceRGB();
+			bitmapInfo = kCGImageAlphaNone;
+			if (iOSimage) {
+				bitmapInfo |= kCGBitmapByteOrder32Little;
+			} else {
+				bitmapInfo |= kCGBitmapByteOrder32Big;
+			}
+		} else {
+			colorSpace = CGColorSpaceCreateDeviceRGB();
+			bitmapInfo = kCGImageAlphaPremultipliedFirst;
+			if (iOSimage) {
+				bitmapInfo |= kCGBitmapByteOrder32Little;
+			} else {
+				bitmapInfo |= kCGBitmapByteOrder32Big;
+			}
+		}
+		
+		if (iOSimage) {
+			context = CGBitmapContextCreate(bufferAddress, width, height, 8, bytesPerRow, colorSpace, bitmapInfo);
+			dstImage = CGBitmapContextCreateImage(context);
+			CGContextRelease(context);
+		} else {
+			
+			NSData *data = [NSData dataWithBytes:image.data length:image.elemSize()*image.total()];
+			CGDataProviderRef provider = CGDataProviderCreateWithCFData((__bridge CFDataRef)data);
+			
+			// Creating CGImage from cv::Mat
+			dstImage = CGImageCreate(image.cols,                                 // width
+									 image.rows,                                 // height
+									 8,                                          // bits per component
+									 8 * image.elemSize(),                       // bits per pixel
+									 image.step,                                 // bytesPerRow
+									 colorSpace,                                 // colorspace
+									 bitmapInfo,                                 // bitmap info
+									 provider,                                   // CGDataProviderRef
+									 NULL,                                       // decode
+									 false,                                      // should interpolate
+									 kCGRenderingIntentDefault                   // intent
+									 );
+			
+			CGDataProviderRelease(provider);
+		}
+		
+		
+		// render buffer
+		dispatch_sync(dispatch_get_main_queue(), ^{
+			self.customPreviewLayer.contents = (__bridge id)dstImage;
+		});
+		
+		
+		if (self.recordVideo == YES) {
+			lastSampleTime = CMSampleBufferGetPresentationTimeStamp(sampleBuffer);
+//			CMTimeShow(lastSampleTime);
+			if (self.recordAssetWriter.status != AVAssetWriterStatusWriting) {
+				[self.recordAssetWriter startWriting];
+				[self.recordAssetWriter startSessionAtSourceTime:lastSampleTime];
+				if (self.recordAssetWriter.status != AVAssetWriterStatusWriting) {
+					NSLog(@"[Camera] Recording Error: asset writer status is not writing: %@", self.recordAssetWriter.error);
+					return;
+				} else {
+					NSLog(@"[Camera] Video recording started");
+				}
+			}
+			
+			if (self.recordAssetWriterInput.readyForMoreMediaData) {
+				if (! [self.recordPixelBufferAdaptor appendPixelBuffer:imageBuffer
+											  	  withPresentationTime:lastSampleTime] ) {
+					NSLog(@"Video Writing Error");
+				}
+			}
+		
+		}
+		
+		
+		// cleanup
+		CGImageRelease(dstImage);
+		
+		CGColorSpaceRelease(colorSpace);
+		
+		CVPixelBufferUnlockBaseAddress(imageBuffer, 0);
+	}
+}
+
+
+- (void)updateOrientation;
+{
+	NSLog(@"rotate..");
+	self.customPreviewLayer.bounds = CGRectMake(0, 0, self.parentView.frame.size.width, self.parentView.frame.size.height);
+	[self layoutPreviewLayer];
+}
+
+
+- (void)saveVideo;
+{
+    if (self.recordVideo == NO) {
+        return;
+    }
+    
+	ALAssetsLibrary *library = [[ALAssetsLibrary alloc] init];
+    if ([library videoAtPathIsCompatibleWithSavedPhotosAlbum:[self videoFileURL]]) {
+        [library writeVideoAtPathToSavedPhotosAlbum:[self videoFileURL]
+                                    completionBlock:^(NSURL *assetURL, NSError *error){}];
+    }
+}
+
+
+- (NSURL *)videoFileURL;
+{
+    NSString *outputPath = [[NSString alloc] initWithFormat:@"%@%@", NSTemporaryDirectory(), @"output.mov"];
+    NSURL *outputURL = [NSURL fileURLWithPath:outputPath];
+    NSFileManager *fileManager = [NSFileManager defaultManager];
+    if ([fileManager fileExistsAtPath:outputPath]) {
+        NSLog(@"file exists");
+    }
+    return outputURL;
+}
+
+
+
+- (NSString *)videoFileString;
+{
+    NSString *outputPath = [[NSString alloc] initWithFormat:@"%@%@", NSTemporaryDirectory(), @"output.mov"];
+    return outputPath;
+}
+
+@end
--- a/modules/imgproc/perf/perf_remap.cpp
+++ b/modules/imgproc/perf/perf_remap.cpp
@ -0,0 +1,68 @@
+#include "perf_precomp.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace perf;
+using namespace testing;
+using std::tr1::make_tuple;
+using std::tr1::get;
+
+CV_ENUM(MatrixType, CV_16UC1, CV_16SC1, CV_32FC1)
+CV_ENUM(MapType, CV_16SC2, CV_32FC1, CV_32FC2)
+CV_ENUM(InterType, INTER_LINEAR, INTER_CUBIC, INTER_LANCZOS4, INTER_NEAREST)
+
+typedef TestBaseWithParam< tr1::tuple<Size, MatrixType, MapType, InterType> > TestRemap;
+
+PERF_TEST_P( TestRemap, Remap,
+             Combine(
+                Values( szVGA, sz1080p ), 
+                ValuesIn( MatrixType::all() ), 
+                ValuesIn( MapType::all() ), 
+                ValuesIn( InterType::all() ) 
+             )
+)
+{
+    Size sz;
+    int src_type, map1_type, inter_type;
+
+    sz         = get<0>(GetParam());
+    src_type   = get<1>(GetParam());
+    map1_type  = get<2>(GetParam());
+    inter_type = get<3>(GetParam());
+
+    Mat src(sz, src_type);
+    Mat map1(sz, map1_type);
+    Mat dst(sz, src_type);
+    
+    Mat map2(map1_type == CV_32FC1 ? sz : Size(), CV_32FC1);
+        
+    RNG rng;
+    rng.fill(src, RNG::UNIFORM, 0, 256);
+    
+    for (int j = 0; j < map1.rows; ++j)
+        for (int i = 0; i < map1.cols; ++i)
+            switch (map1_type)
+            {
+                case CV_32FC1:
+                    map1.at<float>(j, i) = src.cols - i;
+                    map2.at<float>(j, i) = j;
+                    break;
+                case CV_32FC2:
+                    map1.at<Vec2f>(j, i)[0] = src.cols - i;
+                    map1.at<Vec2f>(j, i)[1] = j;
+                    break;
+                case CV_16SC2:
+                    map1.at<Vec2s>(j, i)[0] = src.cols - i;
+                    map1.at<Vec2s>(j, i)[1] = j;
+                    break;
+                default:
+                    CV_Assert(0);
+            }
+
+
+    declare.in(src, WARMUP_RNG).out(dst).time(20);
+
+    TEST_CYCLE() remap(src, dst, map1, map2, inter_type);
+
+    SANITY_CHECK(dst);
+}
--- a/modules/imgproc/perf/perf_resize.cpp
+++ b/modules/imgproc/perf/perf_resize.cpp
@ -59,11 +59,11 @@ PERF_TEST_P(MatInfo_Size_Size, resizeDownLinear,
 typedef tr1::tuple<MatType, Size, int> MatInfo_Size_Scale_t;
 typedef TestBaseWithParam<MatInfo_Size_Scale_t> MatInfo_Size_Scale;

-PERF_TEST_P(MatInfo_Size_Scale, resizeAreaFast,
+PERF_TEST_P(MatInfo_Size_Scale, ResizeAreaFast,
            testing::Combine(
                testing::Values(CV_8UC1, CV_8UC4),
                testing::Values(szVGA, szqHD, sz720p, sz1080p),
-                testing::Values(2, 4)
+                testing::Values(2)
                )
            )
 {
@ -84,3 +84,31 @@ PERF_TEST_P(MatInfo_Size_Scale, resizeAreaFast,
    //difference equal to 1 is allowed because of different possible rounding modes: round-to-nearest vs bankers' rounding
    SANITY_CHECK(dst, 1);
 }
+
+
+typedef TestBaseWithParam<tr1::tuple<MatType, Size, double> > MatInfo_Size_Scale_Area;
+
+PERF_TEST_P(MatInfo_Size_Scale_Area, ResizeArea,
+            testing::Combine(
+                testing::Values(CV_8UC1, CV_8UC4),
+                testing::Values(szVGA, szqHD, sz720p, sz1080p),
+                testing::Values(2.4, 3.4, 1.3)
+                )
+            )
+{
+    int matType = get<0>(GetParam());
+    Size from = get<1>(GetParam());
+    double scale = get<2>(GetParam());
+
+    cv::Mat src(from, matType);
+    
+    Size to(cvRound(from.width * scale), cvRound(from.height * scale));
+    cv::Mat dst(to, matType);
+
+    declare.in(src, WARMUP_RNG).out(dst);
+
+    TEST_CYCLE() resize(src, dst, dst.size(), 0, 0, INTER_AREA);
+
+    //difference equal to 1 is allowed because of different possible rounding modes: round-to-nearest vs bankers' rounding
+    SANITY_CHECK(dst, 1);
+}
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
--- a/Show More
+++ b/Show More