Merge release 2.4.4

12 years ago · 9e12b7c3c2
parent 52a45ed191 facab40745
commit 9e12b7c3c2
64 changed files with 11513 additions and 3107 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -184,7 +184,7 @@ OCV_OPTION(INSTALL_TO_MANGLED_PATHS "Enables mangled install paths, that help wi
 OCV_OPTION(ENABLE_PRECOMPILED_HEADERS "Use precompiled headers"                                  ON   IF (NOT IOS) )
 OCV_OPTION(ENABLE_SOLUTION_FOLDERS    "Solution folder in Visual Studio or in other IDEs"        (MSVC_IDE OR CMAKE_GENERATOR MATCHES Xcode) IF (CMAKE_VERSION VERSION_GREATER "2.8.0") )
 OCV_OPTION(ENABLE_PROFILING           "Enable profiling in the GCC compiler (Add flags: -g -pg)" OFF  IF CMAKE_COMPILER_IS_GNUCXX )
-OCV_OPTION(ENABLE_OMIT_FRAME_POINTER  "Enable -fomit-frame-pointer for GCC"                      ON   IF CMAKE_COMPILER_IS_GNUCXX )
+OCV_OPTION(ENABLE_OMIT_FRAME_POINTER  "Enable -fomit-frame-pointer for GCC"                      ON   IF CMAKE_COMPILER_IS_GNUCXX AND NOT (APPLE AND CMAKE_COMPILER_IS_CLANGCXX) )
 OCV_OPTION(ENABLE_POWERPC             "Enable PowerPC for GCC"                                   ON   IF (CMAKE_COMPILER_IS_GNUCXX AND CMAKE_SYSTEM_PROCESSOR MATCHES powerpc.*) )
 OCV_OPTION(ENABLE_FAST_MATH           "Enable -ffast-math (not recommended for GCC 4.6.x)"       OFF  IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) )
 OCV_OPTION(ENABLE_SSE                 "Enable SSE instructions"                                  ON   IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) )
--- a/android/libinfo/info.c
+++ b/android/libinfo/info.c
@ -7,7 +7,7 @@ const char* GetLibraryList(void);
 JNIEXPORT jstring JNICALL Java_org_opencv_android_StaticHelper_getLibraryList(JNIEnv *, jclass);

 #define PACKAGE_NAME "org.opencv.lib_v" CVAUX_STR(CV_VERSION_EPOCH) CVAUX_STR(CV_VERSION_MAJOR) "_" ANDROID_PACKAGE_PLATFORM
-#define PACKAGE_REVISION CVAUX_STR(CV_VERSION_MINOR) "." CVAUX_STR(ANDROID_PACKAGE_RELEASE)
+#define PACKAGE_REVISION CVAUX_STR(CV_VERSION_MINOR) "." CVAUX_STR(CV_VERSION_REVISION) "." CVAUX_STR(ANDROID_PACKAGE_RELEASE)

 const char* GetPackageName(void)
 {
--- a/android/package/AndroidManifest.xml
+++ b/android/package/AndroidManifest.xml
@ -1,8 +1,8 @@
 <?xml version="1.0" encoding="utf-8"?>
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
    package="org.opencv.lib_v@OPENCV_VERSION_MAJOR@@OPENCV_VERSION_MINOR@_@ANDROID_PACKAGE_PLATFORM@"
-    android:versionCode="@OPENCV_VERSION_PATCH@@ANDROID_PACKAGE_RELEASE@"
-    android:versionName="@OPENCV_VERSION_PATCH@.@ANDROID_PACKAGE_RELEASE@" >
+    android:versionCode="@OPENCV_VERSION_PATCH@@OPENCV_VERSION_TWEAK@@ANDROID_PACKAGE_RELEASE@"
+    android:versionName="@OPENCV_VERSION_PATCH@.@OPENCV_VERSION_TWEAK@.@ANDROID_PACKAGE_RELEASE@" >

    <uses-sdk android:minSdkVersion="@ANDROID_SDK_VERSION@" />
    <uses-feature android:name="android.hardware.touchscreen" android:required="false"/>
--- a/android/service/engine/jni/NativeService/PackageInfo.cpp
+++ b/android/service/engine/jni/NativeService/PackageInfo.cpp
@ -136,7 +136,17 @@ inline int SplitVersion(const vector<string>& features, const string& package_ve

        // Taking release and build number from package revision
        vector<string> tmp2 = SplitStringVector(package_version, '.');
-        result += atoi(tmp2[0].c_str())*100 + atoi(tmp2[1].c_str());
+        if (tmp2.size() == 2)
+        {
+            // the 2nd digit is revision
+            result += atoi(tmp2[0].c_str())*100 + 00;
+        }
+        else
+        {
+            // the 2nd digit is part of library version
+            // the 3rd digit is revision
+            result += atoi(tmp2[0].c_str())*100 + atoi(tmp2[1].c_str());
+        }
    }
    else
    {
@ -194,10 +204,10 @@ inline int SplitPlatfrom(const vector<string>& features)
 * Example: armv7_neon
 */
 PackageInfo::PackageInfo(int version, int platform, int cpu_id, std::string install_path):
-Version(version),
-Platform(platform),
-CpuID(cpu_id),
-InstallPath("")
+    Version(version),
+    Platform(platform),
+    CpuID(cpu_id),
+    InstallPath("")
 {
    #ifndef __SUPPORT_TEGRA3
    Platform = PLATFORM_UNKNOWN;
--- a/android/service/engine/jni/Tests/PackageInfoTest.cpp
+++ b/android/service/engine/jni/Tests/PackageInfoTest.cpp
@ -157,6 +157,20 @@ TEST(PackageInfo, MipsFromFullName)
 }
 #endif

+TEST(PackageInfo, Check2DigitRevision)
+{
+    PackageInfo info("org.opencv.lib_v23_armv7a_neon", "/data/data/org.opencv.lib_v23_armv7_neon", "4.1");
+    EXPECT_EQ(2030400, info.GetVersion());
+    EXPECT_EQ(ARCH_ARMv7 | FEATURES_HAS_NEON, info.GetCpuID());
+}
+
+TEST(PackageInfo, Check3DigitRevision)
+{
+    PackageInfo info("org.opencv.lib_v23_armv7a_neon", "/data/data/org.opencv.lib_v23_armv7_neon", "4.1.5");
+    EXPECT_EQ(2030401, info.GetVersion());
+    EXPECT_EQ(ARCH_ARMv7 | FEATURES_HAS_NEON, info.GetCpuID());
+}
+
 TEST(PackageInfo, Comparator1)
 {
    PackageInfo info1(2040000, PLATFORM_UNKNOWN, ARCH_X86);
--- a/android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java
+++ b/android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java
@ -299,10 +299,9 @@ public class ManagerActivity extends Activity
                else
                    NativeLibDir = "/data/data/" + mInstalledPackageInfo[i].packageName + "/lib";

-                OpenCVLibraryInfo NativeInfo = new OpenCVLibraryInfo(NativeLibDir);
-
                if (PackageName.equals("org.opencv.engine"))
                {
+                    OpenCVLibraryInfo NativeInfo = new OpenCVLibraryInfo(NativeLibDir);
                    if (NativeInfo.status())
                    {
                        PublicName = "Built-in OpenCV library";
@ -348,9 +347,7 @@ public class ManagerActivity extends Activity

                if (null != ActivePackagePath)
                {
-                    int start = ActivePackagePath.indexOf(mInstalledPackageInfo[i].packageName);
-                    int stop = start + mInstalledPackageInfo[i].packageName.length();
-                    if (start >= 0 && ActivePackagePath.charAt(stop) == '/')
+                    if (ActivePackagePath.equals(NativeLibDir))
                    {
                        temp.put("Activity", "y");
                        Tags = "active";
@ -405,13 +402,22 @@ public class ManagerActivity extends Activity
        if (OpenCVersion == null || PackageVersion == null)
            return "unknown";

-        int dot = PackageVersion.indexOf(".");
-        if (dot == -1 || OpenCVersion.length() == 0)
+        String[] revisions = PackageVersion.split("\\.");
+
+        if (revisions.length <= 1 || OpenCVersion.length() == 0)
            return "unknown";
        else
-            return OpenCVersion.substring(0,  OpenCVersion.length()-1) + "." +
-                OpenCVersion.toCharArray()[OpenCVersion.length()-1] + "." +
-                PackageVersion.substring(0, dot) + " rev " + PackageVersion.substring(dot+1);
+            if (revisions.length == 2)
+                // the 2nd digit is revision
+                return OpenCVersion.substring(0,  OpenCVersion.length()-1) + "." +
+                    OpenCVersion.toCharArray()[OpenCVersion.length()-1] + "." +
+                    revisions[0] + " rev " + revisions[1];
+            else
+                // the 2nd digit is part of library version
+                // the 3rd digit is revision
+                return OpenCVersion.substring(0,  OpenCVersion.length()-1) + "." +
+                    OpenCVersion.toCharArray()[OpenCVersion.length()-1] + "." +
+                    revisions[0] + "." + revisions[1] + " rev " + revisions[2];
    }

    protected String ConvertPackageName(String Name, String Version)
--- a/cmake/OpenCVCompilerOptions.cmake
+++ b/cmake/OpenCVCompilerOptions.cmake
@ -91,7 +91,7 @@ if(CMAKE_COMPILER_IS_GNUCXX)
  endif()

  # We need pthread's
-  if(UNIX AND NOT ANDROID)
+  if(UNIX AND NOT ANDROID AND NOT (APPLE AND CMAKE_COMPILER_IS_CLANGCXX))
    add_extra_compiler_option(-pthread)
  endif()

--- a/cmake/OpenCVDetectCXXCompiler.cmake
+++ b/cmake/OpenCVDetectCXXCompiler.cmake
@ -5,17 +5,17 @@ if(CMAKE_CL_64)
    set(MSVC64 1)
 endif()

-if(NOT APPLE)
-  if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-    set(CMAKE_COMPILER_IS_GNUCXX 1)
-    set(CMAKE_COMPILER_IS_CLANGCXX 1)
-    set(ENABLE_PRECOMPILED_HEADERS OFF CACHE BOOL "" FORCE)
-  endif()
-  if(CMAKE_C_COMPILER_ID STREQUAL "Clang")
-    set(CMAKE_COMPILER_IS_GNUCC 1)
-    set(CMAKE_COMPILER_IS_CLANGCC 1)
-    set(ENABLE_PRECOMPILED_HEADERS OFF CACHE BOOL "" FORCE)
-  endif()
+if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+  set(CMAKE_COMPILER_IS_GNUCXX 1)
+  set(CMAKE_COMPILER_IS_CLANGCXX 1)
+endif()
+if(CMAKE_C_COMPILER_ID STREQUAL "Clang")
+  set(CMAKE_COMPILER_IS_GNUCC 1)
+  set(CMAKE_COMPILER_IS_CLANGCC 1)
+endif()
+
+if((CMAKE_COMPILER_IS_CLANGCXX OR CMAKE_COMPILER_IS_CLANGCC) AND NOT CMAKE_GENERATOR MATCHES "Xcode")
+  set(ENABLE_PRECOMPILED_HEADERS OFF CACHE BOOL "" FORCE)
 endif()

 # ----------------------------------------------------------------------------
--- a/cmake/OpenCVDetectPython.cmake
+++ b/cmake/OpenCVDetectPython.cmake
@ -19,18 +19,25 @@ unset(HAVE_SPHINX CACHE)
 if(PYTHON_EXECUTABLE)
  if(PYTHON_VERSION_STRING)
    set(PYTHON_VERSION_MAJOR_MINOR "${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}")
-    string(REGEX MATCH "[0-9]+.[0-9]+.[0-9]+" PYTHON_VERSION_FULL "${PYTHON_VERSION_STRING}")
+    set(PYTHON_VERSION_FULL "${PYTHON_VERSION_STRING}")
  else()
    execute_process(COMMAND ${PYTHON_EXECUTABLE} --version
      ERROR_VARIABLE PYTHON_VERSION_FULL
      ERROR_STRIP_TRAILING_WHITESPACE)

    string(REGEX MATCH "[0-9]+.[0-9]+" PYTHON_VERSION_MAJOR_MINOR "${PYTHON_VERSION_FULL}")
-    string(REGEX MATCH "[0-9]+.[0-9]+.[0-9]+" PYTHON_VERSION_FULL "${PYTHON_VERSION_FULL}")
+  endif()
+
+  if("${PYTHON_VERSION_FULL}" MATCHES "[0-9]+.[0-9]+.[0-9]+")
+    set(PYTHON_VERSION_FULL "${CMAKE_MATCH_0}")
+  elseif("${PYTHON_VERSION_FULL}" MATCHES "[0-9]+.[0-9]+")
+    set(PYTHON_VERSION_FULL "${CMAKE_MATCH_0}")
+  else()
+    unset(PYTHON_VERSION_FULL)
  endif()

  if(NOT ANDROID AND NOT IOS)
-    if(CMAKE_VERSION VERSION_GREATER 2.8.8)
+    if(CMAKE_VERSION VERSION_GREATER 2.8.8 AND PYTHON_VERSION_FULL)
      find_host_package(PythonLibs ${PYTHON_VERSION_FULL} EXACT)
    else()
      find_host_package(PythonLibs ${PYTHON_VERSION_FULL})
--- a/cmake/OpenCVGenConfig.cmake
+++ b/cmake/OpenCVGenConfig.cmake
@ -53,6 +53,10 @@ if(OpenCV_LIB_COMPONENTS)
  list(REMOVE_ITEM OPENCV_MODULES_CONFIGCMAKE ${OpenCV_LIB_COMPONENTS})
 endif()

+if(BUILD_FAT_JAVA_LIB AND HAVE_opencv_java)
+  list(APPEND OPENCV_MODULES_CONFIGCMAKE opencv_java)
+endif()
+
 macro(ocv_generate_dependencies_map_configcmake suffix configuration)
  set(OPENCV_DEPENDENCIES_MAP_${suffix} "")
  set(OPENCV_PROCESSED_LIBS "")
@ -126,8 +130,13 @@ configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/OpenCVConfig-version.cmake.
 set(OpenCV_INCLUDE_DIRS_CONFIGCMAKE "\"\${OpenCV_INSTALL_PATH}/${OPENCV_INCLUDE_INSTALL_PATH}/opencv" "\${OpenCV_INSTALL_PATH}/${OPENCV_INCLUDE_INSTALL_PATH}\"")

 set(OpenCV2_INCLUDE_DIRS_CONFIGCMAKE "\"\"")
-set(OpenCV_LIB_DIRS_CONFIGCMAKE "\"\${OpenCV_INSTALL_PATH}/${OPENCV_LIB_INSTALL_PATH}\"")
-set(OpenCV_3RDPARTY_LIB_DIRS_CONFIGCMAKE "\"\${OpenCV_INSTALL_PATH}/${OPENCV_3P_LIB_INSTALL_PATH}\"")
+if(ANDROID)
+  set(OpenCV_LIB_DIRS_CONFIGCMAKE          "\"\${OpenCV_INSTALL_PATH}/sdk/native/libs/\${ANDROID_NDK_ABI_NAME}\"")
+  set(OpenCV_3RDPARTY_LIB_DIRS_CONFIGCMAKE "\"\${OpenCV_INSTALL_PATH}/sdk/native/3rdparty/libs/\${ANDROID_NDK_ABI_NAME}\"")
+else()
+  set(OpenCV_LIB_DIRS_CONFIGCMAKE          "\"\${OpenCV_INSTALL_PATH}/${OPENCV_LIB_INSTALL_PATH}\"")
+  set(OpenCV_3RDPARTY_LIB_DIRS_CONFIGCMAKE "\"\${OpenCV_INSTALL_PATH}/${OPENCV_3P_LIB_INSTALL_PATH}\"")
+endif()
 if(INSTALL_TO_MANGLED_PATHS)
  string(REPLACE "OpenCV" "OpenCV-${OPENCV_VERSION}" OpenCV_3RDPARTY_LIB_DIRS_CONFIGCMAKE "${OPENCV_3P_LIB_INSTALL_PATH}")
  set(OpenCV_3RDPARTY_LIB_DIRS_CONFIGCMAKE "\"\${OpenCV_INSTALL_PATH}/${OpenCV_3RDPARTY_LIB_DIRS_CONFIGCMAKE}\"")
--- a/cmake/templates/OpenCV.mk.in
+++ b/cmake/templates/OpenCV.mk.in
@ -92,7 +92,7 @@ define add_opencv_camera_module
    include $(PREBUILT_SHARED_LIBRARY)
 endef

-ifeq ($(OPENCV_MK_ALREADY_INCLUDED),)
+ifeq ($(OPENCV_MK_$(OPENCV_TARGET_ARCH_ABI)_ALREADY_INCLUDED),)
    ifeq ($(OPENCV_INSTALL_MODULES),on)
        $(foreach module,$(OPENCV_LIBS),$(eval $(call add_opencv_module,$(module))))
    endif
@ -105,7 +105,7 @@ ifeq ($(OPENCV_MK_ALREADY_INCLUDED),)
    endif

    #turn off module installation to prevent their redefinition
-    OPENCV_MK_ALREADY_INCLUDED:=on
+    OPENCV_MK_$(OPENCV_TARGET_ARCH_ABI)_ALREADY_INCLUDED:=on
 endif

 ifeq ($(OPENCV_LOCAL_CFLAGS),)
--- a/cmake/templates/OpenCVConfig.cmake.in
+++ b/cmake/templates/OpenCVConfig.cmake.in
@ -150,6 +150,7 @@ endif()
 # ==============================================================
 if(NOT OpenCV_FIND_COMPONENTS)
  set(OpenCV_FIND_COMPONENTS ${OpenCV_LIB_COMPONENTS})
+  list(REMOVE_ITEM OpenCV_FIND_COMPONENTS opencv_java)
  if(GTest_FOUND OR GTEST_FOUND)
    list(REMOVE_ITEM OpenCV_FIND_COMPONENTS opencv_ts)
  endif()
@ -200,7 +201,7 @@ foreach(__opttype OPT DBG)
        #indicate that this module is also found
        string(TOUPPER "${__cvdep}" __cvdep)
        set(${__cvdep}_FOUND 1)
-      else()
+      elseif(EXISTS "${OpenCV_3RDPARTY_LIB_DIR_${__opttype}}/${OpenCV_${__cvdep}_LIBNAME_${__opttype}}")
        list(APPEND OpenCV_LIBS_${__opttype} "${OpenCV_3RDPARTY_LIB_DIR_${__opttype}}/${OpenCV_${__cvdep}_LIBNAME_${__opttype}}")
      endif()
    endforeach()
@ -220,7 +221,7 @@ foreach(__opttype OPT DBG)
  endif()

  # CUDA
-  if(OpenCV_CUDA_VERSION AND WIN32 AND NOT OpenCV_SHARED)
+  if(OpenCV_CUDA_VERSION AND (CMAKE_CROSSCOMPILING OR (WIN32 AND NOT OpenCV_SHARED)))
    if(NOT CUDA_FOUND)
      find_package(CUDA ${OpenCV_CUDA_VERSION} EXACT REQUIRED)
    else()
@ -303,3 +304,11 @@ else()
  SET(OpenCV_LIB_DIR ${OpenCV_LIB_DIR_OPT} ${OpenCV_3RDPARTY_LIB_DIR_OPT})
 endif()
 set(OpenCV_LIBRARIES ${OpenCV_LIBS})
+
+if(CMAKE_CROSSCOMPILING AND OpenCV_SHARED AND (CMAKE_SYSTEM_NAME MATCHES "Linux"))
+  foreach(dir ${OpenCV_LIB_DIR})
+    set(CMAKE_EXE_LINKER_FLAGS    "${CMAKE_EXE_LINKER_FLAGS}    -Wl,-rpath-link,${dir}")
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-rpath-link,${dir}")
+    set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -Wl,-rpath-link,${dir}")
+  endforeach()
+endif()
--- a/data/haarcascades/haarcascade_smile.xml
+++ b/data/haarcascades/haarcascade_smile.xml
--- a/doc/tutorials/introduction/android_binary_package/O4A_SDK.rst
+++ b/doc/tutorials/introduction/android_binary_package/O4A_SDK.rst
@ -159,7 +159,7 @@ Get the OpenCV4Android SDK

      unzip ~/Downloads/OpenCV-2.4.4-android-sdk.zip

-.. |opencv_android_bin_pack| replace:: OpenCV-2.4.4-android-sdk.zip
+.. |opencv_android_bin_pack| replace:: :file:`OpenCV-2.4.4-android-sdk.zip`
 .. _opencv_android_bin_pack_url: http://sourceforge.net/projects/opencvlibrary/files/opencv-android/2.4.4/OpenCV-2.4.4-android-sdk.zip/download
 .. |opencv_android_bin_pack_url| replace:: |opencv_android_bin_pack|
 .. |seven_zip| replace:: 7-Zip
@ -184,7 +184,7 @@ Import OpenCV library and samples to the Eclipse
   You can simply reference it in your projects.

   Each sample included into the |opencv_android_bin_pack| is a regular Android project that already
-   references OpenCV library.Follow the steps below to import OpenCV and samples into the workspace:
+   references OpenCV library. Follow the steps below to import OpenCV and samples into the workspace:

   .. note:: OpenCV samples are indeed **dependent** on OpenCV library project so don't forget to import it to your workspace as well.

@ -246,8 +246,8 @@ Running OpenCV Samples
 ----------------------

 At this point you should be able to build and run the samples. Keep in mind, that
-``face-detection``, ``Tutorial 3` and ``Tutorial 4`` include some native code and
-require Android NDK and CDT plugin for Eclipse to build working applications. If you haven't
+``face-detection`` and ``Tutorial 2 - Mixed Processing`` include some native code and
+require Android NDK and NDK/CDT plugin for Eclipse to build working applications. If you haven't
 installed these tools, see the corresponding section of :ref:`Android_Dev_Intro`.

 .. warning:: Please consider that some samples use Android Java Camera API, which is accessible
@ -295,7 +295,7 @@ Well, running samples from Eclipse is very simple:
  .. code-block:: sh
    :linenos:

-    <Android SDK path>/platform-tools/adb install <OpenCV4Android SDK path>/apk/OpenCV_2.4.4_Manager_armv7a-neon.apk
+    <Android SDK path>/platform-tools/adb install <OpenCV4Android SDK path>/apk/OpenCV_2.4.4_Manager_2.6_armv7a-neon.apk

  .. note:: ``armeabi``, ``armv7a-neon``, ``arm7a-neon-android8``, ``mips`` and ``x86`` stand for
            platform targets:
@ -326,15 +326,16 @@ Well, running samples from Eclipse is very simple:

  When done, you will be able to run OpenCV samples on your device/emulator seamlessly.

-* Here is ``Tutorial 2 - Use OpenCV Camera`` sample, running on top of stock camera-preview of the emulator.
+* Here is ``Sample - image-manipulations`` sample, running on top of stock camera-preview of the emulator.

  .. image:: images/emulator_canny.png
-     :height: 600px
-     :alt: Tutorial 1 Basic - 1. Add OpenCV - running Canny
+     :alt: 'Sample - image-manipulations' running Canny
     :align: center


 What's next
 ===========

-Now, when you have your instance of OpenCV4Adroid SDK set up and configured, you may want to proceed to using OpenCV in your own application. You can learn how to do that in a separate :ref:`dev_with_OCV_on_Android` tutorial.
+Now, when you have your instance of OpenCV4Adroid SDK set up and configured,
+you may want to proceed to using OpenCV in your own application.
+You can learn how to do that in a separate :ref:`dev_with_OCV_on_Android` tutorial.
--- a/doc/tutorials/introduction/android_binary_package/android_dev_intro.rst
+++ b/doc/tutorials/introduction/android_binary_package/android_dev_intro.rst
@ -103,8 +103,8 @@ You need the following software to be installed in order to develop for Android

   Here is Google's `install guide <http://developer.android.com/sdk/installing.html>`_ for the SDK.

-   .. note:: You can choose downloading ``ADT Bundle package`` that in addition to Android SDK Tools includes
-             Eclipse + ADT + CDT plugins, Android Platform-tools, the latest Android platform and the latest
+   .. note:: You can choose downloading **ADT Bundle package** that in addition to Android SDK Tools includes
+             Eclipse + ADT + NDK/CDT plugins, Android Platform-tools, the latest Android platform and the latest
             Android system image for the emulator - this is the best choice for those who is setting up Android
             development environment the first time!

@ -112,15 +112,15 @@ You need the following software to be installed in order to develop for Android
             for use on amd64 and ia64 systems to be installed. You can install them with the
             following command:

-      .. code-block:: bash
+             .. code-block:: bash

-         sudo apt-get install ia32-libs
+                 sudo apt-get install ia32-libs

-      For Red Hat based systems the following command might be helpful:
+             For Red Hat based systems the following command might be helpful:

-      .. code-block:: bash
+             .. code-block:: bash

-         sudo yum install libXtst.i386
+                 sudo yum install libXtst.i386

 #. **Android SDK components**

@ -148,7 +148,7 @@ You need the following software to be installed in order to develop for Android

   Check the `Android SDK System Requirements <http://developer.android.com/sdk/requirements.html>`_
   document for a list of Eclipse versions that are compatible with the Android SDK.
-   For OpenCV 2.4.x we recommend **Eclipse 3.7 (Indigo)** or later versions. They work well for
+   For OpenCV 2.4.x we recommend **Eclipse 3.7 (Indigo)** or **Eclipse 4.2 (Juno)**. They work well for
   OpenCV under both Windows and Linux.

   If you have no Eclipse installed, you can get it from the `official site <http://www.eclipse.org/downloads/>`_.
--- a/doc/tutorials/introduction/android_binary_package/images/eclipse_10_crystal_clean.png
+++ b/doc/tutorials/introduction/android_binary_package/images/eclipse_10_crystal_clean.png
--- a/doc/tutorials/introduction/android_binary_package/images/eclipse_7_select_projects.png
+++ b/doc/tutorials/introduction/android_binary_package/images/eclipse_7_select_projects.png
--- a/doc/tutorials/introduction/android_binary_package/images/eclipse_cdt_cfg4.png
+++ b/doc/tutorials/introduction/android_binary_package/images/eclipse_cdt_cfg4.png
--- a/doc/tutorials/introduction/android_binary_package/images/emulator_canny.png
+++ b/doc/tutorials/introduction/android_binary_package/images/emulator_canny.png
--- a/doc/tutorials/introduction/desktop_java/images/eclipse_run.png
+++ b/doc/tutorials/introduction/desktop_java/images/eclipse_run.png
--- a/doc/tutorials/introduction/desktop_java/java_dev_intro.rst
+++ b/doc/tutorials/introduction/desktop_java/java_dev_intro.rst
@ -5,7 +5,7 @@
 Introduction to Java Development
 ********************************

-Last updated: 12 February, 2013.
+Last updated: 28 February, 2013.

 As of OpenCV 2.4.4, OpenCV supports desktop Java development using nearly the same interface as for
 Android development. This guide will help you to create your first Java (or Scala) application using OpenCV.
@ -28,10 +28,14 @@ In this guide, we will:
 The same process was used to create the samples in the :file:`samples/java` folder of the OpenCV repository,
 so consult those files if you get lost.

-Get OpenCV with desktop Java support
-************************************
+Get proper OpenCV
+*****************

 Starting from version 2.4.4 OpenCV includes desktop Java bindings.
+
+Download
+########
+
 The most simple way to get it is downloading the appropriate package of **version 2.4.4 or higher** from the
 `OpenCV SourceForge repository <http://sourceforge.net/projects/opencvlibrary/files/>`_.

@ -45,8 +49,8 @@ In order to build OpenCV with Java bindings you need :abbr:`JDK (Java Developmen
 (we recommend `Oracle/Sun JDK 6 or 7 <http://www.oracle.com/technetwork/java/javase/downloads/>`_),
 `Apache Ant <http://ant.apache.org/>`_ and `Python` v2.6 or higher to be installed.

-Build OpenCV
-############
+Build
+#####

 Let's build OpenCV:

@ -83,6 +87,16 @@ through the CMake output for any Java-related tools that aren't found and instal
        :alt: CMake output
        :align: center

+.. note:: If ``CMake`` can't find Java in your system set the ``JAVA_HOME``
+          environment variable with the path to installed JDK
+          before running it. E.g.:
+
+          .. code-block:: bash
+
+             export JAVA_HOME=/usr/lib/jvm/java-6-oracle
+             cmake -DBUILD_SHARED_LIBS=OFF ..
+
+
 Now start the build:

        .. code-block:: bash
@ -95,20 +109,20 @@ or

           msbuild /m OpenCV.sln /t:Build /p:Configuration=Release /v:m

-Besides all this will create a ``jar`` containing the Java interface (:file:`bin/opencv_2.4.4.jar`)
+Besides all this will create a ``jar`` containing the Java interface (:file:`bin/opencv-244.jar`)
 and a native dynamic library containing Java bindings and all the OpenCV stuff
-(:file:`bin/Release/opencv_java244.dll` or :file:`bin/libopencv_java244.so` respectively).
+(:file:`bin/Release/opencv_java244.dll` or :file:`lib/libopencv_java244.so` respectively).
 We'll use these files later.

-Create a simple Java sample and an Ant build file for it
-********************************************************
+Java sample with Ant
+********************

 .. note::
    The described sample is provided with OpenCV library in the :file:`opencv/samples/java/ant` folder.

 * Create a folder where you'll develop this sample application.

-* In this folder create an XML file with the following content using any text editor:
+* In this folder create the :file:`build.xml` file with the following content using any text editor:

    .. code-block:: xml
        :linenos:
@ -135,7 +149,7 @@ Create a simple Java sample and an Ant build file for it

            <target name="compile">
                <mkdir dir="${classes.dir}"/>
-                <javac srcdir="${src.dir}" destdir="${classes.dir}" classpathref="classpath"/>
+                <javac includeantruntime="false" srcdir="${src.dir}" destdir="${classes.dir}" classpathref="classpath"/>
            </target>

            <target name="jar" depends="compile">
@ -181,15 +195,17 @@ Create a simple Java sample and an Ant build file for it
 * Put the following Java code into the :file:`SimpleSample.java` file:
    .. code-block:: java

+        import org.opencv.core.Core;
        import org.opencv.core.Mat;
        import org.opencv.core.CvType;
        import org.opencv.core.Scalar;

        class SimpleSample {

-          static{ System.loadLibrary("opencv_java244"); }
+          static{ System.loadLibrary(Core.NATIVE_LIBRARY_NAME); }

          public static void main(String[] args) {
+            System.out.println("Welcome to OpenCV " + Core.VERSION);
            Mat m = new Mat(5, 10, CvType.CV_8UC1, new Scalar(0));
            System.out.println("OpenCV Mat: " + m);
            Mat mr1 = m.row(1);
@ -219,8 +235,8 @@ Create a simple Java sample and an Ant build file for it
        :alt: run app with Ant
        :align: center

-Create a simple Java project in Eclipse
-***************************************
+Java project in Eclipse
+***********************

 Now let's look at the possiblity of using OpenCV in Java when developing in Eclipse IDE.

@ -293,12 +309,13 @@ Now let's look at the possiblity of using OpenCV in Java when developing in Ecli
 * Put some simple OpenCV calls there, e.g.:
    .. code-block:: java

+        import org.opencv.core.Core;
        import org.opencv.core.CvType;
        import org.opencv.core.Mat;

        public class Main {
            public static void main(String[] args) {
-                System.loadLibrary("opencv_java244");
+                System.loadLibrary(Core.NATIVE_LIBRARY_NAME);
                Mat m  = Mat.eye(3, 3, CvType.CV_8UC1);
                System.out.println("m = " + m.dump());
            }
@ -310,8 +327,8 @@ Now let's look at the possiblity of using OpenCV in Java when developing in Ecli
        :alt: Eclipse: run
        :align: center

-Create an SBT project and samples in Java and Scala
-***************************************************
+SBT project for Java and Scala
+******************************

 Now we'll create a simple Java application using SBT. This serves as a brief introduction to
 those unfamiliar with this build tool. We're using SBT because it is particularly easy and powerful.
@ -409,8 +426,8 @@ You should see something like this:
        :alt: SBT run
        :align: center

-Copy the OpenCV jar and write a simple application
-********************************************************
+Running SBT samples
+###################

 Now we'll create a simple face detection application using OpenCV.

@ -424,7 +441,7 @@ You can optionally rerun ``sbt eclipse`` to update your Eclipse project.
           cp <opencv_dir>/build/bin/opencv_<version>.jar lib/
           sbt eclipse

-Next, create the directory src/main/resources and download this Lena image into it:
+Next, create the directory :file:`src/main/resources` and download this Lena image into it:

     .. image:: images/lena.png
        :alt: Lena
@ -433,7 +450,7 @@ Next, create the directory src/main/resources and download this Lena image into
 Make sure it's called :file:`"lena.png"`.
 Items in the resources directory are available to the Java application at runtime.

-Next, copy :file:`lbpcascade_frontalface.xml` from :file:`opencv/data/` into the :file:`resources`
+Next, copy :file:`lbpcascade_frontalface.xml` from :file:`opencv/data/lbpcascades/` into the :file:`resources`
 directory:

        .. code-block:: bash
@ -490,12 +507,12 @@ Now modify src/main/java/HelloOpenCV.java so it contains the following Java code
       System.out.println("Hello, OpenCV");

       // Load the native library.
-       System.loadLibrary("opencv_java244");
+       System.loadLibrary(Core.NATIVE_LIBRARY_NAME);
       new DetectFaceDemo().run();
     }
   }

-Note the call to ``System.loadLibrary("opencv_java244")``.
+Note the call to ``System.loadLibrary(Core.NATIVE_LIBRARY_NAME)``.
 This command must be executed exactly once per Java process prior to using any native OpenCV methods.
 If you don't call it, you will get ``UnsatisfiedLink errors``.
 You will also get errors if you try to load OpenCV when it has already been loaded.
--- a/modules/gpu/perf/perf_calib3d.cpp
+++ b/modules/gpu/perf/perf_calib3d.cpp
@ -3,15 +3,14 @@
 using namespace std;
 using namespace testing;

-namespace {
-
 //////////////////////////////////////////////////////////////////////
 // StereoBM

 typedef std::tr1::tuple<string, string> pair_string;
 DEF_PARAM_TEST_1(ImagePair, pair_string);

-PERF_TEST_P(ImagePair, Calib3D_StereoBM, Values(pair_string("gpu/perf/aloe.png", "gpu/perf/aloeR.png")))
+PERF_TEST_P(ImagePair, Calib3D_StereoBM,
+            Values(pair_string("gpu/perf/aloe.png", "gpu/perf/aloeR.png")))
 {
    declare.time(5.0);

@ -28,18 +27,13 @@ PERF_TEST_P(ImagePair, Calib3D_StereoBM, Values(pair_string("gpu/perf/aloe.png",
    {
        cv::gpu::StereoBM_GPU d_bm(preset, ndisp);

-        cv::gpu::GpuMat d_imgLeft(imgLeft);
-        cv::gpu::GpuMat d_imgRight(imgRight);
-        cv::gpu::GpuMat d_dst;
-
-        d_bm(d_imgLeft, d_imgRight, d_dst);
+        const cv::gpu::GpuMat d_imgLeft(imgLeft);
+        const cv::gpu::GpuMat d_imgRight(imgRight);
+        cv::gpu::GpuMat dst;

-        TEST_CYCLE()
-        {
-            d_bm(d_imgLeft, d_imgRight, d_dst);
-        }
+        TEST_CYCLE() d_bm(d_imgLeft, d_imgRight, dst);

-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
@ -47,12 +41,7 @@ PERF_TEST_P(ImagePair, Calib3D_StereoBM, Values(pair_string("gpu/perf/aloe.png",

        cv::Mat dst;

-        bm(imgLeft, imgRight, dst);
-
-        TEST_CYCLE()
-        {
-            bm(imgLeft, imgRight, dst);
-        }
+        TEST_CYCLE() bm(imgLeft, imgRight, dst);

        CPU_SANITY_CHECK(dst);
    }
@ -61,7 +50,8 @@ PERF_TEST_P(ImagePair, Calib3D_StereoBM, Values(pair_string("gpu/perf/aloe.png",
 //////////////////////////////////////////////////////////////////////
 // StereoBeliefPropagation

-PERF_TEST_P(ImagePair, Calib3D_StereoBeliefPropagation, Values(pair_string("gpu/stereobp/aloe-L.png", "gpu/stereobp/aloe-R.png")))
+PERF_TEST_P(ImagePair, Calib3D_StereoBeliefPropagation,
+            Values(pair_string("gpu/stereobp/aloe-L.png", "gpu/stereobp/aloe-R.png")))
 {
    declare.time(10.0);

@ -77,29 +67,25 @@ PERF_TEST_P(ImagePair, Calib3D_StereoBeliefPropagation, Values(pair_string("gpu/
    {
        cv::gpu::StereoBeliefPropagation d_bp(ndisp);

-        cv::gpu::GpuMat d_imgLeft(imgLeft);
-        cv::gpu::GpuMat d_imgRight(imgRight);
-        cv::gpu::GpuMat d_dst;
+        const cv::gpu::GpuMat d_imgLeft(imgLeft);
+        const cv::gpu::GpuMat d_imgRight(imgRight);
+        cv::gpu::GpuMat dst;

-        d_bp(d_imgLeft, d_imgRight, d_dst);
+        TEST_CYCLE() d_bp(d_imgLeft, d_imgRight, dst);

-        TEST_CYCLE()
-        {
-            d_bp(d_imgLeft, d_imgRight, d_dst);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
-        FAIL() << "No such CPU implementation analogy.";
+        FAIL_NO_CPU();
    }
 }

 //////////////////////////////////////////////////////////////////////
 // StereoConstantSpaceBP

-PERF_TEST_P(ImagePair, Calib3D_StereoConstantSpaceBP, Values(pair_string("gpu/stereobm/aloe-L.png", "gpu/stereobm/aloe-R.png")))
+PERF_TEST_P(ImagePair, Calib3D_StereoConstantSpaceBP,
+            Values(pair_string("gpu/stereobm/aloe-L.png", "gpu/stereobm/aloe-R.png")))
 {
    declare.time(10.0);

@ -115,29 +101,25 @@ PERF_TEST_P(ImagePair, Calib3D_StereoConstantSpaceBP, Values(pair_string("gpu/st
    {
        cv::gpu::StereoConstantSpaceBP d_csbp(ndisp);

-        cv::gpu::GpuMat d_imgLeft(imgLeft);
-        cv::gpu::GpuMat d_imgRight(imgRight);
-        cv::gpu::GpuMat d_dst;
-
-        d_csbp(d_imgLeft, d_imgRight, d_dst);
+        const cv::gpu::GpuMat d_imgLeft(imgLeft);
+        const cv::gpu::GpuMat d_imgRight(imgRight);
+        cv::gpu::GpuMat dst;

-        TEST_CYCLE()
-        {
-            d_csbp(d_imgLeft, d_imgRight, d_dst);
-        }
+        TEST_CYCLE() d_csbp(d_imgLeft, d_imgRight, dst);

-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
-        FAIL() << "No such CPU implementation analogy.";
+        FAIL_NO_CPU();
    }
 }

 //////////////////////////////////////////////////////////////////////
 // DisparityBilateralFilter

-PERF_TEST_P(ImagePair, Calib3D_DisparityBilateralFilter, Values(pair_string("gpu/stereobm/aloe-L.png", "gpu/stereobm/aloe-disp.png")))
+PERF_TEST_P(ImagePair, Calib3D_DisparityBilateralFilter,
+            Values(pair_string("gpu/stereobm/aloe-L.png", "gpu/stereobm/aloe-disp.png")))
 {
    const cv::Mat img = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
    ASSERT_FALSE(img.empty());
@ -151,22 +133,17 @@ PERF_TEST_P(ImagePair, Calib3D_DisparityBilateralFilter, Values(pair_string("gpu
    {
        cv::gpu::DisparityBilateralFilter d_filter(ndisp);

-        cv::gpu::GpuMat d_img(img);
-        cv::gpu::GpuMat d_disp(disp);
-        cv::gpu::GpuMat d_dst;
+        const cv::gpu::GpuMat d_img(img);
+        const cv::gpu::GpuMat d_disp(disp);
+        cv::gpu::GpuMat dst;

-        d_filter(d_disp, d_img, d_dst);
+        TEST_CYCLE() d_filter(d_disp, d_img, dst);

-        TEST_CYCLE()
-        {
-            d_filter(d_disp, d_img, d_dst);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
-        FAIL() << "No such CPU implementation analogy.";
+        FAIL_NO_CPU();
    }
 }

@ -175,45 +152,42 @@ PERF_TEST_P(ImagePair, Calib3D_DisparityBilateralFilter, Values(pair_string("gpu

 DEF_PARAM_TEST_1(Count, int);

-PERF_TEST_P(Count, Calib3D_TransformPoints, Values(5000, 10000, 20000))
+PERF_TEST_P(Count, Calib3D_TransformPoints,
+            Values(5000, 10000, 20000))
 {
    const int count = GetParam();

    cv::Mat src(1, count, CV_32FC3);
-    fillRandom(src, -100, 100);
+    declare.in(src, WARMUP_RNG);

    const cv::Mat rvec = cv::Mat::ones(1, 3, CV_32FC1);
    const cv::Mat tvec = cv::Mat::ones(1, 3, CV_32FC1);

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
-
-        cv::gpu::transformPoints(d_src, rvec, tvec, d_dst);
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;

-        TEST_CYCLE()
-        {
-            cv::gpu::transformPoints(d_src, rvec, tvec, d_dst);
-        }
+        TEST_CYCLE() cv::gpu::transformPoints(d_src, rvec, tvec, dst);

-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
-        FAIL() << "No such CPU implementation analogy.";
+        FAIL_NO_CPU();
    }
 }

 //////////////////////////////////////////////////////////////////////
 // ProjectPoints

-PERF_TEST_P(Count, Calib3D_ProjectPoints, Values(5000, 10000, 20000))
+PERF_TEST_P(Count, Calib3D_ProjectPoints,
+            Values(5000, 10000, 20000))
 {
    const int count = GetParam();

    cv::Mat src(1, count, CV_32FC3);
-    fillRandom(src, -100, 100);
+    declare.in(src, WARMUP_RNG);

    const cv::Mat rvec = cv::Mat::ones(1, 3, CV_32FC1);
    const cv::Mat tvec = cv::Mat::ones(1, 3, CV_32FC1);
@ -221,28 +195,18 @@ PERF_TEST_P(Count, Calib3D_ProjectPoints, Values(5000, 10000, 20000))

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;

-        cv::gpu::projectPoints(d_src, rvec, tvec, camera_mat, cv::Mat(), d_dst);
+        TEST_CYCLE() cv::gpu::projectPoints(d_src, rvec, tvec, camera_mat, cv::Mat(), dst);

-        TEST_CYCLE()
-        {
-            cv::gpu::projectPoints(d_src, rvec, tvec, camera_mat, cv::Mat(), d_dst);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
        cv::Mat dst;

-        cv::projectPoints(src, rvec, tvec, camera_mat, cv::noArray(), dst);
-
-        TEST_CYCLE()
-        {
-            cv::projectPoints(src, rvec, tvec, camera_mat, cv::noArray(), dst);
-        }
+        TEST_CYCLE() cv::projectPoints(src, rvec, tvec, camera_mat, cv::noArray(), dst);

        CPU_SANITY_CHECK(dst);
    }
@ -251,17 +215,18 @@ PERF_TEST_P(Count, Calib3D_ProjectPoints, Values(5000, 10000, 20000))
 //////////////////////////////////////////////////////////////////////
 // SolvePnPRansac

-PERF_TEST_P(Count, Calib3D_SolvePnPRansac, Values(5000, 10000, 20000))
+PERF_TEST_P(Count, Calib3D_SolvePnPRansac,
+            Values(5000, 10000, 20000))
 {
    declare.time(10.0);

    const int count = GetParam();

    cv::Mat object(1, count, CV_32FC3);
-    fillRandom(object, -100, 100);
+    declare.in(object, WARMUP_RNG);

    cv::Mat camera_mat(3, 3, CV_32FC1);
-    fillRandom(camera_mat, 0.5, 1);
+    cv::randu(camera_mat, 0.5, 1);
    camera_mat.at<float>(0, 1) = 0.f;
    camera_mat.at<float>(1, 0) = 0.f;
    camera_mat.at<float>(2, 0) = 0.f;
@ -269,79 +234,66 @@ PERF_TEST_P(Count, Calib3D_SolvePnPRansac, Values(5000, 10000, 20000))

    const cv::Mat dist_coef(1, 8, CV_32F, cv::Scalar::all(0));

-    std::vector<cv::Point2f> image_vec;
    cv::Mat rvec_gold(1, 3, CV_32FC1);
-    fillRandom(rvec_gold, 0, 1);
+    cv::randu(rvec_gold, 0, 1);
+
    cv::Mat tvec_gold(1, 3, CV_32FC1);
-    fillRandom(tvec_gold, 0, 1);
+    cv::randu(tvec_gold, 0, 1);
+
+    std::vector<cv::Point2f> image_vec;
    cv::projectPoints(object, rvec_gold, tvec_gold, camera_mat, dist_coef, image_vec);

-    cv::Mat image(1, count, CV_32FC2, &image_vec[0]);
+    const cv::Mat image(1, count, CV_32FC2, &image_vec[0]);

    cv::Mat rvec;
    cv::Mat tvec;

    if (PERF_RUN_GPU())
    {
-        cv::gpu::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
+        TEST_CYCLE() cv::gpu::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);

-        TEST_CYCLE()
-        {
-            cv::gpu::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
-        }
+        GPU_SANITY_CHECK(rvec, 1e-3);
+        GPU_SANITY_CHECK(tvec, 1e-3);
    }
    else
    {
-        cv::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
+        TEST_CYCLE() cv::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);

-        TEST_CYCLE()
-        {
-            cv::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
-        }
+        CPU_SANITY_CHECK(rvec, 1e-6);
+        CPU_SANITY_CHECK(tvec, 1e-6);
    }
-
-    CPU_SANITY_CHECK(rvec);
-    CPU_SANITY_CHECK(tvec);
 }

 //////////////////////////////////////////////////////////////////////
 // ReprojectImageTo3D

-PERF_TEST_P(Sz_Depth, Calib3D_ReprojectImageTo3D, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16S)))
+PERF_TEST_P(Sz_Depth, Calib3D_ReprojectImageTo3D,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16S)))
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);

    cv::Mat src(size, depth);
-    fillRandom(src, 5.0, 30.0);
+    declare.in(src, WARMUP_RNG);

    cv::Mat Q(4, 4, CV_32FC1);
-    fillRandom(Q, 0.1, 1.0);
+    cv::randu(Q, 0.1, 1.0);

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;

-        cv::gpu::reprojectImageTo3D(d_src, d_dst, Q);
+        TEST_CYCLE() cv::gpu::reprojectImageTo3D(d_src, dst, Q);

-        TEST_CYCLE()
-        {
-            cv::gpu::reprojectImageTo3D(d_src, d_dst, Q);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
        cv::Mat dst;

-        cv::reprojectImageTo3D(src, dst, Q);
-
-        TEST_CYCLE()
-        {
-            cv::reprojectImageTo3D(src, dst, Q);
-        }
+        TEST_CYCLE() cv::reprojectImageTo3D(src, dst, Q);

        CPU_SANITY_CHECK(dst);
    }
@ -350,32 +302,27 @@ PERF_TEST_P(Sz_Depth, Calib3D_ReprojectImageTo3D, Combine(GPU_TYPICAL_MAT_SIZES,
 //////////////////////////////////////////////////////////////////////
 // DrawColorDisp

-PERF_TEST_P(Sz_Depth, Calib3D_DrawColorDisp, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16S)))
+PERF_TEST_P(Sz_Depth, Calib3D_DrawColorDisp,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16S)))
 {
    const cv::Size size = GET_PARAM(0);
    const int type = GET_PARAM(1);

    cv::Mat src(size, type);
-    fillRandom(src, 0, 255);
+    declare.in(src, WARMUP_RNG);

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;

-        cv::gpu::drawColorDisp(d_src, d_dst, 255);
+        TEST_CYCLE() cv::gpu::drawColorDisp(d_src, dst, 255);

-        TEST_CYCLE()
-        {
-            cv::gpu::drawColorDisp(d_src, d_dst, 255);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
-        FAIL() << "No such CPU implementation analogy.";
+        FAIL_NO_CPU();
    }
 }
-
-} // namespace
--- a/modules/gpu/perf/perf_core.cpp
+++ b/modules/gpu/perf/perf_core.cpp
--- a/modules/gpu/perf/perf_denoising.cpp
+++ b/modules/gpu/perf/perf_denoising.cpp
@ -3,8 +3,7 @@
 using namespace std;
 using namespace testing;

-#define GPU_DENOISING_IMAGE_SIZES testing::Values(perf::szVGA, perf::szXGA, perf::sz720p, perf::sz1080p)
-
+#define GPU_DENOISING_IMAGE_SIZES testing::Values(perf::szVGA, perf::sz720p)

 //////////////////////////////////////////////////////////////////////
 // BilateralFilter
@ -12,96 +11,86 @@ using namespace testing;
 DEF_PARAM_TEST(Sz_Depth_Cn_KernelSz, cv::Size, MatDepth, MatCn, int);

 PERF_TEST_P(Sz_Depth_Cn_KernelSz, Denoising_BilateralFilter,
-            Combine(GPU_DENOISING_IMAGE_SIZES, Values(CV_8U, CV_32F), GPU_CHANNELS_1_3, Values(3, 5, 9)))
+            Combine(GPU_DENOISING_IMAGE_SIZES,
+                    Values(CV_8U, CV_32F),
+                    GPU_CHANNELS_1_3,
+                    Values(3, 5, 9)))
 {
    declare.time(60.0);

-    cv::Size size = GET_PARAM(0);
-    int depth = GET_PARAM(1);
-    int channels = GET_PARAM(2);
-    int kernel_size = GET_PARAM(3);
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+    const int kernel_size = GET_PARAM(3);

-    float sigma_color = 7;
-    float sigma_spatial = 5;
-    int borderMode = cv::BORDER_REFLECT101;
+    const float sigma_color = 7;
+    const float sigma_spatial = 5;
+    const int borderMode = cv::BORDER_REFLECT101;

-    int type = CV_MAKE_TYPE(depth, channels);
+    const int type = CV_MAKE_TYPE(depth, channels);

    cv::Mat src(size, type);
-    fillRandom(src);
+    declare.in(src, WARMUP_RNG);

-     if (PERF_RUN_GPU())
+    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
-
-        cv::gpu::bilateralFilter(d_src, d_dst, kernel_size, sigma_color, sigma_spatial, borderMode);
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;

-        TEST_CYCLE()
-        {
-            cv::gpu::bilateralFilter(d_src, d_dst, kernel_size, sigma_color, sigma_spatial, borderMode);
-        }
+        TEST_CYCLE() cv::gpu::bilateralFilter(d_src, dst, kernel_size, sigma_color, sigma_spatial, borderMode);

-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
        cv::Mat dst;

-        cv::bilateralFilter(src, dst, kernel_size, sigma_color, sigma_spatial, borderMode);
-
-        TEST_CYCLE()
-        {
-            cv::bilateralFilter(src, dst, kernel_size, sigma_color, sigma_spatial, borderMode);
-        }
+        TEST_CYCLE() cv::bilateralFilter(src, dst, kernel_size, sigma_color, sigma_spatial, borderMode);

        CPU_SANITY_CHECK(dst);
    }
 }

-
 //////////////////////////////////////////////////////////////////////
 // nonLocalMeans

 DEF_PARAM_TEST(Sz_Depth_Cn_WinSz_BlockSz, cv::Size, MatDepth, MatCn, int, int);

 PERF_TEST_P(Sz_Depth_Cn_WinSz_BlockSz, Denoising_NonLocalMeans,
-            Combine(GPU_DENOISING_IMAGE_SIZES, Values<MatDepth>(CV_8U), GPU_CHANNELS_1_3, Values(21), Values(5, 7)))
+            Combine(GPU_DENOISING_IMAGE_SIZES,
+                    Values<MatDepth>(CV_8U),
+                    GPU_CHANNELS_1_3,
+                    Values(21),
+                    Values(5)))
 {
    declare.time(60.0);

-    cv::Size size = GET_PARAM(0);
-    int depth = GET_PARAM(1);
-    int channels = GET_PARAM(2);
-
-    int search_widow_size = GET_PARAM(3);
-    int block_size = GET_PARAM(4);
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+    const int search_widow_size = GET_PARAM(3);
+    const int block_size = GET_PARAM(4);

-    float h = 10;
-    int borderMode = cv::BORDER_REFLECT101;
+    const float h = 10;
+    const int borderMode = cv::BORDER_REFLECT101;

-    int type = CV_MAKE_TYPE(depth, channels);
+    const int type = CV_MAKE_TYPE(depth, channels);

    cv::Mat src(size, type);
-    fillRandom(src);
+    declare.in(src, WARMUP_RNG);

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;

-        cv::gpu::nonLocalMeans(d_src, d_dst, h, search_widow_size, block_size, borderMode);
+        TEST_CYCLE() cv::gpu::nonLocalMeans(d_src, dst, h, search_widow_size, block_size, borderMode);

-        TEST_CYCLE()
-        {
-            cv::gpu::nonLocalMeans(d_src, d_dst, h, search_widow_size, block_size, borderMode);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
-        FAIL() << "No such CPU implementation analogy";
+        FAIL_NO_CPU();
    }
 }

@ -112,46 +101,41 @@ PERF_TEST_P(Sz_Depth_Cn_WinSz_BlockSz, Denoising_NonLocalMeans,
 DEF_PARAM_TEST(Sz_Depth_Cn_WinSz_BlockSz, cv::Size, MatDepth, MatCn, int, int);

 PERF_TEST_P(Sz_Depth_Cn_WinSz_BlockSz, Denoising_FastNonLocalMeans,
-            Combine(GPU_DENOISING_IMAGE_SIZES, Values<MatDepth>(CV_8U), GPU_CHANNELS_1_3, Values(21), Values(7)))
+            Combine(GPU_DENOISING_IMAGE_SIZES,
+                    Values<MatDepth>(CV_8U),
+                    GPU_CHANNELS_1_3,
+                    Values(21),
+                    Values(7)))
 {
-    declare.time(150.0);
-
-    cv::Size size = GET_PARAM(0);
-    int depth = GET_PARAM(1);
+    declare.time(60.0);

-    int search_widow_size = GET_PARAM(2);
-    int block_size = GET_PARAM(3);
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int search_widow_size = GET_PARAM(2);
+    const int block_size = GET_PARAM(3);

-    float h = 10;
-    int type = CV_MAKE_TYPE(depth, 1);
+    const float h = 10;
+    const int type = CV_MAKE_TYPE(depth, 1);

    cv::Mat src(size, type);
-    fillRandom(src);
+    declare.in(src, WARMUP_RNG);

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
        cv::gpu::FastNonLocalMeansDenoising fnlmd;

-        fnlmd.simpleMethod(d_src, d_dst, h, search_widow_size, block_size);
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;

-        TEST_CYCLE()
-        {
-            fnlmd.simpleMethod(d_src, d_dst, h, search_widow_size, block_size);
-        }
+        TEST_CYCLE() fnlmd.simpleMethod(d_src, dst, h, search_widow_size, block_size);

-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
        cv::Mat dst;
-        cv::fastNlMeansDenoising(src, dst, h, block_size, search_widow_size);

-        TEST_CYCLE()
-        {
-            cv::fastNlMeansDenoising(src, dst, h, block_size, search_widow_size);
-        }
+        TEST_CYCLE() cv::fastNlMeansDenoising(src, dst, h, block_size, search_widow_size);

        CPU_SANITY_CHECK(dst);
    }
@ -163,47 +147,41 @@ PERF_TEST_P(Sz_Depth_Cn_WinSz_BlockSz, Denoising_FastNonLocalMeans,
 DEF_PARAM_TEST(Sz_Depth_WinSz_BlockSz, cv::Size, MatDepth, int, int);

 PERF_TEST_P(Sz_Depth_WinSz_BlockSz, Denoising_FastNonLocalMeansColored,
-            Combine(GPU_DENOISING_IMAGE_SIZES, Values<MatDepth>(CV_8U), Values(21), Values(7)))
+            Combine(GPU_DENOISING_IMAGE_SIZES,
+                    Values<MatDepth>(CV_8U),
+                    Values(21),
+                    Values(7)))
 {
-    declare.time(350.0);
-
-    cv::Size size = GET_PARAM(0);
-    int depth = GET_PARAM(1);
+    declare.time(60.0);

-    int search_widow_size = GET_PARAM(2);
-    int block_size = GET_PARAM(3);
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int search_widow_size = GET_PARAM(2);
+    const int block_size = GET_PARAM(3);

-    float h = 10;
-    int type = CV_MAKE_TYPE(depth, 3);
+    const float h = 10;
+    const int type = CV_MAKE_TYPE(depth, 3);

    cv::Mat src(size, type);
-    fillRandom(src);
+    declare.in(src, WARMUP_RNG);

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
        cv::gpu::FastNonLocalMeansDenoising fnlmd;

-        fnlmd.labMethod(d_src, d_dst, h, h, search_widow_size, block_size);
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;

-        TEST_CYCLE()
-        {
-            fnlmd.labMethod(d_src, d_dst, h, h, search_widow_size, block_size);
-        }
+        TEST_CYCLE() fnlmd.labMethod(d_src, dst, h, h, search_widow_size, block_size);

-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
        cv::Mat dst;
-        cv::fastNlMeansDenoisingColored(src, dst, h, h, block_size, search_widow_size);

-        TEST_CYCLE()
-        {
-            cv::fastNlMeansDenoisingColored(src, dst, h, h, block_size, search_widow_size);
-        }
+        TEST_CYCLE() cv::fastNlMeansDenoisingColored(src, dst, h, h, block_size, search_widow_size);

        CPU_SANITY_CHECK(dst);
    }
-}
+}
--- a/modules/gpu/perf/perf_features2d.cpp
+++ b/modules/gpu/perf/perf_features2d.cpp
@ -3,139 +3,194 @@
 using namespace std;
 using namespace testing;

-namespace {
+struct KeypointIdxCompare
+{
+    std::vector<cv::KeyPoint>* keypoints;
+
+    explicit KeypointIdxCompare(std::vector<cv::KeyPoint>* _keypoints) : keypoints(_keypoints) {}
+
+    bool operator ()(size_t i1, size_t i2) const
+    {
+        cv::KeyPoint kp1 = (*keypoints)[i1];
+        cv::KeyPoint kp2 = (*keypoints)[i2];
+        if (kp1.pt.x != kp2.pt.x)
+            return kp1.pt.x < kp2.pt.x;
+        if (kp1.pt.y != kp2.pt.y)
+            return kp1.pt.y < kp2.pt.y;
+        if (kp1.response != kp2.response)
+            return kp1.response < kp2.response;
+        return kp1.octave < kp2.octave;
+    }
+};
+
+static void sortKeyPoints(std::vector<cv::KeyPoint>& keypoints, cv::InputOutputArray _descriptors = cv::noArray())
+{
+    std::vector<size_t> indexies(keypoints.size());
+    for (size_t i = 0; i < indexies.size(); ++i)
+        indexies[i] = i;
+
+    std::sort(indexies.begin(), indexies.end(), KeypointIdxCompare(&keypoints));
+
+    std::vector<cv::KeyPoint> new_keypoints;
+    cv::Mat new_descriptors;
+
+    new_keypoints.resize(keypoints.size());
+
+    cv::Mat descriptors;
+    if (_descriptors.needed())
+    {
+        descriptors = _descriptors.getMat();
+        new_descriptors.create(descriptors.size(), descriptors.type());
+    }
+
+    for (size_t i = 0; i < indexies.size(); ++i)
+    {
+        size_t new_idx = indexies[i];
+        new_keypoints[i] = keypoints[new_idx];
+        if (!new_descriptors.empty())
+            descriptors.row((int) new_idx).copyTo(new_descriptors.row((int) i));
+    }
+
+    keypoints.swap(new_keypoints);
+    if (_descriptors.needed())
+        new_descriptors.copyTo(_descriptors);
+}

 //////////////////////////////////////////////////////////////////////
 // SURF

 DEF_PARAM_TEST_1(Image, string);

-PERF_TEST_P(Image, Features2D_SURF, Values<string>("gpu/perf/aloe.png"))
+PERF_TEST_P(Image, Features2D_SURF,
+            Values<string>("gpu/perf/aloe.png"))
 {
    declare.time(50.0);

-    cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    const cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
    ASSERT_FALSE(img.empty());

    if (PERF_RUN_GPU())
    {
        cv::gpu::SURF_GPU d_surf;

-        cv::gpu::GpuMat d_img(img);
+        const cv::gpu::GpuMat d_img(img);
        cv::gpu::GpuMat d_keypoints, d_descriptors;

-        d_surf(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);
+        TEST_CYCLE() d_surf(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);
+
+        std::vector<cv::KeyPoint> gpu_keypoints;
+        d_surf.downloadKeypoints(d_keypoints, gpu_keypoints);

-        TEST_CYCLE()
-        {
-            d_surf(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);
-        }
+        cv::Mat gpu_descriptors(d_descriptors);

-        GPU_SANITY_CHECK(d_descriptors, 1e-4);
-        GPU_SANITY_CHECK_KEYPOINTS(SURF, d_keypoints);
+        sortKeyPoints(gpu_keypoints, gpu_descriptors);
+
+        SANITY_CHECK_KEYPOINTS(gpu_keypoints);
+        SANITY_CHECK(gpu_descriptors, 1e-3);
    }
    else
    {
        cv::SURF surf;

-        std::vector<cv::KeyPoint> keypoints;
-        cv::Mat descriptors;
-
-        surf(img, cv::noArray(), keypoints, descriptors);
+        std::vector<cv::KeyPoint> cpu_keypoints;
+        cv::Mat cpu_descriptors;

-        TEST_CYCLE()
-        {
-            keypoints.clear();
-            surf(img, cv::noArray(), keypoints, descriptors);
-        }
+        TEST_CYCLE() surf(img, cv::noArray(), cpu_keypoints, cpu_descriptors);

-        SANITY_CHECK_KEYPOINTS(keypoints);
-        SANITY_CHECK(descriptors, 1e-4);
+        SANITY_CHECK_KEYPOINTS(cpu_keypoints);
+        SANITY_CHECK(cpu_descriptors);
    }
 }

 //////////////////////////////////////////////////////////////////////
 // FAST

-PERF_TEST_P(Image, Features2D_FAST, Values<string>("gpu/perf/aloe.png"))
+DEF_PARAM_TEST(Image_Threshold_NonMaxSupression, string, int, bool);
+
+PERF_TEST_P(Image_Threshold_NonMaxSupression, Features2D_FAST,
+            Combine(Values<string>("gpu/perf/aloe.png"),
+                    Values(20),
+                    Bool()))
 {
-    cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    const cv::Mat img = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
    ASSERT_FALSE(img.empty());

+    const int threshold = GET_PARAM(1);
+    const bool nonMaxSuppersion = GET_PARAM(2);
+
    if (PERF_RUN_GPU())
    {
-        cv::gpu::FAST_GPU d_fast(20);
+        cv::gpu::FAST_GPU d_fast(threshold, nonMaxSuppersion, 0.5);

-        cv::gpu::GpuMat d_img(img);
+        const cv::gpu::GpuMat d_img(img);
        cv::gpu::GpuMat d_keypoints;

-        d_fast(d_img, cv::gpu::GpuMat(), d_keypoints);
+        TEST_CYCLE() d_fast(d_img, cv::gpu::GpuMat(), d_keypoints);
+
+        std::vector<cv::KeyPoint> gpu_keypoints;
+        d_fast.downloadKeypoints(d_keypoints, gpu_keypoints);

-        TEST_CYCLE()
-        {
-            d_fast(d_img, cv::gpu::GpuMat(), d_keypoints);
-        }
+        sortKeyPoints(gpu_keypoints);

-        GPU_SANITY_CHECK_RESPONSE(FAST, d_keypoints);
+        SANITY_CHECK_KEYPOINTS(gpu_keypoints);
    }
    else
    {
-        std::vector<cv::KeyPoint> keypoints;
+        std::vector<cv::KeyPoint> cpu_keypoints;

-        cv::FAST(img, keypoints, 20);
+        TEST_CYCLE() cv::FAST(img, cpu_keypoints, threshold, nonMaxSuppersion);

-        TEST_CYCLE()
-        {
-            keypoints.clear();
-            cv::FAST(img, keypoints, 20);
-        }
-
-        SANITY_CHECK_KEYPOINTS(keypoints);
+        SANITY_CHECK_KEYPOINTS(cpu_keypoints);
    }
 }

 //////////////////////////////////////////////////////////////////////
 // ORB

-PERF_TEST_P(Image, Features2D_ORB, Values<string>("gpu/perf/aloe.png"))
+DEF_PARAM_TEST(Image_NFeatures, string, int);
+
+PERF_TEST_P(Image_NFeatures, Features2D_ORB,
+            Combine(Values<string>("gpu/perf/aloe.png"),
+                    Values(4000)))
 {
-    cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    const cv::Mat img = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
    ASSERT_FALSE(img.empty());

+    const int nFeatures = GET_PARAM(1);
+
    if (PERF_RUN_GPU())
    {
-        cv::gpu::ORB_GPU d_orb(4000);
+        cv::gpu::ORB_GPU d_orb(nFeatures);

-        cv::gpu::GpuMat d_img(img);
+        const cv::gpu::GpuMat d_img(img);
        cv::gpu::GpuMat d_keypoints, d_descriptors;

-        d_orb(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);
+        TEST_CYCLE() d_orb(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);
+
+        std::vector<cv::KeyPoint> gpu_keypoints;
+        d_orb.downloadKeyPoints(d_keypoints, gpu_keypoints);
+
+        cv::Mat gpu_descriptors(d_descriptors);

-        TEST_CYCLE()
-        {
-            d_orb(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);
-        }
+        gpu_keypoints.resize(10);
+        gpu_descriptors = gpu_descriptors.rowRange(0, 10);

-        GPU_SANITY_CHECK_KEYPOINTS(ORB, d_keypoints);
-        GPU_SANITY_CHECK(d_descriptors);
+        sortKeyPoints(gpu_keypoints, gpu_descriptors);
+
+        SANITY_CHECK_KEYPOINTS(gpu_keypoints);
+        SANITY_CHECK(gpu_descriptors);
    }
    else
    {
-        cv::ORB orb(4000);
-
-        std::vector<cv::KeyPoint> keypoints;
-        cv::Mat descriptors;
+        cv::ORB orb(nFeatures);

-        orb(img, cv::noArray(), keypoints, descriptors);
+        std::vector<cv::KeyPoint> cpu_keypoints;
+        cv::Mat cpu_descriptors;

-        TEST_CYCLE()
-        {
-            keypoints.clear();
-            orb(img, cv::noArray(), keypoints, descriptors);
-        }
+        TEST_CYCLE() orb(img, cv::noArray(), cpu_keypoints, cpu_descriptors);

-        SANITY_CHECK_KEYPOINTS(keypoints);
-        SANITY_CHECK(descriptors);
+        SANITY_CHECK_KEYPOINTS(cpu_keypoints);
+        SANITY_CHECK(cpu_descriptors);
    }
 }

@ -144,166 +199,165 @@ PERF_TEST_P(Image, Features2D_ORB, Values<string>("gpu/perf/aloe.png"))

 DEF_PARAM_TEST(DescSize_Norm, int, NormType);

-PERF_TEST_P(DescSize_Norm, Features2D_BFMatch, Combine(Values(64, 128, 256), Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))))
+PERF_TEST_P(DescSize_Norm, Features2D_BFMatch,
+            Combine(Values(64, 128, 256),
+                    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))))
 {
    declare.time(20.0);

-    int desc_size = GET_PARAM(0);
-    int normType = GET_PARAM(1);
+    const int desc_size = GET_PARAM(0);
+    const int normType = GET_PARAM(1);

-    int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;
+    const int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;

    cv::Mat query(3000, desc_size, type);
-    fillRandom(query);
+    declare.in(query, WARMUP_RNG);

    cv::Mat train(3000, desc_size, type);
-    fillRandom(train);
+    declare.in(train, WARMUP_RNG);

    if (PERF_RUN_GPU())
    {
        cv::gpu::BFMatcher_GPU d_matcher(normType);

-        cv::gpu::GpuMat d_query(query);
-        cv::gpu::GpuMat d_train(train);
+        const cv::gpu::GpuMat d_query(query);
+        const cv::gpu::GpuMat d_train(train);
        cv::gpu::GpuMat d_trainIdx, d_distance;

-        d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
+        TEST_CYCLE() d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);

-        TEST_CYCLE()
-        {
-            d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
-        }
+        std::vector<cv::DMatch> gpu_matches;
+        d_matcher.matchDownload(d_trainIdx, d_distance, gpu_matches);

-        GPU_SANITY_CHECK(d_trainIdx);
-        GPU_SANITY_CHECK(d_distance);
+        SANITY_CHECK_MATCHES(gpu_matches);
    }
    else
    {
        cv::BFMatcher matcher(normType);

-        std::vector<cv::DMatch> matches;
-
-        matcher.match(query, train, matches);
+        std::vector<cv::DMatch> cpu_matches;

-        TEST_CYCLE()
-        {
-            matcher.match(query, train, matches);
-        }
+        TEST_CYCLE() matcher.match(query, train, cpu_matches);

-        SANITY_CHECK(matches);
+        SANITY_CHECK_MATCHES(cpu_matches);
    }
 }

 //////////////////////////////////////////////////////////////////////
 // BFKnnMatch

+static void toOneRowMatches(const std::vector< std::vector<cv::DMatch> >& src, std::vector<cv::DMatch>& dst)
+{
+    dst.clear();
+    for (size_t i = 0; i < src.size(); ++i)
+        for (size_t j = 0; j < src[i].size(); ++j)
+            dst.push_back(src[i][j]);
+}
+
 DEF_PARAM_TEST(DescSize_K_Norm, int, int, NormType);

-PERF_TEST_P(DescSize_K_Norm, Features2D_BFKnnMatch, Combine(
-    Values(64, 128, 256),
-    Values(2, 3),
-    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))))
+PERF_TEST_P(DescSize_K_Norm, Features2D_BFKnnMatch,
+            Combine(Values(64, 128, 256),
+                    Values(2, 3),
+                    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2))))
 {
    declare.time(30.0);

-    int desc_size = GET_PARAM(0);
-    int k = GET_PARAM(1);
-    int normType = GET_PARAM(2);
+    const int desc_size = GET_PARAM(0);
+    const int k = GET_PARAM(1);
+    const int normType = GET_PARAM(2);

-    int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;
+    const int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;

    cv::Mat query(3000, desc_size, type);
-    fillRandom(query);
+    declare.in(query, WARMUP_RNG);

    cv::Mat train(3000, desc_size, type);
-    fillRandom(train);
+    declare.in(train, WARMUP_RNG);

    if (PERF_RUN_GPU())
    {
        cv::gpu::BFMatcher_GPU d_matcher(normType);

-        cv::gpu::GpuMat d_query(query);
-        cv::gpu::GpuMat d_train(train);
+        const cv::gpu::GpuMat d_query(query);
+        const cv::gpu::GpuMat d_train(train);
        cv::gpu::GpuMat d_trainIdx, d_distance, d_allDist;

-        d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, k);
+        TEST_CYCLE() d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, k);
+
+        std::vector< std::vector<cv::DMatch> > matchesTbl;
+        d_matcher.knnMatchDownload(d_trainIdx, d_distance, matchesTbl);

-        TEST_CYCLE()
-        {
-            d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, k);
-        }
+        std::vector<cv::DMatch> gpu_matches;
+        toOneRowMatches(matchesTbl, gpu_matches);

-        GPU_SANITY_CHECK(d_trainIdx);
-        GPU_SANITY_CHECK(d_distance);
+        SANITY_CHECK_MATCHES(gpu_matches);
    }
    else
    {
        cv::BFMatcher matcher(normType);

-        std::vector< std::vector<cv::DMatch> > matches;
+        std::vector< std::vector<cv::DMatch> > matchesTbl;

-        matcher.knnMatch(query, train, matches, k);
+        TEST_CYCLE() matcher.knnMatch(query, train, matchesTbl, k);

-        TEST_CYCLE()
-        {
-            matcher.knnMatch(query, train, matches, k);
-        }
+        std::vector<cv::DMatch> cpu_matches;
+        toOneRowMatches(matchesTbl, cpu_matches);

-        SANITY_CHECK(matches);
+        SANITY_CHECK_MATCHES(cpu_matches);
    }
 }

 //////////////////////////////////////////////////////////////////////
 // BFRadiusMatch

-PERF_TEST_P(DescSize_Norm, Features2D_BFRadiusMatch, Combine(Values(64, 128, 256), Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))))
+PERF_TEST_P(DescSize_Norm, Features2D_BFRadiusMatch,
+            Combine(Values(64, 128, 256),
+                    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2))))
 {
    declare.time(30.0);

-    int desc_size = GET_PARAM(0);
-    int normType = GET_PARAM(1);
+    const int desc_size = GET_PARAM(0);
+    const int normType = GET_PARAM(1);

-    int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;
+    const int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;
+    const float maxDistance = 10000;

    cv::Mat query(3000, desc_size, type);
-    fillRandom(query, 0.0, 1.0);
+    declare.in(query, WARMUP_RNG);

    cv::Mat train(3000, desc_size, type);
-    fillRandom(train, 0.0, 1.0);
+    declare.in(train, WARMUP_RNG);

    if (PERF_RUN_GPU())
    {
        cv::gpu::BFMatcher_GPU d_matcher(normType);

-        cv::gpu::GpuMat d_query(query);
-        cv::gpu::GpuMat d_train(train);
+        const cv::gpu::GpuMat d_query(query);
+        const cv::gpu::GpuMat d_train(train);
        cv::gpu::GpuMat d_trainIdx, d_nMatches, d_distance;

-        d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, 2.0);
+        TEST_CYCLE() d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, maxDistance);

-        TEST_CYCLE()
-        {
-            d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, 2.0);
-        }
+        std::vector< std::vector<cv::DMatch> > matchesTbl;
+        d_matcher.radiusMatchDownload(d_trainIdx, d_distance, d_nMatches, matchesTbl);

-        GPU_SANITY_CHECK(d_trainIdx);
-        GPU_SANITY_CHECK(d_distance);
+        std::vector<cv::DMatch> gpu_matches;
+        toOneRowMatches(matchesTbl, gpu_matches);
+
+        SANITY_CHECK_MATCHES(gpu_matches);
    }
    else
    {
        cv::BFMatcher matcher(normType);

-        std::vector< std::vector<cv::DMatch> > matches;
+        std::vector< std::vector<cv::DMatch> > matchesTbl;

-        matcher.radiusMatch(query, train, matches, 2.0);
+        TEST_CYCLE() matcher.radiusMatch(query, train, matchesTbl, maxDistance);

-        TEST_CYCLE()
-        {
-            matcher.radiusMatch(query, train, matches, 2.0);
-        }
+        std::vector<cv::DMatch> cpu_matches;
+        toOneRowMatches(matchesTbl, cpu_matches);

-        SANITY_CHECK(matches);
+        SANITY_CHECK_MATCHES(cpu_matches);
    }
 }
-
-} // namespace
--- a/modules/gpu/perf/perf_filters.cpp
+++ b/modules/gpu/perf/perf_filters.cpp
@ -3,48 +3,39 @@
 using namespace std;
 using namespace testing;

-namespace {
-
 //////////////////////////////////////////////////////////////////////
 // Blur

 DEF_PARAM_TEST(Sz_Type_KernelSz, cv::Size, MatType, int);

-PERF_TEST_P(Sz_Type_KernelSz, Filters_Blur, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4), Values(3, 5, 7)))
+PERF_TEST_P(Sz_Type_KernelSz, Filters_Blur,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8UC1, CV_8UC4),
+                    Values(3, 5, 7)))
 {
    declare.time(20.0);

-    cv::Size size = GET_PARAM(0);
-    int type = GET_PARAM(1);
-    int ksize = GET_PARAM(2);
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int ksize = GET_PARAM(2);

    cv::Mat src(size, type);
-    fillRandom(src);
+    declare.in(src, WARMUP_RNG);

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
-
-        cv::gpu::blur(d_src, d_dst, cv::Size(ksize, ksize));
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;

-        TEST_CYCLE()
-        {
-            cv::gpu::blur(d_src, d_dst, cv::Size(ksize, ksize));
-        }
+        TEST_CYCLE() cv::gpu::blur(d_src, dst, cv::Size(ksize, ksize));

-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
        cv::Mat dst;

-        cv::blur(src, dst, cv::Size(ksize, ksize));
-
-        TEST_CYCLE()
-        {
-            cv::blur(src, dst, cv::Size(ksize, ksize));
-        }
+        TEST_CYCLE() cv::blur(src, dst, cv::Size(ksize, ksize));

        CPU_SANITY_CHECK(dst);
    }
@ -57,38 +48,28 @@ PERF_TEST_P(Sz_Type_KernelSz, Filters_Sobel, Combine(GPU_TYPICAL_MAT_SIZES, Valu
 {
    declare.time(20.0);

-    cv::Size size = GET_PARAM(0);
-    int type = GET_PARAM(1);
-    int ksize = GET_PARAM(2);
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int ksize = GET_PARAM(2);

    cv::Mat src(size, type);
-    fillRandom(src);
+    declare.in(src, WARMUP_RNG);

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
        cv::gpu::GpuMat d_buf;

-        cv::gpu::Sobel(d_src, d_dst, -1, 1, 1, d_buf, ksize);
+        TEST_CYCLE() cv::gpu::Sobel(d_src, dst, -1, 1, 1, d_buf, ksize);

-        TEST_CYCLE()
-        {
-            cv::gpu::Sobel(d_src, d_dst, -1, 1, 1, d_buf, ksize);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
        cv::Mat dst;

-        cv::Sobel(src, dst, -1, 1, 1, ksize);
-
-        TEST_CYCLE()
-        {
-            cv::Sobel(src, dst, -1, 1, 1, ksize);
-        }
+        TEST_CYCLE() cv::Sobel(src, dst, -1, 1, 1, ksize);

        CPU_SANITY_CHECK(dst);
    }
@ -101,37 +82,27 @@ PERF_TEST_P(Sz_Type, Filters_Scharr, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U
 {
    declare.time(20.0);

-    cv::Size size = GET_PARAM(0);
-    int type = GET_PARAM(1);
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);

    cv::Mat src(size, type);
-    fillRandom(src);
+    declare.in(src, WARMUP_RNG);

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
        cv::gpu::GpuMat d_buf;

-        cv::gpu::Scharr(d_src, d_dst, -1, 1, 0, d_buf);
+        TEST_CYCLE() cv::gpu::Scharr(d_src, dst, -1, 1, 0, d_buf);

-        TEST_CYCLE()
-        {
-            cv::gpu::Scharr(d_src, d_dst, -1, 1, 0, d_buf);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
        cv::Mat dst;

-        cv::Scharr(src, dst, -1, 1, 0);
-
-        TEST_CYCLE()
-        {
-            cv::Scharr(src, dst, -1, 1, 0);
-        }
+        TEST_CYCLE() cv::Scharr(src, dst, -1, 1, 0);

        CPU_SANITY_CHECK(dst);
    }
@ -144,38 +115,28 @@ PERF_TEST_P(Sz_Type_KernelSz, Filters_GaussianBlur, Combine(GPU_TYPICAL_MAT_SIZE
 {
    declare.time(20.0);

-    cv::Size size = GET_PARAM(0);
-    int type = GET_PARAM(1);
-    int ksize = GET_PARAM(2);
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int ksize = GET_PARAM(2);

    cv::Mat src(size, type);
-    fillRandom(src);
+    declare.in(src, WARMUP_RNG);

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
        cv::gpu::GpuMat d_buf;

-        cv::gpu::GaussianBlur(d_src, d_dst, cv::Size(ksize, ksize), d_buf, 0.5);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::GaussianBlur(d_src, d_dst, cv::Size(ksize, ksize), d_buf, 0.5);
-        }
+        TEST_CYCLE() cv::gpu::GaussianBlur(d_src, dst, cv::Size(ksize, ksize), d_buf, 0.5);

-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
        cv::Mat dst;

-        cv::GaussianBlur(src, dst, cv::Size(ksize, ksize), 0.5);
-
-        TEST_CYCLE()
-        {
-            cv::GaussianBlur(src, dst, cv::Size(ksize, ksize), 0.5);
-        }
+        TEST_CYCLE() cv::GaussianBlur(src, dst, cv::Size(ksize, ksize), 0.5);

        CPU_SANITY_CHECK(dst);
    }
@ -188,37 +149,27 @@ PERF_TEST_P(Sz_Type_KernelSz, Filters_Laplacian, Combine(GPU_TYPICAL_MAT_SIZES,
 {
    declare.time(20.0);

-    cv::Size size = GET_PARAM(0);
-    int type = GET_PARAM(1);
-    int ksize = GET_PARAM(2);
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int ksize = GET_PARAM(2);

    cv::Mat src(size, type);
-    fillRandom(src);
+    declare.in(src, WARMUP_RNG);

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
-
-        cv::gpu::Laplacian(d_src, d_dst, -1, ksize);
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;

-        TEST_CYCLE()
-        {
-            cv::gpu::Laplacian(d_src, d_dst, -1, ksize);
-        }
+        TEST_CYCLE() cv::gpu::Laplacian(d_src, dst, -1, ksize);

-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
        cv::Mat dst;

-        cv::Laplacian(src, dst, -1, ksize);
-
-        TEST_CYCLE()
-        {
-            cv::Laplacian(src, dst, -1, ksize);
-        }
+        TEST_CYCLE() cv::Laplacian(src, dst, -1, ksize);

        CPU_SANITY_CHECK(dst);
    }
@ -231,39 +182,29 @@ PERF_TEST_P(Sz_Type, Filters_Erode, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC
 {
    declare.time(20.0);

-    cv::Size size = GET_PARAM(0);
-    int type = GET_PARAM(1);
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);

    cv::Mat src(size, type);
-    fillRandom(src);
+    declare.in(src, WARMUP_RNG);

-    cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));
+    const cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
        cv::gpu::GpuMat d_buf;

-        cv::gpu::erode(d_src, d_dst, ker, d_buf);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::erode(d_src, d_dst, ker, d_buf);
-        }
+        TEST_CYCLE() cv::gpu::erode(d_src, dst, ker, d_buf);

-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
        cv::Mat dst;

-        cv::erode(src, dst, ker);
-
-        TEST_CYCLE()
-        {
-            cv::erode(src, dst, ker);
-        }
+        TEST_CYCLE() cv::erode(src, dst, ker);

        CPU_SANITY_CHECK(dst);
    }
@ -276,39 +217,29 @@ PERF_TEST_P(Sz_Type, Filters_Dilate, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U
 {
    declare.time(20.0);

-    cv::Size size = GET_PARAM(0);
-    int type = GET_PARAM(1);
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);

    cv::Mat src(size, type);
-    fillRandom(src);
+    declare.in(src, WARMUP_RNG);

-    cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));
+    const cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
        cv::gpu::GpuMat d_buf;

-        cv::gpu::dilate(d_src, d_dst, ker, d_buf);
+        TEST_CYCLE() cv::gpu::dilate(d_src, dst, ker, d_buf);

-        TEST_CYCLE()
-        {
-            cv::gpu::dilate(d_src, d_dst, ker, d_buf);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
        cv::Mat dst;

-        cv::dilate(src, dst, ker);
-
-        TEST_CYCLE()
-        {
-            cv::dilate(src, dst, ker);
-        }
+        TEST_CYCLE() cv::dilate(src, dst, ker);

        CPU_SANITY_CHECK(dst);
    }
@ -326,41 +257,31 @@ PERF_TEST_P(Sz_Type_Op, Filters_MorphologyEx, Combine(GPU_TYPICAL_MAT_SIZES, Val
 {
    declare.time(20.0);

-    cv::Size size = GET_PARAM(0);
-    int type = GET_PARAM(1);
-    int morphOp = GET_PARAM(2);
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int morphOp = GET_PARAM(2);

    cv::Mat src(size, type);
-    fillRandom(src);
+    declare.in(src, WARMUP_RNG);

-    cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));
+    const cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
        cv::gpu::GpuMat d_buf1;
        cv::gpu::GpuMat d_buf2;

-        cv::gpu::morphologyEx(d_src, d_dst, morphOp, ker, d_buf1, d_buf2);
+        TEST_CYCLE() cv::gpu::morphologyEx(d_src, dst, morphOp, ker, d_buf1, d_buf2);

-        TEST_CYCLE()
-        {
-            cv::gpu::morphologyEx(d_src, d_dst, morphOp, ker, d_buf1, d_buf2);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
        cv::Mat dst;

-        cv::morphologyEx(src, dst, morphOp, ker);
-
-        TEST_CYCLE()
-        {
-            cv::morphologyEx(src, dst, morphOp, ker);
-        }
+        TEST_CYCLE() cv::morphologyEx(src, dst, morphOp, ker);

        CPU_SANITY_CHECK(dst);
    }
@ -373,43 +294,31 @@ PERF_TEST_P(Sz_Type_KernelSz, Filters_Filter2D, Combine(GPU_TYPICAL_MAT_SIZES, V
 {
    declare.time(20.0);

-    cv::Size size = GET_PARAM(0);
-    int type = GET_PARAM(1);
-    int ksize = GET_PARAM(2);
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int ksize = GET_PARAM(2);

    cv::Mat src(size, type);
-    fillRandom(src);
+    declare.in(src, WARMUP_RNG);

    cv::Mat kernel(ksize, ksize, CV_32FC1);
-    fillRandom(kernel, 0.0, 1.0);
+    declare.in(kernel, WARMUP_RNG);

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
-
-        cv::gpu::filter2D(d_src, d_dst, -1, kernel);
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;

-        TEST_CYCLE()
-        {
-            cv::gpu::filter2D(d_src, d_dst, -1, kernel);
-        }
+        TEST_CYCLE() cv::gpu::filter2D(d_src, dst, -1, kernel);

-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
        cv::Mat dst;

-        cv::filter2D(src, dst, -1, kernel);
-
-        TEST_CYCLE()
-        {
-            cv::filter2D(src, dst, -1, kernel);
-        }
+        TEST_CYCLE() cv::filter2D(src, dst, -1, kernel);

        CPU_SANITY_CHECK(dst);
    }
 }
-
-} // namespace
--- a/modules/gpu/perf/perf_imgproc.cpp
+++ b/modules/gpu/perf/perf_imgproc.cpp
--- a/modules/gpu/perf/perf_labeling.cpp
+++ b/modules/gpu/perf/perf_labeling.cpp
@ -3,8 +3,6 @@
 using namespace std;
 using namespace testing;

-namespace {
-
 DEF_PARAM_TEST_1(Image, string);

 struct GreedyLabeling
@ -100,28 +98,45 @@ struct GreedyLabeling
    dot* stack;
 };

-PERF_TEST_P(Image, Labeling_ConnectedComponents, Values<string>("gpu/labeling/aloe-disp.png"))
+PERF_TEST_P(Image, DISABLED_Labeling_ConnectivityMask,
+            Values<string>("gpu/labeling/aloe-disp.png"))
 {
    declare.time(1.0);

-    cv::Mat image = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    const cv::Mat image = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(image.empty());

    if (PERF_RUN_GPU())
    {
+        cv::gpu::GpuMat d_image(image);
        cv::gpu::GpuMat mask;
-        mask.create(image.rows, image.cols, CV_8UC1);

-        cv::gpu::GpuMat components;
-        components.create(image.rows, image.cols, CV_32SC1);
+        TEST_CYCLE() cv::gpu::connectivityMask(d_image, mask, cv::Scalar::all(0), cv::Scalar::all(2));

-        cv::gpu::connectivityMask(cv::gpu::GpuMat(image), mask, cv::Scalar::all(0), cv::Scalar::all(2));
+        GPU_SANITY_CHECK(mask);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}

-        ASSERT_NO_THROW(cv::gpu::labelComponents(mask, components));
+PERF_TEST_P(Image, DISABLED_Labeling_ConnectedComponents,
+            Values<string>("gpu/labeling/aloe-disp.png"))
+{
+    declare.time(1.0);

-        TEST_CYCLE()
-        {
-            cv::gpu::labelComponents(mask, components);
-        }
+    const cv::Mat image = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(image.empty());
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_mask;
+        cv::gpu::connectivityMask(cv::gpu::GpuMat(image), d_mask, cv::Scalar::all(0), cv::Scalar::all(2));
+
+        cv::gpu::GpuMat components;
+
+        TEST_CYCLE() cv::gpu::labelComponents(d_mask, components);

        GPU_SANITY_CHECK(components);
    }
@ -129,17 +144,9 @@ PERF_TEST_P(Image, Labeling_ConnectedComponents, Values<string>("gpu/labeling/al
    {
        GreedyLabeling host(image);

-        host(host._labels);
+        TEST_CYCLE() host(host._labels);

-        declare.time(1.0);
-
-        TEST_CYCLE()
-        {
-            host(host._labels);
-        }
-
-        CPU_SANITY_CHECK(host._labels);
+        cv::Mat components = host._labels;
+        CPU_SANITY_CHECK(components);
    }
 }
-
-} // namespace
--- a/modules/gpu/perf/perf_main.cpp
+++ b/modules/gpu/perf/perf_main.cpp
@ -1,7 +1,5 @@
 #include "perf_precomp.hpp"

-namespace{
-
 static void printOsInfo()
 {
 #if defined _WIN32
@ -69,6 +67,4 @@ static void printCudaInfo()
 #endif
 }

-}
-
-CV_PERF_TEST_MAIN(gpu, printCudaInfo())
+CV_PERF_TEST_MAIN(gpu, printCudaInfo())
--- a/modules/gpu/perf/perf_matop.cpp
+++ b/modules/gpu/perf/perf_matop.cpp
@ -3,137 +3,112 @@
 using namespace std;
 using namespace testing;

-namespace {
-
 //////////////////////////////////////////////////////////////////////
 // SetTo

-PERF_TEST_P(Sz_Depth_Cn, MatOp_SetTo, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16U, CV_32F, CV_64F), GPU_CHANNELS_1_3_4))
+PERF_TEST_P(Sz_Depth_Cn, MatOp_SetTo,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F, CV_64F),
+                    GPU_CHANNELS_1_3_4))
 {
-    cv::Size size = GET_PARAM(0);
-    int depth = GET_PARAM(1);
-    int channels = GET_PARAM(2);
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);

-    int type = CV_MAKE_TYPE(depth, channels);
+    const int type = CV_MAKE_TYPE(depth, channels);

-    cv::Scalar val(1, 2, 3, 4);
+    const cv::Scalar val(1, 2, 3, 4);

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(size, type);
-
-        d_src.setTo(val);
+        cv::gpu::GpuMat dst(size, type);

-        TEST_CYCLE()
-        {
-            d_src.setTo(val);
-        }
+        TEST_CYCLE() dst.setTo(val);

-        GPU_SANITY_CHECK(d_src);
+        GPU_SANITY_CHECK(dst);
    }
    else
    {
-        cv::Mat src(size, type);
+        cv::Mat dst(size, type);

-        src.setTo(val);
+        TEST_CYCLE() dst.setTo(val);

-        TEST_CYCLE()
-        {
-            src.setTo(val);
-        }
-
-        CPU_SANITY_CHECK(src);
+        CPU_SANITY_CHECK(dst);
    }
 }

 //////////////////////////////////////////////////////////////////////
 // SetToMasked

-PERF_TEST_P(Sz_Depth_Cn, MatOp_SetToMasked, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16U, CV_32F, CV_64F), GPU_CHANNELS_1_3_4))
+PERF_TEST_P(Sz_Depth_Cn, MatOp_SetToMasked,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F, CV_64F),
+                    GPU_CHANNELS_1_3_4))
 {
-    cv::Size size = GET_PARAM(0);
-    int depth = GET_PARAM(1);
-    int channels = GET_PARAM(2);
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);

-    int type = CV_MAKE_TYPE(depth, channels);
+    const int type = CV_MAKE_TYPE(depth, channels);

    cv::Mat src(size, type);
-    fillRandom(src);
-
    cv::Mat mask(size, CV_8UC1);
-    fillRandom(mask, 0, 2);
+    declare.in(src, mask, WARMUP_RNG);

-    cv::Scalar val(1, 2, 3, 4);
+    const cv::Scalar val(1, 2, 3, 4);

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_mask(mask);
-
-        d_src.setTo(val, d_mask);
+        cv::gpu::GpuMat dst(src);
+        const cv::gpu::GpuMat d_mask(mask);

-        TEST_CYCLE()
-        {
-            d_src.setTo(val, d_mask);
-        }
+        TEST_CYCLE() dst.setTo(val, d_mask);

-        GPU_SANITY_CHECK(d_src);
+        GPU_SANITY_CHECK(dst, 1e-10);
    }
    else
    {
-        src.setTo(val, mask);
+        cv::Mat dst = src;

-        TEST_CYCLE()
-        {
-            src.setTo(val, mask);
-        }
+        TEST_CYCLE() dst.setTo(val, mask);

-        CPU_SANITY_CHECK(src);
+        CPU_SANITY_CHECK(dst);
    }
 }

 //////////////////////////////////////////////////////////////////////
 // CopyToMasked

-PERF_TEST_P(Sz_Depth_Cn, MatOp_CopyToMasked, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16U, CV_32F, CV_64F), GPU_CHANNELS_1_3_4))
+PERF_TEST_P(Sz_Depth_Cn, MatOp_CopyToMasked,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F, CV_64F),
+                    GPU_CHANNELS_1_3_4))
 {
-    cv::Size size = GET_PARAM(0);
-    int depth = GET_PARAM(1);
-    int channels = GET_PARAM(2);
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);

-    int type = CV_MAKE_TYPE(depth, channels);
+    const int type = CV_MAKE_TYPE(depth, channels);

    cv::Mat src(size, type);
-    fillRandom(src);
-
    cv::Mat mask(size, CV_8UC1);
-    fillRandom(mask, 0, 2);
+    declare.in(src, mask, WARMUP_RNG);

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_mask(mask);
-        cv::gpu::GpuMat d_dst;
+        const cv::gpu::GpuMat d_src(src);
+        const cv::gpu::GpuMat d_mask(mask);
+        cv::gpu::GpuMat dst(d_src.size(), d_src.type(), cv::Scalar::all(0));

-        d_src.copyTo(d_dst, d_mask);
+        TEST_CYCLE() d_src.copyTo(dst, d_mask);

-        TEST_CYCLE()
-        {
-            d_src.copyTo(d_dst, d_mask);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst, 1e-10);
    }
    else
    {
-        cv::Mat dst;
+        cv::Mat dst(src.size(), src.type(), cv::Scalar::all(0));

-        src.copyTo(dst, mask);
-
-        TEST_CYCLE()
-        {
-            src.copyTo(dst, mask);
-        }
+        TEST_CYCLE() src.copyTo(dst, mask);

        CPU_SANITY_CHECK(dst);
    }
@ -144,42 +119,36 @@ PERF_TEST_P(Sz_Depth_Cn, MatOp_CopyToMasked, Combine(GPU_TYPICAL_MAT_SIZES, Valu

 DEF_PARAM_TEST(Sz_2Depth, cv::Size, MatDepth, MatDepth);

-PERF_TEST_P(Sz_2Depth, MatOp_ConvertTo, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16U, CV_32F, CV_64F), Values(CV_8U, CV_16U, CV_32F, CV_64F)))
+PERF_TEST_P(Sz_2Depth, MatOp_ConvertTo,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F, CV_64F),
+                    Values(CV_8U, CV_16U, CV_32F, CV_64F)))
 {
-    cv::Size size = GET_PARAM(0);
-    int depth1 = GET_PARAM(1);
-    int depth2 = GET_PARAM(2);
+    const cv::Size size = GET_PARAM(0);
+    const int depth1 = GET_PARAM(1);
+    const int depth2 = GET_PARAM(2);

    cv::Mat src(size, depth1);
-    fillRandom(src);
+    declare.in(src, WARMUP_RNG);
+
+    const double a = 0.5;
+    const double b = 1.0;

    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;

-        d_src.convertTo(d_dst, depth2, 0.5, 1.0);
+        TEST_CYCLE() d_src.convertTo(dst, depth2, a, b);

-        TEST_CYCLE()
-        {
-            d_src.convertTo(d_dst, depth2, 0.5, 1.0);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
+        GPU_SANITY_CHECK(dst, 1e-10);
    }
    else
    {
        cv::Mat dst;

-        src.convertTo(dst, depth2, 0.5, 1.0);
-
-        TEST_CYCLE()
-        {
-            src.convertTo(dst, depth2, 0.5, 1.0);
-        }
+        TEST_CYCLE() src.convertTo(dst, depth2, a, b);

        CPU_SANITY_CHECK(dst);
    }
 }
-
-} // namespace
--- a/modules/gpu/perf/perf_objdetect.cpp
+++ b/modules/gpu/perf/perf_objdetect.cpp
@ -3,90 +3,47 @@
 using namespace std;
 using namespace testing;

-namespace {
-
 ///////////////////////////////////////////////////////////////
 // HOG

 DEF_PARAM_TEST_1(Image, string);

-PERF_TEST_P(Image, ObjDetect_HOG, Values<string>("gpu/hog/road.png"))
+PERF_TEST_P(Image, ObjDetect_HOG,
+            Values<string>("gpu/hog/road.png",
+                           "gpu/caltech/image_00000009_0.png",
+                           "gpu/caltech/image_00000032_0.png",
+                           "gpu/caltech/image_00000165_0.png",
+                           "gpu/caltech/image_00000261_0.png",
+                           "gpu/caltech/image_00000469_0.png",
+                           "gpu/caltech/image_00000527_0.png",
+                           "gpu/caltech/image_00000574_0.png"))
 {
-    cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    const cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
    ASSERT_FALSE(img.empty());

-    std::vector<cv::Rect> found_locations;
-
    if (PERF_RUN_GPU())
    {
-        cv::gpu::GpuMat d_img(img);
+        const cv::gpu::GpuMat d_img(img);
+        std::vector<cv::Rect> gpu_found_locations;

        cv::gpu::HOGDescriptor d_hog;
        d_hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());

-        d_hog.detectMultiScale(d_img, found_locations);
+        TEST_CYCLE() d_hog.detectMultiScale(d_img, gpu_found_locations);

-        TEST_CYCLE()
-        {
-            d_hog.detectMultiScale(d_img, found_locations);
-        }
+        SANITY_CHECK(gpu_found_locations);
    }
    else
    {
-        cv::HOGDescriptor hog;
-        hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());
-
-        hog.detectMultiScale(img, found_locations);
-
-        TEST_CYCLE()
-        {
-            hog.detectMultiScale(img, found_locations);
-        }
-    }
-
-    SANITY_CHECK(found_locations);
-}
-
-//===========test for CalTech data =============//
-DEF_PARAM_TEST_1(HOG, string);
-
-PERF_TEST_P(HOG, CalTech, Values<string>("gpu/caltech/image_00000009_0.png", "gpu/caltech/image_00000032_0.png",
-    "gpu/caltech/image_00000165_0.png", "gpu/caltech/image_00000261_0.png", "gpu/caltech/image_00000469_0.png",
-    "gpu/caltech/image_00000527_0.png", "gpu/caltech/image_00000574_0.png"))
-{
-    cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    std::vector<cv::Rect> found_locations;
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::GpuMat d_img(img);
-
-        cv::gpu::HOGDescriptor d_hog;
-        d_hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());
-
-        d_hog.detectMultiScale(d_img, found_locations);
+        std::vector<cv::Rect> cpu_found_locations;

-        TEST_CYCLE()
-        {
-            d_hog.detectMultiScale(d_img, found_locations);
-        }
-    }
-    else
-    {
        cv::HOGDescriptor hog;
        hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());

-        hog.detectMultiScale(img, found_locations);
+        TEST_CYCLE() hog.detectMultiScale(img, cpu_found_locations);

-        TEST_CYCLE()
-        {
-            hog.detectMultiScale(img, found_locations);
-        }
+        SANITY_CHECK(cpu_found_locations);
    }
-
-    SANITY_CHECK(found_locations);
 }

 ///////////////////////////////////////////////////////////////
@ -96,9 +53,9 @@ typedef pair<string, string> pair_string;
 DEF_PARAM_TEST_1(ImageAndCascade, pair_string);

 PERF_TEST_P(ImageAndCascade, ObjDetect_HaarClassifier,
-    Values<pair_string>(make_pair("gpu/haarcascade/group_1_640x480_VGA.pgm", "gpu/perf/haarcascade_frontalface_alt.xml")))
+            Values<pair_string>(make_pair("gpu/haarcascade/group_1_640x480_VGA.pgm", "gpu/perf/haarcascade_frontalface_alt.xml")))
 {
-    cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    const cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
    ASSERT_FALSE(img.empty());

    if (PERF_RUN_GPU())
@ -106,33 +63,28 @@ PERF_TEST_P(ImageAndCascade, ObjDetect_HaarClassifier,
        cv::gpu::CascadeClassifier_GPU d_cascade;
        ASSERT_TRUE(d_cascade.load(perf::TestBase::getDataPath(GetParam().second)));

-        cv::gpu::GpuMat d_img(img);
-        cv::gpu::GpuMat d_objects_buffer;
+        const cv::gpu::GpuMat d_img(img);
+        cv::gpu::GpuMat objects_buffer;
+        int detections_num = 0;

-        d_cascade.detectMultiScale(d_img, d_objects_buffer);
+        TEST_CYCLE() detections_num = d_cascade.detectMultiScale(d_img, objects_buffer);

-        TEST_CYCLE()
-        {
-            d_cascade.detectMultiScale(d_img, d_objects_buffer);
-        }
-
-        GPU_SANITY_CHECK(d_objects_buffer);
+        std::vector<cv::Rect> gpu_rects(detections_num);
+        cv::Mat gpu_rects_mat(1, detections_num, cv::DataType<cv::Rect>::type, &gpu_rects[0]);
+        objects_buffer.colRange(0, detections_num).download(gpu_rects_mat);
+        cv::groupRectangles(gpu_rects, 3, 0.2);
+        SANITY_CHECK(gpu_rects);
    }
    else
    {
        cv::CascadeClassifier cascade;
        ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/perf/haarcascade_frontalface_alt.xml")));

-        std::vector<cv::Rect> rects;
-
-        cascade.detectMultiScale(img, rects);
+        std::vector<cv::Rect> cpu_rects;

-        TEST_CYCLE()
-        {
-            cascade.detectMultiScale(img, rects);
-        }
+        TEST_CYCLE() cascade.detectMultiScale(img, cpu_rects);

-        CPU_SANITY_CHECK(rects);
+        SANITY_CHECK(cpu_rects);
    }
 }

@ -140,9 +92,9 @@ PERF_TEST_P(ImageAndCascade, ObjDetect_HaarClassifier,
 // LBP cascade

 PERF_TEST_P(ImageAndCascade, ObjDetect_LBPClassifier,
-    Values<pair_string>(make_pair("gpu/haarcascade/group_1_640x480_VGA.pgm", "gpu/lbpcascade/lbpcascade_frontalface.xml")))
+            Values<pair_string>(make_pair("gpu/haarcascade/group_1_640x480_VGA.pgm", "gpu/lbpcascade/lbpcascade_frontalface.xml")))
 {
-    cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    const cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
    ASSERT_FALSE(img.empty());

    if (PERF_RUN_GPU())
@ -150,34 +102,27 @@ PERF_TEST_P(ImageAndCascade, ObjDetect_LBPClassifier,
        cv::gpu::CascadeClassifier_GPU d_cascade;
        ASSERT_TRUE(d_cascade.load(perf::TestBase::getDataPath(GetParam().second)));

-        cv::gpu::GpuMat d_img(img);
-        cv::gpu::GpuMat d_gpu_rects;
+        const cv::gpu::GpuMat d_img(img);
+        cv::gpu::GpuMat objects_buffer;
+        int detections_num = 0;

-        d_cascade.detectMultiScale(d_img, d_gpu_rects);
+        TEST_CYCLE() detections_num = d_cascade.detectMultiScale(d_img, objects_buffer);

-        TEST_CYCLE()
-        {
-            d_cascade.detectMultiScale(d_img, d_gpu_rects);
-        }
-
-        GPU_SANITY_CHECK(d_gpu_rects);
+        std::vector<cv::Rect> gpu_rects(detections_num);
+        cv::Mat gpu_rects_mat(1, detections_num, cv::DataType<cv::Rect>::type, &gpu_rects[0]);
+        objects_buffer.colRange(0, detections_num).download(gpu_rects_mat);
+        cv::groupRectangles(gpu_rects, 3, 0.2);
+        SANITY_CHECK(gpu_rects);
    }
    else
    {
        cv::CascadeClassifier cascade;
        ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/lbpcascade/lbpcascade_frontalface.xml")));

-        std::vector<cv::Rect> rects;
-
-        cascade.detectMultiScale(img, rects);
+        std::vector<cv::Rect> cpu_rects;

-        TEST_CYCLE()
-        {
-            cascade.detectMultiScale(img, rects);
-        }
+        TEST_CYCLE() cascade.detectMultiScale(img, cpu_rects);

-        CPU_SANITY_CHECK(rects);
+        SANITY_CHECK(cpu_rects);
    }
 }
-
-} // namespace
--- a/modules/gpu/perf/perf_video.cpp
+++ b/modules/gpu/perf/perf_video.cpp
--- a/modules/gpu/perf/utility.cpp
+++ b/modules/gpu/perf/utility.cpp
@ -2,13 +2,6 @@

 using namespace std;
 using namespace cv;
-using namespace cv::gpu;
-
-void fillRandom(Mat& m, double a, double b)
-{
-    RNG rng(123456789);
-    rng.fill(m, RNG::UNIFORM, Scalar::all(a), Scalar::all(b));
-}

 Mat readImage(const string& fileName, int flags)
 {
@ -188,4 +181,4 @@ void PrintTo(const CvtColorInfo& info, ostream* os)
    };

    *os << str[info.code];
-}
+}
--- a/modules/gpu/perf/utility.hpp
+++ b/modules/gpu/perf/utility.hpp
@ -2,11 +2,9 @@
 #define __OPENCV_PERF_GPU_UTILITY_HPP__

 #include "opencv2/core/core.hpp"
-#include "opencv2/core/gpumat.hpp"
 #include "opencv2/imgproc/imgproc.hpp"
 #include "opencv2/ts/ts_perf.hpp"

-void fillRandom(cv::Mat& m, double a = 0.0, double b = 255.0);
 cv::Mat readImage(const std::string& fileName, int flags = cv::IMREAD_COLOR);

 using perf::MatType;
@ -17,12 +15,13 @@ CV_ENUM(BorderMode, cv::BORDER_REFLECT101, cv::BORDER_REPLICATE, cv::BORDER_CONS

 CV_ENUM(Interpolation, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC, cv::INTER_AREA)
 #define ALL_INTERPOLATIONS testing::ValuesIn(Interpolation::all())
+
 CV_ENUM(NormType, cv::NORM_INF, cv::NORM_L1, cv::NORM_L2, cv::NORM_HAMMING, cv::NORM_MINMAX)

-const int Gray = 1, TwoChannel = 2, BGR = 3, BGRA = 4;
+enum { Gray = 1, TwoChannel = 2, BGR = 3, BGRA = 4 };
 CV_ENUM(MatCn, Gray, TwoChannel, BGR, BGRA)
-#define GPU_CHANNELS_1_3_4 testing::Values(Gray, BGR, BGRA)
-#define GPU_CHANNELS_1_3 testing::Values(Gray, BGR)
+#define GPU_CHANNELS_1_3_4 testing::Values(MatCn(Gray), MatCn(BGR), MatCn(BGRA))
+#define GPU_CHANNELS_1_3 testing::Values(MatCn(Gray), MatCn(BGR))

 struct CvtColorInfo
 {
@ -30,7 +29,8 @@ struct CvtColorInfo
    int dcn;
    int code;

-    explicit CvtColorInfo(int scn_=0, int dcn_=0, int code_=0) : scn(scn_), dcn(dcn_), code(code_) {}
+    CvtColorInfo() {}
+    explicit CvtColorInfo(int scn_, int dcn_, int code_) : scn(scn_), dcn(dcn_), code(code_) {}
 };
 void PrintTo(const CvtColorInfo& info, std::ostream* os);

@ -46,39 +46,18 @@ DEF_PARAM_TEST(Sz_Depth_Cn, cv::Size, MatDepth, MatCn);

 #define GPU_TYPICAL_MAT_SIZES testing::Values(perf::sz720p, perf::szSXGA, perf::sz1080p)

-#define GPU_SANITY_CHECK(dmat, ...) \
-    do{ \
-        cv::Mat d##dmat(dmat); \
-        SANITY_CHECK(d##dmat, ## __VA_ARGS__); \
-    } while(0)
+#define FAIL_NO_CPU() FAIL() << "No such CPU implementation analogy"

-#define CPU_SANITY_CHECK(cmat, ...) \
+#define GPU_SANITY_CHECK(mat, ...) \
    do{ \
-        SANITY_CHECK(cmat, ## __VA_ARGS__); \
+        cv::Mat gpu_##mat(mat); \
+        SANITY_CHECK(gpu_##mat, ## __VA_ARGS__); \
    } while(0)

-#define GPU_SANITY_CHECK_KEYPOINTS(alg, dmat, ...)                                          \
-    do{                                                                                     \
-        cv::Mat d##dmat(dmat);                                                              \
-        cv::Mat __pt_x      = d##dmat.row(cv::gpu::alg##_GPU::X_ROW);                       \
-        cv::Mat __pt_y      = d##dmat.row(cv::gpu::alg##_GPU::Y_ROW);                       \
-        cv::Mat __angle     = d##dmat.row(cv::gpu::alg##_GPU::ANGLE_ROW);                   \
-        cv::Mat __octave    = d##dmat.row(cv::gpu::alg##_GPU::OCTAVE_ROW);                               \
-        cv::Mat __size      = d##dmat.row(cv::gpu::alg##_GPU::SIZE_ROW);                                 \
-        ::perf::Regression::add(this, std::string(#dmat) + "-pt-x-row",     __pt_x,     ## __VA_ARGS__); \
-        ::perf::Regression::add(this, std::string(#dmat) + "-pt-y-row",     __pt_y,     ## __VA_ARGS__); \
-        ::perf::Regression::add(this, std::string(#dmat) + "-angle-row",    __angle,    ## __VA_ARGS__); \
-        ::perf::Regression::add(this, std::string(#dmat) + "octave-row",    __octave,   ## __VA_ARGS__); \
-        ::perf::Regression::add(this, std::string(#dmat) + "-pt-size-row",  __size,     ## __VA_ARGS__); \
-    } while(0)
-
-#define GPU_SANITY_CHECK_RESPONSE(alg, dmat, ...) \
-    do{                                                                                     \
-        cv::Mat d##dmat(dmat);                                                              \
-        cv::Mat __response  = d##dmat.row(cv::gpu::alg##_GPU::RESPONSE_ROW);                \
-        ::perf::Regression::add(this, std::string(#dmat) + "-response-row", __response, ## __VA_ARGS__); \
+#define CPU_SANITY_CHECK(mat, ...) \
+    do{ \
+        cv::Mat cpu_##mat(mat); \
+        SANITY_CHECK(cpu_##mat, ## __VA_ARGS__); \
    } while(0)

-#define FAIL_NO_CPU()   FAIL() << "No such CPU implementation analogy"
-
 #endif // __OPENCV_PERF_GPU_UTILITY_HPP__
--- a/modules/gpu/src/cuda/bgfg_mog.cu
+++ b/modules/gpu/src/cuda/bgfg_mog.cu
@ -648,7 +648,7 @@ namespace cv { namespace gpu { namespace device
                    tWeight += gmm_weight(mode * frame.rows + y, x);
                    if (tWeight > c_TB)
                        break;
-                };
+                }
            }

            fgmask(y, x) = background ? 0 : isShadow ? c_shadowVal : 255;
@ -761,4 +761,4 @@ namespace cv { namespace gpu { namespace device
 }}}


-#endif /* CUDA_DISABLER */
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/ccomponetns.cu
+++ b/modules/gpu/src/cuda/ccomponetns.cu
@ -194,10 +194,10 @@ namespace cv { namespace gpu { namespace device
            if ( y > 0 && connected(intensity, image(y - 1, x)))
                c |= UP;

-            if ( x - 1 < image.cols && connected(intensity, image(y, x + 1)))
+            if ( x + 1 < image.cols && connected(intensity, image(y, x + 1)))
                c |= RIGHT;

-            if ( y - 1 < image.rows && connected(intensity, image(y + 1, x)))
+            if ( y + 1 < image.rows && connected(intensity, image(y + 1, x)))
                c |= DOWN;

            components(y, x) = c;
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@ -2284,15 +2284,18 @@ namespace arithm

    template void bitScalarAnd<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
    template void bitScalarAnd<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
-    template void bitScalarAnd<uint>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarAnd<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarAnd<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);

    template void bitScalarOr<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
    template void bitScalarOr<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
-    template void bitScalarOr<uint>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarOr<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarOr<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);

    template void bitScalarXor<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
    template void bitScalarXor<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
-    template void bitScalarXor<uint>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarXor<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarXor<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
 }

 //////////////////////////////////////////////////////////////////////////
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@ -2280,11 +2280,11 @@ namespace
 {
    typedef void (*bit_scalar_func_t)(PtrStepSzb src1, unsigned int src2, PtrStepSzb dst, cudaStream_t stream);

-    template <bit_scalar_func_t func> struct BitScalar
+    template <typename T, bit_scalar_func_t func> struct BitScalar
    {
        static void call(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream)
        {
-            func(src, static_cast<unsigned int>(sc.val[0]), dst, stream);
+            func(src, saturate_cast<T>(sc.val[0]), dst, stream);
        }
    };

@ -2292,14 +2292,12 @@ namespace
    {
        static void call(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream)
        {
-            Scalar_<unsigned int> isc = sc;
-
            unsigned int packedVal = 0;

-            packedVal |= (isc.val[0] & 0xffff);
-            packedVal |= (isc.val[1] & 0xffff) << 8;
-            packedVal |= (isc.val[2] & 0xffff) << 16;
-            packedVal |= (isc.val[3] & 0xffff) << 24;
+            packedVal |= (saturate_cast<unsigned char>(sc.val[0]) & 0xffff);
+            packedVal |= (saturate_cast<unsigned char>(sc.val[1]) & 0xffff) << 8;
+            packedVal |= (saturate_cast<unsigned char>(sc.val[2]) & 0xffff) << 16;
+            packedVal |= (saturate_cast<unsigned char>(sc.val[3]) & 0xffff) << 24;

            func(src, packedVal, dst, stream);
        }
@ -2330,7 +2328,7 @@ namespace
            oSizeROI.width = src.cols;
            oSizeROI.height = src.rows;

-            const npp_t pConstants[] = {static_cast<npp_t>(sc.val[0]), static_cast<npp_t>(sc.val[1]), static_cast<npp_t>(sc.val[2]), static_cast<npp_t>(sc.val[3])};
+            const npp_t pConstants[] = {saturate_cast<npp_t>(sc.val[0]), saturate_cast<npp_t>(sc.val[1]), saturate_cast<npp_t>(sc.val[2]), saturate_cast<npp_t>(sc.val[3])};

            nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step), pConstants, dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI) );

@ -2350,7 +2348,7 @@ namespace
            oSizeROI.width = src.cols;
            oSizeROI.height = src.rows;

-            nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step), static_cast<npp_t>(sc.val[0]), dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI) );
+            nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step), saturate_cast<npp_t>(sc.val[0]), dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI) );

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
@ -2365,11 +2363,11 @@ void cv::gpu::bitwise_and(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stre
    typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[5][4] =
    {
-        {BitScalar< bitScalarAnd<unsigned char> >::call , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarAnd<unsigned int> >::call},
+        {BitScalar<unsigned char, bitScalarAnd<unsigned char> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarAnd<unsigned int> >::call},
        {0,0,0,0},
-        {BitScalar< bitScalarAnd<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
+        {BitScalar<unsigned short, bitScalarAnd<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
        {0,0,0,0},
-        {BitScalar< bitScalarAnd<unsigned int> >::call  , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call}
+        {BitScalar<int, bitScalarAnd<int> >::call                      , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call}
    };

    const int depth = src.depth();
@ -2390,11 +2388,11 @@ void cv::gpu::bitwise_or(const GpuMat& src, const Scalar& sc, GpuMat& dst, Strea
    typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[5][4] =
    {
-        {BitScalar< bitScalarOr<unsigned char> >::call , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, BitScalar4< bitScalarOr<unsigned int> >::call},
+        {BitScalar<unsigned char, bitScalarOr<unsigned char> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, BitScalar4< bitScalarOr<unsigned int> >::call},
        {0,0,0,0},
-        {BitScalar< bitScalarOr<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call},
+        {BitScalar<unsigned short, bitScalarOr<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call},
        {0,0,0,0},
-        {BitScalar< bitScalarOr<unsigned int> >::call  , 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R>::call}
+        {BitScalar<int, bitScalarOr<int> >::call                      , 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R>::call}
    };

    const int depth = src.depth();
@ -2415,11 +2413,11 @@ void cv::gpu::bitwise_xor(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stre
    typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[5][4] =
    {
-        {BitScalar< bitScalarXor<unsigned char> >::call , 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call, BitScalar4< bitScalarXor<unsigned int> >::call},
+        {BitScalar<unsigned char, bitScalarXor<unsigned char> >::call  , 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call, BitScalar4< bitScalarXor<unsigned int> >::call},
        {0,0,0,0},
-        {BitScalar< bitScalarXor<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call},
+        {BitScalar<unsigned short, bitScalarXor<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call},
        {0,0,0,0},
-        {BitScalar< bitScalarXor<unsigned int> >::call  , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call}
+        {BitScalar<int, bitScalarXor<int> >::call                      , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call}
    };

    const int depth = src.depth();
--- a/modules/gpu/src/graphcuts.cpp
+++ b/modules/gpu/src/graphcuts.cpp
@ -104,12 +104,12 @@ void cv::gpu::connectivityMask(const GpuMat& image, GpuMat& mask, const cv::Scal

 void cv::gpu::labelComponents(const GpuMat& mask, GpuMat& components, int flags, Stream& s)
 {
-    if (!TargetArchs::builtWith(SHARED_ATOMICS) || !DeviceInfo().supports(SHARED_ATOMICS))
-        CV_Error(CV_StsNotImplemented, "The device doesn't support shared atomics and communicative synchronization!");
    CV_Assert(!mask.empty() && mask.type() == CV_8U);

-    if (mask.size() != components.size() || components.type() != CV_32SC1)
-        components.create(mask.size(), CV_32SC1);
+    if (!deviceSupports(SHARED_ATOMICS))
+        CV_Error(CV_StsNotImplemented, "The device doesn't support shared atomics and communicative synchronization!");
+
+    components.create(mask.size(), CV_32SC1);

    cudaStream_t stream = StreamAccessor::getStream(s);
    device::ccl::labelComponents(mask, components, flags, stream);
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@ -517,6 +517,7 @@ void cv::gpu::rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, d
    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);

    dst.create(dsize, src.type());
+    dst.setTo(Scalar::all(0));

    funcs[src.depth()][src.channels() - 1](src, dst, dsize, angle, xShift, yShift, interpolation, StreamAccessor::getStream(stream));
 }
--- a/modules/gpu/src/mssegmentation.cpp
+++ b/modules/gpu/src/mssegmentation.cpp
@ -380,6 +380,7 @@ void cv::gpu::meanShiftSegmentation(const GpuMat& src, Mat& dst, int sp, int sr,
            dstcol[0] = static_cast<uchar>(sumcol[0] / comps.size[parent]);
            dstcol[1] = static_cast<uchar>(sumcol[1] / comps.size[parent]);
            dstcol[2] = static_cast<uchar>(sumcol[2] / comps.size[parent]);
+            dstcol[3] = 255;
        }
    }
 }
--- a/modules/gpu/src/pyrlk.cpp
+++ b/modules/gpu/src/pyrlk.cpp
@ -206,6 +206,8 @@ void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextI
    ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[0]);
    ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[1]);
    ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[1]);
+    uPyr_[0].setTo(Scalar::all(0));
+    vPyr_[0].setTo(Scalar::all(0));
    uPyr_[1].setTo(Scalar::all(0));
    vPyr_[1].setTo(Scalar::all(0));

--- a/modules/gpu/src/warp.cpp
+++ b/modules/gpu/src/warp.cpp
@ -232,10 +232,8 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
    };

    bool useNpp = borderMode == BORDER_CONSTANT && ofs.x == 0 && ofs.y == 0 && useNppTab[src.depth()][src.channels() - 1][interpolation];
-    #ifdef linux
-        // NPP bug on float data
-        useNpp = useNpp && src.depth() != CV_32F;
-    #endif
+    // NPP bug on float data
+    useNpp = useNpp && src.depth() != CV_32F;

    if (useNpp)
    {
@ -372,10 +370,8 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
    };

    bool useNpp = borderMode == BORDER_CONSTANT && ofs.x == 0 && ofs.y == 0 && useNppTab[src.depth()][src.channels() - 1][interpolation];
-    #ifdef linux
-        // NPP bug on float data
-        useNpp = useNpp && src.depth() != CV_32F;
-    #endif
+    // NPP bug on float data
+    useNpp = useNpp && src.depth() != CV_32F;

    if (useNpp)
    {
--- a/modules/gpu/test/test_bgfg.cpp
+++ b/modules/gpu/test/test_bgfg.cpp
@ -207,11 +207,17 @@ INSTANTIATE_TEST_CASE_P(GPU_Video, MOG, testing::Combine(
 //////////////////////////////////////////////////////
 // MOG2

-PARAM_TEST_CASE(MOG2, cv::gpu::DeviceInfo, std::string, UseGray, UseRoi)
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(DetectShadow, bool)
+}
+
+PARAM_TEST_CASE(MOG2, cv::gpu::DeviceInfo, std::string, UseGray, DetectShadow, UseRoi)
 {
    cv::gpu::DeviceInfo devInfo;
    std::string inputFile;
    bool useGray;
+    bool detectShadow;
    bool useRoi;

    virtual void SetUp()
@ -220,10 +226,9 @@ PARAM_TEST_CASE(MOG2, cv::gpu::DeviceInfo, std::string, UseGray, UseRoi)
        cv::gpu::setDevice(devInfo.deviceID());

        inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "video/" + GET_PARAM(1);
-
        useGray = GET_PARAM(2);
-
-        useRoi = GET_PARAM(3);
+        detectShadow = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
    }
 };

@ -237,9 +242,11 @@ GPU_TEST_P(MOG2, Update)
    ASSERT_FALSE(frame.empty());

    cv::gpu::MOG2_GPU mog2;
+    mog2.bShadowDetection = detectShadow;
    cv::gpu::GpuMat foreground = createMat(frame.size(), CV_8UC1, useRoi);

    cv::BackgroundSubtractorMOG2 mog2_gold;
+    mog2_gold.set("detectShadows", detectShadow);
    cv::Mat foreground_gold;

    for (int i = 0; i < 10; ++i)
@ -258,11 +265,14 @@ GPU_TEST_P(MOG2, Update)

        mog2_gold(frame, foreground_gold);

-        double norm = cv::norm(foreground_gold, cv::Mat(foreground), cv::NORM_L1);
-
-        norm /= foreground_gold.size().area();
-
-        ASSERT_LE(norm, 0.09);
+        if (detectShadow)
+        {
+            ASSERT_MAT_SIMILAR(foreground_gold, foreground, 1e-2);
+        }
+        else
+        {
+            ASSERT_MAT_NEAR(foreground_gold, foreground, 0);
+        }
    }
 }

@ -277,9 +287,11 @@ GPU_TEST_P(MOG2, getBackgroundImage)
    cv::Mat frame;

    cv::gpu::MOG2_GPU mog2;
+    mog2.bShadowDetection = detectShadow;
    cv::gpu::GpuMat foreground;

    cv::BackgroundSubtractorMOG2 mog2_gold;
+    mog2_gold.set("detectShadows", detectShadow);
    cv::Mat foreground_gold;

    for (int i = 0; i < 10; ++i)
@ -305,6 +317,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Video, MOG2, testing::Combine(
    ALL_DEVICES,
    testing::Values(std::string("768x576.avi")),
    testing::Values(UseGray(true), UseGray(false)),
+    testing::Values(DetectShadow(true), DetectShadow(false)),
    WHOLE_SUBMAT));

 //////////////////////////////////////////////////////
--- a/modules/gpu/test/test_core.cpp
+++ b/modules/gpu/test/test_core.cpp
@ -1873,7 +1873,7 @@ PARAM_TEST_CASE(Bitwise_Scalar, cv::gpu::DeviceInfo, cv::Size, MatDepth, Channel
        cv::gpu::setDevice(devInfo.deviceID());

        src = randomMat(size, CV_MAKE_TYPE(depth, channels));
-        cv::Scalar_<int> ival = randomScalar(0.0, 255.0);
+        cv::Scalar_<int> ival = randomScalar(0.0, std::numeric_limits<int>::max());
        val = ival;
    }
 };
--- a/modules/gpu/test/test_warp_affine.cpp
+++ b/modules/gpu/test/test_warp_affine.cpp
@ -252,6 +252,8 @@ PARAM_TEST_CASE(WarpAffineNPP, cv::gpu::DeviceInfo, MatType, Inverse, Interpolat
 GPU_TEST_P(WarpAffineNPP, Accuracy)
 {
    cv::Mat src = readImageType("stereobp/aloe-L.png", type);
+    ASSERT_FALSE(src.empty());
+
    cv::Mat M = createTransfomMatrix(src.size(), CV_PI / 4);
    int flags = interpolation;
    if (inverse)
--- a/modules/gpu/test/test_warp_perspective.cpp
+++ b/modules/gpu/test/test_warp_perspective.cpp
@ -255,6 +255,8 @@ PARAM_TEST_CASE(WarpPerspectiveNPP, cv::gpu::DeviceInfo, MatType, Inverse, Inter
 GPU_TEST_P(WarpPerspectiveNPP, Accuracy)
 {
    cv::Mat src = readImageType("stereobp/aloe-L.png", type);
+    ASSERT_FALSE(src.empty());
+
    cv::Mat M = createTransfomMatrix(src.size(), CV_PI / 4);
    int flags = interpolation;
    if (inverse)
--- a/modules/highgui/CMakeLists.txt
+++ b/modules/highgui/CMakeLists.txt
@ -275,7 +275,7 @@ if(WIN32 AND WITH_FFMPEG)
                       COMMAND ${CMAKE_COMMAND} -E copy "${ffmpeg_path}" "${EXECUTABLE_OUTPUT_PATH}/Release/${ffmpeg_bare_name_ver}"
                       COMMAND ${CMAKE_COMMAND} -E copy "${ffmpeg_path}" "${EXECUTABLE_OUTPUT_PATH}/Debug/${ffmpeg_bare_name_ver}"
                       COMMENT "Copying ${ffmpeg_path} to the output directory")
-  elseif(MSVC)
+  elseif(MSVC AND (CMAKE_GENERATOR MATCHES "Visual"))
    add_custom_command(TARGET ${the_module} POST_BUILD
                       COMMAND ${CMAKE_COMMAND} -E copy "${ffmpeg_path}" "${EXECUTABLE_OUTPUT_PATH}/${CMAKE_BUILD_TYPE}/${ffmpeg_bare_name_ver}"
                       COMMENT "Copying ${ffmpeg_path} to the output directory")
--- a/modules/java/CMakeLists.txt
+++ b/modules/java/CMakeLists.txt
@ -217,6 +217,12 @@ endif(ANDROID AND ANDROID_EXECUTABLE)

 set(step3_depends ${step2_depends} ${step3_input_files} ${copied_files})

+if(ANDROID)
+  set(LIB_NAME_SUFIX "")
+else()
+  set(LIB_NAME_SUFIX "${OPENCV_VERSION_MAJOR}${OPENCV_VERSION_MINOR}${OPENCV_VERSION_PATCH}")
+endif()
+
 # step 4: build jar
 if(ANDROID)
  set(JAR_FILE "${OpenCV_BINARY_DIR}/bin/classes.jar")
@ -241,7 +247,7 @@ if(ANDROID)
                      )
  endif()
 else(ANDROID)
-  set(JAR_NAME opencv-${OPENCV_VERSION}.jar)
+  set(JAR_NAME opencv-${LIB_NAME_SUFIX}.jar)
  set(JAR_FILE "${OpenCV_BINARY_DIR}/bin/${JAR_NAME}")
  configure_file("${CMAKE_CURRENT_SOURCE_DIR}/build.xml.in" "${OpenCV_BINARY_DIR}/build.xml" IMMEDIATE @ONLY)
  list(APPEND step3_depends "${OpenCV_BINARY_DIR}/build.xml")
@ -294,8 +300,8 @@ endif()

 # Additional target properties
 set_target_properties(${the_module} PROPERTIES
-    OUTPUT_NAME "${the_module}${OPENCV_DLLVERSION}"
-    DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
+    OUTPUT_NAME "${the_module}${LIB_NAME_SUFIX}"
+    #DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
    ARCHIVE_OUTPUT_DIRECTORY ${LIBRARY_OUTPUT_PATH}
    RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}
    INSTALL_NAME_DIR ${OPENCV_LIB_INSTALL_PATH}
--- a/modules/java/generator/gen_java.py
+++ b/modules/java/generator/gen_java.py
@ -557,6 +557,15 @@ func_arg_fix = {
    }, # '', i.e. no class
 } # func_arg_fix

+
+def getLibVersion(version_hpp_path):
+    version_file = open(version_hpp_path, "rt").read()
+    epoch = re.search("^W*#\W*define\W+CV_VERSION_EPOCH\W+(\d+)\W*$", version_file, re.MULTILINE).group(1)
+    major = re.search("^W*#\W*define\W+CV_VERSION_MAJOR\W+(\d+)\W*$", version_file, re.MULTILINE).group(1)
+    minor = re.search("^W*#\W*define\W+CV_VERSION_MINOR\W+(\d+)\W*$", version_file, re.MULTILINE).group(1)
+    revision = re.search("^W*#\W*define\W+CV_VERSION_REVISION\W+(\d+)\W*$", version_file, re.MULTILINE).group(1)
+    return (epoch, major, minor, revision)
+
 class ConstInfo(object):
    def __init__(self, cname, name, val, addedManually=False):
        self.cname = cname
@ -719,13 +728,16 @@ $imports
 public class %(jc)s {
 """ % { 'm' : self.module, 'jc' : jname } )

-#        self.java_code[class_name]["jn_code"].write("""
-#    //
-#    // native stuff
-#    //
-#    static { System.loadLibrary("opencv_java"); }
-#""" )
-
+        if class_name == 'Core':
+            (epoch, major, minor, revision) = getLibVersion(
+                (os.path.dirname(__file__) or '.') + '/../../core/include/opencv2/core/version.hpp')
+            version_str    = '.'.join( (epoch, major, minor, revision) )
+            version_suffix =  ''.join( (epoch, major, minor) )
+            self.classes[class_name].imports.add("java.lang.String")
+            self.java_code[class_name]["j_code"].write("""
+    public static final String VERSION = "%(v)s", NATIVE_LIBRARY_NAME = "opencv_java%(vs)s";
+    public static final int VERSION_EPOCH = %(ep)s, VERSION_MAJOR = %(ma)s, VERSION_MINOR = %(mi)s, VERSION_REVISION = %(re)s;
+""" % { 'v' : version_str, 'vs' : version_suffix, 'ep' : epoch, 'ma' : major, 'mi' : minor, 're' : revision } )


    def add_class(self, decl):
--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
@ -2122,12 +2122,16 @@ void cv::ocl::addWeighted(const oclMat &src1, double alpha, const oclMat &src2,
                              };

    int dst_step1 = dst.cols * dst.elemSize();
+    int src1_step = (int) src1.step;
+    int src2_step = (int) src2.step;
+    int dst_step  = (int) dst.step;
+    float alpha_f = alpha, beta_f = beta, gama_f = gama;
    std::vector<std::pair<size_t , const void *> > args;
    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.step ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1_step ));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.offset));
    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.step ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2_step ));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2.offset));

    if(src1.clCxt -> impl -> double_support != 0)
@ -2138,14 +2142,13 @@ void cv::ocl::addWeighted(const oclMat &src1, double alpha, const oclMat &src2,
    }
    else
    {
-        float alpha_f = alpha, beta_f = beta, gama_f = gama;
        args.push_back( std::make_pair( sizeof(cl_float), (void *)&alpha_f ));
        args.push_back( std::make_pair( sizeof(cl_float), (void *)&beta_f ));
        args.push_back( std::make_pair( sizeof(cl_float), (void *)&gama_f ));
    }

    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data ));
-    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step ));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1.rows ));
    args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
--- a/modules/ocl/src/blend.cpp
+++ b/modules/ocl/src/blend.cpp
@ -73,7 +73,7 @@ void cv::ocl::blendLinear(const oclMat &img1, const oclMat &img2, const oclMat &
    size_t localSize[] = {256, 1, 1};

    std::vector< std::pair<size_t, const void *> > args;
-
+    result.create(img1.size(), CV_MAKE_TYPE(depth,img1.channels()));
    if(globalSize[0] != 0)
    {
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data ));
--- a/modules/ocl/src/brute_force_matcher.cpp
+++ b/modules/ocl/src/brute_force_matcher.cpp
@ -60,7 +60,7 @@ namespace cv
 }

 template < int BLOCK_SIZE, int MAX_DESC_LEN,  typename T/*, typename Mask*/ >
-void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &mask,
+void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
                         const oclMat &trainIdx, const oclMat &distance, int distType)
 {
    cv::ocl::Context *ctx = query.clCxt;
@ -75,7 +75,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat
    {
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&query.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&train.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
+        //args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&distance.data ));
        args.push_back( std::make_pair( smemSize, (void *)NULL));
@ -101,7 +101,7 @@ void matchUnrolledCached(const oclMat /*query*/, const oclMat * /*trains*/, int
 }

 template < int BLOCK_SIZE,  typename T/*, typename Mask*/ >
-void match(const oclMat &query, const oclMat &train, const oclMat &mask,
+void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
           const oclMat &trainIdx, const oclMat &distance, int distType)
 {
    cv::ocl::Context *ctx = query.clCxt;
@ -115,7 +115,7 @@ void match(const oclMat &query, const oclMat &train, const oclMat &mask,
    {
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&query.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&train.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
+        //args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&distance.data ));
        args.push_back( std::make_pair( smemSize, (void *)NULL));
@ -141,7 +141,7 @@ void match(const oclMat /*query*/, const oclMat * /*trains*/, int /*n*/, const o

 //radius_matchUnrolledCached
 template < int BLOCK_SIZE, int MAX_DESC_LEN,  typename T/*, typename Mask*/ >
-void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &mask,
+void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &/*mask*/,
                         const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType)
 {
    cv::ocl::Context *ctx = query.clCxt;
@ -157,7 +157,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&query.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&train.data ));
        args.push_back( std::make_pair( sizeof(cl_float), (void *)&maxDistance ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
+        //args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&distance.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&nMatches.data ));
@ -181,7 +181,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist

 //radius_match
 template < int BLOCK_SIZE, typename T/*, typename Mask*/ >
-void radius_match(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &mask,
+void radius_match(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &/*mask*/,
                  const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType)
 {
    cv::ocl::Context *ctx = query.clCxt;
@ -196,7 +196,7 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&query.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&train.data ));
        args.push_back( std::make_pair( sizeof(cl_float), (void *)&maxDistance ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
+        //args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&distance.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&nMatches.data ));
@ -470,7 +470,7 @@ void matchDispatcher(const oclMat &query, const oclMat &train, int n, float maxD

 //knn match Dispatcher
 template < int BLOCK_SIZE, int MAX_DESC_LEN,  typename T/*, typename Mask*/ >
-void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &mask,
+void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
                             const oclMat &trainIdx, const oclMat &distance, int distType)
 {
    cv::ocl::Context *ctx = query.clCxt;
@ -485,7 +485,7 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl
    {
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&query.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&train.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
+        //args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&distance.data ));
        args.push_back( std::make_pair( smemSize, (void *)NULL));
@ -505,7 +505,7 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl
 }

 template < int BLOCK_SIZE,  typename T/*, typename Mask*/ >
-void knn_match(const oclMat &query, const oclMat &train, const oclMat &mask,
+void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
               const oclMat &trainIdx, const oclMat &distance, int distType)
 {
    cv::ocl::Context *ctx = query.clCxt;
@ -519,7 +519,7 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &mask,
    {
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&query.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&train.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
+        //args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&distance.data ));
        args.push_back( std::make_pair( smemSize, (void *)NULL));
@ -538,7 +538,7 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &mask,
 }

 template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ >
-void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat &mask, const oclMat &allDist, int distType)
+void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &allDist, int distType)
 {
    cv::ocl::Context *ctx = query.clCxt;
    size_t globalSize[] = {(query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
@ -552,7 +552,7 @@ void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat
    {
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&query.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&train.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
+        //args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&allDist.data ));
        args.push_back( std::make_pair( smemSize, (void *)NULL));
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&block_size ));
@ -571,7 +571,7 @@ void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat
 }

 template < int BLOCK_SIZE, typename T/*, typename Mask*/ >
-void calcDistance(const oclMat &query, const oclMat &train, const oclMat &mask, const oclMat &allDist, int distType)
+void calcDistance(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &allDist, int distType)
 {
    cv::ocl::Context *ctx = query.clCxt;
    size_t globalSize[] = {(query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
@ -584,7 +584,7 @@ void calcDistance(const oclMat &query, const oclMat &train, const oclMat &mask,
    {
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&query.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&train.data ));
-        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
+        //args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mask.data ));
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&allDist.data ));
        args.push_back( std::make_pair( smemSize, (void *)NULL));
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&block_size ));
@ -1005,6 +1005,7 @@ void cv::ocl::BruteForceMatcher_OCL_base::matchConvert(const Mat &trainIdx, cons

 void cv::ocl::BruteForceMatcher_OCL_base::match(const oclMat &query, const oclMat &train, std::vector<DMatch> &matches, const oclMat &mask)
 {
+    assert(mask.empty()); // mask is not supported at the moment
    oclMat trainIdx, distance;
    matchSingle(query, train, trainIdx, distance, mask);
    matchDownload(trainIdx, distance, matches);
@ -1448,7 +1449,7 @@ void cv::ocl::BruteForceMatcher_OCL_base::knnMatch(const oclMat &query, std::vec

 // radiusMatchSingle
 void cv::ocl::BruteForceMatcher_OCL_base::radiusMatchSingle(const oclMat &query, const oclMat &train,
-        oclMat &trainIdx,	oclMat &distance, oclMat &nMatches, float maxDistance, const oclMat &mask)
+        oclMat &trainIdx,   oclMat &distance, oclMat &nMatches, float maxDistance, const oclMat &mask)
 {
    if (query.empty() || train.empty())
        return;
@ -1694,4 +1695,4 @@ void cv::ocl::BruteForceMatcher_OCL_base::radiusMatch(const oclMat &query, std::
    oclMat trainIdx, imgIdx, distance, nMatches;
    radiusMatchCollection(query, trainIdx, imgIdx, distance, nMatches, maxDistance, masks);
    radiusMatchDownload(trainIdx, imgIdx, distance, nMatches, matches, compactResult);
-}
+}
--- a/modules/ocl/src/imgproc.cpp
+++ b/modules/ocl/src/imgproc.cpp
@ -288,13 +288,14 @@ namespace cv
                args.push_back( std::make_pair(sizeof(cl_int), (void *)&map1.cols));
                args.push_back( std::make_pair(sizeof(cl_int), (void *)&map1.rows));
                args.push_back( std::make_pair(sizeof(cl_int), (void *)&cols));
-                if(src.clCxt -> impl -> double_support != 0)
+                float borderFloat[4] = {(float)borderValue[0], (float)borderValue[1], (float)borderValue[2], (float)borderValue[3]};
+
+               if(src.clCxt -> impl -> double_support != 0)
                {
                    args.push_back( std::make_pair(sizeof(cl_double4), (void *)&borderValue));
                }
                else
                {
-                    float borderFloat[4] = {(float)borderValue[0], (float)borderValue[1], (float)borderValue[2], (float)borderValue[3]};
                    args.push_back( std::make_pair(sizeof(cl_float4), (void *)&borderFloat));
                }
            }
--- a/modules/ocl/src/kernels/brute_force_match.cl
+++ b/modules/ocl/src/kernels/brute_force_match.cl
@ -5,11 +5,13 @@ int bit1Count(float x)
 {
    int c = 0;
    int ix = (int)x;
+
    for (int i = 0 ; i < 32 ; i++)
    {
        c += ix & 0x1;
        ix >>= 1;
    }
+
    return (float)c;
 }
 /* 2dim launch, global size: dim0 is (query rows + block_size - 1) / block_size * block_size, dim1 is block_size
@ -18,7 +20,7 @@ local size: dim0 is block_size, dim1 is block_size.
 __kernel void BruteForceMatch_UnrollMatch(
    __global float *query,
    __global float *train,
-    __global float *mask,
+    //__global float *mask,
    __global int *bestTrainIdx,
    __global float *bestDistance,
    __local float *sharebuffer,
@ -30,7 +32,7 @@ __kernel void BruteForceMatch_UnrollMatch(
    int train_cols,
    int step,
    int distType
-    )
+)
 {
    const int lidx = get_local_id(0);
    const int lidy = get_local_id(1);
@ -40,6 +42,7 @@ __kernel void BruteForceMatch_UnrollMatch(
    __local float *s_train = sharebuffer + block_size * max_desc_len;

    int queryIdx = groupidx * block_size + lidy;
+
    // load the query into local memory.
    for (int i = 0 ;  i <  max_desc_len / block_size; i ++)
    {
@ -52,9 +55,11 @@ __kernel void BruteForceMatch_UnrollMatch(

    // loopUnrolledCached to find the best trainIdx and best distance.
    volatile int imgIdx = 0;
+
    for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
    {
        float result = 0;
+
        for (int i = 0 ; i < max_desc_len / block_size ; i++)
        {
            //load a block_size * block_size block into local train.
@ -67,28 +72,34 @@ __kernel void BruteForceMatch_UnrollMatch(
            /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
            sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/

-            switch(distType)
+            switch (distType)
            {
-            case 0:
-                for (int j = 0 ; j < block_size ; j++)
-                {
-                    result += fabs(s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx]);
-                }
-                break;
-            case 1:
-                for (int j = 0 ; j < block_size ; j++)
-                {
-                    float qr = s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx];
-                    result += qr * qr;
-                }
-                break;
-            case 2:
-                for (int j = 0 ; j < block_size ; j++)
-                {
-                    //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
-                    result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
-                }
-                break;
+                case 0:
+
+                    for (int j = 0 ; j < block_size ; j++)
+                    {
+                        result += fabs(s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx]);
+                    }
+
+                    break;
+                case 1:
+
+                    for (int j = 0 ; j < block_size ; j++)
+                    {
+                        float qr = s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx];
+                        result += qr * qr;
+                    }
+
+                    break;
+                case 2:
+
+                    for (int j = 0 ; j < block_size ; j++)
+                    {
+                        //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
+                        result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]);
+                    }
+
+                    break;
            }

            barrier(CLK_LOCAL_MEM_FENCE);
@ -105,8 +116,8 @@ __kernel void BruteForceMatch_UnrollMatch(
    }

    barrier(CLK_LOCAL_MEM_FENCE);
-    __local float *s_distance = (__local float*)(sharebuffer);
-    __local int* s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
+    __local float *s_distance = (__local float *)(sharebuffer);
+    __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);

    //find BestMatch
    s_distance += lidy * block_size;
@ -136,7 +147,7 @@ __kernel void BruteForceMatch_UnrollMatch(
 __kernel void BruteForceMatch_Match(
    __global float *query,
    __global float *train,
-    __global float *mask,
+    //__global float *mask,
    __global int *bestTrainIdx,
    __global float *bestDistance,
    __local float *sharebuffer,
@ -147,7 +158,7 @@ __kernel void BruteForceMatch_Match(
    int train_cols,
    int step,
    int distType
-    )
+)
 {
    const int lidx = get_local_id(0);
    const int lidy = get_local_id(1);
@ -166,6 +177,7 @@ __kernel void BruteForceMatch_Match(
    {
        //Dist dist;
        float result = 0;
+
        for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
        {
            const int loadx = lidx + i * block_size;
@ -184,28 +196,34 @@ __kernel void BruteForceMatch_Match(
            /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
            sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/

-            switch(distType)
+            switch (distType)
            {
-            case 0:
-                for (int j = 0 ; j < block_size ; j++)
-                {
-                    result += fabs(s_query[lidy * block_size + j] -  s_train[j * block_size + lidx]);
-                }
-                break;
-            case 1:
-                for (int j = 0 ; j < block_size ; j++)
-                {
-                    float qr = s_query[lidy * block_size + j] -  s_train[j * block_size + lidx];
-                    result += qr * qr;
-                }
-                break;
-            case 2:
-                for (int j = 0 ; j < block_size ; j++)
-                {
-                    //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
-                    result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[(uint)j * block_size + lidx]);
-                }
-                break;
+                case 0:
+
+                    for (int j = 0 ; j < block_size ; j++)
+                    {
+                        result += fabs(s_query[lidy * block_size + j] -  s_train[j * block_size + lidx]);
+                    }
+
+                    break;
+                case 1:
+
+                    for (int j = 0 ; j < block_size ; j++)
+                    {
+                        float qr = s_query[lidy * block_size + j] -  s_train[j * block_size + lidx];
+                        result += qr * qr;
+                    }
+
+                    break;
+                case 2:
+
+                    for (int j = 0 ; j < block_size ; j++)
+                    {
+                        //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
+                        result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]);
+                    }
+
+                    break;
            }

            barrier(CLK_LOCAL_MEM_FENCE);
@ -256,7 +274,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
    __global float *query,
    __global float *train,
    float maxDistance,
-    __global float *mask,
+    //__global float *mask,
    __global int *bestTrainIdx,
    __global float *bestDistance,
    __global int *nMatches,
@ -271,7 +289,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
    int step,
    int ostep,
    int distType
-    )
+)
 {
    const int lidx = get_local_id(0);
    const int lidy = get_local_id(1);
@ -285,6 +303,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
    __local float *s_train = sharebuffer + block_size * block_size;

    float result = 0;
+
    for (int i = 0 ; i < max_desc_len / block_size ; ++i)
    {
        //load a block_size * block_size block into local train.
@ -299,27 +318,33 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
        /* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
        sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/

-        switch(distType)
+        switch (distType)
        {
-        case 0:
-            for (int j = 0 ; j < block_size ; ++j)
-            {
-                result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
-            }
-            break;
-        case 1:
-            for (int j = 0 ; j < block_size ; ++j)
-            {
-                float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
-                result += qr * qr;
-            }
-            break;
-        case 2:
-            for (int j = 0 ; j < block_size ; ++j)
-            {
-                result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
-            }
-            break;
+            case 0:
+
+                for (int j = 0 ; j < block_size ; ++j)
+                {
+                    result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
+                }
+
+                break;
+            case 1:
+
+                for (int j = 0 ; j < block_size ; ++j)
+                {
+                    float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
+                    result += qr * qr;
+                }
+
+                break;
+            case 2:
+
+                for (int j = 0 ; j < block_size ; ++j)
+                {
+                    result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]);
+                }
+
+                break;
        }

        barrier(CLK_LOCAL_MEM_FENCE);
@ -329,7 +354,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
    {
        unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);

-        if(ind < bestTrainIdx_cols)
+        if (ind < bestTrainIdx_cols)
        {
            //bestImgIdx = imgIdx;
            bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
@ -343,7 +368,7 @@ __kernel void BruteForceMatch_RadiusMatch(
    __global float *query,
    __global float *train,
    float maxDistance,
-    __global float *mask,
+    //__global float *mask,
    __global int *bestTrainIdx,
    __global float *bestDistance,
    __global int *nMatches,
@ -357,7 +382,7 @@ __kernel void BruteForceMatch_RadiusMatch(
    int step,
    int ostep,
    int distType
-    )
+)
 {
    const int lidx = get_local_id(0);
    const int lidy = get_local_id(1);
@ -371,6 +396,7 @@ __kernel void BruteForceMatch_RadiusMatch(
    __local float *s_train = sharebuffer + block_size * block_size;

    float result = 0;
+
    for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; ++i)
    {
        //load a block_size * block_size block into local train.
@ -385,27 +411,33 @@ __kernel void BruteForceMatch_RadiusMatch(
        /* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
        sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/

-        switch(distType)
+        switch (distType)
        {
-        case 0:
-            for (int j = 0 ; j < block_size ; ++j)
-            {
-                result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
-            }
-            break;
-        case 1:
-            for (int j = 0 ; j < block_size ; ++j)
-            {
-                float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
-                result += qr * qr;
-            }
-            break;
-        case 2:
-            for (int j = 0 ; j < block_size ; ++j)
-            {
-                result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
-            }
-            break;
+            case 0:
+
+                for (int j = 0 ; j < block_size ; ++j)
+                {
+                    result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
+                }
+
+                break;
+            case 1:
+
+                for (int j = 0 ; j < block_size ; ++j)
+                {
+                    float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
+                    result += qr * qr;
+                }
+
+                break;
+            case 2:
+
+                for (int j = 0 ; j < block_size ; ++j)
+                {
+                    result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]);
+                }
+
+                break;
        }

        barrier(CLK_LOCAL_MEM_FENCE);
@ -415,7 +447,7 @@ __kernel void BruteForceMatch_RadiusMatch(
    {
        unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);

-        if(ind < bestTrainIdx_cols)
+        if (ind < bestTrainIdx_cols)
        {
            //bestImgIdx = imgIdx;
            bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
@ -428,7 +460,7 @@ __kernel void BruteForceMatch_RadiusMatch(
 __kernel void BruteForceMatch_knnUnrollMatch(
    __global float *query,
    __global float *train,
-    __global float *mask,
+    //__global float *mask,
    __global int2 *bestTrainIdx,
    __global float2 *bestDistance,
    __local float *sharebuffer,
@ -440,7 +472,7 @@ __kernel void BruteForceMatch_knnUnrollMatch(
    int train_cols,
    int step,
    int distType
-    )
+)
 {
    const int lidx = get_local_id(0);
    const int lidy = get_local_id(1);
@ -464,9 +496,11 @@ __kernel void BruteForceMatch_knnUnrollMatch(

    //loopUnrolledCached
    volatile int imgIdx = 0;
+
    for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
    {
        float result = 0;
+
        for (int i = 0 ; i < max_desc_len / block_size ; i++)
        {
            const int loadX = lidx + i * block_size;
@ -480,28 +514,34 @@ __kernel void BruteForceMatch_knnUnrollMatch(
            /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
            sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/

-            switch(distType)
+            switch (distType)
            {
-            case 0:
-                for (int j = 0 ; j < block_size ; j++)
-                {
-                    result += fabs(s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx]);
-                }
-                break;
-            case 1:
-                for (int j = 0 ; j < block_size ; j++)
-                {
-                    float qr = s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx];
-                    result += qr * qr;
-                }
-                break;
-            case 2:
-                for (int j = 0 ; j < block_size ; j++)
-                {
-                    //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
-                    result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
-                }
-                break;
+                case 0:
+
+                    for (int j = 0 ; j < block_size ; j++)
+                    {
+                        result += fabs(s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx]);
+                    }
+
+                    break;
+                case 1:
+
+                    for (int j = 0 ; j < block_size ; j++)
+                    {
+                        float qr = s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx];
+                        result += qr * qr;
+                    }
+
+                    break;
+                case 2:
+
+                    for (int j = 0 ; j < block_size ; j++)
+                    {
+                        //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
+                        result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]);
+                    }
+
+                    break;
            }

            barrier(CLK_LOCAL_MEM_FENCE);
@ -549,6 +589,7 @@ __kernel void BruteForceMatch_knnUnrollMatch(
        for (int i = 0 ; i < block_size ; i++)
        {
            float val = s_distance[i];
+
            if (val < bestDistance1)
            {
                bestDistance2 = bestDistance1;
@ -602,7 +643,7 @@ __kernel void BruteForceMatch_knnUnrollMatch(
 __kernel void BruteForceMatch_knnMatch(
    __global float *query,
    __global float *train,
-    __global float *mask,
+    //__global float *mask,
    __global int2 *bestTrainIdx,
    __global float2 *bestDistance,
    __local float *sharebuffer,
@ -613,7 +654,7 @@ __kernel void BruteForceMatch_knnMatch(
    int train_cols,
    int step,
    int distType
-    )
+)
 {
    const int lidx = get_local_id(0);
    const int lidy = get_local_id(1);
@ -632,7 +673,8 @@ __kernel void BruteForceMatch_knnMatch(
    for (int  t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
    {
        float result = 0.0f;
-        for (int i = 0 ; i < (query_cols + block_size -1) / block_size ; i++)
+
+        for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
        {
            const int loadx = lidx + i * block_size;
            //load query and train into local memory
@ -650,28 +692,34 @@ __kernel void BruteForceMatch_knnMatch(
            /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
            sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/

-            switch(distType)
+            switch (distType)
            {
-            case 0:
-                for (int j = 0 ; j < block_size ; j++)
-                {
-                    result += fabs(s_query[lidy * block_size + j] -  s_train[j * block_size + lidx]);
-                }
-                break;
-            case 1:
-                for (int j = 0 ; j < block_size ; j++)
-                {
-                    float qr = s_query[lidy * block_size + j] -  s_train[j * block_size + lidx];
-                    result += qr * qr;
-                }
-                break;
-            case 2:
-                for (int j = 0 ; j < block_size ; j++)
-                {
-                    //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
-                    result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[(uint)j * block_size + lidx]);
-                }
-                break;
+                case 0:
+
+                    for (int j = 0 ; j < block_size ; j++)
+                    {
+                        result += fabs(s_query[lidy * block_size + j] -  s_train[j * block_size + lidx]);
+                    }
+
+                    break;
+                case 1:
+
+                    for (int j = 0 ; j < block_size ; j++)
+                    {
+                        float qr = s_query[lidy * block_size + j] -  s_train[j * block_size + lidx];
+                        result += qr * qr;
+                    }
+
+                    break;
+                case 2:
+
+                    for (int j = 0 ; j < block_size ; j++)
+                    {
+                        //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
+                        result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]);
+                    }
+
+                    break;
            }

            barrier(CLK_LOCAL_MEM_FENCE);
@ -719,6 +767,7 @@ __kernel void BruteForceMatch_knnMatch(
        for (int i = 0 ; i < block_size ; i++)
        {
            float val = s_distance[i];
+
            if (val < bestDistance1)
            {
                bestDistance2 = bestDistance1;
@ -772,7 +821,7 @@ __kernel void BruteForceMatch_knnMatch(
 kernel void BruteForceMatch_calcDistanceUnrolled(
    __global float *query,
    __global float *train,
-    __global float *mask,
+    //__global float *mask,
    __global float *allDist,
    __local float *sharebuffer,
    int block_size,
@ -790,7 +839,7 @@ kernel void BruteForceMatch_calcDistanceUnrolled(
 kernel void BruteForceMatch_calcDistance(
    __global float *query,
    __global float *train,
-    __global float *mask,
+    //__global float *mask,
    __global float *allDist,
    __local float *sharebuffer,
    int block_size,
@ -808,9 +857,9 @@ kernel void BruteForceMatch_findBestMatch(
    __global float *allDist,
    __global int *bestTrainIdx,
    __global float *bestDistance,
-     int k,
-     int block_size
-    )
+    int k,
+    int block_size
+)
 {
    /* Todo */
 }
--- a/modules/ocl/src/kernels/nonfree_surf.cl
+++ b/modules/ocl/src/kernels/nonfree_surf.cl
@ -78,7 +78,7 @@ uchar read_imgTex(IMAGE_INT8 img, sampler_t sam, float2 coord, int rows, int col

 // dynamically change the precision used for floating type

-#if defined (__ATI__) || defined (__NVIDIA__)
+#if defined DOUBLE_SUPPORT
 #define F double
 #else
 #define F float
@ -299,7 +299,7 @@ __kernel
    __global const float * det,
    __global const float * trace,
    __global int4 * maxPosBuffer,
-    volatile __global unsigned int* maxCounter,
+    volatile __global int* maxCounter,
    int counter_offset,
    int det_step,     // the step of det in bytes
    int trace_step,   // the step of trace in bytes
@ -408,7 +408,7 @@ __kernel

                if(condmax)
                {
-                    unsigned int ind = atomic_inc(maxCounter);
+                    int ind = atomic_inc(maxCounter);

                    if (ind < c_max_candidates)
                    {
@ -427,7 +427,7 @@ __kernel
    __global float * det,
    __global float * trace,
    __global int4 * maxPosBuffer,
-    volatile __global unsigned int* maxCounter,
+    volatile __global  int* maxCounter,
    int counter_offset,
    int det_step,     // the step of det in bytes
    int trace_step,   // the step of trace in bytes
@ -525,7 +525,7 @@ __kernel

            if(condmax)
            {
-                unsigned int ind = atomic_inc(maxCounter);
+                 int ind = atomic_inc(maxCounter);

                if (ind < c_max_candidates)
                {
@ -585,7 +585,7 @@ __kernel
    __global const float * det,
    __global const int4 * maxPosBuffer,
    __global float * keypoints,
-    volatile __global unsigned int * featureCounter,
+    volatile __global  int * featureCounter,
    int det_step,
    int keypoints_step,
    int c_img_rows,
@ -684,7 +684,7 @@ __kernel
                if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)
                {
                    // Get a new feature index.
-                    unsigned int ind = atomic_inc(featureCounter);
+                     int ind = atomic_inc(featureCounter);

                    if (ind < c_max_features)
                    {
@ -737,19 +737,19 @@ __constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448
 __constant float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};
 __constant float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};

-void reduce_32_sum(volatile __local  float * data, float partial_reduction, int tid)
+void reduce_32_sum(volatile __local  float * data, volatile float* partial_reduction, int tid)
 {
-#define op(A, B) (A)+(B)
-    data[tid] = partial_reduction;
+#define op(A, B) (*A)+(B)
+    data[tid] = *partial_reduction;
    barrier(CLK_LOCAL_MEM_FENCE);

    if (tid < 16)
    {
-        data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
-        data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
-        data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
-        data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
-        data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
+        data[tid] = *partial_reduction = op(partial_reduction, data[tid + 16]);
+        data[tid] = *partial_reduction = op(partial_reduction, data[tid + 8 ]);
+        data[tid] = *partial_reduction = op(partial_reduction, data[tid + 4 ]);
+        data[tid] = *partial_reduction = op(partial_reduction, data[tid + 2 ]);
+        data[tid] = *partial_reduction = op(partial_reduction, data[tid + 1 ]);
    }
 #undef op
 }
@ -831,7 +831,7 @@ __kernel
    {
        const int dir = (i * 4 + get_local_id(1)) * ORI_SEARCH_INC;

-        float sumx = 0.0f, sumy = 0.0f;
+        volatile float sumx = 0.0f, sumy = 0.0f;
        int d = abs(convert_int_rte(s_angle[get_local_id(0)]) - dir);
        if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
        {
@ -856,8 +856,8 @@ __kernel
            sumx += s_X[get_local_id(0) + 96];
            sumy += s_Y[get_local_id(0) + 96];
        }
-        reduce_32_sum(s_sumx + get_local_id(1) * 32, sumx, get_local_id(0));
-        reduce_32_sum(s_sumy + get_local_id(1) * 32, sumy, get_local_id(0));
+        reduce_32_sum(s_sumx + get_local_id(1) * 32, &sumx, get_local_id(0));
+        reduce_32_sum(s_sumy + get_local_id(1) * 32, &sumy, get_local_id(0));

        const float temp_mod = sumx * sumx + sumy * sumy;
        if (temp_mod > best_mod)
@ -892,14 +892,32 @@ __kernel
            kp_dir += 2.0f * CV_PI_F;
        kp_dir *= 180.0f / CV_PI_F;

-        kp_dir = 360.0f - kp_dir;
-        if (fabs(kp_dir - 360.f) < FLT_EPSILON)
-            kp_dir = 0.f;
+        //kp_dir = 360.0f - kp_dir;
+        //if (fabs(kp_dir - 360.f) < FLT_EPSILON)
+        //    kp_dir = 0.f;

        featureDir[get_group_id(0)] = kp_dir;
    }
 }

+
+__kernel
+    void icvSetUpright(
+    __global float * keypoints,
+    int keypoints_step,
+    int nFeatures
+    )
+{
+    keypoints_step /= sizeof(*keypoints);
+    __global float* featureDir  = keypoints + ANGLE_ROW * keypoints_step;
+
+    if(get_global_id(0) <= nFeatures)
+    {
+        featureDir[get_global_id(0)] = 90.0f;
+    }
+}
+
+
 #undef ORI_SEARCH_INC
 #undef ORI_WIN
 #undef ORI_SAMPLES
@ -993,10 +1011,7 @@ void calc_dx_dy(
    const float centerX = featureX[get_group_id(0)];
    const float centerY = featureY[get_group_id(0)];
    const float size = featureSize[get_group_id(0)];
-    float descriptor_dir = 360.0f - featureDir[get_group_id(0)];
-    if (fabs(descriptor_dir - 360.f) < FLT_EPSILON)
-        descriptor_dir = 0.f;
-    descriptor_dir *= (float)(CV_PI_F / 180.0f);
+    float descriptor_dir = featureDir[get_group_id(0)] * (float)(CV_PI_F / 180.0f);

    /* The sampling intervals and wavelet sized for selecting an orientation
    and building the keypoint descriptor are defined relative to 's' */
@ -1125,11 +1140,15 @@ __kernel
    {
        sdxabs[tid] = fabs(sdx[tid]); // |dx| array
        sdyabs[tid] = fabs(sdy[tid]); // |dy| array
-        //barrier(CLK_LOCAL_MEM_FENCE);
-
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 25)
+    {
        reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
-        //barrier(CLK_LOCAL_MEM_FENCE);
-
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 25)
+    {
        volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 2);

        // write dx, dy, |dx|, |dy|
--- a/modules/ocl/src/kernels/objdetect_hog.cl
+++ b/modules/ocl/src/kernels/objdetect_hog.cl
@ -140,6 +140,10 @@ float reduce_smem(volatile __local float* smem, int size)
    if (tid < 32)
    {
        if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 16)
+    {
        if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
        if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
        if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];
@ -224,6 +228,11 @@ __kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr
    {
        volatile __local float* smem = products;
        smem[tid] = product = product + smem[tid + 32];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 16)
+    {
+        volatile __local float* smem = products;
        smem[tid] = product = product + smem[tid + 16];
        smem[tid] = product = product + smem[tid + 8];
        smem[tid] = product = product + smem[tid + 4];
--- a/modules/ocl/src/surf.cpp
+++ b/modules/ocl/src/surf.cpp
@ -56,6 +56,21 @@ namespace cv
    {
        ///////////////////////////OpenCL kernel strings///////////////////////////
        extern const char *nonfree_surf;
+
+        const char* noImage2dOption = "-D DISABLE_IMAGE2D";
+
+        static void openCLExecuteKernelSURF(Context *clCxt , const char **source, std::string kernelName, size_t globalThreads[3],
+            size_t localThreads[3], std::vector< std::pair<size_t, const void *> > &args, int channels, int depth)
+        {
+            if(support_image2d())
+            {
+                openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth);
+            }
+            else
+            {
+                openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, noImage2dOption);
+            }
+        }
    }
 }

@ -79,10 +94,6 @@ static inline int calcSize(int octave, int layer)
    return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
 }

-namespace
-{
-    const char* noImage2dOption = "-D DISABLE_IMAGE2D";
-}

 class SURF_OCL_Invoker
 {
@ -99,15 +110,16 @@ public:
    void icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset,
                                  int octave, bool use_mask, int nLayers, int layer_rows, int layer_cols);

-    void icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, unsigned int maxCounter,
+    void icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, int maxCounter,
                                    oclMat &keypoints, oclMat &counters, int octave, int layer_rows, int maxFeatures);

    void icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures);

+    void icvSetUpright_gpu(const oclMat &keypoints, int nFeatures);
+
    void compute_descriptors_gpu(const oclMat &descriptors, const oclMat &keypoints, int nFeatures);
    // end of kernel callers declarations

-
    SURF_OCL_Invoker(SURF_OCL &surf, const oclMat &img, const oclMat &mask) :
        surf_(surf),
        img_cols(img.cols), img_rows(img.rows),
@ -181,8 +193,8 @@ public:
            icvFindMaximaInLayer_gpu(surf_.det, surf_.trace, surf_.maxPosBuffer, counters, 1 + octave,
                                     octave, use_mask, surf_.nOctaveLayers, layer_rows, layer_cols);

-            unsigned int maxCounter = Mat(counters).at<unsigned int>(1 + octave);
-            maxCounter = std::min(maxCounter, static_cast<unsigned int>(maxCandidates));
+            int maxCounter = ((Mat)counters).at<int>(1 + octave);
+            maxCounter = std::min(maxCounter, static_cast<int>(maxCandidates));

            if (maxCounter > 0)
            {
@ -190,15 +202,29 @@ public:
                                           keypoints, counters, octave, layer_rows, maxFeatures);
            }
        }
-        unsigned int featureCounter = Mat(counters).at<unsigned int>(0);
-        featureCounter = std::min(featureCounter, static_cast<unsigned int>(maxFeatures));
+        int featureCounter = Mat(counters).at<int>(0);
+        featureCounter = std::min(featureCounter, static_cast<int>(maxFeatures));

        keypoints.cols = featureCounter;

        if (surf_.upright)
-            keypoints.row(SURF_OCL::ANGLE_ROW).setTo(Scalar::all(90.0));
+        {
+            //keypoints.row(SURF_OCL::ANGLE_ROW).setTo(Scalar::all(90.0));
+            setUpright(keypoints);
+        }
        else
+        {
            findOrientation(keypoints);
+        }
+    }
+
+    void setUpright(oclMat &keypoints)
+    {
+        const int nFeatures = keypoints.cols;
+        if(nFeatures > 0)
+        {
+            icvSetUpright_gpu(keypoints, keypoints.cols);
+        }
    }

    void findOrientation(oclMat &keypoints)
@ -483,14 +509,7 @@ void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat &det, oclMat &trace, i
        divUp(max_samples_i, localThreads[1]) *localThreads[1] *(nOctaveLayers + 2),
        1
    };
-    if(support_image2d())
-    {
-    openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
-}
-    else
-    {
-        openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
-    }
+    openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
 }

 void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat &trace, oclMat &maxPosBuffer, oclMat &maxCounter, int counterOffset,
@ -536,17 +555,10 @@ void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat &det, const oclMat
                               1
                              };

-    if(support_image2d())
-    {
-    openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
-}
-    else
-    {
-        openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
-    }
+    openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
 }

-void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, unsigned int maxCounter,
+void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMat &maxPosBuffer, int maxCounter,
        oclMat &keypoints, oclMat &counters, int octave, int layer_rows, int maxFeatures)
 {
    Context *clCxt = det.clCxt;
@ -568,14 +580,7 @@ void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat &det, const oclMa
    size_t localThreads[3]  = {3, 3, 3};
    size_t globalThreads[3] = {maxCounter *localThreads[0], localThreads[1], 1};

-    if(support_image2d())
-    {
-    openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
-}
-    else
-    {
-        openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
-    }
+    openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
 }

 void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeatures)
@ -602,16 +607,27 @@ void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeat
    size_t localThreads[3]  = {32, 4, 1};
    size_t globalThreads[3] = {nFeatures *localThreads[0], localThreads[1], 1};

-    if(support_image2d())
-    {
-    openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
 }
-    else
-    {
-        openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
-    }
+
+void SURF_OCL_Invoker::icvSetUpright_gpu(const oclMat &keypoints, int nFeatures)
+{
+    Context *clCxt = counters.clCxt;
+    std::string kernelName = "icvSetUpright";
+
+    std::vector< std::pair<size_t, const void *> > args;
+
+    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&keypoints.data));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&keypoints.step));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&nFeatures));
+
+    size_t localThreads[3]  = {256, 1, 1};
+    size_t globalThreads[3] = {nFeatures, 1, 1};
+
+    openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
 }

+
 void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const oclMat &keypoints, int nFeatures)
 {
    // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
@ -647,14 +663,8 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.rows));
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.cols));
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.step));
-        if(support_image2d())
-        {
-            openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
-        }
-        else
-        {
-            openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
-        }
+
+        openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);

        kernelName = "normalize_descriptors64";

@ -667,14 +677,8 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
        args.clear();
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));
-        if(support_image2d())
-        {
-        openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
-    }
-    else
-    {
-            openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
-        }
+
+        openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
    }
    else
    {
@ -702,14 +706,8 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.rows));
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.cols));
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&_img.step));
-        if(support_image2d())
-        {
-            openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
-        }
-        else
-        {
-            openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
-        }
+
+        openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);

        kernelName = "normalize_descriptors128";

@ -722,14 +720,8 @@ void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat &descriptors, const
        args.clear();
        args.push_back( std::make_pair( sizeof(cl_mem), (void *)&descriptors.data));
        args.push_back( std::make_pair( sizeof(cl_int), (void *)&descriptors.step));
-        if(support_image2d())
-        {
-            openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
-        }
-        else
-        {
-            openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1, noImage2dOption);
-        }
+
+        openCLExecuteKernelSURF(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
    }
 }

--- a/samples/c/smiledetect.cpp
+++ b/samples/c/smiledetect.cpp
@ -0,0 +1,247 @@
+#include "opencv2/objdetect/objdetect.hpp"
+#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/imgproc/imgproc.hpp"
+
+#include <cctype>
+#include <iostream>
+#include <iterator>
+#include <stdio.h>
+
+using namespace std;
+using namespace cv;
+
+static void help()
+{
+    cout << "\nThis program demonstrates the smile detector.\n"
+            "Usage:\n"
+            "./smiledetect [--cascade=<cascade_path> this is the frontal face classifier]\n"
+            "   [--smile-cascade=[<smile_cascade_path>]]\n"
+            "   [--scale=<image scale greater or equal to 1, try 2.0 for example. The larger the faster the processing>]\n"
+            "   [--try-flip]\n"
+            "   [video_filename|camera_index]\n\n"
+            "Example:\n"
+            "./smiledetect --cascade=\"../../data/haarcascades/haarcascade_frontalface_alt.xml\" --smile-cascade=\"../../data/haarcascades/haarcascade_smile.xml\" --scale=2.0\n\n"
+            "During execution:\n\tHit any key to quit.\n"
+            "\tUsing OpenCV version " << CV_VERSION << "\n" << endl;
+}
+
+void detectAndDraw( Mat& img, CascadeClassifier& cascade,
+                    CascadeClassifier& nestedCascade,
+                    double scale, bool tryflip );
+
+string cascadeName = "../../data/haarcascades/haarcascade_frontalface_alt.xml";
+string nestedCascadeName = "../../data/haarcascades/haarcascade_smile.xml";
+
+
+int main( int argc, const char** argv )
+{
+    CvCapture* capture = 0;
+    Mat frame, frameCopy, image;
+    const string scaleOpt = "--scale=";
+    size_t scaleOptLen = scaleOpt.length();
+    const string cascadeOpt = "--cascade=";
+    size_t cascadeOptLen = cascadeOpt.length();
+    const string nestedCascadeOpt = "--smile-cascade";
+    size_t nestedCascadeOptLen = nestedCascadeOpt.length();
+    const string tryFlipOpt = "--try-flip";
+    size_t tryFlipOptLen = tryFlipOpt.length();
+    string inputName;
+    bool tryflip = false;
+
+    help();
+
+    CascadeClassifier cascade, nestedCascade;
+    double scale = 1;
+
+    for( int i = 1; i < argc; i++ )
+    {
+        cout << "Processing " << i << " " <<  argv[i] << endl;
+        if( cascadeOpt.compare( 0, cascadeOptLen, argv[i], cascadeOptLen ) == 0 )
+        {
+            cascadeName.assign( argv[i] + cascadeOptLen );
+            cout << "  from which we have cascadeName= " << cascadeName << endl;
+        }
+        else if( nestedCascadeOpt.compare( 0, nestedCascadeOptLen, argv[i], nestedCascadeOptLen ) == 0 )
+        {
+            if( argv[i][nestedCascadeOpt.length()] == '=' )
+                nestedCascadeName.assign( argv[i] + nestedCascadeOpt.length() + 1 );
+        }
+        else if( scaleOpt.compare( 0, scaleOptLen, argv[i], scaleOptLen ) == 0 )
+        {
+            if( !sscanf( argv[i] + scaleOpt.length(), "%lf", &scale ) || scale < 1 )
+                scale = 1;
+            cout << " from which we read scale = " << scale << endl;
+        }
+        else if( tryFlipOpt.compare( 0, tryFlipOptLen, argv[i], tryFlipOptLen ) == 0 )
+        {
+            tryflip = true;
+            cout << " will try to flip image horizontally to detect assymetric objects\n";
+        }
+        else if( argv[i][0] == '-' )
+        {
+            cerr << "WARNING: Unknown option " << argv[i] << endl;
+        }
+        else
+            inputName.assign( argv[i] );
+    }
+
+    if( !cascade.load( cascadeName ) )
+    {
+        cerr << "ERROR: Could not load face cascade" << endl;
+        help();
+        return -1;
+    }
+    if( !nestedCascade.load( nestedCascadeName ) )
+    {
+        cerr << "ERROR: Could not load smile cascade" << endl;
+        help();
+        return -1;
+    }
+
+    if( inputName.empty() || (isdigit(inputName.c_str()[0]) && inputName.c_str()[1] == '\0') )
+    {
+        capture = cvCaptureFromCAM( inputName.empty() ? 0 : inputName.c_str()[0] - '0' );
+        int c = inputName.empty() ? 0 : inputName.c_str()[0] - '0' ;
+        if(!capture) cout << "Capture from CAM " <<  c << " didn't work" << endl;
+    }
+    else if( inputName.size() )
+    {
+        capture = cvCaptureFromAVI( inputName.c_str() );
+        if(!capture) cout << "Capture from AVI didn't work" << endl;
+    }
+
+    cvNamedWindow( "result", 1 );
+
+    if( capture )
+    {
+        cout << "In capture ..." << endl;
+        cout << endl << "NOTE: Smile intensity will only be valid after a first smile has been detected" << endl;
+
+        for(;;)
+        {
+            IplImage* iplImg = cvQueryFrame( capture );
+            frame = iplImg;
+            if( frame.empty() )
+                break;
+            if( iplImg->origin == IPL_ORIGIN_TL )
+                frame.copyTo( frameCopy );
+            else
+                flip( frame, frameCopy, 0 );
+
+            detectAndDraw( frameCopy, cascade, nestedCascade, scale, tryflip );
+
+            if( waitKey( 10 ) >= 0 )
+                goto _cleanup_;
+        }
+
+        waitKey(0);
+
+_cleanup_:
+        cvReleaseCapture( &capture );
+    }
+    else
+    {
+        cerr << "ERROR: Could not initiate capture" << endl;
+        help();
+        return -1;
+    }
+
+    cvDestroyWindow("result");
+    return 0;
+}
+
+void detectAndDraw( Mat& img, CascadeClassifier& cascade,
+                    CascadeClassifier& nestedCascade,
+                    double scale, bool tryflip)
+{
+    int i = 0;
+    vector<Rect> faces, faces2;
+    const static Scalar colors[] =  { CV_RGB(0,0,255),
+        CV_RGB(0,128,255),
+        CV_RGB(0,255,255),
+        CV_RGB(0,255,0),
+        CV_RGB(255,128,0),
+        CV_RGB(255,255,0),
+        CV_RGB(255,0,0),
+        CV_RGB(255,0,255)} ;
+    Mat gray, smallImg( cvRound (img.rows/scale), cvRound(img.cols/scale), CV_8UC1 );
+
+    cvtColor( img, gray, CV_BGR2GRAY );
+    resize( gray, smallImg, smallImg.size(), 0, 0, INTER_LINEAR );
+    equalizeHist( smallImg, smallImg );
+
+    cascade.detectMultiScale( smallImg, faces,
+        1.1, 2, 0
+        //|CV_HAAR_FIND_BIGGEST_OBJECT
+        //|CV_HAAR_DO_ROUGH_SEARCH
+        |CV_HAAR_SCALE_IMAGE
+        ,
+        Size(30, 30) );
+    if( tryflip )
+    {
+        flip(smallImg, smallImg, 1);
+        cascade.detectMultiScale( smallImg, faces2,
+                                 1.1, 2, 0
+                                 //|CV_HAAR_FIND_BIGGEST_OBJECT
+                                 //|CV_HAAR_DO_ROUGH_SEARCH
+                                 |CV_HAAR_SCALE_IMAGE
+                                 ,
+                                 Size(30, 30) );
+        for( vector<Rect>::const_iterator r = faces2.begin(); r != faces2.end(); r++ )
+        {
+            faces.push_back(Rect(smallImg.cols - r->x - r->width, r->y, r->width, r->height));
+        }
+    }
+
+    for( vector<Rect>::iterator r = faces.begin(); r != faces.end(); r++, i++ )
+    {
+        Mat smallImgROI;
+        vector<Rect> nestedObjects;
+        Point center;
+        Scalar color = colors[i%8];
+        int radius;
+
+        double aspect_ratio = (double)r->width/r->height;
+        if( 0.75 < aspect_ratio && aspect_ratio < 1.3 )
+        {
+            center.x = cvRound((r->x + r->width*0.5)*scale);
+            center.y = cvRound((r->y + r->height*0.5)*scale);
+            radius = cvRound((r->width + r->height)*0.25*scale);
+            circle( img, center, radius, color, 3, 8, 0 );
+        }
+        else
+            rectangle( img, cvPoint(cvRound(r->x*scale), cvRound(r->y*scale)),
+                       cvPoint(cvRound((r->x + r->width-1)*scale), cvRound((r->y + r->height-1)*scale)),
+                       color, 3, 8, 0);
+
+        const int half_height=cvRound((float)r->height/2);
+        r->y=r->y + half_height;
+        r->height = half_height;
+        smallImgROI = smallImg(*r);
+        nestedCascade.detectMultiScale( smallImgROI, nestedObjects,
+            1.1, 0, 0
+            //|CV_HAAR_FIND_BIGGEST_OBJECT
+            //|CV_HAAR_DO_ROUGH_SEARCH
+            //|CV_HAAR_DO_CANNY_PRUNING
+            |CV_HAAR_SCALE_IMAGE
+            ,
+            Size(30, 30) );
+
+        // The number of detected neighbors depends on image size (and also illumination, etc.). The
+        // following steps use a floating minimum and maximum of neighbors. Intensity thus estimated will be
+        //accurate only after a first smile has been displayed by the user.
+        const int smile_neighbors = (int)nestedObjects.size();
+        static int max_neighbors=-1;
+        static int min_neighbors=-1;
+        if (min_neighbors == -1) min_neighbors = smile_neighbors;
+        max_neighbors = MAX(max_neighbors, smile_neighbors);
+
+        // Draw rectangle on the left side of the image reflecting smile intensity
+        float intensityZeroOne = ((float)smile_neighbors - min_neighbors) / (max_neighbors - min_neighbors + 1);
+        int rect_height = cvRound((float)img.rows * intensityZeroOne);
+        CvScalar col = CV_RGB((float)255 * intensityZeroOne, 0, 0);
+        rectangle(img, cvPoint(0, img.rows), cvPoint(img.cols/10, img.rows - rect_height), col, -1);
+    }
+
+    cv::imshow( "result", img );
+}
--- a/samples/java/ant/build.xml
+++ b/samples/java/ant/build.xml
@ -20,7 +20,7 @@

    <target name="compile">
        <mkdir dir="${classes.dir}"/>
-        <javac srcdir="${src.dir}" destdir="${classes.dir}" classpathref="classpath"/>
+        <javac includeantruntime="false" srcdir="${src.dir}" destdir="${classes.dir}" classpathref="classpath"/>
    </target>

    <target name="jar" depends="compile">
--- a/samples/java/ant/src/SimpleSample.java
+++ b/samples/java/ant/src/SimpleSample.java
@ -1,12 +1,14 @@
+import org.opencv.core.Core;
 import org.opencv.core.Mat;
 import org.opencv.core.CvType;
 import org.opencv.core.Scalar;

 class SimpleSample {

-  static{ System.loadLibrary("opencv_java244"); }
+  static{ System.loadLibrary(Core.NATIVE_LIBRARY_NAME); }

  public static void main(String[] args) {
+    System.out.println("Welcome to OpenCV " + Core.VERSION);
    Mat m = new Mat(5, 10, CvType.CV_8UC1, new Scalar(0));
    System.out.println("OpenCV Mat: " + m);
    Mat mr1 = m.row(1);
--- a/samples/java/eclipse/HelloCV/src/Main.java
+++ b/samples/java/eclipse/HelloCV/src/Main.java
@ -1,10 +1,12 @@
+import org.opencv.core.Core;
 import org.opencv.core.CvType;
 import org.opencv.core.Mat;

 public class Main {

    public static void main(String[] args) {
-        System.loadLibrary("opencv_java244");
+        System.out.println("Welcome to OpenCV " + Core.VERSION);
+        System.loadLibrary(Core.NATIVE_LIBRARY_NAME);
        Mat m  = Mat.eye(3, 3, CvType.CV_8UC1);
        System.out.println("m = " + m.dump());
    }
--- a/samples/java/sbt/src/main/scala/Main.scala
+++ b/samples/java/sbt/src/main/scala/Main.scala
@ -8,11 +8,14 @@
 * You're invited to submit your own examples, in any JVM language of
 * your choosing so long as you can get them to build.
 */
+
+import org.opencv.core.Core
+
 object Main extends App {
  // We must load the native library before using any OpenCV functions.
  // You must load this library _exactly once_ per Java invocation.
  // If you load it more than once, you will get a java.lang.UnsatisfiedLinkError.
-  System.loadLibrary("opencv_java")
+  System.loadLibrary(Core.NATIVE_LIBRARY_NAME)

  ScalaCorrespondenceMatchingDemo.run()
  ScalaDetectFaceDemo.run()
--- a/samples/ocl/performance.cpp
+++ b/samples/ocl/performance.cpp
@ -25,7 +25,7 @@ using namespace cv;

 // This program test most of the functions in ocl module and generate data metrix of x-factor in .csv files
 // All images needed in this test are in samples/gpu folder.
-// For haar template, please rename it to facedetect.xml
+// For haar template, haarcascade_frontalface_alt.xml shouold be in working directory

 void gen(Mat &mat, int rows, int cols, int type, Scalar low, Scalar high);
 string abspath(const string &relpath);
@ -707,7 +707,7 @@ TEST(matchTemplate)

        for (size_t j = 0; j < sizeof(all_type) / sizeof(int); j++)
        {
-            for(templ_size = 5; templ_size <=5; templ_size *= 5)
+            for(templ_size = 5; templ_size <= 5; templ_size *= 5)
            {
                gen(src, size, size, all_type[j], 0, 1);

@ -749,7 +749,7 @@ TEST(matchTemplate)

        for (size_t j = 0; j < sizeof(all_type_8U) / sizeof(int); j++)
        {
-            for(templ_size = 5; templ_size < 200; templ_size *= 5)
+            for(templ_size = 5; templ_size <= 5; templ_size *= 5)
            {
                SUBTEST << src.cols << 'x' << src.rows << "; " << type_name_8U[j] << "; templ " << templ_size << 'x' << templ_size << "; CCORR_NORMED";

@ -1063,9 +1063,9 @@ TEST(Haar)

    CascadeClassifier faceCascadeCPU;

-    if (!faceCascadeCPU.load(abspath("facedetect.xml")))
+    if (!faceCascadeCPU.load(abspath("haarcascade_frontalface_alt.xml")))
    {
-        throw runtime_error("can't load facedetect.xml");
+        throw runtime_error("can't load haarcascade_frontalface_alt.xml");
    }

    vector<Rect> faces;
@ -1079,9 +1079,9 @@ TEST(Haar)
 #ifdef USE_OPENCL
    ocl::CascadeClassifier_GPU faceCascade;

-    if (!faceCascade.load(abspath("facedetect.xml")))
+    if (!faceCascade.load(abspath("haarcascade_frontalface_alt.xml")))
    {
-        throw runtime_error("can't load facedetect.xml");
+        throw runtime_error("can't load haarcascade_frontalface_alt.xml");
    }

    ocl::oclMat d_img(img);
@ -4345,7 +4345,7 @@ int main(int argc, const char *argv[])
        {
            if (device == devidx)
            {
-                ocl::setDevice(oclinfo[i], j);
+                ocl::setDevice(oclinfo[i], (int)j);
                TestSystem::instance().setRecordName(oclinfo[i].DeviceName[j]);
                printf("\nuse %d: %s\n", devidx, oclinfo[i].DeviceName[j].c_str());
                goto END_DEV;