Merge branch 'master' of git://github.com/Itseez/opencv into ocl_hough

pull/267/head
Suenghoon Park 12 years ago
commit ecb6c20915
  1. 26
      CMakeLists.txt
  2. 22
      android/android.toolchain.cmake
  3. 4
      android/service/CMakeLists.txt
  4. 22
      android/service/ReadMe.txt
  5. 44
      android/service/doc/UseCases.rst
  6. 4
      android/service/engine/AndroidManifest.xml
  7. 4
      android/service/engine/jni/BinderComponent/OpenCVEngine.cpp
  8. 12
      android/service/engine/jni/JNIWrapper/OpenCVLibraryInfo.cpp
  9. 8
      android/service/engine/jni/NativeService/CommonPackageManager.cpp
  10. 8
      android/service/engine/jni/NativeService/PackageInfo.cpp
  11. 2
      android/service/engine/jni/include/EngineCommon.h
  12. 2
      android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java
  13. 28
      android/service/readme.txt
  14. 2
      cmake/OpenCVCompilerOptions.cmake
  15. 27
      cmake/OpenCVDetectCUDA.cmake
  16. 2
      cmake/OpenCVPCHSupport.cmake
  17. 21
      cmake/templates/cvconfig.h.cmake
  18. 25
      doc/CMakeLists.txt
  19. 2
      doc/tutorials/imgproc/shapedescriptors/bounding_rects_circles/bounding_rects_circles.rst
  20. 24
      doc/tutorials/introduction/android_binary_package/O4A_SDK.rst
  21. 16
      doc/tutorials/introduction/android_binary_package/dev_with_OCV_on_Android.rst
  22. BIN
      doc/tutorials/introduction/android_binary_package/images/install_opencv_manager_with_adb.png
  23. 8
      doc/user_guide/ug_mat.rst
  24. 1
      modules/androidcamera/src/camera_activity.cpp
  25. 2
      modules/calib3d/perf/perf_pnp.cpp
  26. 8
      modules/calib3d/src/calibration.cpp
  27. 4
      modules/contrib/src/chamfermatching.cpp
  28. 2
      modules/contrib/src/spinimages.cpp
  29. 1
      modules/core/CMakeLists.txt
  30. 27
      modules/core/include/opencv2/core/core.hpp
  31. 14
      modules/core/include/opencv2/core/cuda_devptrs.hpp
  32. 18
      modules/core/include/opencv2/core/gpumat.hpp
  33. 35
      modules/core/include/opencv2/core/internal.hpp
  34. 295
      modules/core/include/opencv2/core/opengl_interop.hpp
  35. 10
      modules/core/include/opencv2/core/operations.hpp
  36. 2
      modules/core/src/command_line_parser.cpp
  37. 108
      modules/core/src/cuda/matrix_operations.cu
  38. 2718
      modules/core/src/gl_core_3_1.cpp
  39. 1331
      modules/core/src/gl_core_3_1.hpp
  40. 198
      modules/core/src/gpumat.cpp
  41. 83
      modules/core/src/matrix.cpp
  42. 1761
      modules/core/src/opengl_interop.cpp
  43. 31
      modules/core/src/persistence.cpp
  44. 33
      modules/core/src/system.cpp
  45. 4
      modules/features2d/perf/perf_batchDistance.cpp
  46. 2
      modules/features2d/perf/perf_fast.cpp
  47. 6
      modules/features2d/src/keypoint.cpp
  48. 10
      modules/features2d/src/matchers.cpp
  49. 2
      modules/features2d/src/precomp.hpp
  50. 16
      modules/gpu/CMakeLists.txt
  51. 10
      modules/gpu/app/nv_perf_test/CMakeLists.txt
  52. BIN
      modules/gpu/app/nv_perf_test/im1_1280x800.jpg
  53. BIN
      modules/gpu/app/nv_perf_test/im2_1280x800.jpg
  54. 489
      modules/gpu/app/nv_perf_test/main.cpp
  55. 85
      modules/gpu/doc/object_detection.rst
  56. 0
      modules/gpu/include/opencv2/gpu/device/block.hpp
  57. 10
      modules/gpu/include/opencv2/gpu/device/common.hpp
  58. 2
      modules/gpu/include/opencv2/gpu/device/detail/color_detail.hpp
  59. 361
      modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp
  60. 498
      modules/gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp
  61. 841
      modules/gpu/include/opencv2/gpu/device/detail/reduction_detail.hpp
  62. 1
      modules/gpu/include/opencv2/gpu/device/emulation.hpp
  63. 123
      modules/gpu/include/opencv2/gpu/device/functional.hpp
  64. 197
      modules/gpu/include/opencv2/gpu/device/reduce.hpp
  65. 158
      modules/gpu/include/opencv2/gpu/device/saturate_cast.hpp
  66. 24
      modules/gpu/include/opencv2/gpu/device/utility.hpp
  67. 10
      modules/gpu/include/opencv2/gpu/device/vec_distance.hpp
  68. 2
      modules/gpu/include/opencv2/gpu/device/vec_math.hpp
  69. 145
      modules/gpu/include/opencv2/gpu/device/warp_shuffle.hpp
  70. 232
      modules/gpu/include/opencv2/gpu/gpu.hpp
  71. 26
      modules/gpu/misc/carma.toolchain.cmake
  72. 882
      modules/gpu/perf/perf_core.cpp
  73. 111
      modules/gpu/perf/perf_imgproc.cpp
  74. 1
      modules/gpu/perf/perf_objdetect.cpp
  75. 279
      modules/gpu/perf/perf_softcascade.cpp
  76. 167
      modules/gpu/perf/perf_video.cpp
  77. 16
      modules/gpu/src/arithm.cpp
  78. 84
      modules/gpu/src/brute_force_matcher.cpp
  79. 2
      modules/gpu/src/cascadeclassifier.cpp
  80. 218
      modules/gpu/src/cuda/bf_knnmatch.cu
  81. 129
      modules/gpu/src/cuda/bf_match.cu
  82. 115
      modules/gpu/src/cuda/bf_radius_match.cu
  83. 25
      modules/gpu/src/cuda/calib3d.cu
  84. 410
      modules/gpu/src/cuda/canny.cu
  85. 1
      modules/gpu/src/cuda/ccomponetns.cu
  86. 53
      modules/gpu/src/cuda/column_filter.0.cu
  87. 53
      modules/gpu/src/cuda/column_filter.1.cu
  88. 53
      modules/gpu/src/cuda/column_filter.10.cu
  89. 53
      modules/gpu/src/cuda/column_filter.11.cu
  90. 53
      modules/gpu/src/cuda/column_filter.12.cu
  91. 53
      modules/gpu/src/cuda/column_filter.13.cu
  92. 53
      modules/gpu/src/cuda/column_filter.14.cu
  93. 53
      modules/gpu/src/cuda/column_filter.2.cu
  94. 53
      modules/gpu/src/cuda/column_filter.3.cu
  95. 53
      modules/gpu/src/cuda/column_filter.4.cu
  96. 53
      modules/gpu/src/cuda/column_filter.5.cu
  97. 53
      modules/gpu/src/cuda/column_filter.6.cu
  98. 53
      modules/gpu/src/cuda/column_filter.7.cu
  99. 53
      modules/gpu/src/cuda/column_filter.8.cu
  100. 53
      modules/gpu/src/cuda/column_filter.9.cu
  101. Some files were not shown because too many files have changed in this diff Show More

@ -110,12 +110,13 @@ endif()
# Optional 3rd party components # Optional 3rd party components
# =================================================== # ===================================================
OCV_OPTION(WITH_1394 "Include IEEE1394 support" ON IF (UNIX AND NOT ANDROID AND NOT IOS) ) OCV_OPTION(WITH_1394 "Include IEEE1394 support" ON IF (UNIX AND NOT ANDROID AND NOT IOS AND NOT CARMA) )
OCV_OPTION(WITH_AVFOUNDATION "Use AVFoundation for Video I/O" ON IF IOS) OCV_OPTION(WITH_AVFOUNDATION "Use AVFoundation for Video I/O" ON IF IOS)
OCV_OPTION(WITH_CARBON "Use Carbon for UI instead of Cocoa" OFF IF APPLE ) OCV_OPTION(WITH_CARBON "Use Carbon for UI instead of Cocoa" OFF IF APPLE )
OCV_OPTION(WITH_CUBLAS "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support" OFF IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS) )
OCV_OPTION(WITH_CUDA "Include NVidia Cuda Runtime support" ON IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS) ) OCV_OPTION(WITH_CUDA "Include NVidia Cuda Runtime support" ON IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS) )
OCV_OPTION(WITH_CUFFT "Include NVidia Cuda Fast Fourier Transform (FFT) library support" ON IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS) ) OCV_OPTION(WITH_CUFFT "Include NVidia Cuda Fast Fourier Transform (FFT) library support" ON IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS) )
OCV_OPTION(WITH_CUBLAS "Include NVidia Cuda Basic Linear Algebra Subprograms (BLAS) library support" OFF IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS) )
OCV_OPTION(WITH_NVCUVID "Include NVidia Video Decoding library support" OFF IF (CMAKE_VERSION VERSION_GREATER "2.8" AND NOT ANDROID AND NOT IOS AND NOT APPLE) )
OCV_OPTION(WITH_EIGEN "Include Eigen2/Eigen3 support" ON) OCV_OPTION(WITH_EIGEN "Include Eigen2/Eigen3 support" ON)
OCV_OPTION(WITH_FFMPEG "Include FFMPEG support" ON IF (NOT ANDROID AND NOT IOS)) OCV_OPTION(WITH_FFMPEG "Include FFMPEG support" ON IF (NOT ANDROID AND NOT IOS))
OCV_OPTION(WITH_GSTREAMER "Include Gstreamer support" ON IF (UNIX AND NOT APPLE AND NOT ANDROID) ) OCV_OPTION(WITH_GSTREAMER "Include Gstreamer support" ON IF (UNIX AND NOT APPLE AND NOT ANDROID) )
@ -139,9 +140,9 @@ OCV_OPTION(WITH_VIDEOINPUT "Build HighGUI with DirectShow support" ON
OCV_OPTION(WITH_XIMEA "Include XIMEA cameras support" OFF IF (NOT ANDROID AND NOT APPLE) ) OCV_OPTION(WITH_XIMEA "Include XIMEA cameras support" OFF IF (NOT ANDROID AND NOT APPLE) )
OCV_OPTION(WITH_XINE "Include Xine support (GPL)" OFF IF (UNIX AND NOT APPLE AND NOT ANDROID) ) OCV_OPTION(WITH_XINE "Include Xine support (GPL)" OFF IF (UNIX AND NOT APPLE AND NOT ANDROID) )
OCV_OPTION(WITH_CLP "Include Clp support (EPL)" OFF) OCV_OPTION(WITH_CLP "Include Clp support (EPL)" OFF)
OCV_OPTION(WITH_OPENCL "Include OpenCL Runtime support" OFF IF (NOT ANDROID AND NOT IOS) ) OCV_OPTION(WITH_OPENCL "Include OpenCL Runtime support" OFF IF (NOT ANDROID AND NOT IOS AND NOT CARMA) )
OCV_OPTION(WITH_OPENCLAMDFFT "Include AMD OpenCL FFT library support" OFF IF (NOT ANDROID AND NOT IOS) ) OCV_OPTION(WITH_OPENCLAMDFFT "Include AMD OpenCL FFT library support" OFF IF (NOT ANDROID AND NOT IOS AND NOT CARMA) )
OCV_OPTION(WITH_OPENCLAMDBLAS "Include AMD OpenCL BLAS library support" OFF IF (NOT ANDROID AND NOT IOS) ) OCV_OPTION(WITH_OPENCLAMDBLAS "Include AMD OpenCL BLAS library support" OFF IF (NOT ANDROID AND NOT IOS AND NOT CARMA) )
# OpenCV build components # OpenCV build components
@ -160,12 +161,12 @@ OCV_OPTION(BUILD_ANDROID_SERVICE "Build OpenCV Manager for Google Play" OFF I
OCV_OPTION(BUILD_ANDROID_PACKAGE "Build platform-specific package for Google Play" OFF IF ANDROID ) OCV_OPTION(BUILD_ANDROID_PACKAGE "Build platform-specific package for Google Play" OFF IF ANDROID )
# 3rd party libs # 3rd party libs
OCV_OPTION(BUILD_ZLIB "Build zlib from source" WIN32 OR APPLE ) OCV_OPTION(BUILD_ZLIB "Build zlib from source" WIN32 OR APPLE OR CARMA )
OCV_OPTION(BUILD_TIFF "Build libtiff from source" WIN32 OR ANDROID OR APPLE ) OCV_OPTION(BUILD_TIFF "Build libtiff from source" WIN32 OR ANDROID OR APPLE OR CARMA )
OCV_OPTION(BUILD_JASPER "Build libjasper from source" WIN32 OR ANDROID OR APPLE ) OCV_OPTION(BUILD_JASPER "Build libjasper from source" WIN32 OR ANDROID OR APPLE OR CARMA )
OCV_OPTION(BUILD_JPEG "Build libjpeg from source" WIN32 OR ANDROID OR APPLE ) OCV_OPTION(BUILD_JPEG "Build libjpeg from source" WIN32 OR ANDROID OR APPLE OR CARMA )
OCV_OPTION(BUILD_PNG "Build libpng from source" WIN32 OR ANDROID OR APPLE ) OCV_OPTION(BUILD_PNG "Build libpng from source" WIN32 OR ANDROID OR APPLE OR CARMA )
OCV_OPTION(BUILD_OPENEXR "Build openexr from source" WIN32 OR ANDROID OR APPLE ) OCV_OPTION(BUILD_OPENEXR "Build openexr from source" WIN32 OR ANDROID OR APPLE OR CARMA )
# OpenCV installation options # OpenCV installation options
@ -458,7 +459,7 @@ if(BUILD_EXAMPLES OR BUILD_ANDROID_EXAMPLES OR INSTALL_PYTHON_EXAMPLES)
add_subdirectory(samples) add_subdirectory(samples)
endif() endif()
if(BUILD_ANDROID_SERVICE) if(ANDROID)
add_subdirectory(android/service) add_subdirectory(android/service)
endif() endif()
@ -564,6 +565,7 @@ if(ANDROID)
status("") status("")
status(" Android: ") status(" Android: ")
status(" Android ABI:" ${ANDROID_ABI}) status(" Android ABI:" ${ANDROID_ABI})
status(" STL type:" ${ANDROID_STL})
status(" Native API level:" android-${ANDROID_NATIVE_API_LEVEL}) status(" Native API level:" android-${ANDROID_NATIVE_API_LEVEL})
status(" SDK target:" "${ANDROID_SDK_TARGET}") status(" SDK target:" "${ANDROID_SDK_TARGET}")
if(BUILD_WITH_ANDROID_NDK) if(BUILD_WITH_ANDROID_NDK)

@ -280,6 +280,9 @@
# - November 2012 # - November 2012
# [+] updated for NDK r8c # [+] updated for NDK r8c
# [+] added support for clang compiler # [+] added support for clang compiler
# - December 2012
# [~] fixed ccache full path search
# [+] updated for NDK r8d
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
cmake_minimum_required( VERSION 2.6.3 ) cmake_minimum_required( VERSION 2.6.3 )
@ -302,7 +305,7 @@ set( CMAKE_SYSTEM_VERSION 1 )
# rpath makes low sence for Android # rpath makes low sence for Android
set( CMAKE_SKIP_RPATH TRUE CACHE BOOL "If set, runtime paths are not added when using shared libraries." ) set( CMAKE_SKIP_RPATH TRUE CACHE BOOL "If set, runtime paths are not added when using shared libraries." )
set( ANDROID_SUPPORTED_NDK_VERSIONS ${ANDROID_EXTRA_NDK_VERSIONS} -r8c -r8b -r8 -r7c -r7b -r7 -r6b -r6 -r5c -r5b -r5 "" ) set( ANDROID_SUPPORTED_NDK_VERSIONS ${ANDROID_EXTRA_NDK_VERSIONS} -r8d -r8c -r8b -r8 -r7c -r7b -r7 -r6b -r6 -r5c -r5b -r5 "" )
if(NOT DEFINED ANDROID_NDK_SEARCH_PATHS) if(NOT DEFINED ANDROID_NDK_SEARCH_PATHS)
if( CMAKE_HOST_WIN32 ) if( CMAKE_HOST_WIN32 )
file( TO_CMAKE_PATH "$ENV{PROGRAMFILES}" ANDROID_NDK_SEARCH_PATHS ) file( TO_CMAKE_PATH "$ENV{PROGRAMFILES}" ANDROID_NDK_SEARCH_PATHS )
@ -962,7 +965,11 @@ if( BUILD_WITH_ANDROID_NDK )
set( ANDROID_STL_INCLUDE_DIRS "${ANDROID_NDK}/sources/cxx-stl/gabi++/include" ) set( ANDROID_STL_INCLUDE_DIRS "${ANDROID_NDK}/sources/cxx-stl/gabi++/include" )
set( __libstl "${ANDROID_NDK}/sources/cxx-stl/gabi++/libs/${ANDROID_NDK_ABI_NAME}/libgabi++_static.a" ) set( __libstl "${ANDROID_NDK}/sources/cxx-stl/gabi++/libs/${ANDROID_NDK_ABI_NAME}/libgabi++_static.a" )
elseif( ANDROID_STL MATCHES "stlport" ) elseif( ANDROID_STL MATCHES "stlport" )
if( NOT ANDROID_NDK_RELEASE STRLESS "r8d" )
set( ANDROID_EXCEPTIONS ON )
else()
set( ANDROID_EXCEPTIONS OFF ) set( ANDROID_EXCEPTIONS OFF )
endif()
if( ANDROID_NDK_RELEASE STRLESS "r7" ) if( ANDROID_NDK_RELEASE STRLESS "r7" )
set( ANDROID_RTTI OFF ) set( ANDROID_RTTI OFF )
else() else()
@ -974,7 +981,13 @@ if( BUILD_WITH_ANDROID_NDK )
set( ANDROID_EXCEPTIONS ON ) set( ANDROID_EXCEPTIONS ON )
set( ANDROID_RTTI ON ) set( ANDROID_RTTI ON )
if( EXISTS "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/${ANDROID_COMPILER_VERSION}" ) if( EXISTS "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/${ANDROID_COMPILER_VERSION}" )
if( ARMEABI_V7A AND ANDROID_COMPILER_VERSION VERSION_EQUAL "4.7" AND ANDROID_NDK_RELEASE STREQUAL "r8d" )
# gnustl binary for 4.7 compiler is buggy :(
# TODO: look for right fix
set( __libstl "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/4.6" )
else()
set( __libstl "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/${ANDROID_COMPILER_VERSION}" ) set( __libstl "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/${ANDROID_COMPILER_VERSION}" )
endif()
else() else()
set( __libstl "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++" ) set( __libstl "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++" )
endif() endif()
@ -1031,6 +1044,9 @@ endif()
# ccache support # ccache support
__INIT_VARIABLE( _ndk_ccache NDK_CCACHE ENV_NDK_CCACHE ) __INIT_VARIABLE( _ndk_ccache NDK_CCACHE ENV_NDK_CCACHE )
if( _ndk_ccache ) if( _ndk_ccache )
if( DEFINED NDK_CCACHE AND NOT EXISTS NDK_CCACHE )
unset( NDK_CCACHE CACHE )
endif()
find_program( NDK_CCACHE "${_ndk_ccache}" DOC "The path to ccache binary") find_program( NDK_CCACHE "${_ndk_ccache}" DOC "The path to ccache binary")
else() else()
unset( NDK_CCACHE CACHE ) unset( NDK_CCACHE CACHE )
@ -1260,7 +1276,7 @@ endif()
if( ANDROID_COMPILER_VERSION VERSION_EQUAL "4.6" ) if( ANDROID_COMPILER_VERSION VERSION_EQUAL "4.6" )
if( ANDROID_GOLD_LINKER AND (CMAKE_HOST_UNIX OR ANDROID_NDK_RELEASE STRGREATER "r8b") AND (ARMEABI OR ARMEABI_V7A OR X86) ) if( ANDROID_GOLD_LINKER AND (CMAKE_HOST_UNIX OR ANDROID_NDK_RELEASE STRGREATER "r8b") AND (ARMEABI OR ARMEABI_V7A OR X86) )
set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -fuse-ld=gold" ) set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -fuse-ld=gold" )
elseif( ANDROID_NDK_RELEASE STREQUAL "r8c") elseif( ANDROID_NDK_RELEASE STRGREATER "r8b")
set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -fuse-ld=bfd" ) set( ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} -fuse-ld=bfd" )
elseif( ANDROID_NDK_RELEASE STREQUAL "r8b" AND ARMEABI AND NOT _CMAKE_IN_TRY_COMPILE ) elseif( ANDROID_NDK_RELEASE STREQUAL "r8b" AND ARMEABI AND NOT _CMAKE_IN_TRY_COMPILE )
message( WARNING "The default bfd linker from arm GCC 4.6 toolchain can fail with 'unresolvable R_ARM_THM_CALL relocation' error message. See https://code.google.com/p/android/issues/detail?id=35342 message( WARNING "The default bfd linker from arm GCC 4.6 toolchain can fail with 'unresolvable R_ARM_THM_CALL relocation' error message. See https://code.google.com/p/android/issues/detail?id=35342
@ -1520,7 +1536,7 @@ endif()
# BUILD_WITH_STANDALONE_TOOLCHAIN : TRUE if standalone toolchain is used # BUILD_WITH_STANDALONE_TOOLCHAIN : TRUE if standalone toolchain is used
# ANDROID_NDK_HOST_SYSTEM_NAME : "windows", "linux-x86" or "darwin-x86" depending on host platform # ANDROID_NDK_HOST_SYSTEM_NAME : "windows", "linux-x86" or "darwin-x86" depending on host platform
# ANDROID_NDK_ABI_NAME : "armeabi", "armeabi-v7a", "x86" or "mips" depending on ANDROID_ABI # ANDROID_NDK_ABI_NAME : "armeabi", "armeabi-v7a", "x86" or "mips" depending on ANDROID_ABI
# ANDROID_NDK_RELEASE : one of r5, r5b, r5c, r6, r6b, r7, r7b, r7c, r8, r8b, r8c; set only for NDK # ANDROID_NDK_RELEASE : one of r5, r5b, r5c, r6, r6b, r7, r7b, r7c, r8, r8b, r8c, r8d; set only for NDK
# ANDROID_ARCH_NAME : "arm" or "x86" or "mips" depending on ANDROID_ABI # ANDROID_ARCH_NAME : "arm" or "x86" or "mips" depending on ANDROID_ABI
# ANDROID_SYSROOT : path to the compiler sysroot # ANDROID_SYSROOT : path to the compiler sysroot
# TOOL_OS_SUFFIX : "" or ".exe" depending on host platform # TOOL_OS_SUFFIX : "" or ".exe" depending on host platform

@ -1,2 +1,6 @@
if(BUILD_ANDROID_SERVICE)
add_subdirectory(engine) add_subdirectory(engine)
#add_subdirectory(engine_test) #add_subdirectory(engine_test)
endif()
install(FILES "readme.txt" DESTINATION "apk/" COMPONENT main)

@ -1,22 +0,0 @@
***************
Package Content
***************
The package provides new OpenCV SDK that uses OpenCV Manager for library initialization. OpenCV Manager provides the following benefits:
* Less memory usage. All apps use the same binaries from service and do not keep native libs inside them self;
* Hardware specific optimizations for all supported platforms;
* Trusted OpenCV library source. All packages with OpenCV are published on Google Play service;
* Regular updates and bug fixes;
Package consists from Library Project for Java development with Eclipse, C++ headers and libraries for native application development, javadoc samples and prebuilt binaries for ARM and X86 platforms.
To try new SDK on serial device with Google Play just install sample package and follow application messages (Google Play service access will be needed).
TO start example on device without Google Play you need to install OpenCV manager package and OpenCV binary pack for your platform from apk folder before.
See docs/doc/tutorials/introduction/android_binary_package/android_binary_package.html and docs/android/refmain.html for details about service.
On-line documentation will be available at address: http://docs.opencv.org/trunk
********
Contacts
********
Please send all feedback to Alexander Smorkalov mailto: alexander.smorkalov@itseez.com

@ -1,6 +1,40 @@
*******************************************
Manager Workflow Manager Workflow
******************************************* ****************
.. _manager_selection:
OpenCV Manager selection
------------------------
Since version 1.7 several packages of OpenCV Manager is built. Every package includes OpenCV library
for package target platform. The internal library is used for most cases, except the rare one, when
arm-v7a without NEON instruction set processor is detected. In this case additional binary package
for arm-v7a is used. The new package selection logic in most cases simplifies OpenCV installation
on end user devices. In most cases OpenCV Manager may be installed automatically from Google Play.
For such case, when Google Play is not available, i.e. emulator, developer board, etc, you can
install it manually using adb tool.
.. code-block:: sh
:linenos:
adb install OpenCV-2.4.3-android-sdk/apk/OpenCV_2.4.3_Manager_2.0_<platform_name>.apk
Use table to determine right OpenCV Manager package:
+----------------------------+-----------------+-----------------------------------------------------+
| Hardware Platform | Android version | Package name |
+============================+=================+=====================================================+
| Intel x86 | >= 2.3 | OpenCV_2.4.3_Manager_2.0_x86.apk |
+----------------------------+-----------------+-----------------------------------------------------+
| MIPS | >= 2.3 | OpenCV_2.4.3_Manager_2.0_mips.apk |
+----------------------------+-----------------+-----------------------------------------------------+
| armeabi (arm-v5, arm-v6) | >= 2.3 | OpenCV_2.4.3_Manager_2.0_armeabi.apk |
+----------------------------+-----------------+-----------------------------------------------------+
| armeabi-v7a (arm-v7a-NEON) | >= 2.3 | OpenCV_2.4.3_Manager_2.0_armv7a-neon.apk |
+----------------------------+-----------------+-----------------------------------------------------+
| armeabi-v7a (arm-v7a-NEON) | 2.2 | OpenCV_2.4.3.1_Manager_2.3_armv7a-neon-android8.apk |
+----------------------------+-----------------+-----------------------------------------------------+
First application start First application start
----------------------- -----------------------
@ -9,10 +43,10 @@ There is no OpenCV Manager or OpenCV libraries:
.. image:: img/NoService.png .. image:: img/NoService.png
Aditional library package installation Additional library package installation
-------------------------------------- ---------------------------------------
There is an OpenCV Manager service, but there is no apropriate OpenCV library. There is an OpenCV Manager service, but it does not contain appropriate OpenCV library.
If OpenCV library installation has been approved\: If OpenCV library installation has been approved\:
.. image:: img/LibInstallAproved.png .. image:: img/LibInstallAproved.png

@ -1,8 +1,8 @@
<?xml version="1.0" encoding="utf-8"?> <?xml version="1.0" encoding="utf-8"?>
<manifest xmlns:android="http://schemas.android.com/apk/res/android" <manifest xmlns:android="http://schemas.android.com/apk/res/android"
package="org.opencv.engine" package="org.opencv.engine"
android:versionCode="23@ANDROID_PLATFORM_VERSION_CODE@" android:versionCode="24@ANDROID_PLATFORM_VERSION_CODE@"
android:versionName="2.3" > android:versionName="2.4" >
<uses-sdk android:minSdkVersion="@ANDROID_NATIVE_API_LEVEL@" /> <uses-sdk android:minSdkVersion="@ANDROID_NATIVE_API_LEVEL@" />
<uses-feature android:name="android.hardware.touchscreen" android:required="false"/> <uses-feature android:name="android.hardware.touchscreen" android:required="false"/>

@ -130,7 +130,7 @@ android::String16 OpenCVEngine::GetLibraryList(android::String16 version)
LOGD("Trying to load info library \"%s\"", tmp.c_str()); LOGD("Trying to load info library \"%s\"", tmp.c_str());
void* handle; void* handle;
const char* (*info_func)(); InfoFunctionType info_func;
handle = dlopen(tmp.c_str(), RTLD_LAZY); handle = dlopen(tmp.c_str(), RTLD_LAZY);
if (handle) if (handle)
@ -138,7 +138,7 @@ android::String16 OpenCVEngine::GetLibraryList(android::String16 version)
const char* error; const char* error;
dlerror(); dlerror();
*(void **) (&info_func) = dlsym(handle, "GetLibraryList"); info_func = (InfoFunctionType)dlsym(handle, "GetLibraryList");
if ((error = dlerror()) == NULL) if ((error = dlerror()) == NULL)
{ {
result = String16((*info_func)()); result = String16((*info_func)());

@ -24,12 +24,12 @@ JNIEXPORT jlong JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_open
JNIEXPORT jstring JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_getPackageName JNIEXPORT jstring JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_getPackageName
(JNIEnv* env, jobject, jlong handle) (JNIEnv* env, jobject, jlong handle)
{ {
const char* (*info_func)(); InfoFunctionType info_func;
const char* result; const char* result;
const char* error; const char* error;
dlerror(); dlerror();
*(void **) (&info_func) = dlsym((void*)handle, "GetPackageName"); info_func = (InfoFunctionType)dlsym((void*)handle, "GetPackageName");
if ((error = dlerror()) == NULL) if ((error = dlerror()) == NULL)
result = (*info_func)(); result = (*info_func)();
else else
@ -44,12 +44,12 @@ JNIEXPORT jstring JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_getPackageNam
JNIEXPORT jstring JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_getLibraryList JNIEXPORT jstring JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_getLibraryList
(JNIEnv* env, jobject, jlong handle) (JNIEnv* env, jobject, jlong handle)
{ {
const char* (*info_func)(); InfoFunctionType info_func;
const char* result; const char* result;
const char* error; const char* error;
dlerror(); dlerror();
*(void **) (&info_func) = dlsym((void*)handle, "GetLibraryList"); info_func = (InfoFunctionType)dlsym((void*)handle, "GetLibraryList");
if ((error = dlerror()) == NULL) if ((error = dlerror()) == NULL)
result = (*info_func)(); result = (*info_func)();
else else
@ -64,12 +64,12 @@ JNIEXPORT jstring JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_getLibraryLis
JNIEXPORT jstring JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_getVersionName JNIEXPORT jstring JNICALL Java_org_opencv_engine_OpenCVLibraryInfo_getVersionName
(JNIEnv* env, jobject, jlong handle) (JNIEnv* env, jobject, jlong handle)
{ {
const char* (*info_func)(); InfoFunctionType info_func;
const char* result; const char* result;
const char* error; const char* error;
dlerror(); dlerror();
*(void **) (&info_func) = dlsym((void*)handle, "GetRevision"); info_func = (InfoFunctionType)dlsym((void*)handle, "GetRevision");
if ((error = dlerror()) == NULL) if ((error = dlerror()) == NULL)
result = (*info_func)(); result = (*info_func)();
else else

@ -144,6 +144,13 @@ int CommonPackageManager::GetHardwareRating(int platform, int cpu_id, const std:
{ {
int result = -1; int result = -1;
if ((cpu_id & ARCH_X86) || (cpu_id & ARCH_X64) || (cpu_id & ARCH_MIPS))
// Note: No raiting for x86, x64 and MIPS
// only one package is used
result = 0;
else
{
// Calculate rating for Arm
for (size_t i = 0; i < group.size(); i++) for (size_t i = 0; i < group.size(); i++)
{ {
if (group[i] == std::pair<int, int>(platform, cpu_id)) if (group[i] == std::pair<int, int>(platform, cpu_id))
@ -152,6 +159,7 @@ int CommonPackageManager::GetHardwareRating(int platform, int cpu_id, const std:
break; break;
} }
} }
}
return result; return result;
} }

@ -342,8 +342,8 @@ InstallPath(install_path)
LOGD("Trying to load info library \"%s\"", tmp.c_str()); LOGD("Trying to load info library \"%s\"", tmp.c_str());
void* handle; void* handle;
const char* (*name_func)(); InfoFunctionType name_func;
const char* (*revision_func)(); InfoFunctionType revision_func;
handle = dlopen(tmp.c_str(), RTLD_LAZY); handle = dlopen(tmp.c_str(), RTLD_LAZY);
if (handle) if (handle)
@ -351,8 +351,8 @@ InstallPath(install_path)
const char* error; const char* error;
dlerror(); dlerror();
*(void **) (&name_func) = dlsym(handle, "GetPackageName"); name_func = (InfoFunctionType)dlsym(handle, "GetPackageName");
*(void **) (&revision_func) = dlsym(handle, "GetRevision"); revision_func = (InfoFunctionType)dlsym(handle, "GetRevision");
error = dlerror(); error = dlerror();
if (!error && revision_func && name_func) if (!error && revision_func && name_func)

@ -17,4 +17,6 @@
// Class name of OpenCV engine binder object. Is needned for connection to service // Class name of OpenCV engine binder object. Is needned for connection to service
#define OPECV_ENGINE_CLASSNAME "org.opencv.engine.OpenCVEngineInterface" #define OPECV_ENGINE_CLASSNAME "org.opencv.engine.OpenCVEngineInterface"
typedef const char* (*InfoFunctionType)();
#endif #endif

@ -358,6 +358,8 @@ public class ManagerActivity extends Activity
else else
{ {
temp.put("Activity", "n"); temp.put("Activity", "n");
if (!PublicName.equals("Built-in OpenCV library"))
Tags = "safe to remove";
} }
} }
else else

@ -0,0 +1,28 @@
OpenCV Manager selection
========================
Since version 1.7 several packages of OpenCV Manager is built. Every package includes OpenCV library
for package target platform. The internal library is used for most cases, except the rare one, when
arm-v7a without NEON instruction set processor is detected. In this case additional binary package
for arm-v7a is used. The new package selection logic in most cases simplifies OpenCV installation
on end user devices. In most cases OpenCV Manager may be installed automatically from Google Play.
For such case, when Google Play is not available, i.e. emulator, developer board, etc, you can
install it manually using adb tool:
adb install OpenCV-2.4.3-android-sdk/apk/OpenCV_2.4.3.2_Manager_2.4_<platform_name>.apk
Use table to determine right OpenCV Manager package:
+----------------------------+-----------------+-----------------------------------------------------+
| Hardware Platform | Android version | Package name |
+============================+=================+=====================================================+
| Intel x86 | >= 2.3 | OpenCV_2.4.3.2_Manager_2.4_x86.apk |
+----------------------------+-----------------+-----------------------------------------------------+
| MIPS | >= 2.3 | OpenCV_2.4.3.2_Manager_2.4_mips.apk |
+----------------------------+-----------------+-----------------------------------------------------+
| armeabi (arm-v5, arm-v6) | >= 2.3 | OpenCV_2.4.3.2_Manager_2.4_armeabi.apk |
+----------------------------+-----------------+-----------------------------------------------------+
| armeabi-v7a (arm-v7a-NEON) | >= 2.3 | OpenCV_2.4.3.2_Manager_2.4_armv7a-neon.apk |
+----------------------------+-----------------+-----------------------------------------------------+
| armeabi-v7a (arm-v7a-NEON) | 2.2 | OpenCV_2.4.3.2_Manager_2.4_armv7a-neon-android8.apk |
+----------------------------+-----------------+-----------------------------------------------------+

@ -61,7 +61,7 @@ if(CMAKE_COMPILER_IS_GNUCXX)
add_extra_compiler_option(-W) add_extra_compiler_option(-W)
add_extra_compiler_option(-Wall) add_extra_compiler_option(-Wall)
add_extra_compiler_option(-Werror=return-type) add_extra_compiler_option(-Werror=return-type)
#add_extra_compiler_option(-Werror=non-virtual-dtor) add_extra_compiler_option(-Werror=non-virtual-dtor)
add_extra_compiler_option(-Werror=address) add_extra_compiler_option(-Werror=address)
add_extra_compiler_option(-Werror=sequence-point) add_extra_compiler_option(-Werror=sequence-point)
add_extra_compiler_option(-Wformat) add_extra_compiler_option(-Wformat)

@ -13,7 +13,7 @@ if (CMAKE_COMPILER_IS_GNUCXX AND NOT APPLE AND CMAKE_CXX_COMPILER_ID STREQUAL "C
return() return()
endif() endif()
find_package(CUDA 4.1) find_package(CUDA 4.2)
if(CUDA_FOUND) if(CUDA_FOUND)
set(HAVE_CUDA 1) set(HAVE_CUDA 1)
@ -26,16 +26,21 @@ if(CUDA_FOUND)
set(HAVE_CUBLAS 1) set(HAVE_CUBLAS 1)
endif() endif()
if(WITH_NVCUVID)
find_cuda_helper_libs(nvcuvid)
set(HAVE_NVCUVID 1)
endif()
message(STATUS "CUDA detected: " ${CUDA_VERSION}) message(STATUS "CUDA detected: " ${CUDA_VERSION})
if(${CUDA_VERSION_STRING} VERSION_GREATER "4.1") if (CARMA)
set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0) 3.0" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported") set(CUDA_ARCH_BIN "3.0" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
set(CUDA_ARCH_PTX "3.0" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
else() else()
set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0)" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported") set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0) 3.0" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
set(CUDA_ARCH_PTX "2.0 3.0" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
endif() endif()
set(CUDA_ARCH_PTX "2.0" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
string(REGEX REPLACE "\\." "" ARCH_BIN_NO_POINTS "${CUDA_ARCH_BIN}") string(REGEX REPLACE "\\." "" ARCH_BIN_NO_POINTS "${CUDA_ARCH_BIN}")
string(REGEX REPLACE "\\." "" ARCH_PTX_NO_POINTS "${CUDA_ARCH_PTX}") string(REGEX REPLACE "\\." "" ARCH_PTX_NO_POINTS "${CUDA_ARCH_PTX}")
@ -78,6 +83,15 @@ if(CUDA_FOUND)
set(OPENCV_CUDA_ARCH_FEATURES "${OPENCV_CUDA_ARCH_FEATURES} ${ARCH}") set(OPENCV_CUDA_ARCH_FEATURES "${OPENCV_CUDA_ARCH_FEATURES} ${ARCH}")
endforeach() endforeach()
if(CARMA)
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --target-cpu-architecture=ARM" )
if (CMAKE_VERSION VERSION_LESS 2.8.10)
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -ccbin=${CMAKE_CXX_COMPILER}" )
endif()
endif()
# These vars will be processed in other scripts # These vars will be processed in other scripts
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${NVCC_FLAGS_EXTRA}) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${NVCC_FLAGS_EXTRA})
set(OpenCV_CUDA_CC "${NVCC_FLAGS_EXTRA}") set(OpenCV_CUDA_CC "${NVCC_FLAGS_EXTRA}")
@ -92,7 +106,6 @@ if(CUDA_FOUND)
mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD CUDA_SDK_ROOT_DIR) mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD CUDA_SDK_ROOT_DIR)
unset(CUDA_npp_LIBRARY CACHE)
find_cuda_helper_libs(npp) find_cuda_helper_libs(npp)
macro(ocv_cuda_compile VAR) macro(ocv_cuda_compile VAR)

@ -19,7 +19,7 @@ IF(CMAKE_COMPILER_IS_GNUCXX)
ARGS ${CMAKE_CXX_COMPILER_ARG1} -dumpversion ARGS ${CMAKE_CXX_COMPILER_ARG1} -dumpversion
OUTPUT_VARIABLE gcc_compiler_version) OUTPUT_VARIABLE gcc_compiler_version)
#MESSAGE("GCC Version: ${gcc_compiler_version}") #MESSAGE("GCC Version: ${gcc_compiler_version}")
IF(gcc_compiler_version MATCHES "4\\.[0,2-9]\\.[0-9x]") IF(gcc_compiler_version VERSION_GREATER "4.2.-1")
SET(PCHSupport_FOUND TRUE) SET(PCHSupport_FOUND TRUE)
ENDIF() ENDIF()

@ -172,21 +172,15 @@
/* NVidia Cuda Runtime API*/ /* NVidia Cuda Runtime API*/
#cmakedefine HAVE_CUDA #cmakedefine HAVE_CUDA
/* OpenCL Support */
#cmakedefine HAVE_OPENCL
/* AMD's OpenCL Fast Fourier Transform Library*/
#cmakedefine HAVE_CLAMDFFT
/* AMD's Basic Linear Algebra Subprograms Library*/
#cmakedefine HAVE_CLAMDBLAS
/* NVidia Cuda Fast Fourier Transform (FFT) API*/ /* NVidia Cuda Fast Fourier Transform (FFT) API*/
#cmakedefine HAVE_CUFFT #cmakedefine HAVE_CUFFT
/* NVidia Cuda Basic Linear Algebra Subprograms (BLAS) API*/ /* NVidia Cuda Basic Linear Algebra Subprograms (BLAS) API*/
#cmakedefine HAVE_CUBLAS #cmakedefine HAVE_CUBLAS
/* NVidia Video Decoding API*/
#cmakedefine HAVE_NVCUVID
/* Compile for 'real' NVIDIA GPU architectures */ /* Compile for 'real' NVIDIA GPU architectures */
#define CUDA_ARCH_BIN "${OPENCV_CUDA_ARCH_BIN}" #define CUDA_ARCH_BIN "${OPENCV_CUDA_ARCH_BIN}"
@ -199,6 +193,15 @@
/* Create PTX or BIN for 1.0 compute capability */ /* Create PTX or BIN for 1.0 compute capability */
#cmakedefine CUDA_ARCH_BIN_OR_PTX_10 #cmakedefine CUDA_ARCH_BIN_OR_PTX_10
/* OpenCL Support */
#cmakedefine HAVE_OPENCL
/* AMD's OpenCL Fast Fourier Transform Library*/
#cmakedefine HAVE_CLAMDFFT
/* AMD's Basic Linear Algebra Subprograms Library*/
#cmakedefine HAVE_CLAMDBLAS
/* VideoInput library */ /* VideoInput library */
#cmakedefine HAVE_VIDEOINPUT #cmakedefine HAVE_VIDEOINPUT

@ -2,8 +2,6 @@
# CMake file for OpenCV docs # CMake file for OpenCV docs
# #
file(GLOB FILES_DOC *.htm *.txt *.jpg *.png *.pdf)
file(GLOB FILES_DOC_VS vidsurv/*.doc)
file(GLOB FILES_TEX *.tex *.sty *.bib) file(GLOB FILES_TEX *.tex *.sty *.bib)
file(GLOB FILES_TEX_PICS pics/*.png pics/*.jpg) file(GLOB FILES_TEX_PICS pics/*.png pics/*.jpg)
@ -11,6 +9,14 @@ if(BUILD_DOCS AND HAVE_SPHINX)
project(opencv_docs) project(opencv_docs)
set(DOC_LIST "${OpenCV_SOURCE_DIR}/doc/opencv-logo.png" "${OpenCV_SOURCE_DIR}/doc/opencv-logo2.png"
"${OpenCV_SOURCE_DIR}/doc/opencv-logo-white.png" "${OpenCV_SOURCE_DIR}/doc/opencv.ico"
"${OpenCV_SOURCE_DIR}/doc/haartraining.htm" "${OpenCV_SOURCE_DIR}/doc/license.txt"
"${OpenCV_SOURCE_DIR}/doc/pattern.png" "${OpenCV_SOURCE_DIR}/doc/acircles_pattern.png")
set(OPTIONAL_DOC_LIST "")
set(OPENCV2_BASE_MODULES core imgproc highgui video calib3d features2d objdetect ml flann gpu photo stitching nonfree contrib legacy) set(OPENCV2_BASE_MODULES core imgproc highgui video calib3d features2d objdetect ml flann gpu photo stitching nonfree contrib legacy)
# build lists of modules to be documented # build lists of modules to be documented
@ -81,6 +87,9 @@ if(BUILD_DOCS AND HAVE_SPHINX)
COMMENT "Generating the PDF Manuals" COMMENT "Generating the PDF Manuals"
) )
LIST(APPEND OPTIONAL_DOC_LIST "${CMAKE_BINARY_DIR}/doc/opencv2refman.pdf" "${CMAKE_BINARY_DIR}/doc/opencv2manager.pdf"
"${CMAKE_BINARY_DIR}/doc/opencv_user.pdf" "${CMAKE_BINARY_DIR}/doc/opencv_tutorials.pdf" "${CMAKE_BINARY_DIR}/doc/opencv_cheatsheet.pdf")
if(ENABLE_SOLUTION_FOLDERS) if(ENABLE_SOLUTION_FOLDERS)
set_target_properties(docs PROPERTIES FOLDER "documentation") set_target_properties(docs PROPERTIES FOLDER "documentation")
endif() endif()
@ -97,7 +106,13 @@ if(BUILD_DOCS AND HAVE_SPHINX)
if(ENABLE_SOLUTION_FOLDERS) if(ENABLE_SOLUTION_FOLDERS)
set_target_properties(html_docs PROPERTIES FOLDER "documentation") set_target_properties(html_docs PROPERTIES FOLDER "documentation")
endif() endif()
endif()
install(FILES ${FILES_DOC} DESTINATION "${OPENCV_DOC_INSTALL_PATH}" COMPONENT main) foreach(f ${DOC_LIST})
install(FILES ${FILES_DOC_VS} DESTINATION "${OPENCV_DOC_INSTALL_PATH}/vidsurv" COMPONENT main) install(FILES "${f}" DESTINATION "${OPENCV_DOC_INSTALL_PATH}" COMPONENT main)
endforeach()
foreach(f ${OPTIONAL_DOC_LIST})
install(FILES "${f}" DESTINATION "${OPENCV_DOC_INSTALL_PATH}" OPTIONAL)
endforeach()
endif()

@ -85,7 +85,7 @@ This tutorial code's is shown lines below. You can also download it from `here <
for( int i = 0; i < contours.size(); i++ ) for( int i = 0; i < contours.size(); i++ )
{ approxPolyDP( Mat(contours[i]), contours_poly[i], 3, true ); { approxPolyDP( Mat(contours[i]), contours_poly[i], 3, true );
boundRect[i] = boundingRect( Mat(contours_poly[i]) ); boundRect[i] = boundingRect( Mat(contours_poly[i]) );
minEnclosingCircle( contours_poly[i], center[i], radius[i] ); minEnclosingCircle( (Mat)contours_poly[i], center[i], radius[i] );
} }

@ -50,8 +50,8 @@ The structure of package contents looks as follows:
OpenCV-2.4.3-android-sdk OpenCV-2.4.3-android-sdk
|_ apk |_ apk
| |_ OpenCV_2.4.3_binary_pack_XXX.apk | |_ OpenCV_2.4.3_binary_pack_armv7a.apk
| |_ OpenCV_2.4.3_Manager.apk | |_ OpenCV_2.4.3_Manager_2.0_XXX.apk
| |
|_ doc |_ doc
|_ samples |_ samples
@ -85,8 +85,8 @@ The structure of package contents looks as follows:
On production devices that have access to Google Play Market (and Internet) these packages will be On production devices that have access to Google Play Market (and Internet) these packages will be
installed from Market on the first start of an application using OpenCV Manager API. installed from Market on the first start of an application using OpenCV Manager API.
But devkits without Market or Internet connection require this packages to be installed manually. But devkits without Market or Internet connection require this packages to be installed manually.
Install the `Manager.apk` and the corresponding `binary_pack.apk` depending on the device CPU, Install the `Manager.apk` and optional `binary_pack.apk` if it needed.
the Manager GUI provides this info. Below you'll see exact commands on how to do this. See :ref:`manager_selection` for details.
.. note:: Installation from Internet is the preferable way since OpenCV team may publish updated .. note:: Installation from Internet is the preferable way since OpenCV team may publish updated
versions of this packages on the Market. versions of this packages on the Market.
@ -280,21 +280,7 @@ Well, running samples from Eclipse is very simple:
To get rid of the message you will need to install `OpenCV Manager` and the appropriate `OpenCV binary pack`. To get rid of the message you will need to install `OpenCV Manager` and the appropriate `OpenCV binary pack`.
Simply tap :menuselection:`Yes` if you have *Google Play Market* installed on your device/emulator. It will redirect you to the corresponding page on *Google Play Market*. Simply tap :menuselection:`Yes` if you have *Google Play Market* installed on your device/emulator. It will redirect you to the corresponding page on *Google Play Market*.
If you have no access to the *Market*, which is often the case with emulators - you will need to install the packages from OpenCV4Android SDK folder manually. Open the console/terminal and type in the following two commands: If you have no access to the *Market*, which is often the case with emulators - you will need to install the packages from OpenCV4Android SDK folder manually. See :ref:`manager_selection` for details.
.. code-block:: sh
:linenos:
<Android SDK path>/platform-tools/adb install <OpenCV4Android SDK path>/apk/OpenCV_2.4.3_Manager.apk
<Android SDK path>/platform-tools/adb install <OpenCV4Android SDK path>/apk/OpenCV_2.4.3_binary_pack_armv7a.apk
If you're running Windows, that will probably look like this:
.. image:: images/install_opencv_manager_with_adb.png
:alt: Run these commands in the console to install OpenCV Manager
:align: center
When done, you will be able to run OpenCV samples on your device/emulator seamlessly.
* Here is ``Tutorial 2 - Use OpenCV Camera`` sample, running on top of stock camera-preview of the emulator. * Here is ``Tutorial 2 - Use OpenCV Camera`` sample, running on top of stock camera-preview of the emulator.

@ -54,20 +54,8 @@ Using async initialization is a **recommended** way for application development.
:alt: Add dependency from OpenCV library :alt: Add dependency from OpenCV library
:align: center :align: center
To run OpenCV Manager-based application for the first time you need to install package with the `OpenCV Manager` for your platform. Armeabi, Armeabi-v7a with NEON, x86 and MIPS achitectures supported. In most cases OpenCV Manager may be installed automatically from Google Play. For such case, when Google Play is not available, i.e. emulator, developer board, etc, you can
You can do it using Google Play Market or manually with ``adb`` tool: install it manually using adb tool. See :ref:`manager_selection` for details.
.. code-block:: sh
:linenos:
<Android SDK path>/platform-tools/adb install <OpenCV4Android SDK path>/apk/OpenCV_2.4.3_Manager.apk
For rare cases if NEON instruction set is not supported you need to install aditional OpenCV Library package:
.. code-block:: sh
:linenos:
<Android SDK path>/platform-tools/adb install <OpenCV4Android SDK path>/apk/OpenCV_2.4.3_binary_pack_armv7a.apk
There is a very base code snippet implementing the async initialization. It shows basic principles. See the "15-puzzle" OpenCV sample for details. There is a very base code snippet implementing the async initialization. It shows basic principles. See the "15-puzzle" OpenCV sample for details.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 16 KiB

@ -71,7 +71,9 @@ There are functions in OpenCV, especially from calib3d module, such as ``project
//... fill the array //... fill the array
Mat pointsMat = Mat(points); Mat pointsMat = Mat(points);
One can access a point in this matrix using the same method \texttt{Mat::at}: :: One can access a point in this matrix using the same method ``Mat::at`` :
::
Point2f point = pointsMat.at<Point2f>(i, 0); Point2f point = pointsMat.at<Point2f>(i, 0);
@ -109,7 +111,7 @@ Selecting a region of interest: ::
Rect r(10, 10, 100, 100); Rect r(10, 10, 100, 100);
Mat smallImg = img(r); Mat smallImg = img(r);
A convertion from \texttt{Mat} to C API data structures: :: A convertion from ``Mat`` to C API data structures: ::
Mat img = imread("image.jpg"); Mat img = imread("image.jpg");
IplImage img1 = img; IplImage img1 = img;
@ -150,7 +152,7 @@ A call to ``waitKey()`` starts a message passing cycle that waits for a key stro
double minVal, maxVal; double minVal, maxVal;
minMaxLoc(sobelx, &minVal, &maxVal); //find minimum and maximum intensities minMaxLoc(sobelx, &minVal, &maxVal); //find minimum and maximum intensities
Mat draw; Mat draw;
sobelx.convertTo(draw, CV_8U, 255.0/(maxVal - minVal), -minVal); sobelx.convertTo(draw, CV_8U, 255.0/(maxVal - minVal), -minVal * 255.0/(maxVal - minVal));
namedWindow("image", CV_WINDOW_AUTOSIZE); namedWindow("image", CV_WINDOW_AUTOSIZE);
imshow("image", draw); imshow("image", draw);

@ -3,6 +3,7 @@
#include <sys/stat.h> #include <sys/stat.h>
#include <dirent.h> #include <dirent.h>
#include <android/log.h> #include <android/log.h>
#include <cctype>
#include <string> #include <string>
#include <vector> #include <vector>
#include <algorithm> #include <algorithm>

@ -16,7 +16,7 @@ typedef perf::TestBaseWithParam<int> PointsNum;
PERF_TEST_P(PointsNum_Algo, solvePnP, PERF_TEST_P(PointsNum_Algo, solvePnP,
testing::Combine( testing::Combine(
testing::Values(4, 3*9, 7*13), testing::Values(/*4,*/ 3*9, 7*13), //TODO: find why results on 4 points are too unstable
testing::Values((int)CV_ITERATIVE, (int)CV_EPNP) testing::Values((int)CV_ITERATIVE, (int)CV_EPNP)
) )
) )

@ -2778,17 +2778,13 @@ CV_IMPL int cvStereoRectifyUncalibrated(
cvPerspectiveTransform( _m1, _m1, &H0 ); cvPerspectiveTransform( _m1, _m1, &H0 );
cvPerspectiveTransform( _m2, _m2, &H2 ); cvPerspectiveTransform( _m2, _m2, &H2 );
CvMat A = cvMat( 1, npoints, CV_64FC3, lines1 ), BxBy, B; CvMat A = cvMat( 1, npoints, CV_64FC3, lines1 ), BxBy, B;
double a[9], atb[3], x[3]; double x[3];
CvMat AtA = cvMat( 3, 3, CV_64F, a );
CvMat AtB = cvMat( 3, 1, CV_64F, atb );
CvMat X = cvMat( 3, 1, CV_64F, x ); CvMat X = cvMat( 3, 1, CV_64F, x );
cvConvertPointsHomogeneous( _m1, &A ); cvConvertPointsHomogeneous( _m1, &A );
cvReshape( &A, &A, 1, npoints ); cvReshape( &A, &A, 1, npoints );
cvReshape( _m2, &BxBy, 1, npoints ); cvReshape( _m2, &BxBy, 1, npoints );
cvGetCol( &BxBy, &B, 0 ); cvGetCol( &BxBy, &B, 0 );
cvGEMM( &A, &A, 1, 0, 0, &AtA, CV_GEMM_A_T ); cvSolve( &A, &B, &X, CV_SVD );
cvGEMM( &A, &B, 1, 0, 0, &AtB, CV_GEMM_A_T );
cvSolve( &AtA, &AtB, &X, CV_SVD_SYM );
double ha[] = double ha[] =
{ {

@ -767,8 +767,8 @@ void ChamferMatcher::Matching::findContourOrientations(const template_coords_t&
} }
// get the middle two angles // get the middle two angles
nth_element(angles.begin(), angles.begin()+M-1, angles.end()); std::nth_element(angles.begin(), angles.begin()+M-1, angles.end());
nth_element(angles.begin()+M-1, angles.begin()+M, angles.end()); std::nth_element(angles.begin()+M-1, angles.begin()+M, angles.end());
// sort(angles.begin(), angles.end()); // sort(angles.begin(), angles.end());
// average them to compute tangent // average them to compute tangent

@ -85,7 +85,7 @@ namespace
}; };
size_t colors_mum = sizeof(colors)/sizeof(colors[0]); size_t colors_mum = sizeof(colors)/sizeof(colors[0]);
#if defined __cplusplus && __cplusplus > 199711L #if (defined __cplusplus && __cplusplus > 199711L) || defined _STLPORT_MAJOR
#else #else
template<class FwIt, class T> void iota(FwIt first, FwIt last, T value) { while(first != last) *first++ = value++; } template<class FwIt, class T> void iota(FwIt first, FwIt last, T value) { while(first != last) *first++ = value++; }
#endif #endif

@ -10,7 +10,6 @@ if(HAVE_CUDA)
file(GLOB lib_cuda "src/cuda/*.cu") file(GLOB lib_cuda "src/cuda/*.cu")
ocv_cuda_compile(cuda_objs ${lib_cuda}) ocv_cuda_compile(cuda_objs ${lib_cuda})
set(cuda_link_libs ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY}) set(cuda_link_libs ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
else() else()
set(lib_cuda "") set(lib_cuda "")

@ -91,7 +91,7 @@ class SparseMat;
typedef Mat MatND; typedef Mat MatND;
class GlBuffer; class GlBuffer;
class GlTexture; class GlTexture2D;
class GlArrays; class GlArrays;
class GlCamera; class GlCamera;
@ -109,13 +109,6 @@ template<typename _Tp> class CV_EXPORTS MatIterator_;
template<typename _Tp> class CV_EXPORTS MatConstIterator_; template<typename _Tp> class CV_EXPORTS MatConstIterator_;
template<typename _Tp> class CV_EXPORTS MatCommaInitializer_; template<typename _Tp> class CV_EXPORTS MatCommaInitializer_;
#if !defined(ANDROID) || (defined(_GLIBCXX_USE_WCHAR_T) && _GLIBCXX_USE_WCHAR_T)
typedef std::basic_string<wchar_t> WString;
CV_EXPORTS string fromUtf16(const WString& str);
CV_EXPORTS WString toUtf16(const string& str);
#endif
CV_EXPORTS string format( const char* fmt, ... ); CV_EXPORTS string format( const char* fmt, ... );
CV_EXPORTS string tempfile( const char* suffix CV_DEFAULT(0)); CV_EXPORTS string tempfile( const char* suffix CV_DEFAULT(0));
@ -1284,6 +1277,8 @@ public:
operator _Tp* (); operator _Tp* ();
operator const _Tp*() const; operator const _Tp*() const;
bool operator==(const Ptr<_Tp>& ptr) const;
_Tp* obj; //< the object pointer. _Tp* obj; //< the object pointer.
int* refcount; //< the associated reference counter int* refcount; //< the associated reference counter
}; };
@ -1311,7 +1306,7 @@ public:
STD_VECTOR_MAT = 5 << KIND_SHIFT, STD_VECTOR_MAT = 5 << KIND_SHIFT,
EXPR = 6 << KIND_SHIFT, EXPR = 6 << KIND_SHIFT,
OPENGL_BUFFER = 7 << KIND_SHIFT, OPENGL_BUFFER = 7 << KIND_SHIFT,
OPENGL_TEXTURE = 8 << KIND_SHIFT, OPENGL_TEXTURE2D = 8 << KIND_SHIFT,
GPU_MAT = 9 << KIND_SHIFT GPU_MAT = 9 << KIND_SHIFT
}; };
_InputArray(); _InputArray();
@ -1328,13 +1323,13 @@ public:
_InputArray(const Scalar& s); _InputArray(const Scalar& s);
_InputArray(const double& val); _InputArray(const double& val);
_InputArray(const GlBuffer& buf); _InputArray(const GlBuffer& buf);
_InputArray(const GlTexture& tex); _InputArray(const GlTexture2D& tex);
_InputArray(const gpu::GpuMat& d_mat); _InputArray(const gpu::GpuMat& d_mat);
virtual Mat getMat(int i=-1) const; virtual Mat getMat(int i=-1) const;
virtual void getMatVector(vector<Mat>& mv) const; virtual void getMatVector(vector<Mat>& mv) const;
virtual GlBuffer getGlBuffer() const; virtual GlBuffer getGlBuffer() const;
virtual GlTexture getGlTexture() const; virtual GlTexture2D getGlTexture2D() const;
virtual gpu::GpuMat getGpuMat() const; virtual gpu::GpuMat getGpuMat() const;
virtual int kind() const; virtual int kind() const;
@ -1345,7 +1340,7 @@ public:
virtual int channels(int i=-1) const; virtual int channels(int i=-1) const;
virtual bool empty() const; virtual bool empty() const;
/*virtual*/ ~_InputArray(); virtual ~_InputArray();
int flags; int flags;
void* obj; void* obj;
@ -1385,6 +1380,8 @@ public:
template<typename _Tp, int m, int n> _OutputArray(Matx<_Tp, m, n>& matx); template<typename _Tp, int m, int n> _OutputArray(Matx<_Tp, m, n>& matx);
template<typename _Tp> _OutputArray(_Tp* vec, int n); template<typename _Tp> _OutputArray(_Tp* vec, int n);
_OutputArray(gpu::GpuMat& d_mat); _OutputArray(gpu::GpuMat& d_mat);
_OutputArray(GlBuffer& buf);
_OutputArray(GlTexture2D& tex);
_OutputArray(const Mat& m); _OutputArray(const Mat& m);
template<typename _Tp> _OutputArray(const vector<_Tp>& vec); template<typename _Tp> _OutputArray(const vector<_Tp>& vec);
@ -1395,19 +1392,23 @@ public:
template<typename _Tp, int m, int n> _OutputArray(const Matx<_Tp, m, n>& matx); template<typename _Tp, int m, int n> _OutputArray(const Matx<_Tp, m, n>& matx);
template<typename _Tp> _OutputArray(const _Tp* vec, int n); template<typename _Tp> _OutputArray(const _Tp* vec, int n);
_OutputArray(const gpu::GpuMat& d_mat); _OutputArray(const gpu::GpuMat& d_mat);
_OutputArray(const GlBuffer& buf);
_OutputArray(const GlTexture2D& tex);
virtual bool fixedSize() const; virtual bool fixedSize() const;
virtual bool fixedType() const; virtual bool fixedType() const;
virtual bool needed() const; virtual bool needed() const;
virtual Mat& getMatRef(int i=-1) const; virtual Mat& getMatRef(int i=-1) const;
virtual gpu::GpuMat& getGpuMatRef() const; virtual gpu::GpuMat& getGpuMatRef() const;
virtual GlBuffer& getGlBufferRef() const;
virtual GlTexture2D& getGlTexture2DRef() const;
virtual void create(Size sz, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const; virtual void create(Size sz, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
virtual void create(int rows, int cols, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const; virtual void create(int rows, int cols, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
virtual void create(int dims, const int* size, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const; virtual void create(int dims, const int* size, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
virtual void release() const; virtual void release() const;
virtual void clear() const; virtual void clear() const;
/*virtual*/ ~_OutputArray(); virtual ~_OutputArray();
}; };
typedef const _InputArray& InputArray; typedef const _InputArray& InputArray;

@ -152,6 +152,20 @@ namespace cv
//#undef __CV_GPU_DEPR_BEFORE__ //#undef __CV_GPU_DEPR_BEFORE__
//#undef __CV_GPU_DEPR_AFTER__ //#undef __CV_GPU_DEPR_AFTER__
namespace device
{
using cv::gpu::PtrSz;
using cv::gpu::PtrStep;
using cv::gpu::PtrStepSz;
using cv::gpu::PtrStepSzb;
using cv::gpu::PtrStepSzf;
using cv::gpu::PtrStepSzi;
using cv::gpu::PtrStepb;
using cv::gpu::PtrStepf;
using cv::gpu::PtrStepi;
}
} }
} }

@ -79,6 +79,8 @@ namespace cv { namespace gpu
WARP_SHUFFLE_FUNCTIONS = FEATURE_SET_COMPUTE_30 WARP_SHUFFLE_FUNCTIONS = FEATURE_SET_COMPUTE_30
}; };
CV_EXPORTS bool deviceSupports(FeatureSet feature_set);
// Gives information about what GPU archs this OpenCV GPU module was // Gives information about what GPU archs this OpenCV GPU module was
// compiled for // compiled for
class CV_EXPORTS TargetArchs class CV_EXPORTS TargetArchs
@ -545,22 +547,6 @@ namespace cv { namespace gpu
ensureSizeIsEnough(size.height, size.width, type, m); ensureSizeIsEnough(size.height, size.width, type, m);
} }
inline void createContinuous(int rows, int cols, int type, GpuMat& m)
{
int area = rows * cols;
if (!m.isContinuous() || m.type() != type || m.size().area() != area)
ensureSizeIsEnough(1, area, type, m);
m = m.reshape(0, rows);
}
inline void ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m)
{
if (m.type() == type && m.rows >= rows && m.cols >= cols)
m = m(Rect(0, 0, cols, rows));
else
m.create(rows, cols, type);
}
inline GpuMat allocMatFromBuf(int rows, int cols, int type, GpuMat &mat) inline GpuMat allocMatFromBuf(int rows, int cols, int type, GpuMat &mat)
{ {
if (!mat.empty() && mat.type() == type && mat.rows >= rows && mat.cols >= cols) if (!mat.empty() && mat.type() == type && mat.rows >= rows && mat.cols >= cols)

@ -750,39 +750,4 @@ typedef struct CvBigFuncTable
(tab).fn_2d[CV_32F] = (void*)FUNCNAME##_32f##FLAG; \ (tab).fn_2d[CV_32F] = (void*)FUNCNAME##_32f##FLAG; \
(tab).fn_2d[CV_64F] = (void*)FUNCNAME##_64f##FLAG (tab).fn_2d[CV_64F] = (void*)FUNCNAME##_64f##FLAG
#ifdef __cplusplus
//! OpenGL extension table
class CV_EXPORTS CvOpenGlFuncTab
{
public:
virtual ~CvOpenGlFuncTab();
virtual void genBuffers(int n, unsigned int* buffers) const = 0;
virtual void deleteBuffers(int n, const unsigned int* buffers) const = 0;
virtual void bufferData(unsigned int target, ptrdiff_t size, const void* data, unsigned int usage) const = 0;
virtual void bufferSubData(unsigned int target, ptrdiff_t offset, ptrdiff_t size, const void* data) const = 0;
virtual void bindBuffer(unsigned int target, unsigned int buffer) const = 0;
virtual void* mapBuffer(unsigned int target, unsigned int access) const = 0;
virtual void unmapBuffer(unsigned int target) const = 0;
virtual void generateBitmapFont(const std::string& family, int height, int weight, bool italic, bool underline, int start, int count, int base) const = 0;
virtual bool isGlContextInitialized() const = 0;
};
CV_EXPORTS void icvSetOpenGlFuncTab(const CvOpenGlFuncTab* tab);
CV_EXPORTS bool icvCheckGlError(const char* file, const int line, const char* func = "");
#if defined(__GNUC__)
#define CV_CheckGlError() CV_DbgAssert( (::icvCheckGlError(__FILE__, __LINE__, __func__)) )
#else
#define CV_CheckGlError() CV_DbgAssert( (::icvCheckGlError(__FILE__, __LINE__)) )
#endif
#endif //__cplusplus
#endif // __OPENCV_CORE_INTERNAL_HPP__ #endif // __OPENCV_CORE_INTERNAL_HPP__

@ -47,205 +47,212 @@
#include "opencv2/core/core.hpp" #include "opencv2/core/core.hpp"
namespace cv namespace cv {
{
CV_EXPORTS bool checkGlError(const char* file, const int line, const char* func = "");
#if defined(__GNUC__)
#define CV_CheckGlError() CV_DbgAssert( (cv::checkGlError(__FILE__, __LINE__, __func__)) )
#else
#define CV_CheckGlError() CV_DbgAssert( (cv::checkGlError(__FILE__, __LINE__)) )
#endif
/////////////////// OpenGL Objects ///////////////////
//! Smart pointer for OpenGL buffer memory with reference counting. //! Smart pointer for OpenGL buffer memory with reference counting.
class CV_EXPORTS GlBuffer class CV_EXPORTS GlBuffer
{ {
public: public:
enum Usage enum Target
{ {
ARRAY_BUFFER = 0x8892, // buffer will use for OpenGL arrays (vertices, colors, normals, etc) ARRAY_BUFFER = 0x8892, //!< The buffer will be used as a source for vertex data
TEXTURE_BUFFER = 0x88EC // buffer will ise for OpenGL textures ELEMENT_ARRAY_BUFFER = 0x8893, //!< The buffer will be used for indices (in glDrawElements, for example)
PIXEL_PACK_BUFFER = 0x88EB, //!< The buffer will be used for reading from OpenGL textures
PIXEL_UNPACK_BUFFER = 0x88EC //!< The buffer will be used for writing to OpenGL textures
};
enum Access
{
READ_ONLY = 0x88B8,
WRITE_ONLY = 0x88B9,
READ_WRITE = 0x88BA
}; };
//! create empty buffer //! create empty buffer
explicit GlBuffer(Usage usage); GlBuffer();
//! create buffer from existed buffer id
GlBuffer(int arows, int acols, int atype, unsigned int abufId, bool autoRelease = false);
GlBuffer(Size asize, int atype, unsigned int abufId, bool autoRelease = false);
//! create buffer //! create buffer
GlBuffer(int rows, int cols, int type, Usage usage); GlBuffer(int arows, int acols, int atype, Target target = ARRAY_BUFFER, bool autoRelease = false);
GlBuffer(Size size, int type, Usage usage); GlBuffer(Size asize, int atype, Target target = ARRAY_BUFFER, bool autoRelease = false);
//! copy from host/device memory //! copy from host/device memory
GlBuffer(InputArray mat, Usage usage); explicit GlBuffer(InputArray arr, Target target = ARRAY_BUFFER, bool autoRelease = false);
void create(int rows, int cols, int type, Usage usage); //! create buffer
void create(Size size, int type, Usage usage); void create(int arows, int acols, int atype, Target target = ARRAY_BUFFER, bool autoRelease = false);
void create(int rows, int cols, int type); void create(Size asize, int atype, Target target = ARRAY_BUFFER, bool autoRelease = false) { create(asize.height, asize.width, atype, target, autoRelease); }
void create(Size size, int type);
//! release memory and delete buffer object
void release(); void release();
//! set auto release mode (if true, release will be called in object's destructor)
void setAutoRelease(bool flag);
//! copy from host/device memory //! copy from host/device memory
void copyFrom(InputArray mat); void copyFrom(InputArray arr, Target target = ARRAY_BUFFER, bool autoRelease = false);
void bind() const; //! copy to host/device memory
void unbind() const; void copyTo(OutputArray arr, Target target = ARRAY_BUFFER, bool autoRelease = false) const;
//! create copy of current buffer
GlBuffer clone(Target target = ARRAY_BUFFER, bool autoRelease = false) const;
//! bind buffer for specified target
void bind(Target target) const;
//! unbind any buffers from specified target
static void unbind(Target target);
//! map to host memory //! map to host memory
Mat mapHost(); Mat mapHost(Access access);
void unmapHost(); void unmapHost();
//! map to device memory //! map to device memory
gpu::GpuMat mapDevice(); gpu::GpuMat mapDevice();
void unmapDevice(); void unmapDevice();
inline int rows() const { return rows_; } int rows() const { return rows_; }
inline int cols() const { return cols_; } int cols() const { return cols_; }
inline Size size() const { return Size(cols_, rows_); } Size size() const { return Size(cols_, rows_); }
inline bool empty() const { return rows_ == 0 || cols_ == 0; } bool empty() const { return rows_ == 0 || cols_ == 0; }
inline int type() const { return type_; } int type() const { return type_; }
inline int depth() const { return CV_MAT_DEPTH(type_); } int depth() const { return CV_MAT_DEPTH(type_); }
inline int channels() const { return CV_MAT_CN(type_); } int channels() const { return CV_MAT_CN(type_); }
inline int elemSize() const { return CV_ELEM_SIZE(type_); } int elemSize() const { return CV_ELEM_SIZE(type_); }
inline int elemSize1() const { return CV_ELEM_SIZE1(type_); } int elemSize1() const { return CV_ELEM_SIZE1(type_); }
inline Usage usage() const { return usage_; } unsigned int bufId() const;
class Impl; class Impl;
private: private:
Ptr<Impl> impl_;
int rows_; int rows_;
int cols_; int cols_;
int type_; int type_;
Usage usage_;
Ptr<Impl> impl_;
}; };
template <> CV_EXPORTS void Ptr<GlBuffer::Impl>::delete_obj(); template <> CV_EXPORTS void Ptr<GlBuffer::Impl>::delete_obj();
//! Smart pointer for OpenGL 2d texture memory with reference counting. //! Smart pointer for OpenGL 2D texture memory with reference counting.
class CV_EXPORTS GlTexture class CV_EXPORTS GlTexture2D
{ {
public: public:
enum Format
{
NONE = 0,
DEPTH_COMPONENT = 0x1902, //!< Depth
RGB = 0x1907, //!< Red, Green, Blue
RGBA = 0x1908 //!< Red, Green, Blue, Alpha
};
//! create empty texture //! create empty texture
GlTexture(); GlTexture2D();
//! create texture from existed texture id
GlTexture2D(int arows, int acols, Format aformat, unsigned int atexId, bool autoRelease = false);
GlTexture2D(Size asize, Format aformat, unsigned int atexId, bool autoRelease = false);
//! create texture //! create texture
GlTexture(int rows, int cols, int type); GlTexture2D(int arows, int acols, Format aformat, bool autoRelease = false);
GlTexture(Size size, int type); GlTexture2D(Size asize, Format aformat, bool autoRelease = false);
//! copy from host/device memory //! copy from host/device memory
explicit GlTexture(InputArray mat, bool bgra = true); explicit GlTexture2D(InputArray arr, bool autoRelease = false);
//! create texture
void create(int arows, int acols, Format aformat, bool autoRelease = false);
void create(Size asize, Format aformat, bool autoRelease = false) { create(asize.height, asize.width, aformat, autoRelease); }
void create(int rows, int cols, int type); //! release memory and delete texture object
void create(Size size, int type);
void release(); void release();
//! set auto release mode (if true, release will be called in object's destructor)
void setAutoRelease(bool flag);
//! copy from host/device memory //! copy from host/device memory
void copyFrom(InputArray mat, bool bgra = true); void copyFrom(InputArray arr, bool autoRelease = false);
//! copy to host/device memory
void copyTo(OutputArray arr, int ddepth = CV_32F, bool autoRelease = false) const;
//! bind texture to current active texture unit for GL_TEXTURE_2D target
void bind() const; void bind() const;
void unbind() const;
inline int rows() const { return rows_; } int rows() const { return rows_; }
inline int cols() const { return cols_; } int cols() const { return cols_; }
inline Size size() const { return Size(cols_, rows_); } Size size() const { return Size(cols_, rows_); }
inline bool empty() const { return rows_ == 0 || cols_ == 0; } bool empty() const { return rows_ == 0 || cols_ == 0; }
Format format() const { return format_; }
inline int type() const { return type_; } unsigned int texId() const;
inline int depth() const { return CV_MAT_DEPTH(type_); }
inline int channels() const { return CV_MAT_CN(type_); }
inline int elemSize() const { return CV_ELEM_SIZE(type_); }
inline int elemSize1() const { return CV_ELEM_SIZE1(type_); }
class Impl; class Impl;
private: private:
Ptr<Impl> impl_;
int rows_; int rows_;
int cols_; int cols_;
int type_; Format format_;
Ptr<Impl> impl_;
GlBuffer buf_;
}; };
template <> CV_EXPORTS void Ptr<GlTexture::Impl>::delete_obj(); template <> CV_EXPORTS void Ptr<GlTexture2D::Impl>::delete_obj();
//! OpenGL Arrays //! OpenGL Arrays
class CV_EXPORTS GlArrays class CV_EXPORTS GlArrays
{ {
public: public:
inline GlArrays() GlArrays();
: vertex_(GlBuffer::ARRAY_BUFFER), color_(GlBuffer::ARRAY_BUFFER), bgra_(true), normal_(GlBuffer::ARRAY_BUFFER), texCoord_(GlBuffer::ARRAY_BUFFER)
{
}
void setVertexArray(InputArray vertex); void setVertexArray(InputArray vertex);
inline void resetVertexArray() { vertex_.release(); } void resetVertexArray();
void setColorArray(InputArray color, bool bgra = true); void setColorArray(InputArray color);
inline void resetColorArray() { color_.release(); } void resetColorArray();
void setNormalArray(InputArray normal); void setNormalArray(InputArray normal);
inline void resetNormalArray() { normal_.release(); } void resetNormalArray();
void setTexCoordArray(InputArray texCoord); void setTexCoordArray(InputArray texCoord);
inline void resetTexCoordArray() { texCoord_.release(); } void resetTexCoordArray();
void release();
void setAutoRelease(bool flag);
void bind() const; void bind() const;
void unbind() const;
inline int rows() const { return vertex_.rows(); } int size() const { return size_; }
inline int cols() const { return vertex_.cols(); } bool empty() const { return size_ == 0; }
inline Size size() const { return vertex_.size(); }
inline bool empty() const { return vertex_.empty(); }
private: private:
int size_;
GlBuffer vertex_; GlBuffer vertex_;
GlBuffer color_; GlBuffer color_;
bool bgra_;
GlBuffer normal_; GlBuffer normal_;
GlBuffer texCoord_; GlBuffer texCoord_;
}; };
//! OpenGL Font /////////////////// Render Functions ///////////////////
class CV_EXPORTS GlFont
{
public:
enum Weight
{
WEIGHT_LIGHT = 300,
WEIGHT_NORMAL = 400,
WEIGHT_SEMIBOLD = 600,
WEIGHT_BOLD = 700,
WEIGHT_BLACK = 900
};
enum Style
{
STYLE_NORMAL = 0,
STYLE_ITALIC = 1,
STYLE_UNDERLINE = 2
};
static Ptr<GlFont> get(const std::string& family, int height = 12, Weight weight = WEIGHT_NORMAL, Style style = STYLE_NORMAL);
void draw(const char* str, size_t len) const;
inline const std::string& family() const { return family_; }
inline int height() const { return height_; }
inline Weight weight() const { return weight_; }
inline Style style() const { return style_; }
private:
GlFont(const std::string& family, int height, Weight weight, Style style);
std::string family_;
int height_;
Weight weight_;
Style style_;
unsigned int base_;
GlFont(const GlFont&);
GlFont& operator =(const GlFont&);
};
//! render functions
//! render texture rectangle in window //! render texture rectangle in window
CV_EXPORTS void render(const GlTexture& tex, CV_EXPORTS void render(const GlTexture2D& tex,
Rect_<double> wndRect = Rect_<double>(0.0, 0.0, 1.0, 1.0), Rect_<double> wndRect = Rect_<double>(0.0, 0.0, 1.0, 1.0),
Rect_<double> texRect = Rect_<double>(0.0, 0.0, 1.0, 1.0)); Rect_<double> texRect = Rect_<double>(0.0, 0.0, 1.0, 1.0));
@ -267,67 +274,13 @@ namespace RenderMode {
//! render OpenGL arrays //! render OpenGL arrays
CV_EXPORTS void render(const GlArrays& arr, int mode = RenderMode::POINTS, Scalar color = Scalar::all(255)); CV_EXPORTS void render(const GlArrays& arr, int mode = RenderMode::POINTS, Scalar color = Scalar::all(255));
CV_EXPORTS void render(const GlArrays& arr, InputArray indices, int mode = RenderMode::POINTS, Scalar color = Scalar::all(255));
CV_EXPORTS void render(const std::string& str, const Ptr<GlFont>& font, Scalar color, Point2d pos); namespace gpu {
//! OpenGL camera
class CV_EXPORTS GlCamera
{
public:
GlCamera();
void lookAt(Point3d eye, Point3d center, Point3d up);
void setCameraPos(Point3d pos, double yaw, double pitch, double roll);
void setScale(Point3d scale);
void setProjectionMatrix(const Mat& projectionMatrix, bool transpose = true);
void setPerspectiveProjection(double fov, double aspect, double zNear, double zFar);
void setOrthoProjection(double left, double right, double bottom, double top, double zNear, double zFar);
void setupProjectionMatrix() const;
void setupModelViewMatrix() const;
private:
Point3d eye_;
Point3d center_;
Point3d up_;
Point3d pos_;
double yaw_;
double pitch_;
double roll_;
bool useLookAtParams_;
Point3d scale_;
Mat projectionMatrix_;
double fov_;
double aspect_;
double left_;
double right_;
double bottom_;
double top_;
double zNear_;
double zFar_;
bool perspectiveProjection_;
};
inline void GlBuffer::create(Size _size, int _type, Usage _usage) { create(_size.height, _size.width, _type, _usage); }
inline void GlBuffer::create(int _rows, int _cols, int _type) { create(_rows, _cols, _type, usage()); }
inline void GlBuffer::create(Size _size, int _type) { create(_size.height, _size.width, _type, usage()); }
inline void GlTexture::create(Size _size, int _type) { create(_size.height, _size.width, _type); }
namespace gpu
{
//! set a CUDA device to use OpenGL interoperability //! set a CUDA device to use OpenGL interoperability
CV_EXPORTS void setGlDevice(int device = 0); CV_EXPORTS void setGlDevice(int device = 0);
} }
} // namespace cv } // namespace cv
#endif // __cplusplus #endif // __cplusplus

@ -64,8 +64,9 @@
#endif #endif
#elif __GNUC__*10 + __GNUC_MINOR__ >= 42 #elif __GNUC__*10 + __GNUC_MINOR__ >= 42
#if !defined WIN32 && (defined __i486__ || defined __i586__ || \ #if !(defined WIN32 || defined _WIN32) && (defined __i486__ || defined __i586__ || \
defined __i686__ || defined __MMX__ || defined __SSE__ || defined __ppc__) defined __i686__ || defined __MMX__ || defined __SSE__ || defined __ppc__) || \
(defined __GNUC__ && defined _STLPORT_MAJOR)
#define CV_XADD __sync_fetch_and_add #define CV_XADD __sync_fetch_and_add
#else #else
#include <ext/atomicity.h> #include <ext/atomicity.h>
@ -2690,6 +2691,11 @@ template<typename _Tp> template<typename _Tp2> inline const Ptr<_Tp2> Ptr<_Tp>::
return p; return p;
} }
template<typename _Tp> inline bool Ptr<_Tp>::operator==(const Ptr<_Tp>& _ptr) const
{
return refcount == _ptr.refcount;
}
//// specializied implementations of Ptr::delete_obj() for classic OpenCV types //// specializied implementations of Ptr::delete_obj() for classic OpenCV types
template<> CV_EXPORTS void Ptr<CvMat>::delete_obj(); template<> CV_EXPORTS void Ptr<CvMat>::delete_obj();

@ -295,7 +295,7 @@ void CommandLineParser::Impl::sort_params()
sort(data[i].keys.begin(), data[i].keys.end()); sort(data[i].keys.begin(), data[i].keys.end());
} }
sort (data.begin(), data.end(), cmp_params); std::sort (data.begin(), data.end(), cmp_params);
} }
string CommandLineParser::Impl::cat_string(const string& str) const string CommandLineParser::Impl::cat_string(const string& str) const

@ -44,6 +44,7 @@
#include "opencv2/gpu/device/saturate_cast.hpp" #include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/transform.hpp" #include "opencv2/gpu/device/transform.hpp"
#include "opencv2/gpu/device/functional.hpp" #include "opencv2/gpu/device/functional.hpp"
#include "opencv2/gpu/device/type_traits.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
@ -54,6 +55,7 @@ namespace cv { namespace gpu { namespace device
void writeScalar(const int*); void writeScalar(const int*);
void writeScalar(const float*); void writeScalar(const float*);
void writeScalar(const double*); void writeScalar(const double*);
void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream);
void convert_gpu(PtrStepSzb, int, PtrStepSzb, int, double, double, cudaStream_t); void convert_gpu(PtrStepSzb, int, PtrStepSzb, int, double, double, cudaStream_t);
}}} }}}
@ -226,16 +228,16 @@ namespace cv { namespace gpu { namespace device
//////////////////////////////// ConvertTo //////////////////////////////// //////////////////////////////// ConvertTo ////////////////////////////////
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
template <typename T, typename D> struct Convertor : unary_function<T, D> template <typename T, typename D, typename S> struct Convertor : unary_function<T, D>
{ {
Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {} Convertor(S alpha_, S beta_) : alpha(alpha_), beta(beta_) {}
__device__ __forceinline__ D operator()(const T& src) const __device__ __forceinline__ D operator()(typename TypeTraits<T>::ParameterType src) const
{ {
return saturate_cast<D>(alpha * src + beta); return saturate_cast<D>(alpha * src + beta);
} }
double alpha, beta; S alpha, beta;
}; };
namespace detail namespace detail
@ -282,16 +284,16 @@ namespace cv { namespace gpu { namespace device
}; };
} }
template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> > template <typename T, typename D, typename S> struct TransformFunctorTraits< Convertor<T, D, S> > : detail::ConvertTraits< Convertor<T, D, S> >
{ {
}; };
template<typename T, typename D> template<typename T, typename D, typename S>
void cvt_(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream) void cvt_(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream)
{ {
cudaSafeCall( cudaSetDoubleForDevice(&alpha) ); cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
cudaSafeCall( cudaSetDoubleForDevice(&beta) ); cudaSafeCall( cudaSetDoubleForDevice(&beta) );
Convertor<T, D> op(alpha, beta); Convertor<T, D, S> op(static_cast<S>(alpha), static_cast<S>(beta));
cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<D>)dst, op, WithOutMask(), stream); cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
} }
@ -304,36 +306,74 @@ namespace cv { namespace gpu { namespace device
{ {
typedef void (*caller_t)(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream); typedef void (*caller_t)(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream);
static const caller_t tab[8][8] = static const caller_t tab[7][7] =
{ {
{cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>, {
cvt_<uchar, int>, cvt_<uchar, float>, cvt_<uchar, double>, 0}, cvt_<uchar, uchar, float>,
cvt_<uchar, schar, float>,
{cvt_<schar, uchar>, cvt_<schar, schar>, cvt_<schar, ushort>, cvt_<schar, short>, cvt_<uchar, ushort, float>,
cvt_<schar, int>, cvt_<schar, float>, cvt_<schar, double>, 0}, cvt_<uchar, short, float>,
cvt_<uchar, int, float>,
{cvt_<ushort, uchar>, cvt_<ushort, schar>, cvt_<ushort, ushort>, cvt_<ushort, short>, cvt_<uchar, float, float>,
cvt_<ushort, int>, cvt_<ushort, float>, cvt_<ushort, double>, 0}, cvt_<uchar, double, double>
},
{cvt_<short, uchar>, cvt_<short, schar>, cvt_<short, ushort>, cvt_<short, short>, {
cvt_<short, int>, cvt_<short, float>, cvt_<short, double>, 0}, cvt_<schar, uchar, float>,
cvt_<schar, schar, float>,
{cvt_<int, uchar>, cvt_<int, schar>, cvt_<int, ushort>, cvt_<schar, ushort, float>,
cvt_<int, short>, cvt_<int, int>, cvt_<int, float>, cvt_<int, double>, 0}, cvt_<schar, short, float>,
cvt_<schar, int, float>,
{cvt_<float, uchar>, cvt_<float, schar>, cvt_<float, ushort>, cvt_<schar, float, float>,
cvt_<float, short>, cvt_<float, int>, cvt_<float, float>, cvt_<float, double>, 0}, cvt_<schar, double, double>
},
{cvt_<double, uchar>, cvt_<double, schar>, cvt_<double, ushort>, {
cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0}, cvt_<ushort, uchar, float>,
cvt_<ushort, schar, float>,
{0,0,0,0,0,0,0,0} cvt_<ushort, ushort, float>,
cvt_<ushort, short, float>,
cvt_<ushort, int, float>,
cvt_<ushort, float, float>,
cvt_<ushort, double, double>
},
{
cvt_<short, uchar, float>,
cvt_<short, schar, float>,
cvt_<short, ushort, float>,
cvt_<short, short, float>,
cvt_<short, int, float>,
cvt_<short, float, float>,
cvt_<short, double, double>
},
{
cvt_<int, uchar, float>,
cvt_<int, schar, float>,
cvt_<int, ushort, float>,
cvt_<int, short, float>,
cvt_<int, int, double>,
cvt_<int, float, double>,
cvt_<int, double, double>
},
{
cvt_<float, uchar, float>,
cvt_<float, schar, float>,
cvt_<float, ushort, float>,
cvt_<float, short, float>,
cvt_<float, int, float>,
cvt_<float, float, float>,
cvt_<float, double, double>
},
{
cvt_<double, uchar, double>,
cvt_<double, schar, double>,
cvt_<double, ushort, double>,
cvt_<double, short, double>,
cvt_<double, int, double>,
cvt_<double, float, double>,
cvt_<double, double, double>
}
}; };
caller_t func = tab[sdepth][ddepth]; caller_t func = tab[sdepth][ddepth];
if (!func)
cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__, "convert_gpu");
func(src, dst, alpha, beta, stream); func(src, dst, alpha, beta, stream);
} }

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -45,8 +45,7 @@
#include <iostream> #include <iostream>
#ifdef HAVE_CUDA #ifdef HAVE_CUDA
#include <cuda.h> #include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <npp.h> #include <npp.h>
#define CUDART_MINIMUM_REQUIRED_VERSION 4010 #define CUDART_MINIMUM_REQUIRED_VERSION 4010
@ -69,33 +68,89 @@ using namespace cv::gpu;
namespace namespace
{ {
// Compares value to set using the given comparator. Returns true if class CudaArch
// there is at least one element x in the set satisfying to: x cmp value {
// predicate. public:
template <typename Comparer> CudaArch();
bool compareToSet(const std::string& set_as_str, int value, Comparer cmp)
bool builtWith(FeatureSet feature_set) const;
bool hasPtx(int major, int minor) const;
bool hasBin(int major, int minor) const;
bool hasEqualOrLessPtx(int major, int minor) const;
bool hasEqualOrGreaterPtx(int major, int minor) const;
bool hasEqualOrGreaterBin(int major, int minor) const;
private:
static void fromStr(const string& set_as_str, vector<int>& arr);
vector<int> bin;
vector<int> ptx;
vector<int> features;
};
const CudaArch cudaArch;
CudaArch::CudaArch()
{
#ifdef HAVE_CUDA
fromStr(CUDA_ARCH_BIN, bin);
fromStr(CUDA_ARCH_PTX, ptx);
fromStr(CUDA_ARCH_FEATURES, features);
#endif
}
bool CudaArch::builtWith(FeatureSet feature_set) const
{
return !features.empty() && (features.back() >= feature_set);
}
bool CudaArch::hasPtx(int major, int minor) const
{
return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end();
}
bool CudaArch::hasBin(int major, int minor) const
{
return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end();
}
bool CudaArch::hasEqualOrLessPtx(int major, int minor) const
{
return !ptx.empty() && (ptx.front() <= major * 10 + minor);
}
bool CudaArch::hasEqualOrGreaterPtx(int major, int minor) const
{
return !ptx.empty() && (ptx.back() >= major * 10 + minor);
}
bool CudaArch::hasEqualOrGreaterBin(int major, int minor) const
{
return !bin.empty() && (bin.back() >= major * 10 + minor);
}
void CudaArch::fromStr(const string& set_as_str, vector<int>& arr)
{ {
if (set_as_str.find_first_not_of(" ") == string::npos) if (set_as_str.find_first_not_of(" ") == string::npos)
return false; return;
std::stringstream stream(set_as_str); istringstream stream(set_as_str);
int cur_value; int cur_value;
while (!stream.eof()) while (!stream.eof())
{ {
stream >> cur_value; stream >> cur_value;
if (cmp(cur_value, value)) arr.push_back(cur_value);
return true;
} }
return false; sort(arr.begin(), arr.end());
} }
} }
bool cv::gpu::TargetArchs::builtWith(cv::gpu::FeatureSet feature_set) bool cv::gpu::TargetArchs::builtWith(cv::gpu::FeatureSet feature_set)
{ {
#if defined (HAVE_CUDA) #if defined (HAVE_CUDA)
return ::compareToSet(CUDA_ARCH_FEATURES, feature_set, std::greater_equal<int>()); return cudaArch.builtWith(feature_set);
#else #else
(void)feature_set; (void)feature_set;
return false; return false;
@ -110,7 +165,7 @@ bool cv::gpu::TargetArchs::has(int major, int minor)
bool cv::gpu::TargetArchs::hasPtx(int major, int minor) bool cv::gpu::TargetArchs::hasPtx(int major, int minor)
{ {
#if defined (HAVE_CUDA) #if defined (HAVE_CUDA)
return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor, std::equal_to<int>()); return cudaArch.hasPtx(major, minor);
#else #else
(void)major; (void)major;
(void)minor; (void)minor;
@ -121,7 +176,7 @@ bool cv::gpu::TargetArchs::hasPtx(int major, int minor)
bool cv::gpu::TargetArchs::hasBin(int major, int minor) bool cv::gpu::TargetArchs::hasBin(int major, int minor)
{ {
#if defined (HAVE_CUDA) #if defined (HAVE_CUDA)
return ::compareToSet(CUDA_ARCH_BIN, major * 10 + minor, std::equal_to<int>()); return cudaArch.hasBin(major, minor);
#else #else
(void)major; (void)major;
(void)minor; (void)minor;
@ -132,8 +187,7 @@ bool cv::gpu::TargetArchs::hasBin(int major, int minor)
bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor) bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor)
{ {
#if defined (HAVE_CUDA) #if defined (HAVE_CUDA)
return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor, return cudaArch.hasEqualOrLessPtx(major, minor);
std::less_equal<int>());
#else #else
(void)major; (void)major;
(void)minor; (void)minor;
@ -143,14 +197,13 @@ bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor)
bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor) bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor)
{ {
return hasEqualOrGreaterPtx(major, minor) || return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor);
hasEqualOrGreaterBin(major, minor);
} }
bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor) bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor)
{ {
#if defined (HAVE_CUDA) #if defined (HAVE_CUDA)
return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor, std::greater_equal<int>()); return cudaArch.hasEqualOrGreaterPtx(major, minor);
#else #else
(void)major; (void)major;
(void)minor; (void)minor;
@ -161,8 +214,7 @@ bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor)
bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor) bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor)
{ {
#if defined (HAVE_CUDA) #if defined (HAVE_CUDA)
return ::compareToSet(CUDA_ARCH_BIN, major * 10 + minor, return cudaArch.hasEqualOrGreaterBin(major, minor);
std::greater_equal<int>());
#else #else
(void)major; (void)major;
(void)minor; (void)minor;
@ -170,6 +222,31 @@ bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor)
#endif #endif
} }
bool cv::gpu::deviceSupports(FeatureSet feature_set)
{
static int versions[] =
{
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
};
static const int cache_size = static_cast<int>(sizeof(versions) / sizeof(versions[0]));
const int devId = getDevice();
int version;
if (devId < cache_size && versions[devId] >= 0)
version = versions[devId];
else
{
DeviceInfo dev(devId);
version = dev.majorVersion() * 10 + dev.minorVersion();
if (devId < cache_size)
versions[devId] = version;
}
return TargetArchs::builtWith(feature_set) && (version >= feature_set);
}
#if !defined (HAVE_CUDA) #if !defined (HAVE_CUDA)
#define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support") #define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support")
@ -316,18 +393,6 @@ void cv::gpu::DeviceInfo::queryMemory(size_t& free_memory, size_t& total_memory)
namespace namespace
{ {
template <class T> void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
{
*attribute = T();
//CUresult error = CUDA_SUCCESS;// = cuDeviceGetAttribute( attribute, device_attribute, device ); why link erros under ubuntu??
CUresult error = cuDeviceGetAttribute( attribute, device_attribute, device );
if( CUDA_SUCCESS == error )
return;
printf("Driver API error = %04d\n", error);
cv::gpu::error("driver API error", __FILE__, __LINE__);
}
int convertSMVer2Cores(int major, int minor) int convertSMVer2Cores(int major, int minor)
{ {
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
@ -336,7 +401,7 @@ namespace
int Cores; int Cores;
} SMtoCores; } SMtoCores;
SMtoCores gpuArchCoresPerSM[] = { { 0x10, 8 }, { 0x11, 8 }, { 0x12, 8 }, { 0x13, 8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, { -1, -1 } }; SMtoCores gpuArchCoresPerSM[] = { { 0x10, 8 }, { 0x11, 8 }, { 0x12, 8 }, { 0x13, 8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 } };
int index = 0; int index = 0;
while (gpuArchCoresPerSM[index].SM != -1) while (gpuArchCoresPerSM[index].SM != -1)
@ -345,7 +410,7 @@ namespace
return gpuArchCoresPerSM[index].Cores; return gpuArchCoresPerSM[index].Cores;
index++; index++;
} }
printf("MapSMtoCores undefined SMversion %d.%d!\n", major, minor);
return -1; return -1;
} }
} }
@ -383,21 +448,12 @@ void cv::gpu::printCudaDeviceInfo(int device)
printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100); printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
printf(" CUDA Capability Major/Minor version number: %d.%d\n", prop.major, prop.minor); printf(" CUDA Capability Major/Minor version number: %d.%d\n", prop.major, prop.minor);
printf(" Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem); printf(" Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem);
printf(" (%2d) Multiprocessors x (%2d) CUDA Cores/MP: %d CUDA Cores\n",
prop.multiProcessorCount, convertSMVer2Cores(prop.major, prop.minor),
convertSMVer2Cores(prop.major, prop.minor) * prop.multiProcessorCount);
printf(" GPU Clock Speed: %.2f GHz\n", prop.clockRate * 1e-6f);
// This is not available in the CUDA Runtime API, so we make the necessary calls the driver API to support this for output int cores = convertSMVer2Cores(prop.major, prop.minor);
int memoryClock, memBusWidth, L2CacheSize; if (cores > 0)
getCudaAttribute<int>( &memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev ); printf(" (%2d) Multiprocessors x (%2d) CUDA Cores/MP: %d CUDA Cores\n", prop.multiProcessorCount, cores, cores * prop.multiProcessorCount);
getCudaAttribute<int>( &memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev );
getCudaAttribute<int>( &L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev );
printf(" Memory Clock rate: %.2f Mhz\n", memoryClock * 1e-3f); printf(" GPU Clock Speed: %.2f GHz\n", prop.clockRate * 1e-6f);
printf(" Memory Bus Width: %d-bit\n", memBusWidth);
if (L2CacheSize)
printf(" L2 Cache Size: %d bytes\n", L2CacheSize);
printf(" Max Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n", printf(" Max Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n",
prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1], prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1],
@ -458,7 +514,12 @@ void cv::gpu::printShortCudaDeviceInfo(int device)
const char *arch_str = prop.major < 2 ? " (not Fermi)" : ""; const char *arch_str = prop.major < 2 ? " (not Fermi)" : "";
printf("Device %d: \"%s\" %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f); printf("Device %d: \"%s\" %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f);
printf(", sm_%d%d%s, %d cores", prop.major, prop.minor, arch_str, convertSMVer2Cores(prop.major, prop.minor) * prop.multiProcessorCount); printf(", sm_%d%d%s", prop.major, prop.minor, arch_str);
int cores = convertSMVer2Cores(prop.major, prop.minor);
if (cores > 0)
printf(", %d cores", cores * prop.multiProcessorCount);
printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100); printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
} }
fflush(stdout); fflush(stdout);
@ -704,6 +765,43 @@ cv::Mat::Mat(const GpuMat& m) : flags(0), dims(0), rows(0), cols(0), data(0), re
m.download(*this); m.download(*this);
} }
void cv::gpu::createContinuous(int rows, int cols, int type, GpuMat& m)
{
int area = rows * cols;
if (m.empty() || m.type() != type || !m.isContinuous() || m.size().area() < area)
m.create(1, area, type);
m.cols = cols;
m.rows = rows;
m.step = m.elemSize() * cols;
m.flags |= Mat::CONTINUOUS_FLAG;
}
void cv::gpu::ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m)
{
if (m.empty() || m.type() != type || m.data != m.datastart)
m.create(rows, cols, type);
else
{
const size_t esz = m.elemSize();
const ptrdiff_t delta2 = m.dataend - m.datastart;
const size_t minstep = m.cols * esz;
Size wholeSize;
wholeSize.height = std::max(static_cast<int>((delta2 - minstep) / m.step + 1), m.rows);
wholeSize.width = std::max(static_cast<int>((delta2 - m.step * (wholeSize.height - 1)) / esz), m.cols);
if (wholeSize.height < rows || wholeSize.width < cols)
m.create(rows, cols, type);
else
{
m.cols = cols;
m.rows = rows;
}
}
}
namespace namespace
{ {
class GpuFuncTable class GpuFuncTable

@ -922,8 +922,8 @@ _InputArray::_InputArray(const Mat& m) : flags(MAT), obj((void*)&m) {}
_InputArray::_InputArray(const vector<Mat>& vec) : flags(STD_VECTOR_MAT), obj((void*)&vec) {} _InputArray::_InputArray(const vector<Mat>& vec) : flags(STD_VECTOR_MAT), obj((void*)&vec) {}
_InputArray::_InputArray(const double& val) : flags(FIXED_TYPE + FIXED_SIZE + MATX + CV_64F), obj((void*)&val), sz(Size(1,1)) {} _InputArray::_InputArray(const double& val) : flags(FIXED_TYPE + FIXED_SIZE + MATX + CV_64F), obj((void*)&val), sz(Size(1,1)) {}
_InputArray::_InputArray(const MatExpr& expr) : flags(FIXED_TYPE + FIXED_SIZE + EXPR), obj((void*)&expr) {} _InputArray::_InputArray(const MatExpr& expr) : flags(FIXED_TYPE + FIXED_SIZE + EXPR), obj((void*)&expr) {}
_InputArray::_InputArray(const GlBuffer& buf) : flags(FIXED_TYPE + FIXED_SIZE + OPENGL_BUFFER), obj((void*)&buf) {} _InputArray::_InputArray(const GlBuffer& buf) : flags(OPENGL_BUFFER), obj((void*)&buf) {}
_InputArray::_InputArray(const GlTexture& tex) : flags(FIXED_TYPE + FIXED_SIZE + OPENGL_TEXTURE), obj((void*)&tex) {} _InputArray::_InputArray(const GlTexture2D &tex) : flags(OPENGL_TEXTURE2D), obj((void*)&tex) {}
_InputArray::_InputArray(const gpu::GpuMat& d_mat) : flags(GPU_MAT), obj((void*)&d_mat) {} _InputArray::_InputArray(const gpu::GpuMat& d_mat) : flags(GPU_MAT), obj((void*)&d_mat) {}
Mat _InputArray::getMat(int i) const Mat _InputArray::getMat(int i) const
@ -1076,14 +1076,14 @@ GlBuffer _InputArray::getGlBuffer() const
} }
} }
GlTexture _InputArray::getGlTexture() const GlTexture2D _InputArray::getGlTexture2D() const
{ {
int k = kind(); int k = kind();
CV_Assert(k == OPENGL_TEXTURE); CV_Assert(k == OPENGL_TEXTURE2D);
//if( k == OPENGL_TEXTURE ) //if( k == OPENGL_TEXTURE )
{ {
const GlTexture* tex = (const GlTexture*)obj; const GlTexture2D* tex = (const GlTexture2D*)obj;
return *tex; return *tex;
} }
} }
@ -1168,10 +1168,10 @@ Size _InputArray::size(int i) const
return buf->size(); return buf->size();
} }
if( k == OPENGL_TEXTURE ) if( k == OPENGL_TEXTURE2D )
{ {
CV_Assert( i < 0 ); CV_Assert( i < 0 );
const GlTexture* tex = (const GlTexture*)obj; const GlTexture2D* tex = (const GlTexture2D*)obj;
return tex->size(); return tex->size();
} }
@ -1186,6 +1186,24 @@ Size _InputArray::size(int i) const
size_t _InputArray::total(int i) const size_t _InputArray::total(int i) const
{ {
int k = kind();
if( k == MAT )
{
CV_Assert( i < 0 );
return ((const Mat*)obj)->total();
}
if( k == STD_VECTOR_MAT )
{
const vector<Mat>& vv = *(const vector<Mat>*)obj;
if( i < 0 )
return vv.size();
CV_Assert( i < (int)vv.size() );
return vv[i].total();
}
return size(i).area(); return size(i).area();
} }
@ -1216,9 +1234,6 @@ int _InputArray::type(int i) const
if( k == OPENGL_BUFFER ) if( k == OPENGL_BUFFER )
return ((const GlBuffer*)obj)->type(); return ((const GlBuffer*)obj)->type();
if( k == OPENGL_TEXTURE )
return ((const GlTexture*)obj)->type();
CV_Assert( k == GPU_MAT ); CV_Assert( k == GPU_MAT );
//if( k == GPU_MAT ) //if( k == GPU_MAT )
return ((const gpu::GpuMat*)obj)->type(); return ((const gpu::GpuMat*)obj)->type();
@ -1271,8 +1286,8 @@ bool _InputArray::empty() const
if( k == OPENGL_BUFFER ) if( k == OPENGL_BUFFER )
return ((const GlBuffer*)obj)->empty(); return ((const GlBuffer*)obj)->empty();
if( k == OPENGL_TEXTURE ) if( k == OPENGL_TEXTURE2D )
return ((const GlTexture*)obj)->empty(); return ((const GlTexture2D*)obj)->empty();
CV_Assert( k == GPU_MAT ); CV_Assert( k == GPU_MAT );
//if( k == GPU_MAT ) //if( k == GPU_MAT )
@ -1285,10 +1300,14 @@ _OutputArray::~_OutputArray() {}
_OutputArray::_OutputArray(Mat& m) : _InputArray(m) {} _OutputArray::_OutputArray(Mat& m) : _InputArray(m) {}
_OutputArray::_OutputArray(vector<Mat>& vec) : _InputArray(vec) {} _OutputArray::_OutputArray(vector<Mat>& vec) : _InputArray(vec) {}
_OutputArray::_OutputArray(gpu::GpuMat& d_mat) : _InputArray(d_mat) {} _OutputArray::_OutputArray(gpu::GpuMat& d_mat) : _InputArray(d_mat) {}
_OutputArray::_OutputArray(GlBuffer& buf) : _InputArray(buf) {}
_OutputArray::_OutputArray(GlTexture2D& tex) : _InputArray(tex) {}
_OutputArray::_OutputArray(const Mat& m) : _InputArray(m) {flags |= FIXED_SIZE|FIXED_TYPE;} _OutputArray::_OutputArray(const Mat& m) : _InputArray(m) {flags |= FIXED_SIZE|FIXED_TYPE;}
_OutputArray::_OutputArray(const vector<Mat>& vec) : _InputArray(vec) {flags |= FIXED_SIZE;} _OutputArray::_OutputArray(const vector<Mat>& vec) : _InputArray(vec) {flags |= FIXED_SIZE;}
_OutputArray::_OutputArray(const gpu::GpuMat& d_mat) : _InputArray(d_mat) {flags |= FIXED_SIZE|FIXED_TYPE;} _OutputArray::_OutputArray(const gpu::GpuMat& d_mat) : _InputArray(d_mat) {flags |= FIXED_SIZE|FIXED_TYPE;}
_OutputArray::_OutputArray(const GlBuffer& buf) : _InputArray(buf) {flags |= FIXED_SIZE|FIXED_TYPE;}
_OutputArray::_OutputArray(const GlTexture2D& tex) : _InputArray(tex) {flags |= FIXED_SIZE|FIXED_TYPE;}
bool _OutputArray::fixedSize() const bool _OutputArray::fixedSize() const
@ -1318,6 +1337,13 @@ void _OutputArray::create(Size _sz, int mtype, int i, bool allowTransposed, int
((gpu::GpuMat*)obj)->create(_sz, mtype); ((gpu::GpuMat*)obj)->create(_sz, mtype);
return; return;
} }
if( k == OPENGL_BUFFER && i < 0 && !allowTransposed && fixedDepthMask == 0 )
{
CV_Assert(!fixedSize() || ((GlBuffer*)obj)->size() == _sz);
CV_Assert(!fixedType() || ((GlBuffer*)obj)->type() == mtype);
((GlBuffer*)obj)->create(_sz, mtype);
return;
}
int sizes[] = {_sz.height, _sz.width}; int sizes[] = {_sz.height, _sz.width};
create(2, sizes, mtype, i, allowTransposed, fixedDepthMask); create(2, sizes, mtype, i, allowTransposed, fixedDepthMask);
} }
@ -1339,6 +1365,13 @@ void _OutputArray::create(int rows, int cols, int mtype, int i, bool allowTransp
((gpu::GpuMat*)obj)->create(rows, cols, mtype); ((gpu::GpuMat*)obj)->create(rows, cols, mtype);
return; return;
} }
if( k == OPENGL_BUFFER && i < 0 && !allowTransposed && fixedDepthMask == 0 )
{
CV_Assert(!fixedSize() || ((GlBuffer*)obj)->size() == Size(cols, rows));
CV_Assert(!fixedType() || ((GlBuffer*)obj)->type() == mtype);
((GlBuffer*)obj)->create(rows, cols, mtype);
return;
}
int sizes[] = {rows, cols}; int sizes[] = {rows, cols};
create(2, sizes, mtype, i, allowTransposed, fixedDepthMask); create(2, sizes, mtype, i, allowTransposed, fixedDepthMask);
} }
@ -1558,6 +1591,18 @@ void _OutputArray::release() const
return; return;
} }
if( k == OPENGL_BUFFER )
{
((GlBuffer*)obj)->release();
return;
}
if( k == OPENGL_TEXTURE2D )
{
((GlTexture2D*)obj)->release();
return;
}
if( k == NONE ) if( k == NONE )
return; return;
@ -1623,6 +1668,20 @@ gpu::GpuMat& _OutputArray::getGpuMatRef() const
return *(gpu::GpuMat*)obj; return *(gpu::GpuMat*)obj;
} }
GlBuffer& _OutputArray::getGlBufferRef() const
{
int k = kind();
CV_Assert( k == OPENGL_BUFFER );
return *(GlBuffer*)obj;
}
GlTexture2D& _OutputArray::getGlTexture2DRef() const
{
int k = kind();
CV_Assert( k == OPENGL_TEXTURE2D );
return *(GlTexture2D*)obj;
}
static _OutputArray _none; static _OutputArray _none;
OutputArray noArray() { return _none; } OutputArray noArray() { return _none; }

File diff suppressed because it is too large Load Diff

@ -45,7 +45,6 @@
#include <ctype.h> #include <ctype.h>
#include <deque> #include <deque>
#include <iterator> #include <iterator>
#include <wchar.h>
#define USE_ZLIB 1 #define USE_ZLIB 1
@ -156,35 +155,6 @@ cv::string cv::FileStorage::getDefaultObjectName(const string& _filename)
return cv::string(name); return cv::string(name);
} }
namespace cv
{
#if !defined(ANDROID) || (defined(_GLIBCXX_USE_WCHAR_T) && _GLIBCXX_USE_WCHAR_T)
string fromUtf16(const WString& str)
{
cv::AutoBuffer<char> _buf(str.size()*4 + 1);
char* buf = _buf;
size_t sz = wcstombs(buf, str.c_str(), str.size());
if( sz == (size_t)-1 )
return string();
buf[sz] = '\0';
return string(buf);
}
WString toUtf16(const string& str)
{
cv::AutoBuffer<wchar_t> _buf(str.size() + 1);
wchar_t* buf = _buf;
size_t sz = mbstowcs(buf, str.c_str(), str.size());
if( sz == (size_t)-1 )
return WString();
buf[sz] = '\0';
return WString(buf);
}
#endif
}
typedef struct CvGenericHash typedef struct CvGenericHash
{ {
CV_SET_FIELDS() CV_SET_FIELDS()
@ -5200,6 +5170,7 @@ void FileStorage::release()
string FileStorage::releaseAndGetString() string FileStorage::releaseAndGetString()
{ {
string buf; string buf;
buf.reserve(16); // HACK: Work around for compiler bug
if( fs.obj && fs.obj->outbuf ) if( fs.obj && fs.obj->outbuf )
icvClose(fs.obj, &buf); icvClose(fs.obj, &buf);

@ -359,26 +359,24 @@ string format( const char* fmt, ... )
string tempfile( const char* suffix ) string tempfile( const char* suffix )
{ {
const char *temp_dir = getenv("OPENCV_TEMP_PATH");
string fname;
#if defined WIN32 || defined _WIN32 #if defined WIN32 || defined _WIN32
char temp_dir[MAX_PATH + 1] = { 0 }; char temp_dir2[MAX_PATH + 1] = { 0 };
char temp_file[MAX_PATH + 1] = { 0 }; char temp_file[MAX_PATH + 1] = { 0 };
::GetTempPathA(sizeof(temp_dir), temp_dir); if (temp_dir == 0 || temp_dir[0] == 0)
{
::GetTempPathA(sizeof(temp_dir2), temp_dir2);
temp_dir = temp_dir2;
}
if(0 == ::GetTempFileNameA(temp_dir, "ocv", 0, temp_file)) if(0 == ::GetTempFileNameA(temp_dir, "ocv", 0, temp_file))
return string(); return string();
DeleteFileA(temp_file); DeleteFileA(temp_file);
string name = temp_file; fname = temp_file;
if(suffix)
{
if (suffix[0] != '.')
return name + "." + suffix;
else
return name + suffix;
}
else
return name;
# else # else
# ifdef ANDROID # ifdef ANDROID
//char defaultTemplate[] = "/mnt/sdcard/__opencv_temp.XXXXXX"; //char defaultTemplate[] = "/mnt/sdcard/__opencv_temp.XXXXXX";
@ -387,8 +385,6 @@ string tempfile( const char* suffix )
char defaultTemplate[] = "/tmp/__opencv_temp.XXXXXX"; char defaultTemplate[] = "/tmp/__opencv_temp.XXXXXX";
# endif # endif
string fname;
const char *temp_dir = getenv("OPENCV_TEMP_PATH");
if (temp_dir == 0 || temp_dir[0] == 0) if (temp_dir == 0 || temp_dir[0] == 0)
fname = defaultTemplate; fname = defaultTemplate;
else else
@ -401,19 +397,20 @@ string tempfile( const char* suffix )
} }
const int fd = mkstemp((char*)fname.c_str()); const int fd = mkstemp((char*)fname.c_str());
if(fd == -1) return ""; if (fd == -1) return string();
close(fd); close(fd);
remove(fname.c_str()); remove(fname.c_str());
# endif
if (suffix) if (suffix)
{ {
if (suffix[0] != '.') if (suffix[0] != '.')
fname = fname + "." + suffix; return fname + "." + suffix;
else else
fname += suffix; return fname + suffix;
} }
return fname; return fname;
# endif
} }
static CvErrorCallback customErrorCallback = 0; static CvErrorCallback customErrorCallback = 0;

@ -150,7 +150,7 @@ void generateData( Mat& query, Mat& train, const int sourceType )
// in ascending order. General boundaries of the perturbation // in ascending order. General boundaries of the perturbation
// are (0.f, 1.f). // are (0.f, 1.f).
train.create( query.rows*countFactor, query.cols, sourceType ); train.create( query.rows*countFactor, query.cols, sourceType );
float step = 1.f / countFactor; float step = (sourceType == CV_8U ? 256.f : 1.f) / countFactor;
for( int qIdx = 0; qIdx < query.rows; qIdx++ ) for( int qIdx = 0; qIdx < query.rows; qIdx++ )
{ {
Mat queryDescriptor = query.row(qIdx); Mat queryDescriptor = query.row(qIdx);
@ -161,7 +161,7 @@ void generateData( Mat& query, Mat& train, const int sourceType )
queryDescriptor.copyTo( trainDescriptor ); queryDescriptor.copyTo( trainDescriptor );
int elem = rng(dim); int elem = rng(dim);
float diff = rng.uniform( step*c, step*(c+1) ); float diff = rng.uniform( step*c, step*(c+1) );
trainDescriptor.at<float>(0, elem) += diff; trainDescriptor.col(elem) += diff;
} }
} }
} }

@ -31,7 +31,7 @@ PERF_TEST_P(fast, detect, testing::Combine(
declare.in(frame); declare.in(frame);
Ptr<FeatureDetector> fd = Algorithm::create<FeatureDetector>("Feature2D.FAST"); Ptr<FeatureDetector> fd = Algorithm::create<FeatureDetector>("Feature2D.FAST");
ASSERT_FALSE( fd == 0 ); ASSERT_FALSE( fd.empty() );
fd->set("threshold", 20); fd->set("threshold", 20);
fd->set("nonmaxSuppression", true); fd->set("nonmaxSuppression", true);
fd->set("type", type); fd->set("type", type);

@ -232,7 +232,7 @@ void KeyPointsFilter::runByImageBorder( vector<KeyPoint>& keypoints, Size imageS
if (imageSize.height <= borderSize * 2 || imageSize.width <= borderSize * 2) if (imageSize.height <= borderSize * 2 || imageSize.width <= borderSize * 2)
keypoints.clear(); keypoints.clear();
else else
keypoints.erase( remove_if(keypoints.begin(), keypoints.end(), keypoints.erase( std::remove_if(keypoints.begin(), keypoints.end(),
RoiPredicate(Rect(Point(borderSize, borderSize), RoiPredicate(Rect(Point(borderSize, borderSize),
Point(imageSize.width - borderSize, imageSize.height - borderSize)))), Point(imageSize.width - borderSize, imageSize.height - borderSize)))),
keypoints.end() ); keypoints.end() );
@ -259,7 +259,7 @@ void KeyPointsFilter::runByKeypointSize( vector<KeyPoint>& keypoints, float minS
CV_Assert( maxSize >= 0); CV_Assert( maxSize >= 0);
CV_Assert( minSize <= maxSize ); CV_Assert( minSize <= maxSize );
keypoints.erase( remove_if(keypoints.begin(), keypoints.end(), SizePredicate(minSize, maxSize)), keypoints.erase( std::remove_if(keypoints.begin(), keypoints.end(), SizePredicate(minSize, maxSize)),
keypoints.end() ); keypoints.end() );
} }
@ -282,7 +282,7 @@ void KeyPointsFilter::runByPixelsMask( vector<KeyPoint>& keypoints, const Mat& m
if( mask.empty() ) if( mask.empty() )
return; return;
keypoints.erase(remove_if(keypoints.begin(), keypoints.end(), MaskPredicate(mask)), keypoints.end()); keypoints.erase(std::remove_if(keypoints.begin(), keypoints.end(), MaskPredicate(mask)), keypoints.end());
} }
struct KeyPoint_LessThan struct KeyPoint_LessThan

@ -77,7 +77,7 @@ DescriptorMatcher::DescriptorCollection::DescriptorCollection()
DescriptorMatcher::DescriptorCollection::DescriptorCollection( const DescriptorCollection& collection ) DescriptorMatcher::DescriptorCollection::DescriptorCollection( const DescriptorCollection& collection )
{ {
mergedDescriptors = collection.mergedDescriptors.clone(); mergedDescriptors = collection.mergedDescriptors.clone();
copy( collection.startIdxs.begin(), collection.startIdxs.begin(), startIdxs.begin() ); std::copy( collection.startIdxs.begin(), collection.startIdxs.begin(), startIdxs.begin() );
} }
DescriptorMatcher::DescriptorCollection::~DescriptorCollection() DescriptorMatcher::DescriptorCollection::~DescriptorCollection()
@ -531,7 +531,7 @@ void FlannBasedMatcher::train()
void FlannBasedMatcher::read( const FileNode& fn) void FlannBasedMatcher::read( const FileNode& fn)
{ {
if (indexParams == 0) if (indexParams.empty())
indexParams = new flann::IndexParams(); indexParams = new flann::IndexParams();
FileNode ip = fn["indexParams"]; FileNode ip = fn["indexParams"];
@ -570,7 +570,7 @@ void FlannBasedMatcher::read( const FileNode& fn)
}; };
} }
if (searchParams == 0) if (searchParams.empty())
searchParams = new flann::SearchParams(); searchParams = new flann::SearchParams();
FileNode sp = fn["searchParams"]; FileNode sp = fn["searchParams"];
@ -807,9 +807,9 @@ GenericDescriptorMatcher::KeyPointCollection::KeyPointCollection( const KeyPoint
keypoints.resize( collection.keypoints.size() ); keypoints.resize( collection.keypoints.size() );
for( size_t i = 0; i < keypoints.size(); i++ ) for( size_t i = 0; i < keypoints.size(); i++ )
copy( collection.keypoints[i].begin(), collection.keypoints[i].end(), keypoints[i].begin() ); std::copy( collection.keypoints[i].begin(), collection.keypoints[i].end(), keypoints[i].begin() );
copy( collection.startIndices.begin(), collection.startIndices.end(), startIndices.begin() ); std::copy( collection.startIndices.begin(), collection.startIndices.end(), startIndices.begin() );
} }
void GenericDescriptorMatcher::KeyPointCollection::add( const vector<Mat>& _images, void GenericDescriptorMatcher::KeyPointCollection::add( const vector<Mat>& _images,

@ -52,6 +52,8 @@
#include "opencv2/imgproc/imgproc_c.h" #include "opencv2/imgproc/imgproc_c.h"
#include "opencv2/core/internal.hpp" #include "opencv2/core/internal.hpp"
#include <algorithm>
#ifdef HAVE_TEGRA_OPTIMIZATION #ifdef HAVE_TEGRA_OPTIMIZATION
#include "opencv2/features2d/features2d_tegra.hpp" #include "opencv2/features2d/features2d_tegra.hpp"
#endif #endif

@ -22,17 +22,14 @@ source_group("Device" FILES ${lib_device_hdrs})
source_group("Device\\Detail" FILES ${lib_device_hdrs_detail}) source_group("Device\\Detail" FILES ${lib_device_hdrs_detail})
if (HAVE_CUDA) if (HAVE_CUDA)
file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp") file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp" "src/nvidia/*.h*")
file(GLOB_RECURSE ncv_cuda "src/nvidia/*.cu") file(GLOB_RECURSE ncv_cuda "src/nvidia/*.cu")
file(GLOB_RECURSE ncv_hdrs "src/nvidia/*.hpp" "src/nvidia/*.h") set(ncv_files ${ncv_srcs} ${ncv_cuda})
set(ncv_files ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda})
source_group("Src\\NVidia" FILES ${ncv_files}) source_group("Src\\NVidia" FILES ${ncv_files})
ocv_include_directories("src/nvidia" "src/nvidia/core" "src/nvidia/NPP_staging" ${CUDA_INCLUDE_DIRS}) ocv_include_directories("src/nvidia" "src/nvidia/core" "src/nvidia/NPP_staging" ${CUDA_INCLUDE_DIRS})
ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wmissing-declarations /wd4211 /wd4201 /wd4100 /wd4505 /wd4408) ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wmissing-declarations /wd4211 /wd4201 /wd4100 /wd4505 /wd4408)
string(REPLACE "-Wsign-promo" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") string(REPLACE "-Wsign-promo" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
#set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-keep")
#set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;/EHsc-;") #set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;/EHsc-;")
if(MSVC) if(MSVC)
@ -47,23 +44,18 @@ if (HAVE_CUDA)
ocv_cuda_compile(cuda_objs ${lib_cuda} ${ncv_cuda}) ocv_cuda_compile(cuda_objs ${lib_cuda} ${ncv_cuda})
#CUDA_BUILD_CLEAN_TARGET()
set(cuda_link_libs ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY}) set(cuda_link_libs ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
if(NOT APPLE) if(WITH_NVCUVID)
unset(CUDA_nvcuvid_LIBRARY CACHE)
find_cuda_helper_libs(nvcuvid)
set(cuda_link_libs ${cuda_link_libs} ${CUDA_nvcuvid_LIBRARY}) set(cuda_link_libs ${cuda_link_libs} ${CUDA_nvcuvid_LIBRARY})
endif() endif()
if(WIN32) if(WIN32)
unset(CUDA_nvcuvenc_LIBRARY CACHE)
find_cuda_helper_libs(nvcuvenc) find_cuda_helper_libs(nvcuvenc)
set(cuda_link_libs ${cuda_link_libs} ${CUDA_nvcuvenc_LIBRARY}) set(cuda_link_libs ${cuda_link_libs} ${CUDA_nvcuvenc_LIBRARY})
endif() endif()
if(NOT APPLE AND WITH_FFMPEG) if(WITH_FFMPEG)
set(cuda_link_libs ${cuda_link_libs} ${HIGHGUI_LIBRARIES}) set(cuda_link_libs ${cuda_link_libs} ${HIGHGUI_LIBRARIES})
endif() endif()
else() else()

@ -0,0 +1,10 @@
cmake_minimum_required(VERSION 2.8.3)
project(nv_perf_test)
find_package(OpenCV REQUIRED)
include_directories(${OpenCV_INCLUDE_DIR})
add_executable(${PROJECT_NAME} main.cpp)
target_link_libraries(${PROJECT_NAME} ${OpenCV_LIBS})

Binary file not shown.

After

Width:  |  Height:  |  Size: 140 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 140 KiB

@ -0,0 +1,489 @@
#include <cstdio>
#define HAVE_CUDA 1
#include <opencv2/core/core.hpp>
#include <opencv2/gpu/gpu.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/video/video.hpp>
#include <opencv2/legacy/legacy.hpp>
#include <opencv2/ts/ts.hpp>
#include <opencv2/ts/ts_perf.hpp>
static void printOsInfo()
{
#if defined _WIN32
# if defined _WIN64
printf("[----------]\n[ GPU INFO ] \tRun on OS Windows x64.\n[----------]\n"); fflush(stdout);
# else
printf("[----------]\n[ GPU INFO ] \tRun on OS Windows x32.\n[----------]\n"); fflush(stdout);
# endif
#elif defined linux
# if defined _LP64
printf("[----------]\n[ GPU INFO ] \tRun on OS Linux x64.\n[----------]\n"); fflush(stdout);
# else
printf("[----------]\n[ GPU INFO ] \tRun on OS Linux x32.\n[----------]\n"); fflush(stdout);
# endif
#elif defined __APPLE__
# if defined _LP64
printf("[----------]\n[ GPU INFO ] \tRun on OS Apple x64.\n[----------]\n"); fflush(stdout);
# else
printf("[----------]\n[ GPU INFO ] \tRun on OS Apple x32.\n[----------]\n"); fflush(stdout);
# endif
#endif
}
static void printCudaInfo()
{
const int deviceCount = cv::gpu::getCudaEnabledDeviceCount();
printf("[----------]\n"); fflush(stdout);
printf("[ GPU INFO ] \tCUDA device count:: %d.\n", deviceCount); fflush(stdout);
printf("[----------]\n"); fflush(stdout);
for (int i = 0; i < deviceCount; ++i)
{
cv::gpu::DeviceInfo info(i);
printf("[----------]\n"); fflush(stdout);
printf("[ DEVICE ] \t# %d %s.\n", i, info.name().c_str()); fflush(stdout);
printf("[ ] \tCompute capability: %d.%d\n", info.majorVersion(), info.minorVersion()); fflush(stdout);
printf("[ ] \tMulti Processor Count: %d\n", info.multiProcessorCount()); fflush(stdout);
printf("[ ] \tTotal memory: %d Mb\n", static_cast<int>(static_cast<int>(info.totalMemory() / 1024.0) / 1024.0)); fflush(stdout);
printf("[ ] \tFree memory: %d Mb\n", static_cast<int>(static_cast<int>(info.freeMemory() / 1024.0) / 1024.0)); fflush(stdout);
if (!info.isCompatible())
printf("[ GPU INFO ] \tThis device is NOT compatible with current GPU module build\n");
printf("[----------]\n"); fflush(stdout);
}
}
int main(int argc, char* argv[])
{
printOsInfo();
printCudaInfo();
perf::Regression::Init("nv_perf_test");
perf::TestBase::Init(argc, argv);
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
#define DEF_PARAM_TEST(name, ...) typedef ::perf::TestBaseWithParam< std::tr1::tuple< __VA_ARGS__ > > name
#define DEF_PARAM_TEST_1(name, param_type) typedef ::perf::TestBaseWithParam< param_type > name
//////////////////////////////////////////////////////////
// HoughLinesP
DEF_PARAM_TEST_1(Image, std::string);
PERF_TEST_P(Image, HoughLinesP,
testing::Values(std::string("im1_1280x800.jpg")))
{
declare.time(30.0);
std::string fileName = GetParam();
const double rho = 1.0;
const double theta = 1.0;
const int threshold = 40;
const int minLineLenght = 20;
const int maxLineGap = 5;
cv::Mat image = cv::imread(fileName, cv::IMREAD_GRAYSCALE);
if (PERF_RUN_GPU())
{
cv::gpu::GpuMat d_image(image);
cv::gpu::GpuMat d_lines;
cv::gpu::HoughLinesBuf d_buf;
cv::gpu::HoughLinesP(d_image, d_lines, d_buf, rho, theta, minLineLenght, maxLineGap);
TEST_CYCLE()
{
cv::gpu::HoughLinesP(d_image, d_lines, d_buf, rho, theta, minLineLenght, maxLineGap);
}
}
else
{
cv::Mat mask;
cv::Canny(image, mask, 50, 100);
std::vector<cv::Vec4i> lines;
cv::HoughLinesP(mask, lines, rho, theta, threshold, minLineLenght, maxLineGap);
TEST_CYCLE()
{
cv::HoughLinesP(mask, lines, rho, theta, threshold, minLineLenght, maxLineGap);
}
}
SANITY_CHECK(0);
}
//////////////////////////////////////////////////////////
// GoodFeaturesToTrack
DEF_PARAM_TEST(Image_Depth, std::string, perf::MatDepth);
PERF_TEST_P(Image_Depth, GoodFeaturesToTrack,
testing::Combine(
testing::Values(std::string("im1_1280x800.jpg")),
testing::Values(CV_8U, CV_16U)
))
{
declare.time(60);
const std::string fileName = std::tr1::get<0>(GetParam());
const int depth = std::tr1::get<1>(GetParam());
const int maxCorners = 5000;
const double qualityLevel = 0.05;
const int minDistance = 5;
const int blockSize = 3;
const bool useHarrisDetector = true;
const double k = 0.05;
cv::Mat src = cv::imread(fileName, cv::IMREAD_GRAYSCALE);
if (src.empty())
FAIL() << "Unable to load source image [" << fileName << "]";
if (depth != CV_8U)
src.convertTo(src, depth);
cv::Mat mask(src.size(), CV_8UC1, cv::Scalar::all(1));
mask(cv::Rect(0, 0, 100, 100)).setTo(cv::Scalar::all(0));
if (PERF_RUN_GPU())
{
cv::gpu::GoodFeaturesToTrackDetector_GPU d_detector(maxCorners, qualityLevel, minDistance, blockSize, useHarrisDetector, k);
cv::gpu::GpuMat d_src(src);
cv::gpu::GpuMat d_mask(mask);
cv::gpu::GpuMat d_pts;
d_detector(d_src, d_pts, d_mask);
TEST_CYCLE()
{
d_detector(d_src, d_pts, d_mask);
}
}
else
{
if (depth != CV_8U)
FAIL() << "Unsupported depth";
cv::Mat pts;
cv::goodFeaturesToTrack(src, pts, maxCorners, qualityLevel, minDistance, mask, blockSize, useHarrisDetector, k);
TEST_CYCLE()
{
cv::goodFeaturesToTrack(src, pts, maxCorners, qualityLevel, minDistance, mask, blockSize, useHarrisDetector, k);
}
}
SANITY_CHECK(0);
}
//////////////////////////////////////////////////////////
// OpticalFlowPyrLKSparse
typedef std::pair<std::string, std::string> string_pair;
DEF_PARAM_TEST(ImagePair_Depth_GraySource, string_pair, perf::MatDepth, bool);
PERF_TEST_P(ImagePair_Depth_GraySource, OpticalFlowPyrLKSparse,
testing::Combine(
testing::Values(string_pair("im1_1280x800.jpg", "im2_1280x800.jpg")),
testing::Values(CV_8U, CV_16U),
testing::Bool()
))
{
declare.time(60);
const string_pair fileNames = std::tr1::get<0>(GetParam());
const int depth = std::tr1::get<1>(GetParam());
const bool graySource = std::tr1::get<2>(GetParam());
// PyrLK params
const cv::Size winSize(15, 15);
const int maxLevel = 5;
const cv::TermCriteria criteria(cv::TermCriteria::COUNT + cv::TermCriteria::EPS, 30, 0.01);
// GoodFeaturesToTrack params
const int maxCorners = 5000;
const double qualityLevel = 0.05;
const int minDistance = 5;
const int blockSize = 3;
const bool useHarrisDetector = true;
const double k = 0.05;
cv::Mat src1 = cv::imread(fileNames.first, graySource ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
if (src1.empty())
FAIL() << "Unable to load source image [" << fileNames.first << "]";
cv::Mat src2 = cv::imread(fileNames.second, graySource ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
if (src2.empty())
FAIL() << "Unable to load source image [" << fileNames.second << "]";
cv::Mat gray_src;
if (graySource)
gray_src = src1;
else
cv::cvtColor(src1, gray_src, cv::COLOR_BGR2GRAY);
cv::Mat pts;
cv::goodFeaturesToTrack(gray_src, pts, maxCorners, qualityLevel, minDistance, cv::noArray(), blockSize, useHarrisDetector, k);
if (depth != CV_8U)
{
src1.convertTo(src1, depth);
src2.convertTo(src2, depth);
}
if (PERF_RUN_GPU())
{
cv::gpu::GpuMat d_src1(src1);
cv::gpu::GpuMat d_src2(src2);
cv::gpu::GpuMat d_pts(pts.reshape(2, 1));
cv::gpu::GpuMat d_nextPts;
cv::gpu::GpuMat d_status;
cv::gpu::PyrLKOpticalFlow d_pyrLK;
d_pyrLK.winSize = winSize;
d_pyrLK.maxLevel = maxLevel;
d_pyrLK.iters = criteria.maxCount;
d_pyrLK.useInitialFlow = false;
d_pyrLK.sparse(d_src1, d_src2, d_pts, d_nextPts, d_status);
TEST_CYCLE()
{
d_pyrLK.sparse(d_src1, d_src2, d_pts, d_nextPts, d_status);
}
}
else
{
if (depth != CV_8U)
FAIL() << "Unsupported depth";
cv::Mat nextPts;
cv::Mat status;
cv::calcOpticalFlowPyrLK(src1, src2, pts, nextPts, status, cv::noArray(), winSize, maxLevel, criteria);
TEST_CYCLE()
{
cv::calcOpticalFlowPyrLK(src1, src2, pts, nextPts, status, cv::noArray(), winSize, maxLevel, criteria);
}
}
SANITY_CHECK(0);
}
//////////////////////////////////////////////////////////
// OpticalFlowFarneback
DEF_PARAM_TEST(ImagePair_Depth, string_pair, perf::MatDepth);
PERF_TEST_P(ImagePair_Depth, OpticalFlowFarneback,
testing::Combine(
testing::Values(string_pair("im1_1280x800.jpg", "im2_1280x800.jpg")),
testing::Values(CV_8U, CV_16U)
))
{
declare.time(500);
const string_pair fileNames = std::tr1::get<0>(GetParam());
const int depth = std::tr1::get<1>(GetParam());
const double pyrScale = 0.5;
const int numLevels = 6;
const int winSize = 7;
const int numIters = 15;
const int polyN = 7;
const double polySigma = 1.5;
const int flags = cv::OPTFLOW_USE_INITIAL_FLOW;
cv::Mat src1 = cv::imread(fileNames.first, cv::IMREAD_GRAYSCALE);
if (src1.empty())
FAIL() << "Unable to load source image [" << fileNames.first << "]";
cv::Mat src2 = cv::imread(fileNames.second, cv::IMREAD_GRAYSCALE);
if (src2.empty())
FAIL() << "Unable to load source image [" << fileNames.second << "]";
if (depth != CV_8U)
{
src1.convertTo(src1, depth);
src2.convertTo(src2, depth);
}
if (PERF_RUN_GPU())
{
cv::gpu::GpuMat d_src1(src1);
cv::gpu::GpuMat d_src2(src2);
cv::gpu::GpuMat d_u(src1.size(), CV_32FC1, cv::Scalar::all(0));
cv::gpu::GpuMat d_v(src1.size(), CV_32FC1, cv::Scalar::all(0));
cv::gpu::FarnebackOpticalFlow d_farneback;
d_farneback.pyrScale = pyrScale;
d_farneback.numLevels = numLevels;
d_farneback.winSize = winSize;
d_farneback.numIters = numIters;
d_farneback.polyN = polyN;
d_farneback.polySigma = polySigma;
d_farneback.flags = flags;
d_farneback(d_src1, d_src2, d_u, d_v);
TEST_CYCLE_N(10)
{
d_farneback(d_src1, d_src2, d_u, d_v);
}
}
else
{
if (depth != CV_8U)
FAIL() << "Unsupported depth";
cv::Mat flow(src1.size(), CV_32FC2, cv::Scalar::all(0));
cv::calcOpticalFlowFarneback(src1, src2, flow, pyrScale, numLevels, winSize, numIters, polyN, polySigma, flags);
TEST_CYCLE_N(10)
{
cv::calcOpticalFlowFarneback(src1, src2, flow, pyrScale, numLevels, winSize, numIters, polyN, polySigma, flags);
}
}
SANITY_CHECK(0);
}
//////////////////////////////////////////////////////////
// OpticalFlowBM
void calcOpticalFlowBM(const cv::Mat& prev, const cv::Mat& curr,
cv::Size bSize, cv::Size shiftSize, cv::Size maxRange, int usePrevious,
cv::Mat& velx, cv::Mat& vely)
{
cv::Size sz((curr.cols - bSize.width + shiftSize.width)/shiftSize.width, (curr.rows - bSize.height + shiftSize.height)/shiftSize.height);
velx.create(sz, CV_32FC1);
vely.create(sz, CV_32FC1);
CvMat cvprev = prev;
CvMat cvcurr = curr;
CvMat cvvelx = velx;
CvMat cvvely = vely;
cvCalcOpticalFlowBM(&cvprev, &cvcurr, bSize, shiftSize, maxRange, usePrevious, &cvvelx, &cvvely);
}
DEF_PARAM_TEST(ImagePair_BlockSize_ShiftSize_MaxRange, string_pair, cv::Size, cv::Size, cv::Size);
PERF_TEST_P(ImagePair_BlockSize_ShiftSize_MaxRange, OpticalFlowBM,
testing::Combine(
testing::Values(string_pair("im1_1280x800.jpg", "im2_1280x800.jpg")),
testing::Values(cv::Size(16, 16)),
testing::Values(cv::Size(2, 2)),
testing::Values(cv::Size(16, 16))
))
{
declare.time(1000);
const string_pair fileNames = std::tr1::get<0>(GetParam());
const cv::Size block_size = std::tr1::get<1>(GetParam());
const cv::Size shift_size = std::tr1::get<2>(GetParam());
const cv::Size max_range = std::tr1::get<3>(GetParam());
cv::Mat src1 = cv::imread(fileNames.first, cv::IMREAD_GRAYSCALE);
if (src1.empty())
FAIL() << "Unable to load source image [" << fileNames.first << "]";
cv::Mat src2 = cv::imread(fileNames.second, cv::IMREAD_GRAYSCALE);
if (src2.empty())
FAIL() << "Unable to load source image [" << fileNames.second << "]";
if (PERF_RUN_GPU())
{
cv::gpu::GpuMat d_src1(src1);
cv::gpu::GpuMat d_src2(src2);
cv::gpu::GpuMat d_velx, d_vely, buf;
cv::gpu::calcOpticalFlowBM(d_src1, d_src2, block_size, shift_size, max_range, false, d_velx, d_vely, buf);
TEST_CYCLE_N(10)
{
cv::gpu::calcOpticalFlowBM(d_src1, d_src2, block_size, shift_size, max_range, false, d_velx, d_vely, buf);
}
}
else
{
cv::Mat velx, vely;
calcOpticalFlowBM(src1, src2, block_size, shift_size, max_range, false, velx, vely);
TEST_CYCLE_N(10)
{
calcOpticalFlowBM(src1, src2, block_size, shift_size, max_range, false, velx, vely);
}
}
SANITY_CHECK(0);
}
PERF_TEST_P(ImagePair_BlockSize_ShiftSize_MaxRange, FastOpticalFlowBM,
testing::Combine(
testing::Values(string_pair("im1_1280x800.jpg", "im2_1280x800.jpg")),
testing::Values(cv::Size(16, 16)),
testing::Values(cv::Size(1, 1)),
testing::Values(cv::Size(16, 16))
))
{
declare.time(1000);
const string_pair fileNames = std::tr1::get<0>(GetParam());
const cv::Size block_size = std::tr1::get<1>(GetParam());
const cv::Size shift_size = std::tr1::get<2>(GetParam());
const cv::Size max_range = std::tr1::get<3>(GetParam());
cv::Mat src1 = cv::imread(fileNames.first, cv::IMREAD_GRAYSCALE);
if (src1.empty())
FAIL() << "Unable to load source image [" << fileNames.first << "]";
cv::Mat src2 = cv::imread(fileNames.second, cv::IMREAD_GRAYSCALE);
if (src2.empty())
FAIL() << "Unable to load source image [" << fileNames.second << "]";
if (PERF_RUN_GPU())
{
cv::gpu::GpuMat d_src1(src1);
cv::gpu::GpuMat d_src2(src2);
cv::gpu::GpuMat d_velx, d_vely;
cv::gpu::FastOpticalFlowBM fastBM;
fastBM(d_src1, d_src2, d_velx, d_vely, max_range.width, block_size.width);
TEST_CYCLE_N(10)
{
fastBM(d_src1, d_src2, d_velx, d_vely, max_range.width, block_size.width);
}
}
else
{
cv::Mat velx, vely;
calcOpticalFlowBM(src1, src2, block_size, shift_size, max_range, false, velx, vely);
TEST_CYCLE_N(10)
{
calcOpticalFlowBM(src1, src2, block_size, shift_size, max_range, false, velx, vely);
}
}
SANITY_CHECK(0);
}

@ -199,6 +199,91 @@ Returns block descriptors computed for the whole image.
The function is mainly used to learn the classifier. The function is mainly used to learn the classifier.
Soft Cascade Classifier
==========================
Soft Cascade Classifier for Object Detection
----------------------------------------------------------
Cascade detectors have been shown to operate extremely rapidly, with high accuracy, and have important applications in different spheres. The initial goal for this cascade implementation was the fast and accurate pedestrian detector but it also useful in general. Soft cascade is trained with AdaBoost. But instead of training sequence of stages, the soft cascade is trained as a one long stage of T weak classifiers. Soft cascade is formulated as follows:
.. math::
\texttt{H}(x) = \sum _{\texttt{t}=1..\texttt{T}} {\texttt{s}_t(x)}
where :math:`\texttt{s}_t(x) = \alpha_t\texttt{h}_t(x)` are the set of thresholded weak classifiers selected during AdaBoost training scaled by the associated weights. Let
.. math::
\texttt{H}_t(x) = \sum _{\texttt{i}=1..\texttt{t}} {\texttt{s}_i(x)}
be the partial sum of sample responses before :math:`t`-the weak classifier will be applied. The funtcion :math:`\texttt{H}_t(x)` of :math:`t` for sample :math:`x` named *sample trace*.
After each weak classifier evaluation, the sample trace at the point :math:`t` is compared with the rejection threshold :math:`r_t`. The sequence of :math:`r_t` named *rejection trace*.
The sample has been rejected if it fall rejection threshold. So stageless cascade allows to reject not-object sample as soon as possible. Another meaning of the sample trace is a confidence with that sample recognized as desired object. At each :math:`t` that confidence depend on all previous weak classifier. This feature of soft cascade is resulted in more accurate detection. The original formulation of soft cascade can be found in [BJ05]_.
.. [BJ05] Lubomir Bourdev and Jonathan Brandt. tRobust Object Detection Via Soft Cascade. IEEE CVPR, 2005.
.. [BMTG12] Rodrigo Benenson, Markus Mathias, Radu Timofte and Luc Van Gool. Pedestrian detection at 100 frames per second. IEEE CVPR, 2012.
gpu::SCascade
-----------------------------------------------
.. ocv:class:: gpu::SCascade : public Algorithm
Implementation of soft (stageless) cascaded detector. ::
class CV_EXPORTS SCascade : public Algorithm
{
struct CV_EXPORTS Detection
{
ushort x;
ushort y;
ushort w;
ushort h;
float confidence;
int kind;
enum {PEDESTRIAN = 0};
};
SCascade(const double minScale = 0.4, const double maxScale = 5., const int scales = 55, const int rejfactor = 1);
virtual ~SCascade();
virtual bool load(const FileNode& fn);
virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const;
virtual void genRoi(InputArray roi, OutputArray mask, Stream& stream = Stream::Null()) const;
};
gpu::SCascade::~SCascade
---------------------------
Destructor for SCascade.
.. ocv:function:: gpu::SCascade::~SCascade()
gpu::SCascade::load
--------------------------
Load cascade from FileNode.
.. ocv:function:: bool gpu::SCascade::load(const FileNode& fn)
:param fn: File node from which the soft cascade are read.
gpu::SCascade::detect
--------------------------
Apply cascade to an input frame and return the vector of Decection objcts.
.. ocv:function:: void gpu::SCascade::detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const
:param image: a frame on which detector will be applied.
:param rois: a regions of interests mask generated by genRoi. Only the objects that fall into one of the regions will be returned.
:param objects: an output array of Detections represented as GpuMat of detections (SCascade::Detection). The first element of the matrix is actually a count of detections.
:param stream: a high-level CUDA stream abstraction used for asynchronous execution.
gpu::CascadeClassifier_GPU gpu::CascadeClassifier_GPU
-------------------------- --------------------------

@ -85,8 +85,6 @@ static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int
cv::gpu::error(cudaGetErrorString(err), file, line, func); cv::gpu::error(cudaGetErrorString(err), file, line, func);
} }
#ifdef __CUDACC__
namespace cv { namespace gpu namespace cv { namespace gpu
{ {
__host__ __device__ __forceinline__ int divUp(int total, int grain) __host__ __device__ __forceinline__ int divUp(int total, int grain)
@ -96,19 +94,25 @@ namespace cv { namespace gpu
namespace device namespace device
{ {
using cv::gpu::divUp;
#ifdef __CUDACC__
typedef unsigned char uchar; typedef unsigned char uchar;
typedef unsigned short ushort; typedef unsigned short ushort;
typedef signed char schar; typedef signed char schar;
#ifdef WIN32
typedef unsigned int uint; typedef unsigned int uint;
#endif
template<class T> inline void bindTexture(const textureReference* tex, const PtrStepSz<T>& img) template<class T> inline void bindTexture(const textureReference* tex, const PtrStepSz<T>& img)
{ {
cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>(); cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) ); cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
} }
#endif // __CUDACC__
} }
}} }}
#endif // __CUDACC__
#endif // __OPENCV_GPU_COMMON_HPP__ #endif // __OPENCV_GPU_COMMON_HPP__

@ -807,9 +807,9 @@ namespace cv { namespace gpu { namespace device
template <int bidx, typename T, typename D> static __device__ __forceinline__ void RGB2XYZConvert(const T* src, D& dst) template <int bidx, typename T, typename D> static __device__ __forceinline__ void RGB2XYZConvert(const T* src, D& dst)
{ {
dst.z = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[6] + src[1] * c_RGB2XYZ_D65i[7] + src[bidx] * c_RGB2XYZ_D65i[8], xyz_shift));
dst.x = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[0] + src[1] * c_RGB2XYZ_D65i[1] + src[bidx] * c_RGB2XYZ_D65i[2], xyz_shift)); dst.x = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[0] + src[1] * c_RGB2XYZ_D65i[1] + src[bidx] * c_RGB2XYZ_D65i[2], xyz_shift));
dst.y = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[3] + src[1] * c_RGB2XYZ_D65i[4] + src[bidx] * c_RGB2XYZ_D65i[5], xyz_shift)); dst.y = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[3] + src[1] * c_RGB2XYZ_D65i[4] + src[bidx] * c_RGB2XYZ_D65i[5], xyz_shift));
dst.z = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[6] + src[1] * c_RGB2XYZ_D65i[7] + src[bidx] * c_RGB2XYZ_D65i[8], xyz_shift));
} }
template <int bidx> static __device__ __forceinline__ uint RGB2XYZConvert(uint src) template <int bidx> static __device__ __forceinline__ uint RGB2XYZConvert(uint src)

@ -0,0 +1,361 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_GPU_REDUCE_DETAIL_HPP__
#define __OPENCV_GPU_REDUCE_DETAIL_HPP__
#include <thrust/tuple.h>
#include "../warp.hpp"
#include "../warp_shuffle.hpp"
namespace cv { namespace gpu { namespace device
{
namespace reduce_detail
{
template <typename T> struct GetType;
template <typename T> struct GetType<T*>
{
typedef T type;
};
template <typename T> struct GetType<volatile T*>
{
typedef T type;
};
template <typename T> struct GetType<T&>
{
typedef T type;
};
template <unsigned int I, unsigned int N>
struct For
{
template <class PointerTuple, class ValTuple>
static __device__ void loadToSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
{
thrust::get<I>(smem)[tid] = thrust::get<I>(val);
For<I + 1, N>::loadToSmem(smem, val, tid);
}
template <class PointerTuple, class ValTuple>
static __device__ void loadFromSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
{
thrust::get<I>(val) = thrust::get<I>(smem)[tid];
For<I + 1, N>::loadFromSmem(smem, val, tid);
}
template <class PointerTuple, class ValTuple, class OpTuple>
static __device__ void merge(const PointerTuple& smem, const ValTuple& val, unsigned int tid, unsigned int delta, const OpTuple& op)
{
typename GetType<typename thrust::tuple_element<I, PointerTuple>::type>::type reg = thrust::get<I>(smem)[tid + delta];
thrust::get<I>(smem)[tid] = thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
For<I + 1, N>::merge(smem, val, tid, delta, op);
}
template <class ValTuple, class OpTuple>
static __device__ void mergeShfl(const ValTuple& val, unsigned int delta, unsigned int width, const OpTuple& op)
{
typename GetType<typename thrust::tuple_element<I, ValTuple>::type>::type reg = shfl_down(thrust::get<I>(val), delta, width);
thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
For<I + 1, N>::mergeShfl(val, delta, width, op);
}
};
template <unsigned int N>
struct For<N, N>
{
template <class PointerTuple, class ValTuple>
static __device__ void loadToSmem(const PointerTuple&, const ValTuple&, unsigned int)
{
}
template <class PointerTuple, class ValTuple>
static __device__ void loadFromSmem(const PointerTuple&, const ValTuple&, unsigned int)
{
}
template <class PointerTuple, class ValTuple, class OpTuple>
static __device__ void merge(const PointerTuple&, const ValTuple&, unsigned int, unsigned int, const OpTuple&)
{
}
template <class ValTuple, class OpTuple>
static __device__ void mergeShfl(const ValTuple&, unsigned int, unsigned int, const OpTuple&)
{
}
};
template <typename T>
__device__ __forceinline__ void loadToSmem(volatile T* smem, T& val, unsigned int tid)
{
smem[tid] = val;
}
template <typename T>
__device__ __forceinline__ void loadFromSmem(volatile T* smem, T& val, unsigned int tid)
{
val = smem[tid];
}
template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
__device__ __forceinline__ void loadToSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
unsigned int tid)
{
For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadToSmem(smem, val, tid);
}
template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
__device__ __forceinline__ void loadFromSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
unsigned int tid)
{
For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadFromSmem(smem, val, tid);
}
template <typename T, class Op>
__device__ __forceinline__ void merge(volatile T* smem, T& val, unsigned int tid, unsigned int delta, const Op& op)
{
T reg = smem[tid + delta];
smem[tid] = val = op(val, reg);
}
template <typename T, class Op>
__device__ __forceinline__ void mergeShfl(T& val, unsigned int delta, unsigned int width, const Op& op)
{
T reg = shfl_down(val, delta, width);
val = op(val, reg);
}
template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
__device__ __forceinline__ void merge(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
unsigned int tid,
unsigned int delta,
const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
{
For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::merge(smem, val, tid, delta, op);
}
template <typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
__device__ __forceinline__ void mergeShfl(const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
unsigned int delta,
unsigned int width,
const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
{
For<0, thrust::tuple_size<thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9> >::value>::mergeShfl(val, delta, width, op);
}
template <unsigned int N> struct Generic
{
template <typename Pointer, typename Reference, class Op>
static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
{
loadToSmem(smem, val, tid);
if (N >= 32)
__syncthreads();
if (N >= 2048)
{
if (tid < 1024)
merge(smem, val, tid, 1024, op);
__syncthreads();
}
if (N >= 1024)
{
if (tid < 512)
merge(smem, val, tid, 512, op);
__syncthreads();
}
if (N >= 512)
{
if (tid < 256)
merge(smem, val, tid, 256, op);
__syncthreads();
}
if (N >= 256)
{
if (tid < 128)
merge(smem, val, tid, 128, op);
__syncthreads();
}
if (N >= 128)
{
if (tid < 64)
merge(smem, val, tid, 64, op);
__syncthreads();
}
if (N >= 64)
{
if (tid < 32)
merge(smem, val, tid, 32, op);
}
if (tid < 16)
{
merge(smem, val, tid, 16, op);
merge(smem, val, tid, 8, op);
merge(smem, val, tid, 4, op);
merge(smem, val, tid, 2, op);
merge(smem, val, tid, 1, op);
}
}
};
template <unsigned int I, typename Pointer, typename Reference, class Op>
struct Unroll
{
static __device__ void loopShfl(Reference val, Op op, unsigned int N)
{
mergeShfl(val, I, N, op);
Unroll<I / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
}
static __device__ void loop(Pointer smem, Reference val, unsigned int tid, Op op)
{
merge(smem, val, tid, I, op);
Unroll<I / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
}
};
template <typename Pointer, typename Reference, class Op>
struct Unroll<0, Pointer, Reference, Op>
{
static __device__ void loopShfl(Reference, Op, unsigned int)
{
}
static __device__ void loop(Pointer, Reference, unsigned int, Op)
{
}
};
template <unsigned int N> struct WarpOptimized
{
template <typename Pointer, typename Reference, class Op>
static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
{
#if __CUDA_ARCH__ >= 300
(void) smem;
(void) tid;
Unroll<N / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
#else
loadToSmem(smem, val, tid);
if (tid < N / 2)
Unroll<N / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
#endif
}
};
template <unsigned int N> struct GenericOptimized32
{
enum { M = N / 32 };
template <typename Pointer, typename Reference, class Op>
static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
{
const unsigned int laneId = Warp::laneId();
#if __CUDA_ARCH__ >= 300
Unroll<16, Pointer, Reference, Op>::loopShfl(val, op, warpSize);
if (laneId == 0)
loadToSmem(smem, val, tid / 32);
#else
loadToSmem(smem, val, tid);
if (laneId < 16)
Unroll<16, Pointer, Reference, Op>::loop(smem, val, tid, op);
__syncthreads();
if (laneId == 0)
loadToSmem(smem, val, tid / 32);
#endif
__syncthreads();
loadFromSmem(smem, val, tid);
if (tid < 32)
{
#if __CUDA_ARCH__ >= 300
Unroll<M / 2, Pointer, Reference, Op>::loopShfl(val, op, M);
#else
Unroll<M / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
#endif
}
}
};
template <bool val, class T1, class T2> struct StaticIf;
template <class T1, class T2> struct StaticIf<true, T1, T2>
{
typedef T1 type;
};
template <class T1, class T2> struct StaticIf<false, T1, T2>
{
typedef T2 type;
};
template <unsigned int N> struct IsPowerOf2
{
enum { value = ((N != 0) && !(N & (N - 1))) };
};
template <unsigned int N> struct Dispatcher
{
typedef typename StaticIf<
(N <= 32) && IsPowerOf2<N>::value,
WarpOptimized<N>,
typename StaticIf<
(N <= 1024) && IsPowerOf2<N>::value,
GenericOptimized32<N>,
Generic<N>
>::type
>::type reductor;
};
}
}}}
#endif // __OPENCV_GPU_REDUCE_DETAIL_HPP__

@ -0,0 +1,498 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__
#define __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__
#include <thrust/tuple.h>
#include "../warp.hpp"
#include "../warp_shuffle.hpp"
namespace cv { namespace gpu { namespace device
{
namespace reduce_key_val_detail
{
template <typename T> struct GetType;
template <typename T> struct GetType<T*>
{
typedef T type;
};
template <typename T> struct GetType<volatile T*>
{
typedef T type;
};
template <typename T> struct GetType<T&>
{
typedef T type;
};
template <unsigned int I, unsigned int N>
struct For
{
template <class PointerTuple, class ReferenceTuple>
static __device__ void loadToSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid)
{
thrust::get<I>(smem)[tid] = thrust::get<I>(data);
For<I + 1, N>::loadToSmem(smem, data, tid);
}
template <class PointerTuple, class ReferenceTuple>
static __device__ void loadFromSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid)
{
thrust::get<I>(data) = thrust::get<I>(smem)[tid];
For<I + 1, N>::loadFromSmem(smem, data, tid);
}
template <class ReferenceTuple>
static __device__ void copyShfl(const ReferenceTuple& val, unsigned int delta, int width)
{
thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width);
For<I + 1, N>::copyShfl(val, delta, width);
}
template <class PointerTuple, class ReferenceTuple>
static __device__ void copy(const PointerTuple& svals, const ReferenceTuple& val, unsigned int tid, unsigned int delta)
{
thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
For<I + 1, N>::copy(svals, val, tid, delta);
}
template <class KeyReferenceTuple, class ValReferenceTuple, class CmpTuple>
static __device__ void mergeShfl(const KeyReferenceTuple& key, const ValReferenceTuple& val, const CmpTuple& cmp, unsigned int delta, int width)
{
typename GetType<typename thrust::tuple_element<I, KeyReferenceTuple>::type>::type reg = shfl_down(thrust::get<I>(key), delta, width);
if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
{
thrust::get<I>(key) = reg;
thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width);
}
For<I + 1, N>::mergeShfl(key, val, cmp, delta, width);
}
template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
static __device__ void merge(const KeyPointerTuple& skeys, const KeyReferenceTuple& key,
const ValPointerTuple& svals, const ValReferenceTuple& val,
const CmpTuple& cmp,
unsigned int tid, unsigned int delta)
{
typename GetType<typename thrust::tuple_element<I, KeyPointerTuple>::type>::type reg = thrust::get<I>(skeys)[tid + delta];
if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
{
thrust::get<I>(skeys)[tid] = thrust::get<I>(key) = reg;
thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
}
For<I + 1, N>::merge(skeys, key, svals, val, cmp, tid, delta);
}
};
template <unsigned int N>
struct For<N, N>
{
template <class PointerTuple, class ReferenceTuple>
static __device__ void loadToSmem(const PointerTuple&, const ReferenceTuple&, unsigned int)
{
}
template <class PointerTuple, class ReferenceTuple>
static __device__ void loadFromSmem(const PointerTuple&, const ReferenceTuple&, unsigned int)
{
}
template <class ReferenceTuple>
static __device__ void copyShfl(const ReferenceTuple&, unsigned int, int)
{
}
template <class PointerTuple, class ReferenceTuple>
static __device__ void copy(const PointerTuple&, const ReferenceTuple&, unsigned int, unsigned int)
{
}
template <class KeyReferenceTuple, class ValReferenceTuple, class CmpTuple>
static __device__ void mergeShfl(const KeyReferenceTuple&, const ValReferenceTuple&, const CmpTuple&, unsigned int, int)
{
}
template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
static __device__ void merge(const KeyPointerTuple&, const KeyReferenceTuple&,
const ValPointerTuple&, const ValReferenceTuple&,
const CmpTuple&,
unsigned int, unsigned int)
{
}
};
//////////////////////////////////////////////////////
// loadToSmem
template <typename T>
__device__ __forceinline__ void loadToSmem(volatile T* smem, T& data, unsigned int tid)
{
smem[tid] = data;
}
template <typename T>
__device__ __forceinline__ void loadFromSmem(volatile T* smem, T& data, unsigned int tid)
{
data = smem[tid];
}
template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
__device__ __forceinline__ void loadToSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
unsigned int tid)
{
For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadToSmem(smem, data, tid);
}
template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
__device__ __forceinline__ void loadFromSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
unsigned int tid)
{
For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadFromSmem(smem, data, tid);
}
//////////////////////////////////////////////////////
// copyVals
template <typename V>
__device__ __forceinline__ void copyValsShfl(V& val, unsigned int delta, int width)
{
val = shfl_down(val, delta, width);
}
template <typename V>
__device__ __forceinline__ void copyVals(volatile V* svals, V& val, unsigned int tid, unsigned int delta)
{
svals[tid] = val = svals[tid + delta];
}
template <typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
__device__ __forceinline__ void copyValsShfl(const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
unsigned int delta,
int width)
{
For<0, thrust::tuple_size<thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9> >::value>::copyShfl(val, delta, width);
}
template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
__device__ __forceinline__ void copyVals(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
unsigned int tid, unsigned int delta)
{
For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::copy(svals, val, tid, delta);
}
//////////////////////////////////////////////////////
// merge
template <typename K, typename V, class Cmp>
__device__ __forceinline__ void mergeShfl(K& key, V& val, const Cmp& cmp, unsigned int delta, int width)
{
K reg = shfl_down(key, delta, width);
if (cmp(reg, key))
{
key = reg;
copyValsShfl(val, delta, width);
}
}
template <typename K, typename V, class Cmp>
__device__ __forceinline__ void merge(volatile K* skeys, K& key, volatile V* svals, V& val, const Cmp& cmp, unsigned int tid, unsigned int delta)
{
K reg = skeys[tid + delta];
if (cmp(reg, key))
{
skeys[tid] = key = reg;
copyVals(svals, val, tid, delta);
}
}
template <typename K,
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
class Cmp>
__device__ __forceinline__ void mergeShfl(K& key,
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
const Cmp& cmp,
unsigned int delta, int width)
{
K reg = shfl_down(key, delta, width);
if (cmp(reg, key))
{
key = reg;
copyValsShfl(val, delta, width);
}
}
template <typename K,
typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
class Cmp>
__device__ __forceinline__ void merge(volatile K* skeys, K& key,
const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
const Cmp& cmp, unsigned int tid, unsigned int delta)
{
K reg = skeys[tid + delta];
if (cmp(reg, key))
{
skeys[tid] = key = reg;
copyVals(svals, val, tid, delta);
}
}
template <typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
__device__ __forceinline__ void mergeShfl(const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
unsigned int delta, int width)
{
For<0, thrust::tuple_size<thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9> >::value>::mergeShfl(key, val, cmp, delta, width);
}
template <typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9,
typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
__device__ __forceinline__ void merge(const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys,
const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
unsigned int tid, unsigned int delta)
{
For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::merge(skeys, key, svals, val, cmp, tid, delta);
}
//////////////////////////////////////////////////////
// Generic
template <unsigned int N> struct Generic
{
template <class KP, class KR, class VP, class VR, class Cmp>
static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
{
loadToSmem(skeys, key, tid);
loadValsToSmem(svals, val, tid);
if (N >= 32)
__syncthreads();
if (N >= 2048)
{
if (tid < 1024)
merge(skeys, key, svals, val, cmp, tid, 1024);
__syncthreads();
}
if (N >= 1024)
{
if (tid < 512)
merge(skeys, key, svals, val, cmp, tid, 512);
__syncthreads();
}
if (N >= 512)
{
if (tid < 256)
merge(skeys, key, svals, val, cmp, tid, 256);
__syncthreads();
}
if (N >= 256)
{
if (tid < 128)
merge(skeys, key, svals, val, cmp, tid, 128);
__syncthreads();
}
if (N >= 128)
{
if (tid < 64)
merge(skeys, key, svals, val, cmp, tid, 64);
__syncthreads();
}
if (N >= 64)
{
if (tid < 32)
merge(skeys, key, svals, val, cmp, tid, 32);
}
if (tid < 16)
{
merge(skeys, key, svals, val, cmp, tid, 16);
merge(skeys, key, svals, val, cmp, tid, 8);
merge(skeys, key, svals, val, cmp, tid, 4);
merge(skeys, key, svals, val, cmp, tid, 2);
merge(skeys, key, svals, val, cmp, tid, 1);
}
}
};
template <unsigned int I, class KP, class KR, class VP, class VR, class Cmp>
struct Unroll
{
static __device__ void loopShfl(KR key, VR val, Cmp cmp, unsigned int N)
{
mergeShfl(key, val, cmp, I, N);
Unroll<I / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
}
static __device__ void loop(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
{
merge(skeys, key, svals, val, cmp, tid, I);
Unroll<I / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
}
};
template <class KP, class KR, class VP, class VR, class Cmp>
struct Unroll<0, KP, KR, VP, VR, Cmp>
{
static __device__ void loopShfl(KR, VR, Cmp, unsigned int)
{
}
static __device__ void loop(KP, KR, VP, VR, unsigned int, Cmp)
{
}
};
template <unsigned int N> struct WarpOptimized
{
template <class KP, class KR, class VP, class VR, class Cmp>
static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
{
#if 0 // __CUDA_ARCH__ >= 300
(void) skeys;
(void) svals;
(void) tid;
Unroll<N / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
#else
loadToSmem(skeys, key, tid);
loadToSmem(svals, val, tid);
if (tid < N / 2)
Unroll<N / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
#endif
}
};
template <unsigned int N> struct GenericOptimized32
{
enum { M = N / 32 };
template <class KP, class KR, class VP, class VR, class Cmp>
static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
{
const unsigned int laneId = Warp::laneId();
#if 0 // __CUDA_ARCH__ >= 300
Unroll<16, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, warpSize);
if (laneId == 0)
{
loadToSmem(skeys, key, tid / 32);
loadToSmem(svals, val, tid / 32);
}
#else
loadToSmem(skeys, key, tid);
loadToSmem(svals, val, tid);
if (laneId < 16)
Unroll<16, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
__syncthreads();
if (laneId == 0)
{
loadToSmem(skeys, key, tid / 32);
loadToSmem(svals, val, tid / 32);
}
#endif
__syncthreads();
loadFromSmem(skeys, key, tid);
if (tid < 32)
{
#if 0 // __CUDA_ARCH__ >= 300
loadFromSmem(svals, val, tid);
Unroll<M / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, M);
#else
Unroll<M / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
#endif
}
}
};
template <bool val, class T1, class T2> struct StaticIf;
template <class T1, class T2> struct StaticIf<true, T1, T2>
{
typedef T1 type;
};
template <class T1, class T2> struct StaticIf<false, T1, T2>
{
typedef T2 type;
};
template <unsigned int N> struct IsPowerOf2
{
enum { value = ((N != 0) && !(N & (N - 1))) };
};
template <unsigned int N> struct Dispatcher
{
typedef typename StaticIf<
(N <= 32) && IsPowerOf2<N>::value,
WarpOptimized<N>,
typename StaticIf<
(N <= 1024) && IsPowerOf2<N>::value,
GenericOptimized32<N>,
Generic<N>
>::type
>::type reductor;
};
}
}}}
#endif // __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__

@ -1,841 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_GPU_REDUCTION_DETAIL_HPP__
#define __OPENCV_GPU_REDUCTION_DETAIL_HPP__
namespace cv { namespace gpu { namespace device
{
namespace utility_detail
{
///////////////////////////////////////////////////////////////////////////////
// Reductor
template <int n> struct WarpReductor
{
template <typename T, typename Op> static __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
{
if (tid < n)
data[tid] = partial_reduction;
if (n > 32) __syncthreads();
if (n > 32)
{
if (tid < n - 32)
data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);
if (tid < 16)
{
data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);
}
}
else if (n > 16)
{
if (tid < n - 16)
data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
if (tid < 8)
{
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);
}
}
else if (n > 8)
{
if (tid < n - 8)
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]);
if (tid < 4)
{
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);
}
}
else if (n > 4)
{
if (tid < n - 4)
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]);
if (tid < 2)
{
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);
}
}
else if (n > 2)
{
if (tid < n - 2)
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);
if (tid < 2)
{
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);
}
}
}
};
template <> struct WarpReductor<64>
{
template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
{
data[tid] = partial_reduction;
__syncthreads();
if (tid < 32)
{
data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
}
}
};
template <> struct WarpReductor<32>
{
template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
{
data[tid] = partial_reduction;
if (tid < 16)
{
data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
}
}
};
template <> struct WarpReductor<16>
{
template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
{
data[tid] = partial_reduction;
if (tid < 8)
{
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
}
}
};
template <> struct WarpReductor<8>
{
template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
{
data[tid] = partial_reduction;
if (tid < 4)
{
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
}
}
};
template <bool warp> struct ReductionDispatcher;
template <> struct ReductionDispatcher<true>
{
template <int n, typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
{
WarpReductor<n>::reduce(data, partial_reduction, tid, op);
}
};
template <> struct ReductionDispatcher<false>
{
template <int n, typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
{
if (tid < n)
data[tid] = partial_reduction;
__syncthreads();
if (n == 512) { if (tid < 256) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 256]); } __syncthreads(); }
if (n >= 256) { if (tid < 128) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 128]); } __syncthreads(); }
if (n >= 128) { if (tid < 64) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 64]); } __syncthreads(); }
if (tid < 32)
{
data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);
}
}
};
///////////////////////////////////////////////////////////////////////////////
// PredValWarpReductor
template <int n> struct PredValWarpReductor;
template <> struct PredValWarpReductor<64>
{
template <typename T, typename V, typename Pred>
static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
{
if (tid < 32)
{
myData = sdata[tid];
myVal = sval[tid];
T reg = sdata[tid + 32];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval[tid] = myVal = sval[tid + 32];
}
reg = sdata[tid + 16];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval[tid] = myVal = sval[tid + 16];
}
reg = sdata[tid + 8];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval[tid] = myVal = sval[tid + 8];
}
reg = sdata[tid + 4];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval[tid] = myVal = sval[tid + 4];
}
reg = sdata[tid + 2];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval[tid] = myVal = sval[tid + 2];
}
reg = sdata[tid + 1];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval[tid] = myVal = sval[tid + 1];
}
}
}
};
template <> struct PredValWarpReductor<32>
{
template <typename T, typename V, typename Pred>
static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
{
if (tid < 16)
{
myData = sdata[tid];
myVal = sval[tid];
T reg = sdata[tid + 16];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval[tid] = myVal = sval[tid + 16];
}
reg = sdata[tid + 8];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval[tid] = myVal = sval[tid + 8];
}
reg = sdata[tid + 4];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval[tid] = myVal = sval[tid + 4];
}
reg = sdata[tid + 2];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval[tid] = myVal = sval[tid + 2];
}
reg = sdata[tid + 1];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval[tid] = myVal = sval[tid + 1];
}
}
}
};
template <> struct PredValWarpReductor<16>
{
template <typename T, typename V, typename Pred>
static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
{
if (tid < 8)
{
myData = sdata[tid];
myVal = sval[tid];
T reg = reg = sdata[tid + 8];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval[tid] = myVal = sval[tid + 8];
}
reg = sdata[tid + 4];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval[tid] = myVal = sval[tid + 4];
}
reg = sdata[tid + 2];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval[tid] = myVal = sval[tid + 2];
}
reg = sdata[tid + 1];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval[tid] = myVal = sval[tid + 1];
}
}
}
};
template <> struct PredValWarpReductor<8>
{
template <typename T, typename V, typename Pred>
static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
{
if (tid < 4)
{
myData = sdata[tid];
myVal = sval[tid];
T reg = reg = sdata[tid + 4];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval[tid] = myVal = sval[tid + 4];
}
reg = sdata[tid + 2];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval[tid] = myVal = sval[tid + 2];
}
reg = sdata[tid + 1];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval[tid] = myVal = sval[tid + 1];
}
}
}
};
template <bool warp> struct PredValReductionDispatcher;
template <> struct PredValReductionDispatcher<true>
{
template <int n, typename T, typename V, typename Pred> static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
{
PredValWarpReductor<n>::reduce(myData, myVal, sdata, sval, tid, pred);
}
};
template <> struct PredValReductionDispatcher<false>
{
template <int n, typename T, typename V, typename Pred> static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
{
myData = sdata[tid];
myVal = sval[tid];
if (n >= 512 && tid < 256)
{
T reg = sdata[tid + 256];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval[tid] = myVal = sval[tid + 256];
}
__syncthreads();
}
if (n >= 256 && tid < 128)
{
T reg = sdata[tid + 128];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval[tid] = myVal = sval[tid + 128];
}
__syncthreads();
}
if (n >= 128 && tid < 64)
{
T reg = sdata[tid + 64];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval[tid] = myVal = sval[tid + 64];
}
__syncthreads();
}
if (tid < 32)
{
if (n >= 64)
{
T reg = sdata[tid + 32];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval[tid] = myVal = sval[tid + 32];
}
}
if (n >= 32)
{
T reg = sdata[tid + 16];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval[tid] = myVal = sval[tid + 16];
}
}
if (n >= 16)
{
T reg = sdata[tid + 8];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval[tid] = myVal = sval[tid + 8];
}
}
if (n >= 8)
{
T reg = sdata[tid + 4];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval[tid] = myVal = sval[tid + 4];
}
}
if (n >= 4)
{
T reg = sdata[tid + 2];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval[tid] = myVal = sval[tid + 2];
}
}
if (n >= 2)
{
T reg = sdata[tid + 1];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval[tid] = myVal = sval[tid + 1];
}
}
}
}
};
///////////////////////////////////////////////////////////////////////////////
// PredVal2WarpReductor
template <int n> struct PredVal2WarpReductor;
template <> struct PredVal2WarpReductor<64>
{
template <typename T, typename V1, typename V2, typename Pred>
static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
{
if (tid < 32)
{
myData = sdata[tid];
myVal1 = sval1[tid];
myVal2 = sval2[tid];
T reg = sdata[tid + 32];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval1[tid] = myVal1 = sval1[tid + 32];
sval2[tid] = myVal2 = sval2[tid + 32];
}
reg = sdata[tid + 16];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval1[tid] = myVal1 = sval1[tid + 16];
sval2[tid] = myVal2 = sval2[tid + 16];
}
reg = sdata[tid + 8];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval1[tid] = myVal1 = sval1[tid + 8];
sval2[tid] = myVal2 = sval2[tid + 8];
}
reg = sdata[tid + 4];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval1[tid] = myVal1 = sval1[tid + 4];
sval2[tid] = myVal2 = sval2[tid + 4];
}
reg = sdata[tid + 2];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval1[tid] = myVal1 = sval1[tid + 2];
sval2[tid] = myVal2 = sval2[tid + 2];
}
reg = sdata[tid + 1];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval1[tid] = myVal1 = sval1[tid + 1];
sval2[tid] = myVal2 = sval2[tid + 1];
}
}
}
};
template <> struct PredVal2WarpReductor<32>
{
template <typename T, typename V1, typename V2, typename Pred>
static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
{
if (tid < 16)
{
myData = sdata[tid];
myVal1 = sval1[tid];
myVal2 = sval2[tid];
T reg = sdata[tid + 16];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval1[tid] = myVal1 = sval1[tid + 16];
sval2[tid] = myVal2 = sval2[tid + 16];
}
reg = sdata[tid + 8];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval1[tid] = myVal1 = sval1[tid + 8];
sval2[tid] = myVal2 = sval2[tid + 8];
}
reg = sdata[tid + 4];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval1[tid] = myVal1 = sval1[tid + 4];
sval2[tid] = myVal2 = sval2[tid + 4];
}
reg = sdata[tid + 2];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval1[tid] = myVal1 = sval1[tid + 2];
sval2[tid] = myVal2 = sval2[tid + 2];
}
reg = sdata[tid + 1];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval1[tid] = myVal1 = sval1[tid + 1];
sval2[tid] = myVal2 = sval2[tid + 1];
}
}
}
};
template <> struct PredVal2WarpReductor<16>
{
template <typename T, typename V1, typename V2, typename Pred>
static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
{
if (tid < 8)
{
myData = sdata[tid];
myVal1 = sval1[tid];
myVal2 = sval2[tid];
T reg = reg = sdata[tid + 8];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval1[tid] = myVal1 = sval1[tid + 8];
sval2[tid] = myVal2 = sval2[tid + 8];
}
reg = sdata[tid + 4];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval1[tid] = myVal1 = sval1[tid + 4];
sval2[tid] = myVal2 = sval2[tid + 4];
}
reg = sdata[tid + 2];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval1[tid] = myVal1 = sval1[tid + 2];
sval2[tid] = myVal2 = sval2[tid + 2];
}
reg = sdata[tid + 1];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval1[tid] = myVal1 = sval1[tid + 1];
sval2[tid] = myVal2 = sval2[tid + 1];
}
}
}
};
template <> struct PredVal2WarpReductor<8>
{
template <typename T, typename V1, typename V2, typename Pred>
static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
{
if (tid < 4)
{
myData = sdata[tid];
myVal1 = sval1[tid];
myVal2 = sval2[tid];
T reg = reg = sdata[tid + 4];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval1[tid] = myVal1 = sval1[tid + 4];
sval2[tid] = myVal2 = sval2[tid + 4];
}
reg = sdata[tid + 2];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval1[tid] = myVal1 = sval1[tid + 2];
sval2[tid] = myVal2 = sval2[tid + 2];
}
reg = sdata[tid + 1];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval1[tid] = myVal1 = sval1[tid + 1];
sval2[tid] = myVal2 = sval2[tid + 1];
}
}
}
};
template <bool warp> struct PredVal2ReductionDispatcher;
template <> struct PredVal2ReductionDispatcher<true>
{
template <int n, typename T, typename V1, typename V2, typename Pred>
static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
{
PredVal2WarpReductor<n>::reduce(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);
}
};
template <> struct PredVal2ReductionDispatcher<false>
{
template <int n, typename T, typename V1, typename V2, typename Pred>
static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred)
{
myData = sdata[tid];
myVal1 = sval1[tid];
myVal2 = sval2[tid];
if (n >= 512 && tid < 256)
{
T reg = sdata[tid + 256];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval1[tid] = myVal1 = sval1[tid + 256];
sval2[tid] = myVal2 = sval2[tid + 256];
}
__syncthreads();
}
if (n >= 256 && tid < 128)
{
T reg = sdata[tid + 128];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval1[tid] = myVal1 = sval1[tid + 128];
sval2[tid] = myVal2 = sval2[tid + 128];
}
__syncthreads();
}
if (n >= 128 && tid < 64)
{
T reg = sdata[tid + 64];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval1[tid] = myVal1 = sval1[tid + 64];
sval2[tid] = myVal2 = sval2[tid + 64];
}
__syncthreads();
}
if (tid < 32)
{
if (n >= 64)
{
T reg = sdata[tid + 32];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval1[tid] = myVal1 = sval1[tid + 32];
sval2[tid] = myVal2 = sval2[tid + 32];
}
}
if (n >= 32)
{
T reg = sdata[tid + 16];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval1[tid] = myVal1 = sval1[tid + 16];
sval2[tid] = myVal2 = sval2[tid + 16];
}
}
if (n >= 16)
{
T reg = sdata[tid + 8];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval1[tid] = myVal1 = sval1[tid + 8];
sval2[tid] = myVal2 = sval2[tid + 8];
}
}
if (n >= 8)
{
T reg = sdata[tid + 4];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval1[tid] = myVal1 = sval1[tid + 4];
sval2[tid] = myVal2 = sval2[tid + 4];
}
}
if (n >= 4)
{
T reg = sdata[tid + 2];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval1[tid] = myVal1 = sval1[tid + 2];
sval2[tid] = myVal2 = sval2[tid + 2];
}
}
if (n >= 2)
{
T reg = sdata[tid + 1];
if (pred(reg, myData))
{
sdata[tid] = myData = reg;
sval1[tid] = myVal1 = sval1[tid + 1];
sval2[tid] = myVal2 = sval2[tid + 1];
}
}
}
}
};
} // namespace utility_detail
}}} // namespace cv { namespace gpu { namespace device
#endif // __OPENCV_GPU_REDUCTION_DETAIL_HPP__

@ -44,7 +44,6 @@
#define OPENCV_GPU_EMULATION_HPP_ #define OPENCV_GPU_EMULATION_HPP_
#include "warp_reduce.hpp" #include "warp_reduce.hpp"
#include <stdio.h>
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {

@ -302,18 +302,18 @@ namespace cv { namespace gpu { namespace device
template <> struct name<type> : binary_function<type, type, type> \ template <> struct name<type> : binary_function<type, type, type> \
{ \ { \
__device__ __forceinline__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \ __device__ __forceinline__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \
__device__ __forceinline__ name(const name& other):binary_function<type, type, type>(){}\ __device__ __forceinline__ name() {}\
__device__ __forceinline__ name():binary_function<type, type, type>(){}\ __device__ __forceinline__ name(const name&) {}\
}; };
template <typename T> struct maximum : binary_function<T, T, T> template <typename T> struct maximum : binary_function<T, T, T>
{ {
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
{ {
return lhs < rhs ? rhs : lhs; return max(lhs, rhs);
} }
__device__ __forceinline__ maximum(const maximum& other):binary_function<T, T, T>(){} __device__ __forceinline__ maximum() {}
__device__ __forceinline__ maximum():binary_function<T, T, T>(){} __device__ __forceinline__ maximum(const maximum&) {}
}; };
OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uchar, ::max) OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uchar, ::max)
@ -330,10 +330,10 @@ namespace cv { namespace gpu { namespace device
{ {
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
{ {
return lhs < rhs ? lhs : rhs; return min(lhs, rhs);
} }
__device__ __forceinline__ minimum(const minimum& other):binary_function<T, T, T>(){} __device__ __forceinline__ minimum() {}
__device__ __forceinline__ minimum():binary_function<T, T, T>(){} __device__ __forceinline__ minimum(const minimum&) {}
}; };
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uchar, ::min) OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uchar, ::min)
@ -350,6 +350,108 @@ namespace cv { namespace gpu { namespace device
// Math functions // Math functions
///bound========================================= ///bound=========================================
template <typename T> struct abs_func : unary_function<T, T>
{
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType x) const
{
return abs(x);
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<unsigned char> : unary_function<unsigned char, unsigned char>
{
__device__ __forceinline__ unsigned char operator ()(unsigned char x) const
{
return x;
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<signed char> : unary_function<signed char, signed char>
{
__device__ __forceinline__ signed char operator ()(signed char x) const
{
return ::abs((int)x);
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<char> : unary_function<char, char>
{
__device__ __forceinline__ char operator ()(char x) const
{
return ::abs((int)x);
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<unsigned short> : unary_function<unsigned short, unsigned short>
{
__device__ __forceinline__ unsigned short operator ()(unsigned short x) const
{
return x;
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<short> : unary_function<short, short>
{
__device__ __forceinline__ short operator ()(short x) const
{
return ::abs((int)x);
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<unsigned int> : unary_function<unsigned int, unsigned int>
{
__device__ __forceinline__ unsigned int operator ()(unsigned int x) const
{
return x;
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<int> : unary_function<int, int>
{
__device__ __forceinline__ int operator ()(int x) const
{
return ::abs(x);
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<float> : unary_function<float, float>
{
__device__ __forceinline__ float operator ()(float x) const
{
return ::fabsf(x);
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
template <> struct abs_func<double> : unary_function<double, double>
{
__device__ __forceinline__ double operator ()(double x) const
{
return ::fabs(x);
}
__device__ __forceinline__ abs_func() {}
__device__ __forceinline__ abs_func(const abs_func&) {}
};
#define OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(name, func) \ #define OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(name, func) \
template <typename T> struct name ## _func : unary_function<T, float> \ template <typename T> struct name ## _func : unary_function<T, float> \
{ \ { \
@ -357,6 +459,8 @@ namespace cv { namespace gpu { namespace device
{ \ { \
return func ## f(v); \ return func ## f(v); \
} \ } \
__device__ __forceinline__ name ## _func() {} \
__device__ __forceinline__ name ## _func(const name ## _func&) {} \
}; \ }; \
template <> struct name ## _func<double> : unary_function<double, double> \ template <> struct name ## _func<double> : unary_function<double, double> \
{ \ { \
@ -364,6 +468,8 @@ namespace cv { namespace gpu { namespace device
{ \ { \
return func(v); \ return func(v); \
} \ } \
__device__ __forceinline__ name ## _func() {} \
__device__ __forceinline__ name ## _func(const name ## _func&) {} \
}; };
#define OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(name, func) \ #define OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(name, func) \
@ -382,7 +488,6 @@ namespace cv { namespace gpu { namespace device
} \ } \
}; };
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(fabs, ::fabs)
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sqrt, ::sqrt) OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sqrt, ::sqrt)
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp, ::exp) OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp, ::exp)
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp2, ::exp2) OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp2, ::exp2)

@ -0,0 +1,197 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_GPU_REDUCE_HPP__
#define __OPENCV_GPU_REDUCE_HPP__
#include <thrust/tuple.h>
#include "detail/reduce.hpp"
#include "detail/reduce_key_val.hpp"
namespace cv { namespace gpu { namespace device
{
template <int N, typename T, class Op>
__device__ __forceinline__ void reduce(volatile T* smem, T& val, unsigned int tid, const Op& op)
{
reduce_detail::Dispatcher<N>::reductor::template reduce<volatile T*, T&, const Op&>(smem, val, tid, op);
}
template <int N,
typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
__device__ __forceinline__ void reduce(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
unsigned int tid,
const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
{
reduce_detail::Dispatcher<N>::reductor::template reduce<
const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>&,
const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>&,
const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>&>(smem, val, tid, op);
}
template <unsigned int N, typename K, typename V, class Cmp>
__device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key, volatile V* svals, V& val, unsigned int tid, const Cmp& cmp)
{
reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&, volatile V*, V&, const Cmp&>(skeys, key, svals, val, tid, cmp);
}
template <unsigned int N,
typename K,
typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
class Cmp>
__device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key,
const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
unsigned int tid, const Cmp& cmp)
{
reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<volatile K*, K&,
const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>&,
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&,
const Cmp&>(skeys, key, svals, val, tid, cmp);
}
template <unsigned int N,
typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9,
typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
__device__ __forceinline__ void reduceKeyVal(const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys,
const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
unsigned int tid,
const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp)
{
reduce_key_val_detail::Dispatcher<N>::reductor::template reduce<
const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>&,
const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>&,
const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>&,
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>&,
const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>&
>(skeys, key, svals, val, tid, cmp);
}
// smem_tuple
template <typename T0>
__device__ __forceinline__
thrust::tuple<volatile T0*>
smem_tuple(T0* t0)
{
return thrust::make_tuple((volatile T0*) t0);
}
template <typename T0, typename T1>
__device__ __forceinline__
thrust::tuple<volatile T0*, volatile T1*>
smem_tuple(T0* t0, T1* t1)
{
return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1);
}
template <typename T0, typename T1, typename T2>
__device__ __forceinline__
thrust::tuple<volatile T0*, volatile T1*, volatile T2*>
smem_tuple(T0* t0, T1* t1, T2* t2)
{
return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2);
}
template <typename T0, typename T1, typename T2, typename T3>
__device__ __forceinline__
thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*>
smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3)
{
return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3);
}
template <typename T0, typename T1, typename T2, typename T3, typename T4>
__device__ __forceinline__
thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*>
smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4)
{
return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4);
}
template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5>
__device__ __forceinline__
thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*>
smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5)
{
return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5);
}
template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>
__device__ __forceinline__
thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*>
smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6)
{
return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6);
}
template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7>
__device__ __forceinline__
thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*, volatile T7*>
smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7)
{
return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7);
}
template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8>
__device__ __forceinline__
thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*, volatile T7*, volatile T8*>
smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8)
{
return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7, (volatile T8*) t8);
}
template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9>
__device__ __forceinline__
thrust::tuple<volatile T0*, volatile T1*, volatile T2*, volatile T3*, volatile T4*, volatile T5*, volatile T6*, volatile T7*, volatile T8*, volatile T9*>
smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8, T9* t9)
{
return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7, (volatile T8*) t8, (volatile T9*) t9);
}
}}}
#endif // __OPENCV_GPU_UTILITY_HPP__

@ -58,35 +58,47 @@ namespace cv { namespace gpu { namespace device
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v) template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
{ {
return (uchar) ::max((int)v, 0); uint res = 0;
int vi = v;
asm("cvt.sat.u8.s8 %0, %1;" : "=r"(res) : "r"(vi));
return res;
}
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
{
uint res = 0;
asm("cvt.sat.u8.s16 %0, %1;" : "=r"(res) : "h"(v));
return res;
} }
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v) template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
{ {
return (uchar) ::min((uint)v, (uint)UCHAR_MAX); uint res = 0;
asm("cvt.sat.u8.u16 %0, %1;" : "=r"(res) : "h"(v));
return res;
} }
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v) template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
{ {
return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); uint res = 0;
asm("cvt.sat.u8.s32 %0, %1;" : "=r"(res) : "r"(v));
return res;
} }
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v) template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
{ {
return (uchar) ::min(v, (uint)UCHAR_MAX); uint res = 0;
asm("cvt.sat.u8.u32 %0, %1;" : "=r"(res) : "r"(v));
return res;
} }
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
{
return saturate_cast<uchar>((uint)v);
}
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v) template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
{ {
int iv = __float2int_rn(v); uint res = 0;
return saturate_cast<uchar>(iv); asm("cvt.rni.sat.u8.f32 %0, %1;" : "=r"(res) : "f"(v));
return res;
} }
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v) template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
{ {
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 #if __CUDA_ARCH__ >= 130
int iv = __double2int_rn(v); uint res = 0;
return saturate_cast<uchar>(iv); asm("cvt.rni.sat.u8.f64 %0, %1;" : "=r"(res) : "d"(v));
return res;
#else #else
return saturate_cast<uchar>((float)v); return saturate_cast<uchar>((float)v);
#endif #endif
@ -94,35 +106,47 @@ namespace cv { namespace gpu { namespace device
template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v) template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
{ {
return (schar) ::min((int)v, SCHAR_MAX); uint res = 0;
uint vi = v;
asm("cvt.sat.s8.u8 %0, %1;" : "=r"(res) : "r"(vi));
return res;
} }
template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v) template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)
{ {
return (schar) ::min((uint)v, (uint)SCHAR_MAX); uint res = 0;
asm("cvt.sat.s8.s16 %0, %1;" : "=r"(res) : "h"(v));
return res;
} }
template<> __device__ __forceinline__ schar saturate_cast<schar>(int v) template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
{ {
return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); uint res = 0;
asm("cvt.sat.s8.u16 %0, %1;" : "=r"(res) : "h"(v));
return res;
} }
template<> __device__ __forceinline__ schar saturate_cast<schar>(short v) template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
{ {
return saturate_cast<schar>((int)v); uint res = 0;
asm("cvt.sat.s8.s32 %0, %1;" : "=r"(res) : "r"(v));
return res;
} }
template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v) template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v)
{ {
return (schar) ::min(v, (uint)SCHAR_MAX); uint res = 0;
asm("cvt.sat.s8.u32 %0, %1;" : "=r"(res) : "r"(v));
return res;
} }
template<> __device__ __forceinline__ schar saturate_cast<schar>(float v) template<> __device__ __forceinline__ schar saturate_cast<schar>(float v)
{ {
int iv = __float2int_rn(v); uint res = 0;
return saturate_cast<schar>(iv); asm("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(res) : "f"(v));
return res;
} }
template<> __device__ __forceinline__ schar saturate_cast<schar>(double v) template<> __device__ __forceinline__ schar saturate_cast<schar>(double v)
{ {
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 #if __CUDA_ARCH__ >= 130
int iv = __double2int_rn(v); uint res = 0;
return saturate_cast<schar>(iv); asm("cvt.rni.sat.s8.f64 %0, %1;" : "=r"(res) : "d"(v));
return res;
#else #else
return saturate_cast<schar>((float)v); return saturate_cast<schar>((float)v);
#endif #endif
@ -130,30 +154,41 @@ namespace cv { namespace gpu { namespace device
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v) template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
{ {
return (ushort) ::max((int)v, 0); ushort res = 0;
int vi = v;
asm("cvt.sat.u16.s8 %0, %1;" : "=h"(res) : "r"(vi));
return res;
} }
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v) template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
{ {
return (ushort) ::max((int)v, 0); ushort res = 0;
asm("cvt.sat.u16.s16 %0, %1;" : "=h"(res) : "h"(v));
return res;
} }
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v) template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
{ {
return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); ushort res = 0;
asm("cvt.sat.u16.s32 %0, %1;" : "=h"(res) : "r"(v));
return res;
} }
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v) template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
{ {
return (ushort) ::min(v, (uint)USHRT_MAX); ushort res = 0;
asm("cvt.sat.u16.u32 %0, %1;" : "=h"(res) : "r"(v));
return res;
} }
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v) template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
{ {
int iv = __float2int_rn(v); ushort res = 0;
return saturate_cast<ushort>(iv); asm("cvt.rni.sat.u16.f32 %0, %1;" : "=h"(res) : "f"(v));
return res;
} }
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v) template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
{ {
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 #if __CUDA_ARCH__ >= 130
int iv = __double2int_rn(v); ushort res = 0;
return saturate_cast<ushort>(iv); asm("cvt.rni.sat.u16.f64 %0, %1;" : "=h"(res) : "d"(v));
return res;
#else #else
return saturate_cast<ushort>((float)v); return saturate_cast<ushort>((float)v);
#endif #endif
@ -161,31 +196,45 @@ namespace cv { namespace gpu { namespace device
template<> __device__ __forceinline__ short saturate_cast<short>(ushort v) template<> __device__ __forceinline__ short saturate_cast<short>(ushort v)
{ {
return (short) ::min((int)v, SHRT_MAX); short res = 0;
asm("cvt.sat.s16.u16 %0, %1;" : "=h"(res) : "h"(v));
return res;
} }
template<> __device__ __forceinline__ short saturate_cast<short>(int v) template<> __device__ __forceinline__ short saturate_cast<short>(int v)
{ {
return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); short res = 0;
asm("cvt.sat.s16.s32 %0, %1;" : "=h"(res) : "r"(v));
return res;
} }
template<> __device__ __forceinline__ short saturate_cast<short>(uint v) template<> __device__ __forceinline__ short saturate_cast<short>(uint v)
{ {
return (short) ::min(v, (uint)SHRT_MAX); short res = 0;
asm("cvt.sat.s16.u32 %0, %1;" : "=h"(res) : "r"(v));
return res;
} }
template<> __device__ __forceinline__ short saturate_cast<short>(float v) template<> __device__ __forceinline__ short saturate_cast<short>(float v)
{ {
int iv = __float2int_rn(v); short res = 0;
return saturate_cast<short>(iv); asm("cvt.rni.sat.s16.f32 %0, %1;" : "=h"(res) : "f"(v));
return res;
} }
template<> __device__ __forceinline__ short saturate_cast<short>(double v) template<> __device__ __forceinline__ short saturate_cast<short>(double v)
{ {
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 #if __CUDA_ARCH__ >= 130
int iv = __double2int_rn(v); short res = 0;
return saturate_cast<short>(iv); asm("cvt.rni.sat.s16.f64 %0, %1;" : "=h"(res) : "d"(v));
return res;
#else #else
return saturate_cast<short>((float)v); return saturate_cast<short>((float)v);
#endif #endif
} }
template<> __device__ __forceinline__ int saturate_cast<int>(uint v)
{
int res = 0;
asm("cvt.sat.s32.u32 %0, %1;" : "=r"(res) : "r"(v));
return res;
}
template<> __device__ __forceinline__ int saturate_cast<int>(float v) template<> __device__ __forceinline__ int saturate_cast<int>(float v)
{ {
return __float2int_rn(v); return __float2int_rn(v);
@ -199,6 +248,25 @@ namespace cv { namespace gpu { namespace device
#endif #endif
} }
template<> __device__ __forceinline__ uint saturate_cast<uint>(schar v)
{
uint res = 0;
int vi = v;
asm("cvt.sat.u32.s8 %0, %1;" : "=r"(res) : "r"(vi));
return res;
}
template<> __device__ __forceinline__ uint saturate_cast<uint>(short v)
{
uint res = 0;
asm("cvt.sat.u32.s16 %0, %1;" : "=r"(res) : "h"(v));
return res;
}
template<> __device__ __forceinline__ uint saturate_cast<uint>(int v)
{
uint res = 0;
asm("cvt.sat.u32.s32 %0, %1;" : "=r"(res) : "r"(v));
return res;
}
template<> __device__ __forceinline__ uint saturate_cast<uint>(float v) template<> __device__ __forceinline__ uint saturate_cast<uint>(float v)
{ {
return __float2uint_rn(v); return __float2uint_rn(v);

@ -45,7 +45,6 @@
#include "saturate_cast.hpp" #include "saturate_cast.hpp"
#include "datamov_utils.hpp" #include "datamov_utils.hpp"
#include "detail/reduction_detail.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
@ -156,29 +155,6 @@ namespace cv { namespace gpu { namespace device
} }
}; };
///////////////////////////////////////////////////////////////////////////////
// Reduction
template <int n, typename T, typename Op> __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
{
StaticAssert<n >= 8 && n <= 512>::check();
utility_detail::ReductionDispatcher<n <= 64>::reduce<n>(data, partial_reduction, tid, op);
}
template <int n, typename T, typename V, typename Pred>
__device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred)
{
StaticAssert<n >= 8 && n <= 512>::check();
utility_detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred);
}
template <int n, typename T, typename V1, typename V2, typename Pred>
__device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred)
{
StaticAssert<n >= 8 && n <= 512>::check();
utility_detail::PredVal2ReductionDispatcher<n <= 64>::reduce<n>(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);
}
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// Solve linear system // Solve linear system

@ -43,7 +43,7 @@
#ifndef __OPENCV_GPU_VEC_DISTANCE_HPP__ #ifndef __OPENCV_GPU_VEC_DISTANCE_HPP__
#define __OPENCV_GPU_VEC_DISTANCE_HPP__ #define __OPENCV_GPU_VEC_DISTANCE_HPP__
#include "utility.hpp" #include "reduce.hpp"
#include "functional.hpp" #include "functional.hpp"
#include "detail/vec_distance_detail.hpp" #include "detail/vec_distance_detail.hpp"
@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid) template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
{ {
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>()); reduce<THREAD_DIM>(smem, mySum, tid, plus<int>());
} }
__device__ __forceinline__ operator int() const __device__ __forceinline__ operator int() const
@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace device
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid) template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
{ {
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>()); reduce<THREAD_DIM>(smem, mySum, tid, plus<float>());
} }
__device__ __forceinline__ operator float() const __device__ __forceinline__ operator float() const
@ -113,7 +113,7 @@ namespace cv { namespace gpu { namespace device
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid) template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
{ {
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>()); reduce<THREAD_DIM>(smem, mySum, tid, plus<float>());
} }
__device__ __forceinline__ operator float() const __device__ __forceinline__ operator float() const
@ -138,7 +138,7 @@ namespace cv { namespace gpu { namespace device
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid) template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
{ {
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>()); reduce<THREAD_DIM>(smem, mySum, tid, plus<int>());
} }
__device__ __forceinline__ operator int() const __device__ __forceinline__ operator int() const

@ -280,7 +280,7 @@ namespace cv { namespace gpu { namespace device
OPENCV_GPU_IMPLEMENT_VEC_UNOP (type, operator ! , logical_not) \ OPENCV_GPU_IMPLEMENT_VEC_UNOP (type, operator ! , logical_not) \
OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, max, maximum) \ OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, max, maximum) \
OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, min, minimum) \ OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, min, minimum) \
OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, fabs, fabs_func) \ OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, abs, abs_func) \
OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, sqrt, sqrt_func) \ OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, sqrt, sqrt_func) \
OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp, exp_func) \ OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp, exp_func) \
OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp2, exp2_func) \ OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp2, exp2_func) \

@ -0,0 +1,145 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_GPU_WARP_SHUFFLE_HPP__
#define __OPENCV_GPU_WARP_SHUFFLE_HPP__
namespace cv { namespace gpu { namespace device
{
template <typename T>
__device__ __forceinline__ T shfl(T val, int srcLane, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
return __shfl(val, srcLane, width);
#else
return T();
#endif
}
__device__ __forceinline__ unsigned int shfl(unsigned int val, int srcLane, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
return (unsigned int) __shfl((int) val, srcLane, width);
#else
return 0;
#endif
}
__device__ __forceinline__ double shfl(double val, int srcLane, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
int lo = __double2loint(val);
int hi = __double2hiint(val);
lo = __shfl(lo, srcLane, width);
hi = __shfl(hi, srcLane, width);
return __hiloint2double(hi, lo);
#else
return 0.0;
#endif
}
template <typename T>
__device__ __forceinline__ T shfl_down(T val, unsigned int delta, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
return __shfl_down(val, delta, width);
#else
return T();
#endif
}
__device__ __forceinline__ unsigned int shfl_down(unsigned int val, unsigned int delta, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
return (unsigned int) __shfl_down((int) val, delta, width);
#else
return 0;
#endif
}
__device__ __forceinline__ double shfl_down(double val, unsigned int delta, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
int lo = __double2loint(val);
int hi = __double2hiint(val);
lo = __shfl_down(lo, delta, width);
hi = __shfl_down(hi, delta, width);
return __hiloint2double(hi, lo);
#else
return 0.0;
#endif
}
template <typename T>
__device__ __forceinline__ T shfl_up(T val, unsigned int delta, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
return __shfl_up(val, delta, width);
#else
return T();
#endif
}
__device__ __forceinline__ unsigned int shfl_up(unsigned int val, unsigned int delta, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
return (unsigned int) __shfl_up((int) val, delta, width);
#else
return 0;
#endif
}
__device__ __forceinline__ double shfl_up(double val, unsigned int delta, int width = warpSize)
{
#if __CUDA_ARCH__ >= 300
int lo = __double2loint(val);
int hi = __double2hiint(val);
lo = __shfl_up(lo, delta, width);
hi = __shfl_up(hi, delta, width);
return __hiloint2double(hi, lo);
#else
return 0.0;
#endif
}
}}}
#endif // __OPENCV_GPU_WARP_SHUFFLE_HPP__

@ -792,31 +792,23 @@ private:
GpuMat lab, l, ab; GpuMat lab, l, ab;
}; };
struct CV_EXPORTS CannyBuf;
CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
CV_EXPORTS void Canny(const GpuMat& image, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);
CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);
struct CV_EXPORTS CannyBuf struct CV_EXPORTS CannyBuf
{ {
CannyBuf() {}
explicit CannyBuf(const Size& image_size, int apperture_size = 3) {create(image_size, apperture_size);}
CannyBuf(const GpuMat& dx_, const GpuMat& dy_);
void create(const Size& image_size, int apperture_size = 3); void create(const Size& image_size, int apperture_size = 3);
void release(); void release();
GpuMat dx, dy; GpuMat dx, dy;
GpuMat dx_buf, dy_buf; GpuMat mag;
GpuMat edgeBuf; GpuMat map;
GpuMat trackBuf1, trackBuf2; GpuMat st1, st2;
Ptr<FilterEngine_GPU> filterDX, filterDY; Ptr<FilterEngine_GPU> filterDX, filterDY;
}; };
CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
CV_EXPORTS void Canny(const GpuMat& image, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);
CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false);
class CV_EXPORTS ImagePyramid class CV_EXPORTS ImagePyramid
{ {
public: public:
@ -855,6 +847,11 @@ CV_EXPORTS void HoughLines(const GpuMat& src, GpuMat& lines, float rho, float th
CV_EXPORTS void HoughLines(const GpuMat& src, GpuMat& lines, HoughLinesBuf& buf, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096); CV_EXPORTS void HoughLines(const GpuMat& src, GpuMat& lines, HoughLinesBuf& buf, float rho, float theta, int threshold, bool doSort = false, int maxLines = 4096);
CV_EXPORTS void HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines, OutputArray h_votes = noArray()); CV_EXPORTS void HoughLinesDownload(const GpuMat& d_lines, OutputArray h_lines, OutputArray h_votes = noArray());
//! HoughLinesP
//! finds line segments in the black-n-white image using probabalistic Hough transform
CV_EXPORTS void HoughLinesP(const GpuMat& image, GpuMat& lines, HoughLinesBuf& buf, float rho, float theta, int minLineLength, int maxLineGap, int maxLines = 4096);
//! HoughCircles //! HoughCircles
struct HoughCirclesBuf struct HoughCirclesBuf
@ -1036,11 +1033,9 @@ CV_EXPORTS void histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels
//! Calculates histogram for 8u one channel image //! Calculates histogram for 8u one channel image
//! Output hist will have one row, 256 cols and CV32SC1 type. //! Output hist will have one row, 256 cols and CV32SC1 type.
CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, Stream& stream = Stream::Null()); CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, Stream& stream = Stream::Null());
CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null());
//! normalizes the grayscale image brightness and contrast by normalizing its histogram //! normalizes the grayscale image brightness and contrast by normalizing its histogram
CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()); CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, Stream& stream = Stream::Null());
CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null()); CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null());
//////////////////////////////// StereoBM_GPU //////////////////////////////// //////////////////////////////// StereoBM_GPU ////////////////////////////////
@ -1532,6 +1527,97 @@ public:
int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize = Size(), double scaleFactor = 1.1, int minNeighbors = 4); int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize = Size(), double scaleFactor = 1.1, int minNeighbors = 4);
}; };
// ======================== GPU version for soft cascade ===================== //
class CV_EXPORTS ChannelsProcessor
{
public:
enum
{
GENERIC = 1 << 4,
SEPARABLE = 2 << 4
};
// Appends specified number of HOG first-order features integrals into given vector.
// Param frame is an input 3-channel bgr image.
// Param channels is a GPU matrix of optionally shrinked channels
// Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution.
virtual void apply(InputArray frame, OutputArray channels, Stream& stream = Stream::Null()) = 0;
// Creates a specific preprocessor implementation.
// Param shrinkage is a resizing factor. Resize is applied before the computing integral sum
// Param bins is a number of HOG-like channels.
// Param flags is a channel computing extra flags.
static cv::Ptr<ChannelsProcessor> create(const int shrinkage, const int bins, const int flags = GENERIC);
virtual ~ChannelsProcessor();
protected:
ChannelsProcessor();
};
// Implementation of soft (stageless) cascaded detector.
class CV_EXPORTS SCascade : public Algorithm
{
public:
// Representation of detectors result.
struct CV_EXPORTS Detection
{
ushort x;
ushort y;
ushort w;
ushort h;
float confidence;
int kind;
enum {PEDESTRIAN = 0};
};
enum { NO_REJECT = 1, DOLLAR = 2, /*PASCAL = 4,*/ DEFAULT = NO_REJECT, NMS_MASK = 0xF};
// An empty cascade will be created.
// Param minScale is a minimum scale relative to the original size of the image on which cascade will be applyed.
// Param minScale is a maximum scale relative to the original size of the image on which cascade will be applyed.
// Param scales is a number of scales from minScale to maxScale.
// Param flags is an extra tuning flags.
SCascade(const double minScale = 0.4, const double maxScale = 5., const int scales = 55,
const int flags = NO_REJECT || ChannelsProcessor::GENERIC);
virtual ~SCascade();
cv::AlgorithmInfo* info() const;
// Load cascade from FileNode.
// Param fn is a root node for cascade. Should be <cascade>.
virtual bool load(const FileNode& fn);
// Load cascade config.
virtual void read(const FileNode& fn);
// Return the matrix of of detectioned objects.
// Param image is a frame on which detector will be applied.
// Param rois is a regions of interests mask generated by genRoi.
// Only the objects that fall into one of the regions will be returned.
// Param objects is an output array of Detections represented as GpuMat of detections (SCascade::Detection)
// The first element of the matrix is actually a count of detections.
// Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution
virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const;
private:
struct Fields;
Fields* fields;
double minScale;
double maxScale;
int scales;
int flags;
};
CV_EXPORTS bool initModule_gpu(void);
////////////////////////////////// SURF ////////////////////////////////////////// ////////////////////////////////// SURF //////////////////////////////////////////
class CV_EXPORTS SURF_GPU class CV_EXPORTS SURF_GPU
@ -1877,8 +1963,6 @@ private:
GpuMat uPyr_[2]; GpuMat uPyr_[2];
GpuMat vPyr_[2]; GpuMat vPyr_[2];
bool isDeviceArch11_;
}; };
@ -1895,7 +1979,6 @@ public:
polyN = 5; polyN = 5;
polySigma = 1.1; polySigma = 1.1;
flags = 0; flags = 0;
isDeviceArch11_ = !DeviceInfo().supports(FEATURE_SET_COMPUTE_12);
} }
int numLevels; int numLevels;
@ -1943,8 +2026,113 @@ private:
GpuMat frames_[2]; GpuMat frames_[2];
GpuMat pyrLevel_[2], M_, bufM_, R_[2], blurredFrame_[2]; GpuMat pyrLevel_[2], M_, bufM_, R_[2], blurredFrame_[2];
std::vector<GpuMat> pyramid0_, pyramid1_; std::vector<GpuMat> pyramid0_, pyramid1_;
};
// Implementation of the Zach, Pock and Bischof Dual TV-L1 Optical Flow method
//
// see reference:
// [1] C. Zach, T. Pock and H. Bischof, "A Duality Based Approach for Realtime TV-L1 Optical Flow".
// [2] Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation".
class CV_EXPORTS OpticalFlowDual_TVL1_GPU
{
public:
OpticalFlowDual_TVL1_GPU();
void operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy);
void collectGarbage();
/**
* Time step of the numerical scheme.
*/
double tau;
/**
* Weight parameter for the data term, attachment parameter.
* This is the most relevant parameter, which determines the smoothness of the output.
* The smaller this parameter is, the smoother the solutions we obtain.
* It depends on the range of motions of the images, so its value should be adapted to each image sequence.
*/
double lambda;
/**
* Weight parameter for (u - v)^2, tightness parameter.
* It serves as a link between the attachment and the regularization terms.
* In theory, it should have a small value in order to maintain both parts in correspondence.
* The method is stable for a large range of values of this parameter.
*/
double theta;
/**
* Number of scales used to create the pyramid of images.
*/
int nscales;
/**
* Number of warpings per scale.
* Represents the number of times that I1(x+u0) and grad( I1(x+u0) ) are computed per scale.
* This is a parameter that assures the stability of the method.
* It also affects the running time, so it is a compromise between speed and accuracy.
*/
int warps;
bool isDeviceArch11_; /**
* Stopping criterion threshold used in the numerical scheme, which is a trade-off between precision and running time.
* A small value will yield more accurate solutions at the expense of a slower convergence.
*/
double epsilon;
/**
* Stopping criterion iterations number used in the numerical scheme.
*/
int iterations;
bool useInitialFlow;
private:
void procOneScale(const GpuMat& I0, const GpuMat& I1, GpuMat& u1, GpuMat& u2);
std::vector<GpuMat> I0s;
std::vector<GpuMat> I1s;
std::vector<GpuMat> u1s;
std::vector<GpuMat> u2s;
GpuMat I1x_buf;
GpuMat I1y_buf;
GpuMat I1w_buf;
GpuMat I1wx_buf;
GpuMat I1wy_buf;
GpuMat grad_buf;
GpuMat rho_c_buf;
GpuMat p11_buf;
GpuMat p12_buf;
GpuMat p21_buf;
GpuMat p22_buf;
GpuMat diff_buf;
GpuMat norm_buf;
};
//! Calculates optical flow for 2 images using block matching algorithm */
CV_EXPORTS void calcOpticalFlowBM(const GpuMat& prev, const GpuMat& curr,
Size block_size, Size shift_size, Size max_range, bool use_previous,
GpuMat& velx, GpuMat& vely, GpuMat& buf,
Stream& stream = Stream::Null());
class CV_EXPORTS FastOpticalFlowBM
{
public:
void operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy, int search_window = 21, int block_window = 7, Stream& s = Stream::Null());
private:
GpuMat buffer;
GpuMat extended_I0;
GpuMat extended_I1;
}; };

@ -0,0 +1,26 @@
set(CMAKE_SYSTEM_NAME Linux)
set(CMAKE_SYSTEM_VERSION 1)
set(CMAKE_SYSTEM_PROCESSOR arm)
set(CMAKE_C_COMPILER arm-linux-gnueabi-gcc-4.5)
set(CMAKE_CXX_COMPILER arm-linux-gnueabi-g++-4.5)
#suppress compiller varning
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-psabi" )
set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-psabi" )
# can be any other plases
set(__arm_linux_eabi_root /usr/arm-linux-gnueabi)
set(CMAKE_FIND_ROOT_PATH ${CMAKE_FIND_ROOT_PATH} ${__arm_linux_eabi_root})
if(EXISTS ${CUDA_TOOLKIT_ROOT_DIR})
set(CMAKE_FIND_ROOT_PATH ${CMAKE_FIND_ROOT_PATH} ${CUDA_TOOLKIT_ROOT_DIR})
endif()
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
set(CARMA 1)
add_definitions(-DCARMA)

File diff suppressed because it is too large Load Diff

@ -581,13 +581,12 @@ PERF_TEST_P(Sz, ImgProc_CalcHist, GPU_TYPICAL_MAT_SIZES)
{ {
cv::gpu::GpuMat d_src(src); cv::gpu::GpuMat d_src(src);
cv::gpu::GpuMat d_hist; cv::gpu::GpuMat d_hist;
cv::gpu::GpuMat d_buf;
cv::gpu::calcHist(d_src, d_hist, d_buf); cv::gpu::calcHist(d_src, d_hist);
TEST_CYCLE() TEST_CYCLE()
{ {
cv::gpu::calcHist(d_src, d_hist, d_buf); cv::gpu::calcHist(d_src, d_hist);
} }
GPU_SANITY_CHECK(d_hist); GPU_SANITY_CHECK(d_hist);
@ -1706,10 +1705,40 @@ PERF_TEST_P(Sz_Depth_Cn, ImgProc_ImagePyramidGetLayer, Combine(GPU_TYPICAL_MAT_S
} }
} }
namespace {
struct Vec4iComparator
{
bool operator()(const cv::Vec4i& a, const cv::Vec4i b) const
{
if (a[0] != b[0]) return a[0] < b[0];
else if(a[1] != b[1]) return a[1] < b[1];
else if(a[2] != b[2]) return a[2] < b[2];
else return a[3] < b[3];
}
};
struct Vec3fComparator
{
bool operator()(const cv::Vec3f& a, const cv::Vec3f b) const
{
if(a[0] != b[0]) return a[0] < b[0];
else if(a[1] != b[1]) return a[1] < b[1];
else return a[2] < b[2];
}
};
struct Vec2fComparator
{
bool operator()(const cv::Vec2f& a, const cv::Vec2f b) const
{
if(a[0] != b[0]) return a[0] < b[0];
else return a[1] < b[1];
}
};
}
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
// HoughLines // HoughLines
PERF_TEST_P(Sz, DISABLED_ImgProc_HoughLines, GPU_TYPICAL_MAT_SIZES) PERF_TEST_P(Sz, ImgProc_HoughLines, GPU_TYPICAL_MAT_SIZES)
{ {
declare.time(30.0); declare.time(30.0);
@ -1744,7 +1773,11 @@ PERF_TEST_P(Sz, DISABLED_ImgProc_HoughLines, GPU_TYPICAL_MAT_SIZES)
cv::gpu::HoughLines(d_src, d_lines, d_buf, rho, theta, threshold); cv::gpu::HoughLines(d_src, d_lines, d_buf, rho, theta, threshold);
} }
GPU_SANITY_CHECK(d_lines); cv::Mat h_lines(d_lines);
cv::Vec2f* begin = (cv::Vec2f*)(h_lines.ptr<char>(0));
cv::Vec2f* end = (cv::Vec2f*)(h_lines.ptr<char>(0) + (h_lines.cols) * 2 * sizeof(float));
std::sort(begin, end, Vec2fComparator());
SANITY_CHECK(h_lines);
} }
else else
{ {
@ -1756,7 +1789,64 @@ PERF_TEST_P(Sz, DISABLED_ImgProc_HoughLines, GPU_TYPICAL_MAT_SIZES)
cv::HoughLines(src, lines, rho, theta, threshold); cv::HoughLines(src, lines, rho, theta, threshold);
} }
CPU_SANITY_CHECK(lines); std::sort(lines.begin(), lines.end(), Vec2fComparator());
SANITY_CHECK(lines);
}
}
//////////////////////////////////////////////////////////////////////
// HoughLinesP
DEF_PARAM_TEST_1(Image, std::string);
PERF_TEST_P(Image, ImgProc_HoughLinesP, testing::Values("cv/shared/pic5.png", "stitching/a1.png"))
{
declare.time(30.0);
std::string fileName = getDataPath(GetParam());
const float rho = 1.f;
const float theta = float(CV_PI) / 180.f;
const int threshold = 100;
const int minLineLenght = 50;
const int maxLineGap = 5;
cv::Mat image = cv::imread(fileName, cv::IMREAD_GRAYSCALE);
cv::Mat mask;
cv::Canny(image, mask, 50, 100);
if (PERF_RUN_GPU())
{
cv::gpu::GpuMat d_mask(mask);
cv::gpu::GpuMat d_lines;
cv::gpu::HoughLinesBuf d_buf;
cv::gpu::HoughLinesP(d_mask, d_lines, d_buf, rho, theta, minLineLenght, maxLineGap);
TEST_CYCLE()
{
cv::gpu::HoughLinesP(d_mask, d_lines, d_buf, rho, theta, minLineLenght, maxLineGap);
}
cv::Mat h_lines(d_lines);
cv::Vec4i* begin = h_lines.ptr<cv::Vec4i>();
cv::Vec4i* end = h_lines.ptr<cv::Vec4i>() + h_lines.cols;
std::sort(begin, end, Vec4iComparator());
SANITY_CHECK(h_lines);
}
else
{
std::vector<cv::Vec4i> lines;
cv::HoughLinesP(mask, lines, rho, theta, threshold, minLineLenght, maxLineGap);
TEST_CYCLE()
{
cv::HoughLinesP(mask, lines, rho, theta, threshold, minLineLenght, maxLineGap);
}
std::sort(lines.begin(), lines.end(), Vec4iComparator());
SANITY_CHECK(lines);
} }
} }
@ -1804,7 +1894,11 @@ PERF_TEST_P(Sz_Dp_MinDist, ImgProc_HoughCircles, Combine(GPU_TYPICAL_MAT_SIZES,
cv::gpu::HoughCircles(d_src, d_circles, d_buf, CV_HOUGH_GRADIENT, dp, minDist, cannyThreshold, votesThreshold, minRadius, maxRadius); cv::gpu::HoughCircles(d_src, d_circles, d_buf, CV_HOUGH_GRADIENT, dp, minDist, cannyThreshold, votesThreshold, minRadius, maxRadius);
} }
GPU_SANITY_CHECK(d_circles); cv::Mat h_circles(d_circles);
cv::Vec3f* begin = (cv::Vec3f*)(h_circles.ptr<char>(0));
cv::Vec3f* end = (cv::Vec3f*)(h_circles.ptr<char>(0) + (h_circles.cols) * 3 * sizeof(float));
std::sort(begin, end, Vec3fComparator());
SANITY_CHECK(h_circles);
} }
else else
{ {
@ -1817,7 +1911,8 @@ PERF_TEST_P(Sz_Dp_MinDist, ImgProc_HoughCircles, Combine(GPU_TYPICAL_MAT_SIZES,
cv::HoughCircles(src, circles, CV_HOUGH_GRADIENT, dp, minDist, cannyThreshold, votesThreshold, minRadius, maxRadius); cv::HoughCircles(src, circles, CV_HOUGH_GRADIENT, dp, minDist, cannyThreshold, votesThreshold, minRadius, maxRadius);
} }
CPU_SANITY_CHECK(circles); std::sort(circles.begin(), circles.end(), Vec3fComparator());
SANITY_CHECK(circles);
} }
} }

@ -89,7 +89,6 @@ PERF_TEST_P(HOG, CalTech, Values<string>("gpu/caltech/image_00000009_0.png", "gp
SANITY_CHECK(found_locations); SANITY_CHECK(found_locations);
} }
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// HaarClassifier // HaarClassifier

@ -0,0 +1,279 @@
#include "perf_precomp.hpp"
#define GPU_PERF_TEST_P(fixture, name, params) \
class fixture##_##name : public fixture {\
public:\
fixture##_##name() {}\
protected:\
virtual void __cpu();\
virtual void __gpu();\
virtual void PerfTestBody();\
};\
TEST_P(fixture##_##name, name /*perf*/){ RunPerfTestBody(); }\
INSTANTIATE_TEST_CASE_P(/*none*/, fixture##_##name, params);\
void fixture##_##name::PerfTestBody() { if (PERF_RUN_GPU()) __gpu(); else __cpu(); }
#define RUN_CPU(fixture, name)\
void fixture##_##name::__cpu()
#define RUN_GPU(fixture, name)\
void fixture##_##name::__gpu()
#define NO_CPU(fixture, name)\
void fixture##_##name::__cpu() { FAIL() << "No such CPU implementation analogy";}
namespace {
struct DetectionLess
{
bool operator()(const cv::gpu::SCascade::Detection& a,
const cv::gpu::SCascade::Detection& b) const
{
if (a.x != b.x) return a.x < b.x;
else if (a.y != b.y) return a.y < b.y;
else if (a.w != b.w) return a.w < b.w;
else return a.h < b.h;
}
};
cv::Mat sortDetections(cv::gpu::GpuMat& objects)
{
cv::Mat detections(objects);
typedef cv::gpu::SCascade::Detection Detection;
Detection* begin = (Detection*)(detections.ptr<char>(0));
Detection* end = (Detection*)(detections.ptr<char>(0) + detections.cols);
std::sort(begin, end, DetectionLess());
return detections;
}
}
typedef std::tr1::tuple<std::string, std::string> fixture_t;
typedef perf::TestBaseWithParam<fixture_t> SCascadeTest;
GPU_PERF_TEST_P(SCascadeTest, detect,
testing::Combine(
testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png"))))
RUN_GPU(SCascadeTest, detect)
{
cv::Mat cpu = readImage (GET_PARAM(1));
ASSERT_FALSE(cpu.empty());
cv::gpu::GpuMat colored(cpu);
cv::gpu::SCascade cascade;
cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
ASSERT_TRUE(fs.isOpened());
ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SCascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1);
rois.setTo(1);
cascade.detect(colored, rois, objectBoxes);
TEST_CYCLE()
{
cascade.detect(colored, rois, objectBoxes);
}
SANITY_CHECK(sortDetections(objectBoxes));
}
NO_CPU(SCascadeTest, detect)
static cv::Rect getFromTable(int idx)
{
static const cv::Rect rois[] =
{
cv::Rect( 65 * 4, 20 * 4, 35 * 4, 80 * 4),
cv::Rect( 95 * 4, 35 * 4, 45 * 4, 40 * 4),
cv::Rect( 45 * 4, 35 * 4, 45 * 4, 40 * 4),
cv::Rect( 25 * 4, 27 * 4, 50 * 4, 45 * 4),
cv::Rect(100 * 4, 50 * 4, 45 * 4, 40 * 4),
cv::Rect( 60 * 4, 30 * 4, 45 * 4, 40 * 4),
cv::Rect( 40 * 4, 55 * 4, 50 * 4, 40 * 4),
cv::Rect( 48 * 4, 37 * 4, 72 * 4, 80 * 4),
cv::Rect( 48 * 4, 32 * 4, 85 * 4, 58 * 4),
cv::Rect( 48 * 4, 0 * 4, 32 * 4, 27 * 4)
};
return rois[idx];
}
typedef std::tr1::tuple<std::string, std::string, int> roi_fixture_t;
typedef perf::TestBaseWithParam<roi_fixture_t> SCascadeTestRoi;
GPU_PERF_TEST_P(SCascadeTestRoi, detectInRoi,
testing::Combine(
testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")),
testing::Range(0, 5)))
RUN_GPU(SCascadeTestRoi, detectInRoi)
{
cv::Mat cpu = readImage (GET_PARAM(1));
ASSERT_FALSE(cpu.empty());
cv::gpu::GpuMat colored(cpu);
cv::gpu::SCascade cascade;
cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
ASSERT_TRUE(fs.isOpened());
ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(colored.size(), CV_8UC1);
rois.setTo(0);
int nroi = GET_PARAM(2);
cv::RNG rng;
for (int i = 0; i < nroi; ++i)
{
cv::Rect r = getFromTable(rng(10));
cv::gpu::GpuMat sub(rois, r);
sub.setTo(1);
}
cascade.detect(colored, rois, objectBoxes);
TEST_CYCLE()
{
cascade.detect(colored, rois, objectBoxes);
}
SANITY_CHECK(sortDetections(objectBoxes));
}
NO_CPU(SCascadeTestRoi, detectInRoi)
GPU_PERF_TEST_P(SCascadeTestRoi, detectEachRoi,
testing::Combine(
testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")),
testing::Range(0, 10)))
RUN_GPU(SCascadeTestRoi, detectEachRoi)
{
cv::Mat cpu = readImage (GET_PARAM(1));
ASSERT_FALSE(cpu.empty());
cv::gpu::GpuMat colored(cpu);
cv::gpu::SCascade cascade;
cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
ASSERT_TRUE(fs.isOpened());
ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(colored.size(), CV_8UC1);
rois.setTo(0);
int idx = GET_PARAM(2);
cv::Rect r = getFromTable(idx);
cv::gpu::GpuMat sub(rois, r);
sub.setTo(1);
cascade.detect(colored, rois, objectBoxes);
TEST_CYCLE()
{
cascade.detect(colored, rois, objectBoxes);
}
SANITY_CHECK(sortDetections(objectBoxes));
}
NO_CPU(SCascadeTestRoi, detectEachRoi)
GPU_PERF_TEST_P(SCascadeTest, detectOnIntegral,
testing::Combine(
testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
testing::Values(std::string("cv/cascadeandhog/integrals.xml"))))
static std::string itoa(long i)
{
static char s[65];
sprintf(s, "%ld", i);
return std::string(s);
}
RUN_GPU(SCascadeTest, detectOnIntegral)
{
cv::FileStorage fsi(perf::TestBase::getDataPath(GET_PARAM(1)), cv::FileStorage::READ);
ASSERT_TRUE(fsi.isOpened());
cv::gpu::GpuMat hogluv(121 * 10, 161, CV_32SC1);
for (int i = 0; i < 10; ++i)
{
cv::Mat channel;
fsi[std::string("channel") + itoa(i)] >> channel;
cv::gpu::GpuMat gchannel(hogluv, cv::Rect(0, 121 * i, 161, 121));
gchannel.upload(channel);
}
cv::gpu::SCascade cascade;
cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
ASSERT_TRUE(fs.isOpened());
ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SCascade::Detection), CV_8UC1), rois(cv::Size(640, 480), CV_8UC1);
rois.setTo(1);
cascade.detect(hogluv, rois, objectBoxes);
TEST_CYCLE()
{
cascade.detect(hogluv, rois, objectBoxes);
}
SANITY_CHECK(sortDetections(objectBoxes));
}
NO_CPU(SCascadeTest, detectOnIntegral)
GPU_PERF_TEST_P(SCascadeTest, detectStream,
testing::Combine(
testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png"))))
RUN_GPU(SCascadeTest, detectStream)
{
cv::Mat cpu = readImage (GET_PARAM(1));
ASSERT_FALSE(cpu.empty());
cv::gpu::GpuMat colored(cpu);
cv::gpu::SCascade cascade;
cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
ASSERT_TRUE(fs.isOpened());
ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SCascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1);
rois.setTo(1);
cv::gpu::Stream s;
cascade.detect(colored, rois, objectBoxes, s);
TEST_CYCLE()
{
cascade.detect(colored, rois, objectBoxes, s);
}
#ifdef HAVE_CUDA
cudaDeviceSynchronize();
#endif
SANITY_CHECK(sortDetections(objectBoxes));
}
NO_CPU(SCascadeTest, detectStream)

@ -394,6 +394,173 @@ PERF_TEST_P(ImagePair, Video_FarnebackOpticalFlow,
} }
} }
//////////////////////////////////////////////////////
// OpticalFlowDual_TVL1
PERF_TEST_P(ImagePair, Video_OpticalFlowDual_TVL1,
Values<pair_string>(make_pair("gpu/opticalflow/frame0.png", "gpu/opticalflow/frame1.png")))
{
declare.time(20);
cv::Mat frame0 = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(frame0.empty());
cv::Mat frame1 = readImage(GetParam().second, cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(frame1.empty());
if (PERF_RUN_GPU())
{
cv::gpu::GpuMat d_frame0(frame0);
cv::gpu::GpuMat d_frame1(frame1);
cv::gpu::GpuMat d_flowx;
cv::gpu::GpuMat d_flowy;
cv::gpu::OpticalFlowDual_TVL1_GPU d_alg;
d_alg(d_frame0, d_frame1, d_flowx, d_flowy);
TEST_CYCLE()
{
d_alg(d_frame0, d_frame1, d_flowx, d_flowy);
}
GPU_SANITY_CHECK(d_flowx);
GPU_SANITY_CHECK(d_flowy);
}
else
{
cv::Mat flow;
cv::OpticalFlowDual_TVL1 alg;
alg(frame0, frame1, flow);
TEST_CYCLE()
{
alg(frame0, frame1, flow);
}
CPU_SANITY_CHECK(flow);
}
}
//////////////////////////////////////////////////////
// OpticalFlowBM
void calcOpticalFlowBM(const cv::Mat& prev, const cv::Mat& curr,
cv::Size bSize, cv::Size shiftSize, cv::Size maxRange, int usePrevious,
cv::Mat& velx, cv::Mat& vely)
{
cv::Size sz((curr.cols - bSize.width + shiftSize.width)/shiftSize.width, (curr.rows - bSize.height + shiftSize.height)/shiftSize.height);
velx.create(sz, CV_32FC1);
vely.create(sz, CV_32FC1);
CvMat cvprev = prev;
CvMat cvcurr = curr;
CvMat cvvelx = velx;
CvMat cvvely = vely;
cvCalcOpticalFlowBM(&cvprev, &cvcurr, bSize, shiftSize, maxRange, usePrevious, &cvvelx, &cvvely);
}
PERF_TEST_P(ImagePair, Video_OpticalFlowBM,
Values<pair_string>(make_pair("gpu/opticalflow/frame0.png", "gpu/opticalflow/frame1.png")))
{
declare.time(400);
cv::Mat frame0 = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(frame0.empty());
cv::Mat frame1 = readImage(GetParam().second, cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(frame1.empty());
cv::Size block_size(16, 16);
cv::Size shift_size(1, 1);
cv::Size max_range(16, 16);
if (PERF_RUN_GPU())
{
cv::gpu::GpuMat d_frame0(frame0);
cv::gpu::GpuMat d_frame1(frame1);
cv::gpu::GpuMat d_velx, d_vely, buf;
cv::gpu::calcOpticalFlowBM(d_frame0, d_frame1, block_size, shift_size, max_range, false, d_velx, d_vely, buf);
TEST_CYCLE()
{
cv::gpu::calcOpticalFlowBM(d_frame0, d_frame1, block_size, shift_size, max_range, false, d_velx, d_vely, buf);
}
GPU_SANITY_CHECK(d_velx);
GPU_SANITY_CHECK(d_vely);
}
else
{
cv::Mat velx, vely;
calcOpticalFlowBM(frame0, frame1, block_size, shift_size, max_range, false, velx, vely);
TEST_CYCLE()
{
calcOpticalFlowBM(frame0, frame1, block_size, shift_size, max_range, false, velx, vely);
}
CPU_SANITY_CHECK(velx);
CPU_SANITY_CHECK(vely);
}
}
PERF_TEST_P(ImagePair, Video_FastOpticalFlowBM,
Values<pair_string>(make_pair("gpu/opticalflow/frame0.png", "gpu/opticalflow/frame1.png")))
{
declare.time(400);
cv::Mat frame0 = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(frame0.empty());
cv::Mat frame1 = readImage(GetParam().second, cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(frame1.empty());
cv::Size block_size(16, 16);
cv::Size shift_size(1, 1);
cv::Size max_range(16, 16);
if (PERF_RUN_GPU())
{
cv::gpu::GpuMat d_frame0(frame0);
cv::gpu::GpuMat d_frame1(frame1);
cv::gpu::GpuMat d_velx, d_vely;
cv::gpu::FastOpticalFlowBM fastBM;
fastBM(d_frame0, d_frame1, d_velx, d_vely, max_range.width, block_size.width);
TEST_CYCLE()
{
fastBM(d_frame0, d_frame1, d_velx, d_vely, max_range.width, block_size.width);
}
GPU_SANITY_CHECK(d_velx);
GPU_SANITY_CHECK(d_vely);
}
else
{
cv::Mat velx, vely;
calcOpticalFlowBM(frame0, frame1, block_size, shift_size, max_range, false, velx, vely);
TEST_CYCLE()
{
calcOpticalFlowBM(frame0, frame1, block_size, shift_size, max_range, false, velx, vely);
}
CPU_SANITY_CHECK(velx);
CPU_SANITY_CHECK(vely);
}
}
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
// FGDStatModel // FGDStatModel

@ -68,11 +68,16 @@ void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool,
void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const GpuMat& src3, double beta, GpuMat& dst, int flags, Stream& stream) void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const GpuMat& src3, double beta, GpuMat& dst, int flags, Stream& stream)
{ {
#ifndef HAVE_CUBLAS #ifndef HAVE_CUBLAS
(void)src1; (void)src2; (void)alpha; (void)src3; (void)beta; (void)dst; (void)flags; (void)stream; (void)src1;
(void)src2;
(void)alpha;
(void)src3;
(void)beta;
(void)dst;
(void)flags;
(void)stream;
CV_Error(CV_StsNotImplemented, "The library was build without CUBLAS"); CV_Error(CV_StsNotImplemented, "The library was build without CUBLAS");
#else #else
// CUBLAS works with column-major matrices // CUBLAS works with column-major matrices
CV_Assert(src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2); CV_Assert(src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2);
@ -80,7 +85,7 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
if (src1.depth() == CV_64F) if (src1.depth() == CV_64F)
{ {
if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
} }
@ -188,7 +193,6 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
} }
cublasSafeCall( cublasDestroy_v2(handle) ); cublasSafeCall( cublasDestroy_v2(handle) );
#endif #endif
} }
@ -227,7 +231,7 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
} }
else // if (src.elemSize() == 8) else // if (src.elemSize() == 8)
{ {
if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
NppStStreamHandler h(stream); NppStStreamHandler h(stream);

@ -88,71 +88,71 @@ namespace cv { namespace gpu { namespace device
{ {
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask, template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
int cc, cudaStream_t stream); cudaStream_t stream);
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask, template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
int cc, cudaStream_t stream); cudaStream_t stream);
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask, template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
int cc, cudaStream_t stream); cudaStream_t stream);
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
int cc, cudaStream_t stream); cudaStream_t stream);
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
int cc, cudaStream_t stream); cudaStream_t stream);
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
int cc, cudaStream_t stream); cudaStream_t stream);
} }
namespace bf_knnmatch namespace bf_knnmatch
{ {
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask, template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
int cc, cudaStream_t stream); cudaStream_t stream);
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask, template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
int cc, cudaStream_t stream); cudaStream_t stream);
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask, template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
int cc, cudaStream_t stream); cudaStream_t stream);
template <typename T> void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, template <typename T> void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
int cc, cudaStream_t stream); cudaStream_t stream);
template <typename T> void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, template <typename T> void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
int cc, cudaStream_t stream); cudaStream_t stream);
template <typename T> void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, template <typename T> void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
int cc, cudaStream_t stream); cudaStream_t stream);
} }
namespace bf_radius_match namespace bf_radius_match
{ {
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask, template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
int cc, cudaStream_t stream); cudaStream_t stream);
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask, template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
int cc, cudaStream_t stream); cudaStream_t stream);
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask, template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
int cc, cudaStream_t stream); cudaStream_t stream);
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
int cc, cudaStream_t stream); cudaStream_t stream);
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
int cc, cudaStream_t stream); cudaStream_t stream);
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
int cc, cudaStream_t stream); cudaStream_t stream);
} }
}}} }}}
@ -202,7 +202,7 @@ void cv::gpu::BFMatcher_GPU::matchSingle(const GpuMat& query, const GpuMat& trai
typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask, typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
int cc, cudaStream_t stream); cudaStream_t stream);
static const caller_t callersL1[] = static const caller_t callersL1[] =
{ {
@ -238,10 +238,7 @@ void cv::gpu::BFMatcher_GPU::matchSingle(const GpuMat& query, const GpuMat& trai
caller_t func = callers[query.depth()]; caller_t func = callers[query.depth()];
CV_Assert(func != 0); CV_Assert(func != 0);
DeviceInfo info; func(query, train, mask, trainIdx, distance, StreamAccessor::getStream(stream));
int cc = info.majorVersion() * 10 + info.minorVersion();
func(query, train, mask, trainIdx, distance, cc, StreamAccessor::getStream(stream));
} }
void cv::gpu::BFMatcher_GPU::matchDownload(const GpuMat& trainIdx, const GpuMat& distance, vector<DMatch>& matches) void cv::gpu::BFMatcher_GPU::matchDownload(const GpuMat& trainIdx, const GpuMat& distance, vector<DMatch>& matches)
@ -348,7 +345,7 @@ void cv::gpu::BFMatcher_GPU::matchCollection(const GpuMat& query, const GpuMat&
typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
int cc, cudaStream_t stream); cudaStream_t stream);
static const caller_t callersL1[] = static const caller_t callersL1[] =
{ {
@ -383,10 +380,7 @@ void cv::gpu::BFMatcher_GPU::matchCollection(const GpuMat& query, const GpuMat&
caller_t func = callers[query.depth()]; caller_t func = callers[query.depth()];
CV_Assert(func != 0); CV_Assert(func != 0);
DeviceInfo info; func(query, trainCollection, masks, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream));
int cc = info.majorVersion() * 10 + info.minorVersion();
func(query, trainCollection, masks, trainIdx, imgIdx, distance, cc, StreamAccessor::getStream(stream));
} }
void cv::gpu::BFMatcher_GPU::matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, vector<DMatch>& matches) void cv::gpu::BFMatcher_GPU::matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, vector<DMatch>& matches)
@ -462,7 +456,7 @@ void cv::gpu::BFMatcher_GPU::knnMatchSingle(const GpuMat& query, const GpuMat& t
typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask, typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
int cc, cudaStream_t stream); cudaStream_t stream);
static const caller_t callersL1[] = static const caller_t callersL1[] =
{ {
@ -512,10 +506,7 @@ void cv::gpu::BFMatcher_GPU::knnMatchSingle(const GpuMat& query, const GpuMat& t
caller_t func = callers[query.depth()]; caller_t func = callers[query.depth()];
CV_Assert(func != 0); CV_Assert(func != 0);
DeviceInfo info; func(query, train, k, mask, trainIdx, distance, allDist, StreamAccessor::getStream(stream));
int cc = info.majorVersion() * 10 + info.minorVersion();
func(query, train, k, mask, trainIdx, distance, allDist, cc, StreamAccessor::getStream(stream));
} }
void cv::gpu::BFMatcher_GPU::knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, void cv::gpu::BFMatcher_GPU::knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance,
@ -594,7 +585,7 @@ void cv::gpu::BFMatcher_GPU::knnMatch2Collection(const GpuMat& query, const GpuM
typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
int cc, cudaStream_t stream); cudaStream_t stream);
static const caller_t callersL1[] = static const caller_t callersL1[] =
{ {
@ -634,10 +625,7 @@ void cv::gpu::BFMatcher_GPU::knnMatch2Collection(const GpuMat& query, const GpuM
caller_t func = callers[query.depth()]; caller_t func = callers[query.depth()];
CV_Assert(func != 0); CV_Assert(func != 0);
DeviceInfo info; func(query, trainCollection, maskCollection, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream));
int cc = info.majorVersion() * 10 + info.minorVersion();
func(query, trainCollection, maskCollection, trainIdx, imgIdx, distance, cc, StreamAccessor::getStream(stream));
} }
void cv::gpu::BFMatcher_GPU::knnMatch2Download(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, void cv::gpu::BFMatcher_GPU::knnMatch2Download(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance,
@ -778,7 +766,7 @@ void cv::gpu::BFMatcher_GPU::radiusMatchSingle(const GpuMat& query, const GpuMat
typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask, typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
int cc, cudaStream_t stream); cudaStream_t stream);
static const caller_t callersL1[] = static const caller_t callersL1[] =
{ {
@ -799,12 +787,6 @@ void cv::gpu::BFMatcher_GPU::radiusMatchSingle(const GpuMat& query, const GpuMat
matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/ matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
}; };
DeviceInfo info;
int cc = info.majorVersion() * 10 + info.minorVersion();
if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");
const int nQuery = query.rows; const int nQuery = query.rows;
const int nTrain = train.rows; const int nTrain = train.rows;
@ -830,7 +812,7 @@ void cv::gpu::BFMatcher_GPU::radiusMatchSingle(const GpuMat& query, const GpuMat
caller_t func = callers[query.depth()]; caller_t func = callers[query.depth()];
CV_Assert(func != 0); CV_Assert(func != 0);
func(query, train, maxDistance, mask, trainIdx, distance, nMatches, cc, StreamAccessor::getStream(stream)); func(query, train, maxDistance, mask, trainIdx, distance, nMatches, StreamAccessor::getStream(stream));
} }
void cv::gpu::BFMatcher_GPU::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, const GpuMat& nMatches, void cv::gpu::BFMatcher_GPU::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, const GpuMat& nMatches,
@ -913,7 +895,7 @@ void cv::gpu::BFMatcher_GPU::radiusMatchCollection(const GpuMat& query, GpuMat&
typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
int cc, cudaStream_t stream); cudaStream_t stream);
static const caller_t callersL1[] = static const caller_t callersL1[] =
{ {
@ -934,12 +916,6 @@ void cv::gpu::BFMatcher_GPU::radiusMatchCollection(const GpuMat& query, GpuMat&
matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/ matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
}; };
DeviceInfo info;
int cc = info.majorVersion() * 10 + info.minorVersion();
if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");
const int nQuery = query.rows; const int nQuery = query.rows;
CV_Assert(query.channels() == 1 && query.depth() < CV_64F); CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
@ -968,7 +944,7 @@ void cv::gpu::BFMatcher_GPU::radiusMatchCollection(const GpuMat& query, GpuMat&
vector<PtrStepSzb> masks_(masks.begin(), masks.end()); vector<PtrStepSzb> masks_(masks.begin(), masks.end());
func(query, &trains_[0], static_cast<int>(trains_.size()), maxDistance, masks_.size() == 0 ? 0 : &masks_[0], func(query, &trains_[0], static_cast<int>(trains_.size()), maxDistance, masks_.size() == 0 ? 0 : &masks_[0],
trainIdx, imgIdx, distance, nMatches, cc, StreamAccessor::getStream(stream)); trainIdx, imgIdx, distance, nMatches, StreamAccessor::getStream(stream));
} }
void cv::gpu::BFMatcher_GPU::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, const GpuMat& nMatches, void cv::gpu::BFMatcher_GPU::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, const GpuMat& nMatches,

@ -623,7 +623,7 @@ private:
} }
// copy data structures on gpu // copy data structures on gpu
stage_mat.upload(cv::Mat(1, stages.size() * sizeof(Stage), CV_8UC1, (uchar*)&(stages[0]) )); stage_mat.upload(cv::Mat(1, (int) (stages.size() * sizeof(Stage)), CV_8UC1, (uchar*)&(stages[0]) ));
trees_mat.upload(cv::Mat(cl_trees).reshape(1,1)); trees_mat.upload(cv::Mat(cl_trees).reshape(1,1));
nodes_mat.upload(cv::Mat(cl_nodes).reshape(1,1)); nodes_mat.upload(cv::Mat(cl_nodes).reshape(1,1));
leaves_mat.upload(cv::Mat(cl_leaves).reshape(1,1)); leaves_mat.upload(cv::Mat(cl_leaves).reshape(1,1));

@ -42,10 +42,13 @@
#if !defined CUDA_DISABLER #if !defined CUDA_DISABLER
#include "internal_shared.hpp" #include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/utility.hpp"
#include "opencv2/gpu/device/reduce.hpp"
#include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/vec_distance.hpp" #include "opencv2/gpu/device/vec_distance.hpp"
#include "opencv2/gpu/device/datamov_utils.hpp" #include "opencv2/gpu/device/datamov_utils.hpp"
#include "opencv2/gpu/device/warp_shuffle.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
@ -59,6 +62,45 @@ namespace cv { namespace gpu { namespace device
int& bestTrainIdx1, int& bestTrainIdx2, int& bestTrainIdx1, int& bestTrainIdx2,
float* s_distance, int* s_trainIdx) float* s_distance, int* s_trainIdx)
{ {
#if __CUDA_ARCH__ >= 300
(void) s_distance;
(void) s_trainIdx;
float d1, d2;
int i1, i2;
#pragma unroll
for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2)
{
d1 = shfl_down(bestDistance1, i, BLOCK_SIZE);
d2 = shfl_down(bestDistance2, i, BLOCK_SIZE);
i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE);
i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE);
if (bestDistance1 < d1)
{
if (d1 < bestDistance2)
{
bestDistance2 = d1;
bestTrainIdx2 = i1;
}
}
else
{
bestDistance2 = bestDistance1;
bestTrainIdx2 = bestTrainIdx1;
bestDistance1 = d1;
bestTrainIdx1 = i1;
if (d2 < bestDistance2)
{
bestDistance2 = d2;
bestTrainIdx2 = i2;
}
}
}
#else
float myBestDistance1 = numeric_limits<float>::max(); float myBestDistance1 = numeric_limits<float>::max();
float myBestDistance2 = numeric_limits<float>::max(); float myBestDistance2 = numeric_limits<float>::max();
int myBestTrainIdx1 = -1; int myBestTrainIdx1 = -1;
@ -122,6 +164,7 @@ namespace cv { namespace gpu { namespace device
bestTrainIdx1 = myBestTrainIdx1; bestTrainIdx1 = myBestTrainIdx1;
bestTrainIdx2 = myBestTrainIdx2; bestTrainIdx2 = myBestTrainIdx2;
#endif
} }
template <int BLOCK_SIZE> template <int BLOCK_SIZE>
@ -130,6 +173,53 @@ namespace cv { namespace gpu { namespace device
int& bestImgIdx1, int& bestImgIdx2, int& bestImgIdx1, int& bestImgIdx2,
float* s_distance, int* s_trainIdx, int* s_imgIdx) float* s_distance, int* s_trainIdx, int* s_imgIdx)
{ {
#if __CUDA_ARCH__ >= 300
(void) s_distance;
(void) s_trainIdx;
(void) s_imgIdx;
float d1, d2;
int i1, i2;
int j1, j2;
#pragma unroll
for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2)
{
d1 = shfl_down(bestDistance1, i, BLOCK_SIZE);
d2 = shfl_down(bestDistance2, i, BLOCK_SIZE);
i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE);
i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE);
j1 = shfl_down(bestImgIdx1, i, BLOCK_SIZE);
j2 = shfl_down(bestImgIdx2, i, BLOCK_SIZE);
if (bestDistance1 < d1)
{
if (d1 < bestDistance2)
{
bestDistance2 = d1;
bestTrainIdx2 = i1;
bestImgIdx2 = j1;
}
}
else
{
bestDistance2 = bestDistance1;
bestTrainIdx2 = bestTrainIdx1;
bestImgIdx2 = bestImgIdx1;
bestDistance1 = d1;
bestTrainIdx1 = i1;
bestImgIdx1 = j1;
if (d2 < bestDistance2)
{
bestDistance2 = d2;
bestTrainIdx2 = i2;
bestImgIdx2 = j2;
}
}
}
#else
float myBestDistance1 = numeric_limits<float>::max(); float myBestDistance1 = numeric_limits<float>::max();
float myBestDistance2 = numeric_limits<float>::max(); float myBestDistance2 = numeric_limits<float>::max();
int myBestTrainIdx1 = -1; int myBestTrainIdx1 = -1;
@ -205,6 +295,7 @@ namespace cv { namespace gpu { namespace device
bestImgIdx1 = myBestImgIdx1; bestImgIdx1 = myBestImgIdx1;
bestImgIdx2 = myBestImgIdx2; bestImgIdx2 = myBestImgIdx2;
#endif
} }
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
@ -748,9 +839,8 @@ namespace cv { namespace gpu { namespace device
template <typename Dist, typename T, typename Mask> template <typename Dist, typename T, typename Mask>
void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzb& trainIdx, const PtrStepSzb& distance,
int cc, cudaStream_t stream) cudaStream_t stream)
{ {
(void)cc;
if (query.cols <= 64) if (query.cols <= 64)
{ {
matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream); matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
@ -780,9 +870,8 @@ namespace cv { namespace gpu { namespace device
template <typename Dist, typename T, typename Mask> template <typename Dist, typename T, typename Mask>
void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask, void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
int cc, cudaStream_t stream) cudaStream_t stream)
{ {
(void)cc;
if (query.cols <= 64) if (query.cols <= 64)
{ {
matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream); matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
@ -945,9 +1034,8 @@ namespace cv { namespace gpu { namespace device
template <typename Dist, typename T, typename Mask> template <typename Dist, typename T, typename Mask>
void calcDistanceDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, void calcDistanceDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
const PtrStepSzf& allDist, const PtrStepSzf& allDist,
int cc, cudaStream_t stream) cudaStream_t stream)
{ {
(void)cc;
if (query.cols <= 64) if (query.cols <= 64)
{ {
calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream); calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream);
@ -1005,7 +1093,7 @@ namespace cv { namespace gpu { namespace device
s_trainIdx[threadIdx.x] = bestIdx; s_trainIdx[threadIdx.x] = bestIdx;
__syncthreads(); __syncthreads();
reducePredVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<volatile float>()); reduceKeyVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<float>());
if (threadIdx.x == 0) if (threadIdx.x == 0)
{ {
@ -1034,7 +1122,7 @@ namespace cv { namespace gpu { namespace device
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream) void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream)
{ {
findKnnMatch<256>(k, static_cast<PtrStepSzi>(trainIdx), static_cast<PtrStepSzf>(distance), allDist, stream); findKnnMatch<256>(k, static_cast<PtrStepSzi>(trainIdx), static_cast<PtrStepSzf>(distance), allDist, stream);
} }
@ -1045,16 +1133,16 @@ namespace cv { namespace gpu { namespace device
template <typename Dist, typename T, typename Mask> template <typename Dist, typename T, typename Mask>
void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, int k, const Mask& mask, void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, int k, const Mask& mask,
const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
int cc, cudaStream_t stream) cudaStream_t stream)
{ {
if (k == 2) if (k == 2)
{ {
match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, cc, stream); match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, stream);
} }
else else
{ {
calcDistanceDispatcher<Dist>(query, train, mask, allDist, cc, stream); calcDistanceDispatcher<Dist>(query, train, mask, allDist, stream);
findKnnMatchDispatcher(k, trainIdx, distance, allDist, cc, stream); findKnnMatchDispatcher(k, trainIdx, distance, allDist, stream);
} }
} }
@ -1063,103 +1151,103 @@ namespace cv { namespace gpu { namespace device
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask, template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
int cc, cudaStream_t stream) cudaStream_t stream)
{ {
if (mask.data) if (mask.data)
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream); matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
else else
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream); matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
} }
template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
//template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
template void matchL1_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); template void matchL1_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask, template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
int cc, cudaStream_t stream) cudaStream_t stream)
{ {
if (mask.data) if (mask.data)
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream); matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
else else
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream); matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
} }
//template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
//template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
//template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
//template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
//template void matchL2_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); //template void matchL2_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask, template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
int cc, cudaStream_t stream) cudaStream_t stream)
{ {
if (mask.data) if (mask.data)
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream); matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
else else
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream); matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
} }
template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
//template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
//template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
template void matchHamming_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); template void matchHamming_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
template <typename T> void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, template <typename T> void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
int cc, cudaStream_t stream) cudaStream_t stream)
{ {
if (masks.data) if (masks.data)
match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream); match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
else else
match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream);
} }
template void match2L1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); template void match2L1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
//template void match2L1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); //template void match2L1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
template void match2L1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); template void match2L1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
template void match2L1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); template void match2L1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
template void match2L1_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); template void match2L1_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
template void match2L1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); template void match2L1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
template <typename T> void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, template <typename T> void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
int cc, cudaStream_t stream) cudaStream_t stream)
{ {
if (masks.data) if (masks.data)
match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream); match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
else else
match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream);
} }
//template void match2L2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); //template void match2L2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
//template void match2L2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); //template void match2L2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
//template void match2L2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); //template void match2L2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
//template void match2L2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); //template void match2L2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
//template void match2L2_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); //template void match2L2_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
template void match2L2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); template void match2L2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
template <typename T> void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, template <typename T> void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
int cc, cudaStream_t stream) cudaStream_t stream)
{ {
if (masks.data) if (masks.data)
match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream); match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
else else
match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream);
} }
template void match2Hamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); template void match2Hamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
//template void match2Hamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); //template void match2Hamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
template void match2Hamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); template void match2Hamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
//template void match2Hamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); //template void match2Hamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
template void match2Hamming_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); template void match2Hamming_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
} // namespace bf_knnmatch } // namespace bf_knnmatch
}}} // namespace cv { namespace gpu { namespace device { }}} // namespace cv { namespace gpu { namespace device {

@ -42,7 +42,9 @@
#if !defined CUDA_DISABLER #if !defined CUDA_DISABLER
#include "internal_shared.hpp" #include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/utility.hpp"
#include "opencv2/gpu/device/reduce.hpp"
#include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/vec_distance.hpp" #include "opencv2/gpu/device/vec_distance.hpp"
#include "opencv2/gpu/device/datamov_utils.hpp" #include "opencv2/gpu/device/datamov_utils.hpp"
@ -60,12 +62,7 @@ namespace cv { namespace gpu { namespace device
s_distance += threadIdx.y * BLOCK_SIZE; s_distance += threadIdx.y * BLOCK_SIZE;
s_trainIdx += threadIdx.y * BLOCK_SIZE; s_trainIdx += threadIdx.y * BLOCK_SIZE;
s_distance[threadIdx.x] = bestDistance; reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<float>());
s_trainIdx[threadIdx.x] = bestTrainIdx;
__syncthreads();
reducePredVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<volatile float>());
} }
template <int BLOCK_SIZE> template <int BLOCK_SIZE>
@ -75,13 +72,7 @@ namespace cv { namespace gpu { namespace device
s_trainIdx += threadIdx.y * BLOCK_SIZE; s_trainIdx += threadIdx.y * BLOCK_SIZE;
s_imgIdx += threadIdx.y * BLOCK_SIZE; s_imgIdx += threadIdx.y * BLOCK_SIZE;
s_distance[threadIdx.x] = bestDistance; reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, smem_tuple(s_trainIdx, s_imgIdx), thrust::tie(bestTrainIdx, bestImgIdx), threadIdx.x, less<float>());
s_trainIdx[threadIdx.x] = bestTrainIdx;
s_imgIdx [threadIdx.x] = bestImgIdx;
__syncthreads();
reducePredVal2<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, s_imgIdx, bestImgIdx, threadIdx.x, less<volatile float>());
} }
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
@ -567,9 +558,8 @@ namespace cv { namespace gpu { namespace device
template <typename Dist, typename T, typename Mask> template <typename Dist, typename T, typename Mask>
void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
int cc, cudaStream_t stream) cudaStream_t stream)
{ {
(void)cc;
if (query.cols <= 64) if (query.cols <= 64)
{ {
matchUnrolledCached<16, 64, Dist>(query, train, mask, trainIdx, distance, stream); matchUnrolledCached<16, 64, Dist>(query, train, mask, trainIdx, distance, stream);
@ -599,9 +589,8 @@ namespace cv { namespace gpu { namespace device
template <typename Dist, typename T, typename Mask> template <typename Dist, typename T, typename Mask>
void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask, void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
int cc, cudaStream_t stream) cudaStream_t stream)
{ {
(void)cc;
if (query.cols <= 64) if (query.cols <= 64)
{ {
matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
@ -633,151 +622,151 @@ namespace cv { namespace gpu { namespace device
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask, template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
int cc, cudaStream_t stream) cudaStream_t stream)
{ {
if (mask.data) if (mask.data)
{ {
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask), matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
trainIdx, distance, trainIdx, distance,
cc, stream); stream);
} }
else else
{ {
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(), matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
trainIdx, distance, trainIdx, distance,
cc, stream); stream);
} }
} }
template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchL1_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchL1_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask, template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
int cc, cudaStream_t stream) cudaStream_t stream)
{ {
if (mask.data) if (mask.data)
{ {
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask), matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
trainIdx, distance, trainIdx, distance,
cc, stream); stream);
} }
else else
{ {
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(), matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
trainIdx, distance, trainIdx, distance,
cc, stream); stream);
} }
} }
//template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchL2_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); //template void matchL2_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask, template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
int cc, cudaStream_t stream) cudaStream_t stream)
{ {
if (mask.data) if (mask.data)
{ {
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask), matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
trainIdx, distance, trainIdx, distance,
cc, stream); stream);
} }
else else
{ {
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(), matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
trainIdx, distance, trainIdx, distance,
cc, stream); stream);
} }
} }
template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchHamming_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchHamming_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
int cc, cudaStream_t stream) cudaStream_t stream)
{ {
if (masks.data) if (masks.data)
{ {
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
trainIdx, imgIdx, distance, trainIdx, imgIdx, distance,
cc, stream); stream);
} }
else else
{ {
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
trainIdx, imgIdx, distance, trainIdx, imgIdx, distance,
cc, stream); stream);
} }
} }
template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchL1_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchL1_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
int cc, cudaStream_t stream) cudaStream_t stream)
{ {
if (masks.data) if (masks.data)
{ {
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
trainIdx, imgIdx, distance, trainIdx, imgIdx, distance,
cc, stream); stream);
} }
else else
{ {
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
trainIdx, imgIdx, distance, trainIdx, imgIdx, distance,
cc, stream); stream);
} }
} }
//template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchL2_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); //template void matchL2_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& maskCollection, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& maskCollection, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
int cc, cudaStream_t stream) cudaStream_t stream)
{ {
if (masks.data) if (masks.data)
{ {
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
trainIdx, imgIdx, distance, trainIdx, imgIdx, distance,
cc, stream); stream);
} }
else else
{ {
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
trainIdx, imgIdx, distance, trainIdx, imgIdx, distance,
cc, stream); stream);
} }
} }
template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
//template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
template void matchHamming_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); template void matchHamming_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
} // namespace bf_match } // namespace bf_match
}}} // namespace cv { namespace gpu { namespace device { }}} // namespace cv { namespace gpu { namespace device {

@ -42,7 +42,8 @@
#if !defined CUDA_DISABLER #if !defined CUDA_DISABLER
#include "internal_shared.hpp" #include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/utility.hpp"
#include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/vec_distance.hpp" #include "opencv2/gpu/device/vec_distance.hpp"
#include "opencv2/gpu/device/datamov_utils.hpp" #include "opencv2/gpu/device/datamov_utils.hpp"
@ -58,8 +59,6 @@ namespace cv { namespace gpu { namespace device
__global__ void matchUnrolled(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask, __global__ void matchUnrolled(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount) PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
{ {
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
extern __shared__ int smem[]; extern __shared__ int smem[];
const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
@ -110,8 +109,6 @@ namespace cv { namespace gpu { namespace device
bestDistance.ptr(queryIdx)[ind] = distVal; bestDistance.ptr(queryIdx)[ind] = distVal;
} }
} }
#endif
} }
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
@ -170,8 +167,6 @@ namespace cv { namespace gpu { namespace device
__global__ void match(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask, __global__ void match(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount) PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
{ {
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
extern __shared__ int smem[]; extern __shared__ int smem[];
const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
@ -221,8 +216,6 @@ namespace cv { namespace gpu { namespace device
bestDistance.ptr(queryIdx)[ind] = distVal; bestDistance.ptr(queryIdx)[ind] = distVal;
} }
} }
#endif
} }
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
@ -281,9 +274,8 @@ namespace cv { namespace gpu { namespace device
template <typename Dist, typename T, typename Mask> template <typename Dist, typename T, typename Mask>
void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, float maxDistance, const Mask& mask, void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, float maxDistance, const Mask& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
int cc, cudaStream_t stream) cudaStream_t stream)
{ {
(void)cc;
if (query.cols <= 64) if (query.cols <= 64)
{ {
matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
@ -313,9 +305,8 @@ namespace cv { namespace gpu { namespace device
template <typename Dist, typename T> template <typename Dist, typename T>
void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, float maxDistance, const PtrStepSzb* masks, void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, float maxDistance, const PtrStepSzb* masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
int cc, cudaStream_t stream) cudaStream_t stream)
{ {
(void)cc;
if (query.cols <= 64) if (query.cols <= 64)
{ {
matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
@ -347,124 +338,124 @@ namespace cv { namespace gpu { namespace device
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask, template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
int cc, cudaStream_t stream) cudaStream_t stream)
{ {
if (mask.data) if (mask.data)
{ {
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask), matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
trainIdx, distance, nMatches, trainIdx, distance, nMatches,
cc, stream); stream);
} }
else else
{ {
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(), matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
trainIdx, distance, nMatches, trainIdx, distance, nMatches,
cc, stream); stream);
} }
} }
template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream); template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream); //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream); template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream); template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchL1_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream); template void matchL1_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream); template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask, template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
int cc, cudaStream_t stream) cudaStream_t stream)
{ {
if (mask.data) if (mask.data)
{ {
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask), matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
trainIdx, distance, nMatches, trainIdx, distance, nMatches,
cc, stream); stream);
} }
else else
{ {
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(), matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
trainIdx, distance, nMatches, trainIdx, distance, nMatches,
cc, stream); stream);
} }
} }
//template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream); //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream); //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream); //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream); //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchL2_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream); //template void matchL2_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream); template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask, template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
int cc, cudaStream_t stream) cudaStream_t stream)
{ {
if (mask.data) if (mask.data)
{ {
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask), matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
trainIdx, distance, nMatches, trainIdx, distance, nMatches,
cc, stream); stream);
} }
else else
{ {
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(), matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
trainIdx, distance, nMatches, trainIdx, distance, nMatches,
cc, stream); stream);
} }
} }
template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream); template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream); //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream); template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream); //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchHamming_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream); template void matchHamming_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
int cc, cudaStream_t stream) cudaStream_t stream)
{ {
matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks, matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
trainIdx, imgIdx, distance, nMatches, trainIdx, imgIdx, distance, nMatches,
cc, stream); stream);
} }
template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream); template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream); //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream); template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream); template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchL1_gpu<int >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream); template void matchL1_gpu<int >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream); template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
int cc, cudaStream_t stream) cudaStream_t stream)
{ {
matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks, matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
trainIdx, imgIdx, distance, nMatches, trainIdx, imgIdx, distance, nMatches,
cc, stream); stream);
} }
//template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream); //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream); //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream); //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream); //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchL2_gpu<int >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream); //template void matchL2_gpu<int >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream); template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
int cc, cudaStream_t stream) cudaStream_t stream)
{ {
matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks, matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
trainIdx, imgIdx, distance, nMatches, trainIdx, imgIdx, distance, nMatches,
cc, stream); stream);
} }
template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream); template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream); //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream); template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
//template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream); //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
template void matchHamming_gpu<int >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream); template void matchHamming_gpu<int >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
} // namespace bf_radius_match } // namespace bf_radius_match
}}} // namespace cv { namespace gpu { namespace device }}} // namespace cv { namespace gpu { namespace device

@ -42,9 +42,10 @@
#if !defined CUDA_DISABLER #if !defined CUDA_DISABLER
#include "internal_shared.hpp" #include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/transform.hpp" #include "opencv2/gpu/device/transform.hpp"
#include "opencv2/gpu/device/functional.hpp" #include "opencv2/gpu/device/functional.hpp"
#include "opencv2/gpu/device/reduce.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
@ -66,6 +67,8 @@ namespace cv { namespace gpu { namespace device
crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y, crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z); crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
} }
__device__ __forceinline__ TransformOp() {}
__device__ __forceinline__ TransformOp(const TransformOp&) {}
}; };
void call(const PtrStepSz<float3> src, const float* rot, void call(const PtrStepSz<float3> src, const float* rot,
@ -103,6 +106,8 @@ namespace cv { namespace gpu { namespace device
(cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z, (cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z,
(cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z); (cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z);
} }
__device__ __forceinline__ ProjectOp() {}
__device__ __forceinline__ ProjectOp(const ProjectOp&) {}
}; };
void call(const PtrStepSz<float3> src, const float* rot, void call(const PtrStepSz<float3> src, const float* rot,
@ -134,6 +139,7 @@ namespace cv { namespace gpu { namespace device
return x * x; return x * x;
} }
template <int BLOCK_SIZE>
__global__ void computeHypothesisScoresKernel( __global__ void computeHypothesisScoresKernel(
const int num_points, const float3* object, const float2* image, const int num_points, const float3* object, const float2* image,
const float dist_threshold, int* g_num_inliers) const float dist_threshold, int* g_num_inliers)
@ -156,19 +162,11 @@ namespace cv { namespace gpu { namespace device
++num_inliers; ++num_inliers;
} }
extern __shared__ float s_num_inliers[]; __shared__ int s_num_inliers[BLOCK_SIZE];
s_num_inliers[threadIdx.x] = num_inliers; reduce<BLOCK_SIZE>(s_num_inliers, num_inliers, threadIdx.x, plus<int>());
__syncthreads();
for (int step = blockDim.x / 2; step > 0; step >>= 1)
{
if (threadIdx.x < step)
s_num_inliers[threadIdx.x] += s_num_inliers[threadIdx.x + step];
__syncthreads();
}
if (threadIdx.x == 0) if (threadIdx.x == 0)
g_num_inliers[blockIdx.x] = s_num_inliers[0]; g_num_inliers[blockIdx.x] = num_inliers;
} }
void computeHypothesisScores( void computeHypothesisScores(
@ -181,9 +179,8 @@ namespace cv { namespace gpu { namespace device
dim3 threads(256); dim3 threads(256);
dim3 grid(num_hypotheses); dim3 grid(num_hypotheses);
int smem_size = threads.x * sizeof(float);
computeHypothesisScoresKernel<<<grid, threads, smem_size>>>( computeHypothesisScoresKernel<256><<<grid, threads>>>(
num_points, object, image, dist_threshold, hypothesis_scores); num_points, object, image, dist_threshold, hypothesis_scores);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );

@ -43,172 +43,148 @@
#if !defined CUDA_DISABLER #if !defined CUDA_DISABLER
#include <utility> #include <utility>
#include <algorithm> #include "opencv2/gpu/device/common.hpp"
#include "internal_shared.hpp" #include "opencv2/gpu/device/emulation.hpp"
#include "opencv2/gpu/device/transform.hpp"
#include "opencv2/gpu/device/functional.hpp"
#include "opencv2/gpu/device/utility.hpp"
using namespace cv::gpu;
using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace device
{
namespace canny namespace canny
{ {
__global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols) struct L1 : binary_function<int, int, float>
{
__shared__ int smem[16][18];
const int j = blockIdx.x * blockDim.x + threadIdx.x;
const int i = blockIdx.y * blockDim.y + threadIdx.y;
if (i < rows)
{ {
smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j]; __device__ __forceinline__ float operator ()(int x, int y) const
if (threadIdx.x == 0)
{ {
smem[threadIdx.y][0] = src.ptr(i)[::max(j - 1, 0)]; return ::abs(x) + ::abs(y);
smem[threadIdx.y][17] = src.ptr(i)[::min(j + 16, cols - 1)];
} }
__syncthreads();
if (j < cols) __device__ __forceinline__ L1() {}
__device__ __forceinline__ L1(const L1&) {}
};
struct L2 : binary_function<int, int, float>
{ {
dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2]; __device__ __forceinline__ float operator ()(int x, int y) const
dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2];
}
}
}
void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
{ {
dim3 block(16, 16, 1); return ::sqrtf(x * x + y * y);
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); }
calcSobelRowPass<<<grid, block>>>(src, dx_buf, dy_buf, rows, cols);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() ); __device__ __forceinline__ L2() {}
__device__ __forceinline__ L2(const L2&) {}
};
} }
struct L1 namespace cv { namespace gpu { namespace device
{ {
static __device__ __forceinline__ float calc(int x, int y) template <> struct TransformFunctorTraits<canny::L1> : DefaultTransformFunctorTraits<canny::L1>
{ {
return ::abs(x) + ::abs(y); enum { smart_shift = 4 };
}
}; };
struct L2 template <> struct TransformFunctorTraits<canny::L2> : DefaultTransformFunctorTraits<canny::L2>
{
static __device__ __forceinline__ float calc(int x, int y)
{ {
return ::sqrtf(x * x + y * y); enum { smart_shift = 4 };
}
}; };
}}}
template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf, namespace canny
PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
{
__shared__ int sdx[18][16];
__shared__ int sdy[18][16];
const int j = blockIdx.x * blockDim.x + threadIdx.x;
const int i = blockIdx.y * blockDim.y + threadIdx.y;
if (j < cols)
{ {
sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j]; texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp);
sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j]; struct SrcTex
if (threadIdx.y == 0)
{ {
sdx[0][threadIdx.x] = dx_buf.ptr(::max(i - 1, 0))[j]; const int xoff;
sdx[17][threadIdx.x] = dx_buf.ptr(::min(i + 16, rows - 1))[j]; const int yoff;
__host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {}
sdy[0][threadIdx.x] = dy_buf.ptr(::max(i - 1, 0))[j]; __device__ __forceinline__ int operator ()(int y, int x) const
sdy[17][threadIdx.x] = dy_buf.ptr(::min(i + 16, rows - 1))[j]; {
return tex2D(tex_src, x + xoff, y + yoff);
} }
__syncthreads(); };
if (i < rows) template <class Norm> __global__
void calcMagnitudeKernel(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
{ {
int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x]; const int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x]; const int y = blockIdx.y * blockDim.y + threadIdx.y;
dx.ptr(i)[j] = x; if (y >= mag.rows || x >= mag.cols)
dy.ptr(i)[j] = y; return;
mag.ptr(i + 1)[j + 1] = Norm::calc(x, y); int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1));
} int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1));
}
dx(y, x) = dxVal;
dy(y, x) = dyVal;
mag(y, x) = norm(dxVal, dyVal);
} }
void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad) void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
{ {
dim3 block(16, 16, 1); const dim3 block(16, 16);
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y));
bindTexture(&tex_src, srcWhole);
SrcTex src(xoff, yoff);
if (L2Grad) if (L2Grad)
calcMagnitude<L2><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols); {
L2 norm;
calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
}
else else
calcMagnitude<L1><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols); {
L1 norm;
calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
}
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
template <typename Norm> __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols) void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
{
const int j = blockIdx.x * blockDim.x + threadIdx.x;
const int i = blockIdx.y * blockDim.y + threadIdx.y;
if (i < rows && j < cols)
mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]);
}
void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
{ {
dim3 block(16, 16, 1);
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
if (L2Grad) if (L2Grad)
calcMagnitude<L2><<<grid, block>>>(dx, dy, mag, rows, cols); {
L2 norm;
transform(dx, dy, mag, norm, WithOutMask(), 0);
}
else else
calcMagnitude<L1><<<grid, block>>>(dx, dy, mag, rows, cols); {
L1 norm;
cudaSafeCall( cudaGetLastError() ); transform(dx, dy, mag, norm, WithOutMask(), 0);
}
cudaSafeCall( cudaDeviceSynchronize() ); }
} }
////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////
#define CANNY_SHIFT 15 namespace canny
#define TG22 (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5) {
texture<float, cudaTextureType2D, cudaReadModeElementType> tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp);
__global__ void calcMap(const PtrStepi dx, const PtrStepi dy, const PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh) __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
{ {
__shared__ float smem[18][18]; const int CANNY_SHIFT = 15;
const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
const int j = blockIdx.x * 16 + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int i = blockIdx.y * 16 + threadIdx.y; const int y = blockIdx.y * blockDim.y + threadIdx.y;
const int tid = threadIdx.y * 16 + threadIdx.x; if (x == 0 || x >= dx.cols - 1 || y == 0 || y >= dx.rows - 1)
const int lx = tid % 18; return;
const int ly = tid / 18;
if (ly < 14) int dxVal = dx(y, x);
smem[ly][lx] = mag.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx]; int dyVal = dy(y, x);
if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols) const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
smem[ly + 14][lx] = mag.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx]; const float m = tex2D(tex_mag, x, y);
__syncthreads(); dxVal = ::abs(dxVal);
dyVal = ::abs(dyVal);
if (i < rows && j < cols)
{
int x = dx.ptr(i)[j];
int y = dy.ptr(i)[j];
const int s = (x ^ y) < 0 ? -1 : 1;
const float m = smem[threadIdx.y + 1][threadIdx.x + 1];
x = ::abs(x);
y = ::abs(y);
// 0 - the pixel can not belong to an edge // 0 - the pixel can not belong to an edge
// 1 - the pixel might belong to an edge // 1 - the pixel might belong to an edge
@ -217,73 +193,81 @@ namespace cv { namespace gpu { namespace device
if (m > low_thresh) if (m > low_thresh)
{ {
const int tg22x = x * TG22; const int tg22x = dxVal * TG22;
const int tg67x = tg22x + ((x + x) << CANNY_SHIFT); const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT);
y <<= CANNY_SHIFT; dyVal <<= CANNY_SHIFT;
if (y < tg22x) if (dyVal < tg22x)
{ {
if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2]) if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y))
edge_type = 1 + (int)(m > high_thresh); edge_type = 1 + (int)(m > high_thresh);
} }
else if( y > tg67x ) else if(dyVal > tg67x)
{ {
if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1]) if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1))
edge_type = 1 + (int)(m > high_thresh); edge_type = 1 + (int)(m > high_thresh);
} }
else else
{ {
if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s]) if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1))
edge_type = 1 + (int)(m > high_thresh); edge_type = 1 + (int)(m > high_thresh);
} }
} }
map.ptr(i + 1)[j + 1] = edge_type; map(y, x) = edge_type;
} }
}
#undef CANNY_SHIFT
#undef TG22
void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh) void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh)
{ {
dim3 block(16, 16, 1); const dim3 block(16, 16);
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y));
calcMap<<<grid, block>>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh); bindTexture(&tex_mag, mag);
calcMapKernel<<<grid, block>>>(dx, dy, map, low_thresh, high_thresh);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
}
////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////
__device__ unsigned int counter = 0; namespace canny
__global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols)
{ {
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 120) __device__ int counter = 0;
__shared__ int smem[18][18]; __global__ void edgesHysteresisLocalKernel(PtrStepSzi map, ushort2* st)
{
const int j = blockIdx.x * 16 + threadIdx.x; __shared__ volatile int smem[18][18];
const int i = blockIdx.y * 16 + threadIdx.y;
const int tid = threadIdx.y * 16 + threadIdx.x;
const int lx = tid % 18;
const int ly = tid / 18;
if (ly < 14) const int x = blockIdx.x * blockDim.x + threadIdx.x;
smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx]; const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols) smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? map(y, x) : 0;
smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx]; if (threadIdx.y == 0)
smem[0][threadIdx.x + 1] = y > 0 ? map(y - 1, x) : 0;
if (threadIdx.y == blockDim.y - 1)
smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? map(y + 1, x) : 0;
if (threadIdx.x == 0)
smem[threadIdx.y + 1][0] = x > 0 ? map(y, x - 1) : 0;
if (threadIdx.x == blockDim.x - 1)
smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols ? map(y, x + 1) : 0;
if (threadIdx.x == 0 && threadIdx.y == 0)
smem[0][0] = y > 0 && x > 0 ? map(y - 1, x - 1) : 0;
if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0)
smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? map(y - 1, x + 1) : 0;
if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1)
smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? map(y + 1, x - 1) : 0;
if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1)
smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? map(y + 1, x + 1) : 0;
__syncthreads(); __syncthreads();
if (i < rows && j < cols) if (x >= map.cols || y >= map.rows)
{ return;
int n; int n;
#pragma unroll #pragma unroll
@ -311,7 +295,7 @@ namespace cv { namespace gpu { namespace device
const int e = smem[threadIdx.y + 1][threadIdx.x + 1]; const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
map.ptr(i + 1)[j + 1] = e; map(y, x) = e;
n = 0; n = 0;
@ -331,69 +315,70 @@ namespace cv { namespace gpu { namespace device
if (n > 0) if (n > 0)
{ {
const unsigned int ind = atomicInc(&counter, (unsigned int)(-1)); const int ind = ::atomicAdd(&counter, 1);
st[ind] = make_ushort2(j + 1, i + 1); st[ind] = make_ushort2(x, y);
} }
} }
#endif void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1)
}
void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols)
{ {
void* counter_ptr; void* counter_ptr;
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) ); cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) ); cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
dim3 block(16, 16, 1); const dim3 block(16, 16);
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y));
edgesHysteresisLocal<<<grid, block>>>(map, st1, rows, cols); edgesHysteresisLocalKernel<<<grid, block>>>(map, st1);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
}
//////////////////////////////////////////////////////////////////////////////////////////
namespace canny
{
__constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1}; __constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1};
__constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1}; __constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1};
__global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count) __global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, ushort2* st1, ushort2* st2, const int count)
{ {
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 120
const int stack_size = 512; const int stack_size = 512;
__shared__ unsigned int s_counter; __shared__ int s_counter;
__shared__ unsigned int s_ind; __shared__ int s_ind;
__shared__ ushort2 s_st[stack_size]; __shared__ ushort2 s_st[stack_size];
if (threadIdx.x == 0) if (threadIdx.x == 0)
s_counter = 0; s_counter = 0;
__syncthreads(); __syncthreads();
int ind = blockIdx.y * gridDim.x + blockIdx.x; int ind = blockIdx.y * gridDim.x + blockIdx.x;
if (ind < count) if (ind >= count)
{ return;
ushort2 pos = st1[ind]; ushort2 pos = st1[ind];
if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
{
if (threadIdx.x < 8) if (threadIdx.x < 8)
{ {
pos.x += c_dx[threadIdx.x]; pos.x += c_dx[threadIdx.x];
pos.y += c_dy[threadIdx.x]; pos.y += c_dy[threadIdx.x];
if (map.ptr(pos.y)[pos.x] == 1) if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
{ {
map.ptr(pos.y)[pos.x] = 2; map(pos.y, pos.x) = 2;
ind = atomicInc(&s_counter, (unsigned int)(-1)); ind = Emulation::smem::atomicAdd(&s_counter, 1);
s_st[ind] = pos; s_st[ind] = pos;
} }
} }
__syncthreads(); __syncthreads();
while (s_counter > 0 && s_counter <= stack_size - blockDim.x) while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
@ -401,30 +386,31 @@ namespace cv { namespace gpu { namespace device
const int subTaskIdx = threadIdx.x >> 3; const int subTaskIdx = threadIdx.x >> 3;
const int portion = ::min(s_counter, blockDim.x >> 3); const int portion = ::min(s_counter, blockDim.x >> 3);
pos.x = pos.y = 0;
if (subTaskIdx < portion) if (subTaskIdx < portion)
pos = s_st[s_counter - 1 - subTaskIdx]; pos = s_st[s_counter - 1 - subTaskIdx];
__syncthreads(); __syncthreads();
if (threadIdx.x == 0) if (threadIdx.x == 0)
s_counter -= portion; s_counter -= portion;
__syncthreads(); __syncthreads();
if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows) if (subTaskIdx < portion)
{ {
pos.x += c_dx[threadIdx.x & 7]; pos.x += c_dx[threadIdx.x & 7];
pos.y += c_dy[threadIdx.x & 7]; pos.y += c_dy[threadIdx.x & 7];
if (map.ptr(pos.y)[pos.x] == 1) if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
{ {
map.ptr(pos.y)[pos.x] = 2; map(pos.y, pos.x) = 2;
ind = atomicInc(&s_counter, (unsigned int)(-1)); ind = Emulation::smem::atomicAdd(&s_counter, 1);
s_st[ind] = pos; s_st[ind] = pos;
} }
} }
__syncthreads(); __syncthreads();
} }
@ -432,70 +418,76 @@ namespace cv { namespace gpu { namespace device
{ {
if (threadIdx.x == 0) if (threadIdx.x == 0)
{ {
ind = atomicAdd(&counter, s_counter); ind = ::atomicAdd(&counter, s_counter);
s_ind = ind - s_counter; s_ind = ind - s_counter;
} }
__syncthreads(); __syncthreads();
ind = s_ind; ind = s_ind;
for (int i = threadIdx.x; i < s_counter; i += blockDim.x) for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
{
st2[ind + i] = s_st[i]; st2[ind + i] = s_st[i];
} }
} }
}
}
#endif void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2)
}
void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols)
{ {
void* counter_ptr; void* counter_ptr;
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) ); cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) );
unsigned int count; int count;
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
while (count > 0) while (count > 0)
{ {
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) ); cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
dim3 block(128, 1, 1); const dim3 block(128);
dim3 grid(std::min(count, 65535u), divUp(count, 65535), 1); const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1);
edgesHysteresisGlobal<<<grid, block>>>(map, st1, st2, rows, cols, count);
edgesHysteresisGlobalKernel<<<grid, block>>>(map, st1, st2, count);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
std::swap(st1, st2); std::swap(st1, st2);
} }
} }
}
__global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols) //////////////////////////////////////////////////////////////////////////////////////////
namespace canny
{ {
const int j = blockIdx.x * 16 + threadIdx.x; struct GetEdges : unary_function<int, uchar>
const int i = blockIdx.y * 16 + threadIdx.y; {
__device__ __forceinline__ uchar operator ()(int e) const
{
return (uchar)(-(e >> 1));
}
if (i < rows && j < cols) __device__ __forceinline__ GetEdges() {}
dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1)); __device__ __forceinline__ GetEdges(const GetEdges&) {}
};
} }
void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols) namespace cv { namespace gpu { namespace device
{ {
dim3 block(16, 16, 1); template <> struct TransformFunctorTraits<canny::GetEdges> : DefaultTransformFunctorTraits<canny::GetEdges>
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); {
enum { smart_shift = 4 };
getEdges<<<grid, block>>>(map, dst, rows, cols); };
cudaSafeCall( cudaGetLastError() ); }}}
cudaSafeCall( cudaDeviceSynchronize() ); namespace canny
{
void getEdges(PtrStepSzi map, PtrStepSzb dst)
{
transform(map, dst, GetEdges(), WithOutMask(), 0);
}
} }
} // namespace canny
}}} // namespace cv { namespace gpu { namespace device
#endif /* CUDA_DISABLER */ #endif /* CUDA_DISABLER */

@ -497,6 +497,7 @@ namespace cv { namespace gpu { namespace device
void labelComponents(const PtrStepSzb& edges, PtrStepSzi comps, int flags, cudaStream_t stream) void labelComponents(const PtrStepSzb& edges, PtrStepSzi comps, int flags, cudaStream_t stream)
{ {
(void) flags;
dim3 block(CTA_SIZE_X, CTA_SIZE_Y); dim3 block(CTA_SIZE_X, CTA_SIZE_Y);
dim3 grid(divUp(edges.cols, TILE_COLS), divUp(edges.rows, TILE_ROWS)); dim3 grid(divUp(edges.cols, TILE_COLS), divUp(edges.rows, TILE_ROWS));

@ -0,0 +1,53 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "column_filter.h"
namespace filter
{
template void linearColumn<float, uchar>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
}
#endif /* CUDA_DISABLER */

@ -0,0 +1,53 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "column_filter.h"
namespace filter
{
template void linearColumn<float3, uchar3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
}
#endif /* CUDA_DISABLER */

@ -0,0 +1,53 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "column_filter.h"
namespace filter
{
template void linearColumn<float, unsigned short>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
}
#endif /* CUDA_DISABLER */

@ -0,0 +1,53 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "column_filter.h"
namespace filter
{
template void linearColumn<float3, ushort3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
}
#endif /* CUDA_DISABLER */

@ -0,0 +1,53 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "column_filter.h"
namespace filter
{
template void linearColumn<float4, ushort4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
}
#endif /* CUDA_DISABLER */

@ -0,0 +1,53 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "column_filter.h"
namespace filter
{
template void linearColumn<float3, int3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
}
#endif /* CUDA_DISABLER */

@ -0,0 +1,53 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "column_filter.h"
namespace filter
{
template void linearColumn<float4, int4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
}
#endif /* CUDA_DISABLER */

@ -0,0 +1,53 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "column_filter.h"
namespace filter
{
template void linearColumn<float4, uchar4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
}
#endif /* CUDA_DISABLER */

@ -0,0 +1,53 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "column_filter.h"
namespace filter
{
template void linearColumn<float3, short3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
}
#endif /* CUDA_DISABLER */

@ -0,0 +1,53 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "column_filter.h"
namespace filter
{
template void linearColumn<float, int>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
}
#endif /* CUDA_DISABLER */

@ -0,0 +1,53 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "column_filter.h"
namespace filter
{
template void linearColumn<float, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
}
#endif /* CUDA_DISABLER */

@ -0,0 +1,53 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "column_filter.h"
namespace filter
{
template void linearColumn<float3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
}
#endif /* CUDA_DISABLER */

@ -0,0 +1,53 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "column_filter.h"
namespace filter
{
template void linearColumn<float4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
}
#endif /* CUDA_DISABLER */

@ -0,0 +1,53 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "column_filter.h"
namespace filter
{
template void linearColumn<float, short>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
}
#endif /* CUDA_DISABLER */

@ -0,0 +1,53 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if !defined CUDA_DISABLER
#include "column_filter.h"
namespace filter
{
template void linearColumn<float4, short4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
}
#endif /* CUDA_DISABLER */

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save