Merge pull request #907 from SpecLad:master

pull/887/merge
Roman Donchenko 12 years ago committed by OpenCV Buildbot
commit b5c013682b
  1. BIN
      3rdparty/lib/armeabi-v7a/libnative_camera_r2.2.0.so
  2. BIN
      3rdparty/lib/armeabi-v7a/libnative_camera_r2.3.3.so
  3. BIN
      3rdparty/lib/armeabi-v7a/libnative_camera_r3.0.1.so
  4. BIN
      3rdparty/lib/armeabi-v7a/libnative_camera_r4.0.0.so
  5. BIN
      3rdparty/lib/armeabi-v7a/libnative_camera_r4.0.3.so
  6. BIN
      3rdparty/lib/armeabi-v7a/libnative_camera_r4.1.1.so
  7. BIN
      3rdparty/lib/armeabi-v7a/libnative_camera_r4.2.0.so
  8. BIN
      3rdparty/lib/armeabi/libnative_camera_r2.2.0.so
  9. BIN
      3rdparty/lib/armeabi/libnative_camera_r2.3.3.so
  10. BIN
      3rdparty/lib/armeabi/libnative_camera_r3.0.1.so
  11. BIN
      3rdparty/lib/armeabi/libnative_camera_r4.0.0.so
  12. BIN
      3rdparty/lib/armeabi/libnative_camera_r4.0.3.so
  13. BIN
      3rdparty/lib/armeabi/libnative_camera_r4.1.1.so
  14. BIN
      3rdparty/lib/armeabi/libnative_camera_r4.2.0.so
  15. BIN
      3rdparty/lib/mips/libnative_camera_r4.0.3.so
  16. BIN
      3rdparty/lib/mips/libnative_camera_r4.1.1.so
  17. BIN
      3rdparty/lib/mips/libnative_camera_r4.2.0.so
  18. BIN
      3rdparty/lib/x86/libnative_camera_r2.3.3.so
  19. BIN
      3rdparty/lib/x86/libnative_camera_r3.0.1.so
  20. BIN
      3rdparty/lib/x86/libnative_camera_r4.0.3.so
  21. BIN
      3rdparty/lib/x86/libnative_camera_r4.1.1.so
  22. BIN
      3rdparty/lib/x86/libnative_camera_r4.2.0.so
  23. 5
      3rdparty/libjasper/CMakeLists.txt
  24. 7
      CMakeLists.txt
  25. 22
      cmake/OpenCVDetectAndroidSDK.cmake
  26. 2
      cmake/OpenCVDetectCXXCompiler.cmake
  27. 12
      cmake/OpenCVDetectOpenCL.cmake
  28. 18
      cmake/OpenCVDetectPython.cmake
  29. 29
      cmake/OpenCVFindLibsGUI.cmake
  30. 73
      modules/androidcamera/camera_wrapper/camera_wrapper.cpp
  31. 2
      modules/calib3d/doc/camera_calibration_and_3d_reconstruction.rst
  32. 3
      modules/core/src/matmul.cpp
  33. 1
      modules/flann/include/opencv2/flann/ground_truth.h
  34. 21
      modules/highgui/CMakeLists.txt
  35. 2
      modules/highgui/include/opencv2/highgui/highgui_c.h
  36. 11
      modules/highgui/src/cap_libv4l.cpp
  37. 5
      modules/highgui/src/grfmt_jpeg.cpp
  38. 5
      modules/highgui/src/grfmt_png.cpp
  39. 8
      modules/highgui/src/window_QT.h
  40. 2
      modules/imgproc/doc/structural_analysis_and_shape_descriptors.rst
  41. 4
      modules/imgproc/src/floodfill.cpp
  42. 2
      modules/imgproc/src/imgwarp.cpp
  43. 3
      modules/java/android_lib/lint.xml
  44. 4
      modules/java/android_lib/res/values/attrs.xml
  45. 9
      modules/java/generator/src/java/android+CameraBridgeViewBase.java
  46. 46
      modules/java/generator/src/java/android+JavaCameraView.java
  47. 27
      modules/java/generator/src/java/android+NativeCameraView.java
  48. 2
      modules/legacy/src/blobtrackgenyml.cpp
  49. 6
      modules/legacy/src/kdtree.cpp
  50. 2
      modules/objdetect/src/cascadedetect.cpp
  51. 412
      modules/ocl/include/opencv2/ocl.hpp
  52. 2
      modules/ocl/include/opencv2/ocl/ocl.hpp
  53. 109
      modules/ocl/src/arithm.cpp
  54. 8
      modules/ocl/src/brute_force_matcher.cpp
  55. 2
      modules/ocl/src/gemm.cpp
  56. 718
      modules/ocl/src/haar.cpp
  57. 47
      modules/ocl/src/imgproc.cpp
  58. 15
      modules/ocl/src/initialization.cpp
  59. 28
      modules/ocl/src/matrix_operations.cpp
  60. 68
      modules/ocl/src/mcwutil.cpp
  61. 2
      modules/ocl/src/moments.cpp
  62. 966
      modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl
  63. 43
      modules/ocl/src/opencl/arithm_bitwise_binary.cl
  64. 501
      modules/ocl/src/opencl/arithm_bitwise_binary_mask.cl
  65. 438
      modules/ocl/src/opencl/arithm_bitwise_binary_scalar.cl
  66. 483
      modules/ocl/src/opencl/arithm_bitwise_binary_scalar_mask.cl
  67. 294
      modules/ocl/src/opencl/arithm_bitwise_or.cl
  68. 1194
      modules/ocl/src/opencl/arithm_bitwise_or_mask.cl
  69. 973
      modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl
  70. 1140
      modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl
  71. 340
      modules/ocl/src/opencl/arithm_bitwise_xor.cl
  72. 1194
      modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl
  73. 1117
      modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl
  74. 367
      modules/ocl/src/opencl/filtering_boxFilter.cl
  75. 320
      modules/ocl/src/opencl/haarobjectdetect.cl
  76. 286
      modules/ocl/src/opencl/haarobjectdetect_scaled2.cl
  77. 220
      modules/ocl/src/opencl/imgproc_integral.cl
  78. 200
      modules/ocl/src/opencl/imgproc_integral_sum.cl
  79. 114
      modules/ocl/src/opencl/moments.cl
  80. 82
      modules/ocl/src/opencl/pyr_up.cl
  81. 223
      modules/ocl/src/opencl/stereobm.cl
  82. 8
      modules/ocl/src/opencl/stereobp.cl
  83. 1402
      modules/ocl/src/opencl/stereocsbp.cl
  84. 407
      modules/ocl/src/opencl/tvl1flow.cl
  85. 756
      modules/ocl/src/stereo_csbp.cpp
  86. 97
      modules/ocl/src/stereobm.cpp
  87. 475
      modules/ocl/src/tvl1flow.cpp
  88. 120
      modules/ocl/test/interpolation.hpp
  89. 2
      modules/ocl/test/precomp.hpp
  90. 491
      modules/ocl/test/test_arithm.cpp
  91. 63
      modules/ocl/test/test_blend.cpp
  92. 14
      modules/ocl/test/test_brute_force_matcher.cpp
  93. 62
      modules/ocl/test/test_calib3d.cpp
  94. 6
      modules/ocl/test/test_color.cpp
  95. 16
      modules/ocl/test/test_columnsum.cpp
  96. 8
      modules/ocl/test/test_fft.cpp
  97. 652
      modules/ocl/test/test_filters.cpp
  98. 3
      modules/ocl/test/test_gemm.cpp
  99. 99
      modules/ocl/test/test_haar.cpp
  100. 5
      modules/ocl/test/test_hog.cpp
  101. Some files were not shown because too many files have changed in this diff Show More

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

@ -23,8 +23,8 @@ if(WIN32 AND NOT MINGW)
add_definitions(-DJAS_WIN_MSVC_BUILD)
endif(WIN32 AND NOT MINGW)
ocv_warnings_disable(CMAKE_C_FLAGS -Wno-implicit-function-declaration -Wno-uninitialized
-Wmissing-prototypes -Wmissing-declarations -Wunused -Wshadow
ocv_warnings_disable(CMAKE_C_FLAGS -Wno-implicit-function-declaration -Wno-uninitialized -Wmissing-prototypes
-Wno-unused-but-set-parameter -Wmissing-declarations -Wunused -Wshadow
-Wsign-compare -Wstrict-overflow)
ocv_warnings_disable(CMAKE_C_FLAGS -Wunused-parameter) # clang
ocv_warnings_disable(CMAKE_C_FLAGS /wd4013 /wd4018 /wd4101 /wd4244 /wd4267 /wd4715) # vs2005
@ -49,4 +49,3 @@ endif()
if(NOT BUILD_SHARED_LIBS)
install(TARGETS ${JASPER_LIBRARY} ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT main)
endif()

@ -595,12 +595,15 @@ endif()
status("")
status(" GUI: ")
if(HAVE_QT)
if(HAVE_QT5)
status(" QT 5.x:" HAVE_QT THEN "YES (ver ${Qt5Core_VERSION_STRING})" ELSE NO)
status(" QT OpenGL support:" HAVE_QT_OPENGL THEN "YES (${Qt5OpenGL_LIBRARIES} ${Qt5OpenGL_VERSION_STRING})" ELSE NO)
elseif(HAVE_QT)
status(" QT 4.x:" HAVE_QT THEN "YES (ver ${QT_VERSION_MAJOR}.${QT_VERSION_MINOR}.${QT_VERSION_PATCH} ${QT_EDITION})" ELSE NO)
status(" QT OpenGL support:" HAVE_QT_OPENGL THEN "YES (${QT_QTOPENGL_LIBRARY})" ELSE NO)
else()
if(DEFINED WITH_QT)
status(" QT 4.x:" NO)
status(" QT:" NO)
endif()
if(DEFINED WITH_WIN32UI)
status(" Win32 UI:" HAVE_WIN32UI THEN YES ELSE NO)

@ -176,7 +176,8 @@ macro(android_get_compatible_target VAR)
endmacro()
unset(__android_project_chain CACHE)
#add_android_project(target_name ${path} NATIVE_DEPS opencv_core LIBRARY_DEPS ${OpenCV_BINARY_DIR} SDK_TARGET 11)
# add_android_project(target_name ${path} NATIVE_DEPS opencv_core LIBRARY_DEPS ${OpenCV_BINARY_DIR} SDK_TARGET 11)
macro(add_android_project target path)
# parse arguments
set(android_proj_arglist NATIVE_DEPS LIBRARY_DEPS SDK_TARGET IGNORE_JAVA IGNORE_MANIFEST)
@ -212,6 +213,16 @@ macro(add_android_project target path)
ocv_check_dependencies(${android_proj_NATIVE_DEPS} opencv_java)
endif()
if(EXISTS "${path}/jni/Android.mk" )
# find if native_app_glue is used
file(STRINGS "${path}/jni/Android.mk" NATIVE_APP_GLUE REGEX ".*(call import-module,android/native_app_glue)" )
if(NATIVE_APP_GLUE)
if(ANDROID_NATIVE_API_LEVEL LESS 9 OR NOT EXISTS "${ANDROID_NDK}/sources/android/native_app_glue")
set(OCV_DEPENDENCIES_FOUND FALSE)
endif()
endif()
endif()
if(OCV_DEPENDENCIES_FOUND AND android_proj_sdk_target AND ANDROID_EXECUTABLE AND ANT_EXECUTABLE AND ANDROID_TOOLS_Pkg_Revision GREATER 13 AND EXISTS "${path}/${ANDROID_MANIFEST_FILE}")
project(${target})
@ -268,9 +279,6 @@ macro(add_android_project target path)
file(STRINGS "${path}/jni/Android.mk" JNI_LIB_NAME REGEX "LOCAL_MODULE[ ]*:=[ ]*.*" )
string(REGEX REPLACE "LOCAL_MODULE[ ]*:=[ ]*([a-zA-Z_][a-zA-Z_0-9]*)[ ]*" "\\1" JNI_LIB_NAME "${JNI_LIB_NAME}")
# find using of native app glue to determine native activity
file(STRINGS "${path}/jni/Android.mk" NATIVE_APP_GLUE REGEX ".*(call import-module,android/native_app_glue)" )
if(JNI_LIB_NAME)
ocv_include_modules_recurse(${android_proj_NATIVE_DEPS})
ocv_include_directories("${path}/jni")
@ -291,9 +299,9 @@ macro(add_android_project target path)
)
get_target_property(android_proj_jni_location "${JNI_LIB_NAME}" LOCATION)
if (NOT (CMAKE_BUILD_TYPE MATCHES "debug"))
add_custom_command(TARGET ${JNI_LIB_NAME} POST_BUILD COMMAND ${CMAKE_STRIP} --strip-unneeded "${android_proj_jni_location}")
endif()
if (NOT (CMAKE_BUILD_TYPE MATCHES "debug"))
add_custom_command(TARGET ${JNI_LIB_NAME} POST_BUILD COMMAND ${CMAKE_STRIP} --strip-unneeded "${android_proj_jni_location}")
endif()
endif()
endif()

@ -101,7 +101,7 @@ endif()
if(MSVC64 OR MINGW64)
set(X86_64 1)
elseif(MSVC AND NOT CMAKE_CROSSCOMPILING)
elseif(MINGW OR (MSVC AND NOT CMAKE_CROSSCOMPILING))
set(X86 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
set(X86_64 1)

@ -20,12 +20,6 @@ else(APPLE)
DOC "OpenCL include directory"
NO_DEFAULT_PATH)
find_path(OPENCL_INCLUDE_DIR
NAMES OpenCL/cl.h CL/cl.h
HINTS ${OPENCL_ROOT_DIR}
PATH_SUFFIXES include include/nvidia-current
DOC "OpenCL include directory")
if (X86_64)
set(OPENCL_POSSIBLE_LIB_SUFFIXES lib/Win64 lib/x86_64 lib/x64)
elseif (X86)
@ -39,12 +33,6 @@ else(APPLE)
DOC "OpenCL library"
NO_DEFAULT_PATH)
find_library(OPENCL_LIBRARY
NAMES OpenCL
HINTS ${OPENCL_ROOT_DIR}
PATH_SUFFIXES ${OPENCL_POSSIBLE_LIB_SUFFIXES}
DOC "OpenCL library")
mark_as_advanced(OPENCL_INCLUDE_DIR OPENCL_LIBRARY)
include(FindPackageHandleStandardArgs)
FIND_PACKAGE_HANDLE_STANDARD_ARGS(OPENCL DEFAULT_MSG OPENCL_LIBRARY OPENCL_INCLUDE_DIR )

@ -102,18 +102,12 @@ if(PYTHON_EXECUTABLE)
if(BUILD_DOCS)
find_host_program(SPHINX_BUILD sphinx-build)
if(SPHINX_BUILD)
if(UNIX)
execute_process(COMMAND sh -c "${SPHINX_BUILD} -_ 2>&1 | sed -ne 1p"
RESULT_VARIABLE SPHINX_PROCESS
OUTPUT_VARIABLE SPHINX_VERSION
OUTPUT_STRIP_TRAILING_WHITESPACE)
else()
execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import sphinx; print sphinx.__version__"
RESULT_VARIABLE SPHINX_PROCESS
OUTPUT_VARIABLE SPHINX_VERSION
OUTPUT_STRIP_TRAILING_WHITESPACE)
endif()
if(SPHINX_PROCESS EQUAL 0)
execute_process(COMMAND "${SPHINX_BUILD}"
OUTPUT_QUIET
ERROR_VARIABLE SPHINX_OUTPUT
OUTPUT_STRIP_TRAILING_WHITESPACE)
if(SPHINX_OUTPUT MATCHES "^Sphinx v([0-9][^ \n]*)")
set(SPHINX_VERSION "${CMAKE_MATCH_1}")
set(HAVE_SPHINX 1)
message(STATUS "Found Sphinx ${SPHINX_VERSION}: ${SPHINX_BUILD}")
endif()

@ -13,12 +13,31 @@ if(WITH_WIN32UI)
endif(WITH_WIN32UI)
# --- QT4 ---
ocv_clear_vars(HAVE_QT)
ocv_clear_vars(HAVE_QT HAVE_QT5)
if(WITH_QT)
find_package(Qt4)
if(QT4_FOUND)
set(HAVE_QT TRUE)
add_definitions(-DHAVE_QT) # We need to define the macro this way, using cvconfig.h does not work
if(NOT CMAKE_VERSION VERSION_LESS 2.8.3 AND NOT WITH_QT EQUAL 4)
find_package(Qt5Core)
find_package(Qt5Gui)
find_package(Qt5Widgets)
find_package(Qt5Test)
find_package(Qt5Concurrent)
if(Qt5Core_FOUND AND Qt5Gui_FOUND AND Qt5Widgets_FOUND AND Qt5Test_FOUND AND Qt5Concurrent_FOUND)
set(HAVE_QT5 ON)
set(HAVE_QT ON)
add_definitions(-DHAVE_QT)
find_package(Qt5OpenGL)
if(Qt5OpenGL_FOUND)
set(QT_QTOPENGL_FOUND ON)
endif()
endif()
endif()
if(NOT HAVE_QT)
find_package(Qt4)
if(QT4_FOUND)
set(HAVE_QT TRUE)
add_definitions(-DHAVE_QT) # We need to define the macro this way, using cvconfig.h does not work
endif()
endif()
endif()

@ -362,6 +362,9 @@ CameraHandler* CameraHandler::initCameraConnect(const CameraCallback& callback,
typedef sp<Camera> (*Android23ConnectFuncType)(int);
typedef sp<Camera> (*Android3DConnectFuncType)(int, int);
const int BACK_CAMERA_INDEX = 99;
const int FRONT_CAMERA_INDEX = 98;
enum {
CAMERA_SUPPORT_MODE_2D = 0x01, /* Camera Sensor supports 2D mode. */
CAMERA_SUPPORT_MODE_3D = 0x02, /* Camera Sensor supports 3D mode. */
@ -373,7 +376,51 @@ CameraHandler* CameraHandler::initCameraConnect(const CameraCallback& callback,
const char Android23ConnectName[] = "_ZN7android6Camera7connectEi";
const char Android3DConnectName[] = "_ZN7android6Camera7connectEii";
LOGD("CameraHandler::initCameraConnect(%p, %d, %p, %p)", callback, cameraId, userData, prevCameraParameters);
int localCameraIndex = cameraId;
#if !defined(ANDROID_r2_2_0)
if (cameraId == BACK_CAMERA_INDEX)
{
LOGD("Back camera selected");
for (int i = 0; i < Camera::getNumberOfCameras(); i++)
{
CameraInfo info;
Camera::getCameraInfo(i, &info);
if (info.facing == CAMERA_FACING_BACK)
{
localCameraIndex = i;
break;
}
}
}
else if (cameraId == FRONT_CAMERA_INDEX)
{
LOGD("Front camera selected");
for (int i = 0; i < Camera::getNumberOfCameras(); i++)
{
CameraInfo info;
Camera::getCameraInfo(i, &info);
if (info.facing == CAMERA_FACING_FRONT)
{
localCameraIndex = i;
break;
}
}
}
if (localCameraIndex == BACK_CAMERA_INDEX)
{
LOGE("Back camera not found!");
return NULL;
}
else if (localCameraIndex == FRONT_CAMERA_INDEX)
{
LOGE("Front camera not found!");
return NULL;
}
#endif
LOGD("CameraHandler::initCameraConnect(%p, %d, %p, %p)", callback, localCameraIndex, userData, prevCameraParameters);
sp<Camera> camera = 0;
@ -381,8 +428,8 @@ CameraHandler* CameraHandler::initCameraConnect(const CameraCallback& callback,
if (!CameraHALHandle)
{
LOGE("Cannot link to \"libcamera_client.so\"");
return NULL;
LOGE("Cannot link to \"libcamera_client.so\"");
return NULL;
}
// reset errors
@ -390,24 +437,24 @@ CameraHandler* CameraHandler::initCameraConnect(const CameraCallback& callback,
if (Android22ConnectFuncType Android22Connect = (Android22ConnectFuncType)dlsym(CameraHALHandle, Android22ConnectName))
{
LOGD("Connecting to CameraService v 2.2");
camera = Android22Connect();
LOGD("Connecting to CameraService v 2.2");
camera = Android22Connect();
}
else if (Android23ConnectFuncType Android23Connect = (Android23ConnectFuncType)dlsym(CameraHALHandle, Android23ConnectName))
{
LOGD("Connecting to CameraService v 2.3");
camera = Android23Connect(cameraId);
LOGD("Connecting to CameraService v 2.3");
camera = Android23Connect(localCameraIndex);
}
else if (Android3DConnectFuncType Android3DConnect = (Android3DConnectFuncType)dlsym(CameraHALHandle, Android3DConnectName))
{
LOGD("Connecting to CameraService v 3D");
camera = Android3DConnect(cameraId, CAMERA_SUPPORT_MODE_2D);
LOGD("Connecting to CameraService v 3D");
camera = Android3DConnect(localCameraIndex, CAMERA_SUPPORT_MODE_2D);
}
else
{
dlclose(CameraHALHandle);
LOGE("Cannot connect to CameraService. Connect method was not found!");
return NULL;
dlclose(CameraHALHandle);
LOGE("Cannot connect to CameraService. Connect method was not found!");
return NULL;
}
dlclose(CameraHALHandle);
@ -422,7 +469,7 @@ CameraHandler* CameraHandler::initCameraConnect(const CameraCallback& callback,
camera->setListener(handler);
handler->camera = camera;
handler->cameraId = cameraId;
handler->cameraId = localCameraIndex;
if (prevCameraParameters != 0)
{

@ -1486,6 +1486,6 @@ The function reconstructs 3-dimensional points (in homogeneous coordinates) by u
.. [SteweniusCFS] Stewénius, H., Calibrated Fivepoint solver. http://www.vis.uky.edu/~stewe/FIVEPOINT/
.. [Slabaugh] Slabaugh, G.G. Computing Euler angles from a rotation matrix. http://gregslabaugh.name/publications/euler.pdf
.. [Slabaugh] Slabaugh, G.G. Computing Euler angles from a rotation matrix. http://www.soi.city.ac.uk/~sbbh653/publications/euler.pdf (verified: 2013-04-15)
.. [Zhang2000] Z. Zhang. A Flexible New Technique for Camera Calibration. IEEE Transactions on Pattern Analysis and Machine Intelligence, 22(11):1330-1334, 2000.

@ -2850,8 +2850,9 @@ PCA& PCA::operator()(InputArray _data, InputArray __mean, int flags, int maxComp
if( _mean.data )
{
CV_Assert( _mean.size() == mean_sz );
CV_Assert( _mean.size() == mean_sz );
_mean.convertTo(mean, ctype);
covar_flags |= CV_COVAR_USE_AVG;
}
calcCovarMatrix( data, covar, mean, covar_flags, ctype );

@ -42,7 +42,6 @@ template <typename Distance>
void find_nearest(const Matrix<typename Distance::ElementType>& dataset, typename Distance::ElementType* query, int* matches, int nn,
int skip = 0, Distance distance = Distance())
{
typedef typename Distance::ElementType ElementType;
typedef typename Distance::ResultType DistanceType;
int n = nn + skip;

@ -76,7 +76,26 @@ set(highgui_srcs
file(GLOB highgui_ext_hdrs "include/opencv2/*.hpp" "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h")
if(HAVE_QT)
if(HAVE_QT5)
set(CMAKE_AUTOMOC ON)
set(CMAKE_INCLUDE_CURRENT_DIR ON)
QT5_ADD_RESOURCES(_RCC_OUTFILES src/window_QT.qrc)
list(APPEND highgui_srcs src/window_QT.cpp src/window_QT.h ${_RCC_OUTFILES})
foreach(dt5_dep Core Gui Widgets Test Concurrent)
add_definitions(${Qt5${dt5_dep}_DEFINITIONS})
include_directories(${Qt5${dt5_dep}_INCLUDE_DIRS})
list(APPEND HIGHGUI_LIBRARIES ${Qt5${dt5_dep}_LIBRARIES})
endforeach()
if(HAVE_QT_OPENGL)
add_definitions(${Qt5OpenGL_DEFINITIONS})
include_directories(${Qt5OpenGL_INCLUDE_DIRS})
list(APPEND HIGHGUI_LIBRARIES ${Qt5OpenGL_LIBRARIES})
endif()
elseif(HAVE_QT)
if (HAVE_QT_OPENGL)
set(QT_USE_QTOPENGL TRUE)
endif()

@ -306,6 +306,8 @@ enum
CV_CAP_OPENNI_ASUS =910, // OpenNI (for Asus Xtion)
CV_CAP_ANDROID =1000, // Android
CV_CAP_ANDROID_BACK =CV_CAP_ANDROID+99, // Android back camera
CV_CAP_ANDROID_FRONT =CV_CAP_ANDROID+98, // Android front camera
CV_CAP_XIAPI =1100, // XIMEA Camera API

@ -1665,6 +1665,17 @@ static int icvSetPropertyCAM_V4L(CvCaptureCAM_V4L* capture, int property_id, dou
width = height = 0;
}
break;
case CV_CAP_PROP_FPS:
struct v4l2_streamparm setfps;
memset (&setfps, 0, sizeof(struct v4l2_streamparm));
setfps.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
setfps.parm.capture.timeperframe.numerator = 1;
setfps.parm.capture.timeperframe.denominator = value;
if (xioctl (capture->deviceHandle, VIDIOC_S_PARM, &setfps) < 0){
fprintf(stderr, "HIGHGUI ERROR: V4L: Unable to set camera FPS\n");
retval=0;
}
break;
default:
retval = icvSetControl(capture, property_id, value);
}

@ -52,6 +52,11 @@
#include <stdio.h>
#include <setjmp.h>
// the following defines are a hack to avoid multiple problems with frame ponter handling and setjmp
// see http://gcc.gnu.org/ml/gcc/2011-10/msg00324.html for some details
#define mingw_getsp(...) 0
#define __builtin_frame_address(...) 0
#ifdef WIN32
#define XMD_H // prevent redefinition of INT32

@ -73,6 +73,11 @@
#pragma warning( disable: 4611 )
#endif
// the following defines are a hack to avoid multiple problems with frame ponter handling and setjmp
// see http://gcc.gnu.org/ml/gcc/2011-10/msg00324.html for some details
#define mingw_getsp(...) 0
#define __builtin_frame_address(...) 0
namespace cv
{

@ -48,13 +48,13 @@
#endif
#include <QAbstractEventDispatcher>
#include <QtGui/QApplication>
#include <QApplication>
#include <QFile>
#include <QPushButton>
#include <QtGui/QGraphicsView>
#include <QGraphicsView>
#include <QSizePolicy>
#include <QInputDialog>
#include <QtGui/QBoxLayout>
#include <QBoxLayout>
#include <QSettings>
#include <qtimer.h>
#include <QtConcurrentRun>
@ -78,7 +78,7 @@
#include <QRadioButton>
#include <QButtonGroup>
#include <QMenu>
#include <QtTest/QTest>
#include <QTest>
//start private enum
enum { CV_MODE_NORMAL = 0, CV_MODE_OPENGL = 1 };

@ -342,7 +342,7 @@ Finds the convex hull of a point set.
:param hull_storage: Output memory storage in the old API (``cvConvexHull2`` returns a sequence containing the convex hull points or their indices).
:param clockwise: Orientation flag. If it is true, the output convex hull is oriented clockwise. Otherwise, it is oriented counter-clockwise. The usual screen coordinate system is assumed so that the origin is at the top-left corner, x axis is oriented to the right, and y axis is oriented downwards.
:param clockwise: Orientation flag. If it is true, the output convex hull is oriented clockwise. Otherwise, it is oriented counter-clockwise. The assumed coordinate system has its X axis pointing to the right, and its Y axis pointing upwards.
:param orientation: Convex hull orientation parameter in the old API, ``CV_CLOCKWISE`` or ``CV_COUNTERCLOCKWISE``.

@ -127,7 +127,6 @@ floodFill_CnIR( Mat& image, Point seed,
_Tp newVal, ConnectedComp* region, int flags,
std::vector<FFillSegment>* buffer )
{
typedef typename DataType<_Tp>::channel_type _CTp;
_Tp* img = (_Tp*)(image.data + image.step * seed.y);
Size roi = image.size();
int i, L, R;
@ -279,7 +278,6 @@ floodFillGrad_CnIR( Mat& image, Mat& msk,
Diff diff, ConnectedComp* region, int flags,
std::vector<FFillSegment>* buffer )
{
typedef typename DataType<_Tp>::channel_type _CTp;
int step = (int)image.step, maskStep = (int)msk.step;
uchar* pImage = image.data;
_Tp* img = (_Tp*)(pImage + step*seed.y);
@ -610,7 +608,7 @@ int cv::floodFill( InputOutputArray _image, InputOutputArray _mask,
&comp, flags, &buffer);
else
CV_Error(CV_StsUnsupportedFormat, "");
if( rect )
*rect = comp.rect;
return comp.area;

@ -1219,8 +1219,6 @@ static void resizeGeneric_( const Mat& src, Mat& dst,
const int* yofs, const void* _beta,
int xmin, int xmax, int ksize )
{
typedef typename HResize::value_type T;
typedef typename HResize::buf_type WT;
typedef typename HResize::alpha_type AT;
const AT* beta = (const AT*)_beta;

@ -1,5 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<lint>
<issue id="InlinedApi">
<ignore path="src\org\opencv\android\JavaCameraView.java" />
</issue>
<issue id="NewApi">
<ignore path="src\org\opencv\android\JavaCameraView.java" />
</issue>

@ -4,8 +4,8 @@
<attr name="show_fps" format="boolean"/>
<attr name="camera_id" format="integer" >
<enum name="any" value="-1" />
<enum name="back" value="0" />
<enum name="front" value="1" />
<enum name="back" value="99" />
<enum name="front" value="98" />
</attr>
</declare-styleable>
</resources>

@ -47,10 +47,14 @@ public abstract class CameraBridgeViewBase extends SurfaceView implements Surfac
protected int mMaxWidth;
protected float mScale = 0;
protected int mPreviewFormat = Highgui.CV_CAP_ANDROID_COLOR_FRAME_RGBA;
protected int mCameraIndex = -1;
protected int mCameraIndex = CAMERA_ID_ANY;
protected boolean mEnabled;
protected FpsMeter mFpsMeter = null;
public static final int CAMERA_ID_ANY = -1;
public static final int CAMERA_ID_BACK = 99;
public static final int CAMERA_ID_FRONT = 98;
public CameraBridgeViewBase(Context context, int cameraId) {
super(context);
mCameraIndex = cameraId;
@ -74,6 +78,7 @@ public abstract class CameraBridgeViewBase extends SurfaceView implements Surfac
getHolder().addCallback(this);
mMaxWidth = MAX_UNSPECIFIED;
mMaxHeight = MAX_UNSPECIFIED;
styledAttrs.recycle();
}
public interface CvCameraViewListener {
@ -155,8 +160,6 @@ public abstract class CameraBridgeViewBase extends SurfaceView implements Surfac
mPreviewFormat = format;
}
private CvCameraViewListenerAdapter() {}
private int mPreviewFormat = Highgui.CV_CAP_ANDROID_COLOR_FRAME_RGBA;
private CvCameraViewListener mOldStyleListener;
};

@ -6,6 +6,7 @@ import android.content.Context;
import android.graphics.ImageFormat;
import android.graphics.SurfaceTexture;
import android.hardware.Camera;
import android.hardware.Camera.CameraInfo;
import android.hardware.Camera.PreviewCallback;
import android.os.Build;
import android.util.AttributeSet;
@ -68,7 +69,7 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
synchronized (this) {
mCamera = null;
if (mCameraIndex == -1) {
if (mCameraIndex == CAMERA_ID_ANY) {
Log.d(TAG, "Trying to open camera with old open()");
try {
mCamera = Camera.open();
@ -92,11 +93,39 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
}
} else {
if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.GINGERBREAD) {
Log.d(TAG, "Trying to open camera with new open(" + Integer.valueOf(mCameraIndex) + ")");
try {
mCamera = Camera.open(mCameraIndex);
} catch (RuntimeException e) {
Log.e(TAG, "Camera #" + mCameraIndex + "failed to open: " + e.getLocalizedMessage());
int localCameraIndex = mCameraIndex;
if (mCameraIndex == CAMERA_ID_BACK) {
Log.i(TAG, "Trying to open back camera");
Camera.CameraInfo cameraInfo = new Camera.CameraInfo();
for (int camIdx = 0; camIdx < Camera.getNumberOfCameras(); ++camIdx) {
Camera.getCameraInfo( camIdx, cameraInfo );
if (cameraInfo.facing == Camera.CameraInfo.CAMERA_FACING_BACK) {
localCameraIndex = camIdx;
break;
}
}
} else if (mCameraIndex == CAMERA_ID_FRONT) {
Log.i(TAG, "Trying to open front camera");
Camera.CameraInfo cameraInfo = new Camera.CameraInfo();
for (int camIdx = 0; camIdx < Camera.getNumberOfCameras(); ++camIdx) {
Camera.getCameraInfo( camIdx, cameraInfo );
if (cameraInfo.facing == Camera.CameraInfo.CAMERA_FACING_FRONT) {
localCameraIndex = camIdx;
break;
}
}
}
if (localCameraIndex == CAMERA_ID_BACK) {
Log.e(TAG, "Back camera not found!");
} else if (localCameraIndex == CAMERA_ID_FRONT) {
Log.e(TAG, "Front camera not found!");
} else {
Log.d(TAG, "Trying to open camera with new open(" + Integer.valueOf(localCameraIndex) + ")");
try {
mCamera = Camera.open(localCameraIndex);
} catch (RuntimeException e) {
Log.e(TAG, "Camera #" + localCameraIndex + "failed to open: " + e.getLocalizedMessage());
}
}
}
}
@ -179,6 +208,8 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
synchronized (this) {
if (mCamera != null) {
mCamera.stopPreview();
mCamera.setPreviewCallback(null);
mCamera.release();
}
mCamera = null;
@ -267,9 +298,6 @@ public class JavaCameraView extends CameraBridgeViewBase implements PreviewCallb
mRgba.release();
}
private JavaCameraFrame(CvCameraViewFrame obj) {
}
private Mat mYuvFrameData;
private Mat mRgba;
private int mWidth;

@ -53,14 +53,16 @@ public class NativeCameraView extends CameraBridgeViewBase {
/* 1. We need to stop thread which updating the frames
* 2. Stop camera and release it
*/
try {
mStopThread = true;
mThread.join();
} catch (InterruptedException e) {
e.printStackTrace();
} finally {
mThread = null;
mStopThread = false;
if (mThread != null) {
try {
mStopThread = true;
mThread.join();
} catch (InterruptedException e) {
e.printStackTrace();
} finally {
mThread = null;
mStopThread = false;
}
}
/* Now release camera */
@ -131,17 +133,17 @@ public class NativeCameraView extends CameraBridgeViewBase {
}
}
private class NativeCameraFrame implements CvCameraViewFrame {
private static class NativeCameraFrame implements CvCameraViewFrame {
@Override
public Mat rgba() {
mCamera.retrieve(mRgba, Highgui.CV_CAP_ANDROID_COLOR_FRAME_RGBA);
mCapture.retrieve(mRgba, Highgui.CV_CAP_ANDROID_COLOR_FRAME_RGBA);
return mRgba;
}
@Override
public Mat gray() {
mCamera.retrieve(mGray, Highgui.CV_CAP_ANDROID_GREY_FRAME);
mCapture.retrieve(mGray, Highgui.CV_CAP_ANDROID_GREY_FRAME);
return mGray;
}
@ -158,9 +160,6 @@ public class NativeCameraView extends CameraBridgeViewBase {
private class CameraWorker implements Runnable {
private Mat mRgba = new Mat();
private Mat mGray = new Mat();
public void run() {
do {
if (!mCamera.grab()) {

@ -61,7 +61,7 @@ protected:
{
int ObjNum = m_TrackList.GetBlobNum();
int i;
char video_name[1024];
char video_name[1024+1];
char* struct_name = NULL;
CvFileStorage* storage = cvOpenFileStorage(m_pFileName,NULL,CV_STORAGE_WRITE_TEXT);

@ -117,10 +117,10 @@ class CvKDTreeWrap : public CvFeatureTree {
CvMat* results) {
int rn = results->rows * results->cols;
std::vector<int> inbounds;
dispatch_cvtype(mat, ((__treetype*)data)->
find_ortho_range((typename __treetype::scalar_type*)bounds_min->data.ptr,
assert(CV_MAT_DEPTH(mat->type) == CV_32F || CV_MAT_DEPTH(mat->type) == CV_64F);
((__treetype*)data)->find_ortho_range((typename __treetype::scalar_type*)bounds_min->data.ptr,
(typename __treetype::scalar_type*)bounds_max->data.ptr,
inbounds));
inbounds);
std::copy(inbounds.begin(),
inbounds.begin() + std::min((int)inbounds.size(), rn),
(int*) results->data.ptr);

@ -1140,7 +1140,7 @@ void CascadeClassifier::detectMultiScale( const Mat& image, std::vector<Rect>& o
Size windowSize( cvRound(originalWindowSize.width*factor), cvRound(originalWindowSize.height*factor) );
Size scaledImageSize( cvRound( grayImage.cols/factor ), cvRound( grayImage.rows/factor ) );
Size processingRectSize( scaledImageSize.width - originalWindowSize.width + 1, scaledImageSize.height - originalWindowSize.height + 1 );
Size processingRectSize( scaledImageSize.width - originalWindowSize.width, scaledImageSize.height - originalWindowSize.height );
if( processingRectSize.width <= 0 || processingRectSize.height <= 0 )
break;

@ -151,7 +151,7 @@ namespace cv
static Context *getContext();
static void setContext(Info &oclinfo);
enum {CL_DOUBLE, CL_UNIFIED_MEM};
enum {CL_DOUBLE, CL_UNIFIED_MEM, CL_VER_1_2};
bool supportsFeature(int ftype);
size_t computeUnits();
size_t maxWorkGroupSize();
@ -264,9 +264,9 @@ namespace cv
void create(Size size, int type);
//! allocates new oclMatrix with specified device memory type.
void createEx(int rows, int cols, int type,
void createEx(int rows, int cols, int type,
DevMemRW rw_type, DevMemType mem_type, void* hptr = 0);
void createEx(Size size, int type, DevMemRW rw_type,
void createEx(Size size, int type, DevMemRW rw_type,
DevMemType mem_type, void* hptr = 0);
//! decreases reference counter;
@ -406,6 +406,9 @@ namespace cv
//! computes element-wise product of the two arrays (c = a * b)
// supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
CV_EXPORTS void multiply(const oclMat &a, const oclMat &b, oclMat &c, double scale = 1);
//! multiplies matrix to a number (dst = scalar * src)
// supports CV_32FC1 only
CV_EXPORTS void multiply(double scalar, const oclMat &src, oclMat &dst);
//! computes element-wise quotient of the two arrays (c = a / b)
// supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
CV_EXPORTS void divide(const oclMat &a, const oclMat &b, oclMat &c, double scale = 1);
@ -823,7 +826,44 @@ namespace cv
};
#endif
class CV_EXPORTS OclCascadeClassifierBuf : public cv::CascadeClassifier
{
public:
OclCascadeClassifierBuf() :
m_flags(0), initialized(false), m_scaleFactor(0), buffers(NULL) {}
~OclCascadeClassifierBuf() {}
void detectMultiScale(oclMat &image, CV_OUT std::vector<cv::Rect>& faces,
double scaleFactor = 1.1, int minNeighbors = 3, int flags = 0,
Size minSize = Size(), Size maxSize = Size());
void release();
private:
void Init(const int rows, const int cols, double scaleFactor, int flags,
const int outputsz, const size_t localThreads[],
Size minSize, Size maxSize);
void CreateBaseBufs(const int datasize, const int totalclassifier, const int flags, const int outputsz);
void CreateFactorRelatedBufs(const int rows, const int cols, const int flags,
const double scaleFactor, const size_t localThreads[],
Size minSize, Size maxSize);
void GenResult(CV_OUT std::vector<cv::Rect>& faces, const std::vector<cv::Rect> &rectList, const std::vector<int> &rweights);
int m_rows;
int m_cols;
int m_flags;
int m_loopcount;
int m_nodenum;
bool findBiggestObject;
bool initialized;
double m_scaleFactor;
Size m_minSize;
Size m_maxSize;
std::vector<Size> sizev;
std::vector<float> scalev;
oclMat gimg1, gsum, gsqsum;
void * buffers;
};
/////////////////////////////// Pyramid /////////////////////////////////////
CV_EXPORTS void pyrDown(const oclMat &src, oclMat &dst);
@ -849,7 +889,6 @@ namespace cv
std::vector<oclMat> image_sqsums;
};
//! computes the proximity map for the raster template and the image where the template is searched for
// Supports TM_SQDIFF, TM_SQDIFF_NORMED, TM_CCORR, TM_CCORR_NORMED, TM_CCOEFF, TM_CCOEFF_NORMED for type 8UC1 and 8UC4
// Supports TM_SQDIFF, TM_CCORR for type 32FC1 and 32FC4
@ -1093,13 +1132,11 @@ namespace cv
/****************************************************************************************\
* Distance *
\****************************************************************************************/
template<typename T>
struct CV_EXPORTS Accumulator
{
typedef T Type;
};
template<> struct Accumulator<unsigned char>
{
typedef float Type;
@ -1173,469 +1210,244 @@ namespace cv
{
public:
enum DistType {L1Dist = 0, L2Dist, HammingDist};
explicit BruteForceMatcher_OCL_base(DistType distType = L2Dist);
// Add descriptors to train descriptor collection
void add(const std::vector<oclMat> &descCollection);
// Get train descriptors collection
const std::vector<oclMat> &getTrainDescriptors() const;
// Clear train descriptors collection
void clear();
// Return true if there are not train descriptors in collection
bool empty() const;
// Return true if the matcher supports mask in match methods
bool isMaskSupported() const;
// Find one best match for each query descriptor
void matchSingle(const oclMat &query, const oclMat &train,
oclMat &trainIdx, oclMat &distance,
const oclMat &mask = oclMat());
// Download trainIdx and distance and convert it to CPU vector with DMatch
static void matchDownload(const oclMat &trainIdx, const oclMat &distance, std::vector<DMatch> &matches);
// Convert trainIdx and distance to vector with DMatch
static void matchConvert(const Mat &trainIdx, const Mat &distance, std::vector<DMatch> &matches);
// Find one best match for each query descriptor
void match(const oclMat &query, const oclMat &train, std::vector<DMatch> &matches, const oclMat &mask = oclMat());
// Make gpu collection of trains and masks in suitable format for matchCollection function
void makeGpuCollection(oclMat &trainCollection, oclMat &maskCollection, const std::vector<oclMat> &masks = std::vector<oclMat>());
// Find one best match from train collection for each query descriptor
void matchCollection(const oclMat &query, const oclMat &trainCollection,
oclMat &trainIdx, oclMat &imgIdx, oclMat &distance,
const oclMat &masks = oclMat());
// Download trainIdx, imgIdx and distance and convert it to vector with DMatch
static void matchDownload(const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance, std::vector<DMatch> &matches);
// Convert trainIdx, imgIdx and distance to vector with DMatch
static void matchConvert(const Mat &trainIdx, const Mat &imgIdx, const Mat &distance, std::vector<DMatch> &matches);
// Find one best match from train collection for each query descriptor.
void match(const oclMat &query, std::vector<DMatch> &matches, const std::vector<oclMat> &masks = std::vector<oclMat>());
// Find k best matches for each query descriptor (in increasing order of distances)
void knnMatchSingle(const oclMat &query, const oclMat &train,
oclMat &trainIdx, oclMat &distance, oclMat &allDist, int k,
const oclMat &mask = oclMat());
// Download trainIdx and distance and convert it to vector with DMatch
// compactResult is used when mask is not empty. If compactResult is false matches
// vector will have the same size as queryDescriptors rows. If compactResult is true
// matches vector will not contain matches for fully masked out query descriptors.
static void knnMatchDownload(const oclMat &trainIdx, const oclMat &distance,
std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
// Convert trainIdx and distance to vector with DMatch
static void knnMatchConvert(const Mat &trainIdx, const Mat &distance,
std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
// Find k best matches for each query descriptor (in increasing order of distances).
// compactResult is used when mask is not empty. If compactResult is false matches
// vector will have the same size as queryDescriptors rows. If compactResult is true
// matches vector will not contain matches for fully masked out query descriptors.
void knnMatch(const oclMat &query, const oclMat &train,
std::vector< std::vector<DMatch> > &matches, int k, const oclMat &mask = oclMat(),
bool compactResult = false);
// Find k best matches from train collection for each query descriptor (in increasing order of distances)
void knnMatch2Collection(const oclMat &query, const oclMat &trainCollection,
oclMat &trainIdx, oclMat &imgIdx, oclMat &distance,
const oclMat &maskCollection = oclMat());
// Download trainIdx and distance and convert it to vector with DMatch
// compactResult is used when mask is not empty. If compactResult is false matches
// vector will have the same size as queryDescriptors rows. If compactResult is true
// matches vector will not contain matches for fully masked out query descriptors.
static void knnMatch2Download(const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance,
std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
// Convert trainIdx and distance to vector with DMatch
static void knnMatch2Convert(const Mat &trainIdx, const Mat &imgIdx, const Mat &distance,
std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
// Find k best matches for each query descriptor (in increasing order of distances).
// compactResult is used when mask is not empty. If compactResult is false matches
// vector will have the same size as queryDescriptors rows. If compactResult is true
// matches vector will not contain matches for fully masked out query descriptors.
void knnMatch(const oclMat &query, std::vector< std::vector<DMatch> > &matches, int k,
const std::vector<oclMat> &masks = std::vector<oclMat>(), bool compactResult = false);
// Find best matches for each query descriptor which have distance less than maxDistance.
// nMatches.at<int>(0, queryIdx) will contain matches count for queryIdx.
// carefully nMatches can be greater than trainIdx.cols - it means that matcher didn't find all matches,
// because it didn't have enough memory.
// If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nTrain / 100), 10),
// otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
// Matches doesn't sorted.
void radiusMatchSingle(const oclMat &query, const oclMat &train,
oclMat &trainIdx, oclMat &distance, oclMat &nMatches, float maxDistance,
const oclMat &mask = oclMat());
// Download trainIdx, nMatches and distance and convert it to vector with DMatch.
// matches will be sorted in increasing order of distances.
// compactResult is used when mask is not empty. If compactResult is false matches
// vector will have the same size as queryDescriptors rows. If compactResult is true
// matches vector will not contain matches for fully masked out query descriptors.
static void radiusMatchDownload(const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches,
std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
// Convert trainIdx, nMatches and distance to vector with DMatch.
static void radiusMatchConvert(const Mat &trainIdx, const Mat &distance, const Mat &nMatches,
std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
// Find best matches for each query descriptor which have distance less than maxDistance
// in increasing order of distances).
void radiusMatch(const oclMat &query, const oclMat &train,
std::vector< std::vector<DMatch> > &matches, float maxDistance,
const oclMat &mask = oclMat(), bool compactResult = false);
// Find best matches for each query descriptor which have distance less than maxDistance.
// If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nQuery / 100), 10),
// otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
// Matches doesn't sorted.
void radiusMatchCollection(const oclMat &query, oclMat &trainIdx, oclMat &imgIdx, oclMat &distance, oclMat &nMatches, float maxDistance,
const std::vector<oclMat> &masks = std::vector<oclMat>());
// Download trainIdx, imgIdx, nMatches and distance and convert it to vector with DMatch.
// matches will be sorted in increasing order of distances.
// compactResult is used when mask is not empty. If compactResult is false matches
// vector will have the same size as queryDescriptors rows. If compactResult is true
// matches vector will not contain matches for fully masked out query descriptors.
static void radiusMatchDownload(const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance, const oclMat &nMatches,
std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
// Convert trainIdx, nMatches and distance to vector with DMatch.
static void radiusMatchConvert(const Mat &trainIdx, const Mat &imgIdx, const Mat &distance, const Mat &nMatches,
std::vector< std::vector<DMatch> > &matches, bool compactResult = false);
// Find best matches from train collection for each query descriptor which have distance less than
// maxDistance (in increasing order of distances).
void radiusMatch(const oclMat &query, std::vector< std::vector<DMatch> > &matches, float maxDistance,
const std::vector<oclMat> &masks = std::vector<oclMat>(), bool compactResult = false);
DistType distType;
private:
std::vector<oclMat> trainDescCollection;
};
template <class Distance>
class CV_EXPORTS BruteForceMatcher_OCL;
template <typename T>
class CV_EXPORTS BruteForceMatcher_OCL< L1<T> > : public BruteForceMatcher_OCL_base
{
public:
explicit BruteForceMatcher_OCL() : BruteForceMatcher_OCL_base(L1Dist) {}
explicit BruteForceMatcher_OCL(L1<T> /*d*/) : BruteForceMatcher_OCL_base(L1Dist) {}
};
template <typename T>
class CV_EXPORTS BruteForceMatcher_OCL< L2<T> > : public BruteForceMatcher_OCL_base
{
public:
explicit BruteForceMatcher_OCL() : BruteForceMatcher_OCL_base(L2Dist) {}
explicit BruteForceMatcher_OCL(L2<T> /*d*/) : BruteForceMatcher_OCL_base(L2Dist) {}
};
template <> class CV_EXPORTS BruteForceMatcher_OCL< Hamming > : public BruteForceMatcher_OCL_base
{
public:
explicit BruteForceMatcher_OCL() : BruteForceMatcher_OCL_base(HammingDist) {}
explicit BruteForceMatcher_OCL(Hamming /*d*/) : BruteForceMatcher_OCL_base(HammingDist) {}
};
class CV_EXPORTS BFMatcher_OCL : public BruteForceMatcher_OCL_base
{
public:
explicit BFMatcher_OCL(int norm = NORM_L2) : BruteForceMatcher_OCL_base(norm == NORM_L1 ? L1Dist : norm == NORM_L2 ? L2Dist : HammingDist) {}
};
/////////////////////////////// PyrLKOpticalFlow /////////////////////////////////////
class CV_EXPORTS PyrLKOpticalFlow
{
public:
PyrLKOpticalFlow()
{
winSize = Size(21, 21);
maxLevel = 3;
iters = 30;
derivLambda = 0.5;
useInitialFlow = false;
minEigThreshold = 1e-4f;
getMinEigenVals = false;
isDeviceArch11_ = false;
}
void sparse(const oclMat &prevImg, const oclMat &nextImg, const oclMat &prevPts, oclMat &nextPts,
oclMat &status, oclMat *err = 0);
void dense(const oclMat &prevImg, const oclMat &nextImg, oclMat &u, oclMat &v, oclMat *err = 0);
Size winSize;
int maxLevel;
int iters;
double derivLambda;
bool useInitialFlow;
float minEigThreshold;
bool getMinEigenVals;
void releaseMemory()
{
dx_calcBuf_.release();
dy_calcBuf_.release();
prevPyr_.clear();
nextPyr_.clear();
dx_buf_.release();
dy_buf_.release();
}
private:
void calcSharrDeriv(const oclMat &src, oclMat &dx, oclMat &dy);
void buildImagePyramid(const oclMat &img0, std::vector<oclMat> &pyr, bool withBorder);
oclMat dx_calcBuf_;
oclMat dy_calcBuf_;
std::vector<oclMat> prevPyr_;
std::vector<oclMat> nextPyr_;
oclMat dx_buf_;
oclMat dy_buf_;
oclMat uPyr_[2];
oclMat vPyr_[2];
bool isDeviceArch11_;
};
//////////////// build warping maps ////////////////////
//! builds plane warping maps
@ -1706,6 +1518,7 @@ namespace cv
private:
oclMat minSSD, leBuf, riBuf;
};
class CV_EXPORTS StereoBeliefPropagation
{
public:
@ -1736,6 +1549,133 @@ namespace cv
std::vector<oclMat> datas;
oclMat out;
};
class CV_EXPORTS StereoConstantSpaceBP
{
public:
enum { DEFAULT_NDISP = 128 };
enum { DEFAULT_ITERS = 8 };
enum { DEFAULT_LEVELS = 4 };
enum { DEFAULT_NR_PLANE = 4 };
static void estimateRecommendedParams(int width, int height, int &ndisp, int &iters, int &levels, int &nr_plane);
explicit StereoConstantSpaceBP(
int ndisp = DEFAULT_NDISP,
int iters = DEFAULT_ITERS,
int levels = DEFAULT_LEVELS,
int nr_plane = DEFAULT_NR_PLANE,
int msg_type = CV_32F);
StereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane,
float max_data_term, float data_weight, float max_disc_term, float disc_single_jump,
int min_disp_th = 0,
int msg_type = CV_32F);
void operator()(const oclMat &left, const oclMat &right, oclMat &disparity);
int ndisp;
int iters;
int levels;
int nr_plane;
float max_data_term;
float data_weight;
float max_disc_term;
float disc_single_jump;
int min_disp_th;
int msg_type;
bool use_local_init_data_cost;
private:
oclMat u[2], d[2], l[2], r[2];
oclMat disp_selected_pyr[2];
oclMat data_cost;
oclMat data_cost_selected;
oclMat temp;
oclMat out;
};
// Implementation of the Zach, Pock and Bischof Dual TV-L1 Optical Flow method
//
// see reference:
// [1] C. Zach, T. Pock and H. Bischof, "A Duality Based Approach for Realtime TV-L1 Optical Flow".
// [2] Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation".
class CV_EXPORTS OpticalFlowDual_TVL1_OCL
{
public:
OpticalFlowDual_TVL1_OCL();
void operator ()(const oclMat& I0, const oclMat& I1, oclMat& flowx, oclMat& flowy);
void collectGarbage();
/**
* Time step of the numerical scheme.
*/
double tau;
/**
* Weight parameter for the data term, attachment parameter.
* This is the most relevant parameter, which determines the smoothness of the output.
* The smaller this parameter is, the smoother the solutions we obtain.
* It depends on the range of motions of the images, so its value should be adapted to each image sequence.
*/
double lambda;
/**
* Weight parameter for (u - v)^2, tightness parameter.
* It serves as a link between the attachment and the regularization terms.
* In theory, it should have a small value in order to maintain both parts in correspondence.
* The method is stable for a large range of values of this parameter.
*/
double theta;
/**
* Number of scales used to create the pyramid of images.
*/
int nscales;
/**
* Number of warpings per scale.
* Represents the number of times that I1(x+u0) and grad( I1(x+u0) ) are computed per scale.
* This is a parameter that assures the stability of the method.
* It also affects the running time, so it is a compromise between speed and accuracy.
*/
int warps;
/**
* Stopping criterion threshold used in the numerical scheme, which is a trade-off between precision and running time.
* A small value will yield more accurate solutions at the expense of a slower convergence.
*/
double epsilon;
/**
* Stopping criterion iterations number used in the numerical scheme.
*/
int iterations;
bool useInitialFlow;
private:
void procOneScale(const oclMat& I0, const oclMat& I1, oclMat& u1, oclMat& u2);
std::vector<oclMat> I0s;
std::vector<oclMat> I1s;
std::vector<oclMat> u1s;
std::vector<oclMat> u2s;
oclMat I1x_buf;
oclMat I1y_buf;
oclMat I1w_buf;
oclMat I1wx_buf;
oclMat I1wy_buf;
oclMat grad_buf;
oclMat rho_c_buf;
oclMat p11_buf;
oclMat p12_buf;
oclMat p21_buf;
oclMat p22_buf;
oclMat diff_buf;
oclMat norm_buf;
};
}
}
#if defined _MSC_VER && _MSC_VER >= 1200

@ -45,4 +45,4 @@
#error this is a compatibility header which should not be used inside the OpenCV library
#endif
#include "opencv2/ocl.hpp"
#include "opencv2/ocl.hpp"

@ -22,6 +22,7 @@
// Jiang Liyuan, jlyuan001.good@163.com
// Rock Li, Rock.Li@amd.com
// Zailong Wu, bullet@yeah.net
// Peng Xiao, pengxiao@outlook.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
@ -61,8 +62,6 @@ namespace cv
namespace ocl
{
////////////////////////////////OpenCL kernel strings/////////////////////
extern const char *bitwise;
extern const char *bitwiseM;
extern const char *transpose_kernel;
extern const char *arithm_nonzero;
extern const char *arithm_sum;
@ -76,19 +75,11 @@ namespace cv
extern const char *arithm_add;
extern const char *arithm_add_scalar;
extern const char *arithm_add_scalar_mask;
extern const char *arithm_bitwise_binary;
extern const char *arithm_bitwise_binary_mask;
extern const char *arithm_bitwise_binary_scalar;
extern const char *arithm_bitwise_binary_scalar_mask;
extern const char *arithm_bitwise_not;
extern const char *arithm_bitwise_and;
extern const char *arithm_bitwise_and_mask;
extern const char *arithm_bitwise_and_scalar;
extern const char *arithm_bitwise_and_scalar_mask;
extern const char *arithm_bitwise_or;
extern const char *arithm_bitwise_or_mask;
extern const char *arithm_bitwise_or_scalar;
extern const char *arithm_bitwise_or_scalar_mask;
extern const char *arithm_bitwise_xor;
extern const char *arithm_bitwise_xor_mask;
extern const char *arithm_bitwise_xor_scalar;
extern const char *arithm_bitwise_xor_scalar_mask;
extern const char *arithm_compare_eq;
extern const char *arithm_compare_ne;
extern const char *arithm_mul;
@ -126,7 +117,7 @@ inline int divUp(int total, int grain)
/////////////////////// add subtract multiply divide /////////////////////////
//////////////////////////////////////////////////////////////////////////////
template<typename T>
void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst,
void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst,
String kernelName, const char **kernelString, void *_scalar, int op_type = 0)
{
if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
@ -195,12 +186,12 @@ void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst,
openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth);
}
}
static void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst,
static void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst,
String kernelName, const char **kernelString, int op_type = 0)
{
arithmetic_run<char>(src1, src2, dst, kernelName, kernelString, (void *)NULL, op_type);
}
static void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask,
static void arithmetic_run(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask,
String kernelName, const char **kernelString, int op_type = 0)
{
if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
@ -295,6 +286,7 @@ void cv::ocl::multiply(const oclMat &src1, const oclMat &src2, oclMat &dst, doub
else
arithmetic_run<float>(src1, src2, dst, "arithm_mul", &arithm_mul, (void *)(&scalar));
}
void cv::ocl::divide(const oclMat &src1, const oclMat &src2, oclMat &dst, double scalar)
{
@ -479,6 +471,11 @@ void cv::ocl::subtract(const Scalar &src2, const oclMat &src1, oclMat &dst, cons
arithmetic_scalar( src1, src2, dst, mask, kernelName, kernelString, -1);
}
void cv::ocl::multiply(double scalar, const oclMat &src, oclMat &dst)
{
String kernelName = "arithm_muls";
arithmetic_scalar_run( src, dst, kernelName, &arithm_mul, scalar);
}
void cv::ocl::divide(double scalar, const oclMat &src, oclMat &dst)
{
if(!src.clCxt->supportsFeature(Context::CL_DOUBLE))
@ -1647,7 +1644,8 @@ static void bitwise_run(const oclMat &src1, oclMat &dst, String kernelName, cons
template<typename T>
void bitwise_run(const oclMat &src1, const oclMat &src2, oclMat &dst, String kernelName, const char **kernelString, void *_scalar)
void bitwise_run(const oclMat &src1, const oclMat &src2, oclMat &dst, String kernelName,
const char **kernelString, void *_scalar, const char* _opt = NULL)
{
dst.create(src1.size(), src1.type());
CV_Assert(src1.cols == src2.cols && src2.cols == dst.cols &&
@ -1697,13 +1695,15 @@ void bitwise_run(const oclMat &src1, const oclMat &src2, oclMat &dst, String ker
args.push_back( std::make_pair( sizeof(T), (void *)&scalar ));
}
openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth);
openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth, _opt);
}
static void bitwise_run(const oclMat &src1, const oclMat &src2, oclMat &dst, String kernelName, const char **kernelString)
static void bitwise_run(const oclMat &src1, const oclMat &src2, oclMat &dst,
String kernelName, const char **kernelString, const char* _opt = NULL)
{
bitwise_run<char>(src1, src2, dst, kernelName, kernelString, (void *)NULL);
bitwise_run<char>(src1, src2, dst, kernelName, kernelString, (void *)NULL, _opt);
}
static void bitwise_run(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask, String kernelName, const char **kernelString)
static void bitwise_run(const oclMat &src1, const oclMat &src2, oclMat &dst,
const oclMat &mask, String kernelName, const char **kernelString, const char* _opt = NULL)
{
dst.create(src1.size(), src1.type());
CV_Assert(src1.cols == src2.cols && src2.cols == dst.cols &&
@ -1751,12 +1751,13 @@ static void bitwise_run(const oclMat &src1, const oclMat &src2, oclMat &dst, con
args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols ));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 ));
openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth);
openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth, _opt);
}
template <typename WT , typename CL_WT>
void bitwise_scalar_run(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask, String kernelName, const char **kernelString, int isMatSubScalar)
void bitwise_scalar_run(const oclMat &src1, const Scalar &src2, oclMat &dst,
const oclMat &mask, String kernelName, const char **kernelString, int isMatSubScalar, const char* opt = NULL)
{
dst.create(src1.size(), src1.type());
@ -1818,14 +1819,16 @@ void bitwise_scalar_run(const oclMat &src1, const Scalar &src2, oclMat &dst, con
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&isMatSubScalar));
}
openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth);
openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, channels, depth, opt);
}
typedef void (*BitwiseFuncS)(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask, String kernelName, const char **kernelString, int isMatSubScalar);
typedef void (*BitwiseFuncS)(const oclMat &src1, const Scalar &src2, oclMat &dst,
const oclMat &mask, String kernelName, const char **kernelString, int isMatSubScalar, const char* opt);
static void bitwise_scalar(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask, String kernelName, const char **kernelString, int isMatSubScalar)
static void bitwise_scalar(const oclMat &src1, const Scalar &src2, oclMat &dst,
const oclMat &mask, String kernelName, const char **kernelString, int isMatSubScalar, const char* opt)
{
static BitwiseFuncS tab[8] =
{
@ -1853,11 +1856,12 @@ static void bitwise_scalar(const oclMat &src1, const Scalar &src2, oclMat &dst,
BitwiseFuncS func = tab[src1.depth()];
if(func == 0)
cv::error(Error::StsBadArg, "Unsupported arithmetic operation", "", __FILE__, __LINE__);
func(src1, src2, dst, mask, kernelName, kernelString, isMatSubScalar);
func(src1, src2, dst, mask, kernelName, kernelString, isMatSubScalar, opt);
}
static void bitwise_scalar(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask, String kernelName, const char **kernelString)
static void bitwise_scalar(const oclMat &src1, const Scalar &src2, oclMat &dst,
const oclMat &mask, String kernelName, const char **kernelString, const char * opt = NULL)
{
bitwise_scalar(src1, src2, dst, mask, kernelName, kernelString, 0);
bitwise_scalar(src1, src2, dst, mask, kernelName, kernelString, 0, opt);
}
void cv::ocl::bitwise_not(const oclMat &src, oclMat &dst)
@ -1880,12 +1884,13 @@ void cv::ocl::bitwise_or(const oclMat &src1, const oclMat &src2, oclMat &dst, co
std::cout << "Selected device do not support double" << std::endl;
return;
}
oclMat emptyMat;
String kernelName = mask.empty() ? "arithm_bitwise_or" : "arithm_bitwise_or_with_mask";
String kernelName = mask.empty() ? "arithm_bitwise_binary" : "arithm_bitwise_binary_with_mask";
static const char opt [] = "-D OP_BINARY=|";
if (mask.empty())
bitwise_run(src1, src2, dst, kernelName, &arithm_bitwise_or);
bitwise_run(src1, src2, dst, kernelName, &arithm_bitwise_binary, opt);
else
bitwise_run(src1, src2, dst, mask, kernelName, &arithm_bitwise_or_mask);
bitwise_run(src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_mask, opt);
}
@ -1896,11 +1901,12 @@ void cv::ocl::bitwise_or(const oclMat &src1, const Scalar &src2, oclMat &dst, co
std::cout << "Selected device do not support double" << std::endl;
return;
}
String kernelName = mask.data ? "arithm_s_bitwise_or_with_mask" : "arithm_s_bitwise_or";
static const char opt [] = "-D OP_BINARY=|";
String kernelName = mask.data ? "arithm_s_bitwise_binary_with_mask" : "arithm_s_bitwise_binary";
if (mask.data)
bitwise_scalar( src1, src2, dst, mask, kernelName, &arithm_bitwise_or_scalar_mask);
bitwise_scalar( src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_scalar_mask, opt);
else
bitwise_scalar( src1, src2, dst, mask, kernelName, &arithm_bitwise_or_scalar);
bitwise_scalar( src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_scalar, opt);
}
void cv::ocl::bitwise_and(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
@ -1913,12 +1919,13 @@ void cv::ocl::bitwise_and(const oclMat &src1, const oclMat &src2, oclMat &dst, c
}
oclMat emptyMat;
String kernelName = mask.empty() ? "arithm_bitwise_and" : "arithm_bitwise_and_with_mask";
String kernelName = mask.empty() ? "arithm_bitwise_binary" : "arithm_bitwise_binary_with_mask";
static const char opt [] = "-D OP_BINARY=&";
if (mask.empty())
bitwise_run(src1, src2, dst, kernelName, &arithm_bitwise_and);
bitwise_run(src1, src2, dst, kernelName, &arithm_bitwise_binary, opt);
else
bitwise_run(src1, src2, dst, mask, kernelName, &arithm_bitwise_and_mask);
bitwise_run(src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_mask, opt);
}
void cv::ocl::bitwise_and(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask)
@ -1928,11 +1935,12 @@ void cv::ocl::bitwise_and(const oclMat &src1, const Scalar &src2, oclMat &dst, c
std::cout << "Selected device do not support double" << std::endl;
return;
}
String kernelName = mask.data ? "arithm_s_bitwise_and_with_mask" : "arithm_s_bitwise_and";
static const char opt [] = "-D OP_BINARY=&";
String kernelName = mask.data ? "arithm_s_bitwise_binary_with_mask" : "arithm_s_bitwise_binary";
if (mask.data)
bitwise_scalar(src1, src2, dst, mask, kernelName, &arithm_bitwise_and_scalar_mask);
bitwise_scalar(src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_scalar_mask, opt);
else
bitwise_scalar(src1, src2, dst, mask, kernelName, &arithm_bitwise_and_scalar);
bitwise_scalar(src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_scalar, opt);
}
void cv::ocl::bitwise_xor(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask)
@ -1942,14 +1950,14 @@ void cv::ocl::bitwise_xor(const oclMat &src1, const oclMat &src2, oclMat &dst, c
std::cout << "Selected device do not support double" << std::endl;
return;
}
oclMat emptyMat;
String kernelName = mask.empty() ? "arithm_bitwise_xor" : "arithm_bitwise_xor_with_mask";
String kernelName = mask.empty() ? "arithm_bitwise_binary" : "arithm_bitwise_binary_with_mask";
static const char opt [] = "-D OP_BINARY=^";
if (mask.empty())
bitwise_run(src1, src2, dst, kernelName, &arithm_bitwise_xor);
bitwise_run(src1, src2, dst, kernelName, &arithm_bitwise_binary, opt);
else
bitwise_run(src1, src2, dst, mask, kernelName, &arithm_bitwise_xor_mask);
bitwise_run(src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_mask, opt);
}
@ -1961,11 +1969,12 @@ void cv::ocl::bitwise_xor(const oclMat &src1, const Scalar &src2, oclMat &dst, c
std::cout << "Selected device do not support double" << std::endl;
return;
}
String kernelName = mask.data ? "arithm_s_bitwise_xor_with_mask" : "arithm_s_bitwise_xor";
String kernelName = mask.data ? "arithm_s_bitwise_binary_with_mask" : "arithm_s_bitwise_binary";
static const char opt [] = "-D OP_BINARY=^";
if (mask.data)
bitwise_scalar( src1, src2, dst, mask, kernelName, &arithm_bitwise_xor_scalar_mask);
bitwise_scalar( src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_scalar_mask, opt);
else
bitwise_scalar( src1, src2, dst, mask, kernelName, &arithm_bitwise_xor_scalar);
bitwise_scalar( src1, src2, dst, mask, kernelName, &arithm_bitwise_binary_scalar, opt);
}
oclMat cv::ocl::operator ~ (const oclMat &src)

@ -844,8 +844,8 @@ void cv::ocl::BruteForceMatcher_OCL_base::knnMatch2Collection(const oclMat &quer
if (query.empty() || trainCollection.empty())
return;
typedef void (*caller_t)(const oclMat & query, const oclMat & trains, const oclMat & masks,
const oclMat & trainIdx, const oclMat & imgIdx, const oclMat & distance);
// typedef void (*caller_t)(const oclMat & query, const oclMat & trains, const oclMat & masks,
// const oclMat & trainIdx, const oclMat & imgIdx, const oclMat & distance);
CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
@ -992,7 +992,7 @@ void cv::ocl::BruteForceMatcher_OCL_base::knnMatch(const oclMat &query, std::vec
// radiusMatchSingle
void cv::ocl::BruteForceMatcher_OCL_base::radiusMatchSingle(const oclMat &query, const oclMat &train,
oclMat &trainIdx, oclMat &distance, oclMat &nMatches, float maxDistance, const oclMat &mask)
oclMat &trainIdx, oclMat &distance, oclMat &nMatches, float maxDistance, const oclMat &mask)
{
if (query.empty() || train.empty())
return;
@ -1094,9 +1094,9 @@ void cv::ocl::BruteForceMatcher_OCL_base::radiusMatchCollection(const oclMat &qu
if (query.empty() || empty())
return;
#if 0
typedef void (*caller_t)(const oclMat & query, const oclMat * trains, int n, float maxDistance, const oclMat * masks,
const oclMat & trainIdx, const oclMat & imgIdx, const oclMat & distance, const oclMat & nMatches);
#if 0
static const caller_t callers[3][6] =
{
{

@ -60,7 +60,7 @@ void cv::ocl::gemm(const oclMat &src1, const oclMat &src2, double alpha,
const oclMat &src3, double beta, oclMat &dst, int flags)
{
CV_Assert(src1.cols == src2.rows &&
(src3.empty() || src1.rows == src3.rows && src2.cols == src3.cols));
(src3.empty() || (src1.rows == src3.rows && src2.cols == src3.cols)));
CV_Assert(!(cv::GEMM_3_T & flags)); // cv::GEMM_3_T is not supported
if(!src3.empty())
{

@ -20,6 +20,7 @@
// Jia Haipeng, jiahaipeng95@gmail.com
// Wu Xinglong, wxl370@126.com
// Wang Yao, bitwangyaoyao@gmail.com
// Sen Liu, swjtuls1987@126.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
@ -144,7 +145,8 @@ typedef struct
int imgoff;
float factor;
} detect_piramid_info;
#ifdef WIN32
#if defined WIN32 && !defined __MINGW__ && !defined __MINGW32__
#define _ALIGNED_ON(_ALIGNMENT) __declspec(align(_ALIGNMENT))
typedef _ALIGNED_ON(128) struct GpuHidHaarFeature
{
@ -841,15 +843,13 @@ static void gpuSetHaarClassifierCascade( CvHaarClassifierCascade *_cascade
} /* j */
}
}
CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemStorage *storage, double scaleFactor,
int minNeighbors, int flags, CvSize minSize, CvSize maxSize)
{
CvHaarClassifierCascade *cascade = oldCascade;
//double alltime = (double)cvGetTickCount();
//double t = (double)cvGetTickCount();
const double GROUP_EPS = 0.2;
oclMat gtemp, gsum1, gtilted1, gsqsum1, gnormImg, gsumcanny;
CvSeq *result_seq = 0;
cv::Ptr<CvMemStorage> temp_storage;
@ -860,7 +860,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
int datasize=0;
int totalclassifier=0;
//void *out;
GpuHidHaarClassifierCascade *gcascade;
GpuHidHaarStageClassifier *stage;
GpuHidHaarClassifier *classifier;
@ -869,11 +868,8 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
int *candidate;
cl_int status;
// bool doCannyPruning = (flags & CV_HAAR_DO_CANNY_PRUNING) != 0;
bool findBiggestObject = (flags & CV_HAAR_FIND_BIGGEST_OBJECT) != 0;
// bool roughSearch = (flags & CV_HAAR_DO_ROUGH_SEARCH) != 0;
//double t = 0;
if( maxSize.height == 0 || maxSize.width == 0 )
{
maxSize.height = gimg.rows;
@ -895,27 +891,20 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
if( findBiggestObject )
flags &= ~CV_HAAR_SCALE_IMAGE;
//gtemp = oclMat( gimg.rows, gimg.cols, CV_8UC1);
//gsum1 = oclMat( gimg.rows + 1, gimg.cols + 1, CV_32SC1 );
//gsqsum1 = oclMat( gimg.rows + 1, gimg.cols + 1, CV_32FC1 );
if( !cascade->hid_cascade )
/*out = (void *)*/gpuCreateHidHaarClassifierCascade(cascade, &datasize, &totalclassifier);
if( cascade->hid_cascade->has_tilted_features )
gtilted1 = oclMat( gimg.rows + 1, gimg.cols + 1, CV_32SC1 );
gpuCreateHidHaarClassifierCascade(cascade, &datasize, &totalclassifier);
result_seq = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvAvgComp), storage );
if( CV_MAT_CN(gimg.type()) > 1 )
{
oclMat gtemp;
cvtColor( gimg, gtemp, COLOR_BGR2GRAY );
gimg = gtemp;
}
if( findBiggestObject )
flags &= ~(CV_HAAR_SCALE_IMAGE | CV_HAAR_DO_CANNY_PRUNING);
//t = (double)cvGetTickCount() - t;
//printf( "before if time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
if( gimg.cols < minSize.width || gimg.rows < minSize.height )
CV_Error(CV_StsError, "Image too small");
@ -923,12 +912,9 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
if( (flags & CV_HAAR_SCALE_IMAGE) )
{
CvSize winSize0 = cascade->orig_window_size;
//float scalefactor = 1.1f;
//float factor = 1.f;
int totalheight = 0;
int indexy = 0;
CvSize sz;
//t = (double)cvGetTickCount();
std::vector<CvSize> sizev;
std::vector<float> scalev;
for(factor = 1.f;; factor *= scaleFactor)
@ -949,20 +935,15 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
sizev.push_back(sz);
scalev.push_back(factor);
}
//int flag = 0;
oclMat gimg1(gimg.rows, gimg.cols, CV_8UC1);
oclMat gsum(totalheight + 4, gimg.cols + 1, CV_32SC1);
oclMat gsqsum(totalheight + 4, gimg.cols + 1, CV_32FC1);
//cl_mem cascadebuffer;
cl_mem stagebuffer;
//cl_mem classifierbuffer;
cl_mem nodebuffer;
cl_mem candidatebuffer;
cl_mem scaleinfobuffer;
//cl_kernel kernel;
//kernel = openCLGetKernelFromSource(gimg.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade");
cv::Rect roi, roi2;
cv::Mat imgroi, imgroisq;
cv::ocl::oclMat resizeroi, gimgroi, gimgroisq;
@ -970,18 +951,13 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
size_t blocksize = 8;
size_t localThreads[3] = { blocksize, blocksize , 1 };
size_t globalThreads[3] = { grp_per_CU *((gsum.clCxt)->computeUnits()) *localThreads[0],
size_t globalThreads[3] = { grp_per_CU * gsum.clCxt->computeUnits() *localThreads[0],
localThreads[1], 1
};
int outputsz = 256 * globalThreads[0] / localThreads[0];
int loopcount = sizev.size();
detect_piramid_info *scaleinfo = (detect_piramid_info *)malloc(sizeof(detect_piramid_info) * loopcount);
//t = (double)cvGetTickCount() - t;
// printf( "pre time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
//int *it =scaleinfo;
// t = (double)cvGetTickCount();
for( int i = 0; i < loopcount; i++ )
{
sz = sizev[i];
@ -991,7 +967,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
resizeroi = gimg1(roi2);
gimgroi = gsum(roi);
gimgroisq = gsqsum(roi);
//scaleinfo[i].rows = gimgroi.rows;
int width = gimgroi.cols - 1 - cascade->orig_window_size.width;
int height = gimgroi.rows - 1 - cascade->orig_window_size.height;
scaleinfo[i].width_height = (width << 16) | height;
@ -999,76 +974,40 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
int grpnumperline = (width + localThreads[0] - 1) / localThreads[0];
int totalgrp = ((height + localThreads[1] - 1) / localThreads[1]) * grpnumperline;
//outputsz +=width*height;
scaleinfo[i].grpnumperline_totalgrp = (grpnumperline << 16) | totalgrp;
scaleinfo[i].imgoff = gimgroi.offset >> 2;
scaleinfo[i].factor = factor;
//printf("rows = %d,ystep = %d,width = %d,height = %d,grpnumperline = %d,totalgrp = %d,imgoff = %d,factor = %f\n",
// scaleinfo[i].rows,scaleinfo[i].ystep,scaleinfo[i].width,scaleinfo[i].height,scaleinfo[i].grpnumperline,
// scaleinfo[i].totalgrp,scaleinfo[i].imgoff,scaleinfo[i].factor);
cv::ocl::resize(gimg, resizeroi, Size(sz.width - 1, sz.height - 1), 0, 0, INTER_LINEAR);
//cv::imwrite("D:\\1.jpg",gimg1);
cv::ocl::integral(resizeroi, gimgroi, gimgroisq);
//cv::ocl::oclMat chk(sz.height,sz.width,CV_32SC1),chksq(sz.height,sz.width,CV_32FC1);
//cv::ocl::integral(gimg1, chk, chksq);
//double r = cv::norm(chk,gimgroi,NORM_INF);
//if(r > std::numeric_limits<double>::epsilon())
//{
// printf("failed");
//}
indexy += sz.height;
}
//int ystep = factor > 2 ? 1 : 2;
// t = (double)cvGetTickCount() - t;
//printf( "resize integral time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
//t = (double)cvGetTickCount();
gcascade = (GpuHidHaarClassifierCascade *)cascade->hid_cascade;
stage = (GpuHidHaarStageClassifier *)(gcascade + 1);
classifier = (GpuHidHaarClassifier *)(stage + gcascade->count);
node = (GpuHidHaarTreeNode *)(classifier->node);
//int m,n;
//m = (gsum.cols - 1 - cascade->orig_window_size.width + ystep - 1)/ystep;
//n = (gsum.rows - 1 - cascade->orig_window_size.height + ystep - 1)/ystep;
//int counter = m*n;
int nodenum = (datasize - sizeof(GpuHidHaarClassifierCascade) -
sizeof(GpuHidHaarStageClassifier) * gcascade->count - sizeof(GpuHidHaarClassifier) * totalclassifier) / sizeof(GpuHidHaarTreeNode);
//if(flag == 0){
candidate = (int *)malloc(4 * sizeof(int) * outputsz);
//memset((char*)candidate,0,4*sizeof(int)*outputsz);
gpuSetImagesForHaarClassifierCascade( cascade,/* &sum1, &sqsum1, _tilted,*/ 1., gsum.step / 4 );
//cascadebuffer = clCreateBuffer(gsum.clCxt->clContext,CL_MEM_READ_ONLY,sizeof(GpuHidHaarClassifierCascade),NULL,&status);
//openCLVerifyCall(status);
//openCLSafeCall(clEnqueueWriteBuffer(gsum.clCxt->clCmdQueue,cascadebuffer,1,0,sizeof(GpuHidHaarClassifierCascade),gcascade,0,NULL,NULL));
gpuSetImagesForHaarClassifierCascade( cascade, 1., gsum.step / 4 );
stagebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(GpuHidHaarStageClassifier) * gcascade->count);
//openCLVerifyCall(status);
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
//classifierbuffer = clCreateBuffer(gsum.clCxt->clContext,CL_MEM_READ_ONLY,sizeof(GpuHidHaarClassifier)*totalclassifier,NULL,&status);
//status = clEnqueueWriteBuffer(gsum.clCxt->clCmdQueue,classifierbuffer,1,0,sizeof(GpuHidHaarClassifier)*totalclassifier,classifier,0,NULL,NULL);
cl_command_queue qu = (cl_command_queue)gsum.clCxt->oclCommandQueue();
openCLSafeCall(clEnqueueWriteBuffer(qu, stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
nodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, nodenum * sizeof(GpuHidHaarTreeNode));
//openCLVerifyCall(status);
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), nodebuffer, 1, 0,
nodenum * sizeof(GpuHidHaarTreeNode),
openCLSafeCall(clEnqueueWriteBuffer(qu, nodebuffer, 1, 0, nodenum * sizeof(GpuHidHaarTreeNode),
node, 0, NULL, NULL));
candidatebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_WRITE_ONLY, 4 * sizeof(int) * outputsz);
//openCLVerifyCall(status);
scaleinfobuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(detect_piramid_info) * loopcount);
//openCLVerifyCall(status);
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), scaleinfobuffer, 1, 0, sizeof(detect_piramid_info)*loopcount, scaleinfo, 0, NULL, NULL));
//flag = 1;
//}
//t = (double)cvGetTickCount() - t;
//printf( "update time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
scaleinfobuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(detect_piramid_info) * loopcount);
openCLSafeCall(clEnqueueWriteBuffer(qu, scaleinfobuffer, 1, 0, sizeof(detect_piramid_info)*loopcount, scaleinfo, 0, NULL, NULL));
//size_t globalThreads[3] = { counter+blocksize*blocksize-counter%(blocksize*blocksize),1,1};
//t = (double)cvGetTickCount();
int startstage = 0;
int endstage = gcascade->count;
int startnode = 0;
@ -1086,11 +1025,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
pq.s[3] = gcascade->pq3;
float correction = gcascade->inv_window_area;
//int grpnumperline = ((m + localThreads[0] - 1) / localThreads[0]);
//int totalgrp = ((n + localThreads[1] - 1) / localThreads[1])*grpnumperline;
// openCLVerifyKernel(gsum.clCxt, kernel, &blocksize, globalThreads, localThreads);
//openCLSafeCall(clSetKernelArg(kernel,argcount++,sizeof(cl_mem),(void*)&cascadebuffer));
std::vector<std::pair<size_t, const void *> > args;
args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&stagebuffer ));
args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&scaleinfobuffer ));
@ -1110,28 +1044,20 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
args.push_back ( std::make_pair(sizeof(cl_float) , (void *)&correction ));
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1);
//t = (double)cvGetTickCount() - t;
//printf( "detection time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
//t = (double)cvGetTickCount();
//openCLSafeCall(clEnqueueReadBuffer(gsum.clCxt->impl->clCmdQueue, candidatebuffer, 1, 0, 4 * sizeof(int)*outputsz, candidate, 0, NULL, NULL));
openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
for(int i = 0; i < outputsz; i++)
if(candidate[4 * i + 2] != 0)
allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1], candidate[4 * i + 2], candidate[4 * i + 3]));
// t = (double)cvGetTickCount() - t;
//printf( "post time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
//t = (double)cvGetTickCount();
allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],
candidate[4 * i + 2], candidate[4 * i + 3]));
free(scaleinfo);
free(candidate);
//openCLSafeCall(clReleaseMemObject(cascadebuffer));
openCLSafeCall(clReleaseMemObject(stagebuffer));
openCLSafeCall(clReleaseMemObject(scaleinfobuffer));
openCLSafeCall(clReleaseMemObject(nodebuffer));
openCLSafeCall(clReleaseMemObject(candidatebuffer));
// openCLSafeCall(clReleaseKernel(kernel));
//t = (double)cvGetTickCount() - t;
//printf( "release time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
}
else
{
@ -1149,7 +1075,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
classifier = (GpuHidHaarClassifier *)(stage + gcascade->count);
node = (GpuHidHaarTreeNode *)(classifier->node);
cl_mem stagebuffer;
//cl_mem classifierbuffer;
cl_mem nodebuffer;
cl_mem candidatebuffer;
cl_mem scaleinfobuffer;
@ -1184,24 +1109,20 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
size_t blocksize = 8;
size_t localThreads[3] = { blocksize, blocksize , 1 };
size_t globalThreads[3] = { grp_per_CU *gsum.clCxt->computeUnits() *localThreads[0],
localThreads[1], 1
};
localThreads[1], 1 };
int outputsz = 256 * globalThreads[0] / localThreads[0];
int nodenum = (datasize - sizeof(GpuHidHaarClassifierCascade) -
sizeof(GpuHidHaarStageClassifier) * gcascade->count - sizeof(GpuHidHaarClassifier) * totalclassifier) / sizeof(GpuHidHaarTreeNode);
nodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY,
nodenum * sizeof(GpuHidHaarTreeNode));
//openCLVerifyCall(status);
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), nodebuffer, 1, 0,
cl_command_queue qu = (cl_command_queue)gsum.clCxt->oclCommandQueue();
openCLSafeCall(clEnqueueWriteBuffer(qu, nodebuffer, 1, 0,
nodenum * sizeof(GpuHidHaarTreeNode),
node, 0, NULL, NULL));
cl_mem newnodebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_WRITE,
loopcount * nodenum * sizeof(GpuHidHaarTreeNode));
int startstage = 0;
int endstage = gcascade->count;
//cl_kernel kernel;
//kernel = openCLGetKernelFromSource(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2");
//cl_kernel kernel2 = openCLGetKernelFromSource(gimg.clCxt, &haarobjectdetect_scaled2, "gpuscaleclassifier");
for(int i = 0; i < loopcount; i++)
{
sz = sizev[i];
@ -1220,7 +1141,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
int height = (gsum.rows - 1 - sz.height + ystep - 1) / ystep;
int grpnumperline = (width + localThreads[0] - 1) / localThreads[0];
int totalgrp = ((height + localThreads[1] - 1) / localThreads[1]) * grpnumperline;
//outputsz +=width*height;
scaleinfo[i].width_height = (width << 16) | height;
scaleinfo[i].grpnumperline_totalgrp = (grpnumperline << 16) | totalgrp;
scaleinfo[i].imgoff = 0;
@ -1238,28 +1159,20 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
size_t globalThreads2[3] = {nodenum, 1, 1};
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuscaleclassifier", globalThreads2, NULL/*localThreads2*/, args1, -1, -1);
//clEnqueueNDRangeKernel(gsum.clCxt->impl->clCmdQueue, kernel2, 1, NULL, globalThreads2, 0, 0, NULL, NULL);
//clFinish(gsum.clCxt->impl->clCmdQueue);
}
//clReleaseKernel(kernel2);
int step = gsum.step / 4;
int startnode = 0;
int splitstage = 3;
int splitnode = stage[0].count + stage[1].count + stage[2].count;
stagebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(GpuHidHaarStageClassifier) * gcascade->count);
//openCLVerifyCall(status);
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
openCLSafeCall(clEnqueueWriteBuffer(qu, stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
candidatebuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, 4 * sizeof(int) * outputsz);
//openCLVerifyCall(status);
scaleinfobuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(detect_piramid_info) * loopcount);
//openCLVerifyCall(status);
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), scaleinfobuffer, 1, 0, sizeof(detect_piramid_info)*loopcount, scaleinfo, 0, NULL, NULL));
openCLSafeCall(clEnqueueWriteBuffer(qu, scaleinfobuffer, 1, 0, sizeof(detect_piramid_info)*loopcount, scaleinfo, 0, NULL, NULL));
pbuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(cl_int4) * loopcount);
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), pbuffer, 1, 0, sizeof(cl_int4)*loopcount, p, 0, NULL, NULL));
openCLSafeCall(clEnqueueWriteBuffer(qu, pbuffer, 1, 0, sizeof(cl_int4)*loopcount, p, 0, NULL, NULL));
correctionbuffer = openCLCreateBuffer(gsum.clCxt, CL_MEM_READ_ONLY, sizeof(cl_float) * loopcount);
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), correctionbuffer, 1, 0, sizeof(cl_float)*loopcount, correction, 0, NULL, NULL));
//int argcount = 0;
openCLSafeCall(clEnqueueWriteBuffer(qu, correctionbuffer, 1, 0, sizeof(cl_float)*loopcount, correction, 0, NULL, NULL));
std::vector<std::pair<size_t, const void *> > args;
args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&stagebuffer ));
@ -1268,22 +1181,21 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&gsum.data ));
args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&gsqsum.data ));
args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&candidatebuffer ));
args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&gsum.rows ));
args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&gsum.cols ));
args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&step ));
args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&loopcount ));
args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&startstage ));
args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&splitstage ));
args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&endstage ));
args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&startnode ));
args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&splitnode ));
args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&pbuffer ));
args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&correctionbuffer ));
args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&nodenum ));
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2", globalThreads, localThreads, args, -1, -1);
//openCLSafeCall(clEnqueueReadBuffer(gsum.clCxt->clCmdQueue,candidatebuffer,1,0,4*sizeof(int)*outputsz,candidate,0,NULL,NULL));
candidate = (int *)clEnqueueMapBuffer((cl_command_queue)gsum.clCxt->oclCommandQueue(), candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int), 0, 0, 0, &status);
candidate = (int *)clEnqueueMapBuffer(qu, candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int) * outputsz, 0, 0, 0, &status);
for(int i = 0; i < outputsz; i++)
{
@ -1294,7 +1206,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
free(scaleinfo);
free(p);
free(correction);
clEnqueueUnmapMemObject((cl_command_queue)gsum.clCxt->oclCommandQueue(), candidatebuffer, candidate, 0, 0, 0);
clEnqueueUnmapMemObject(qu, candidatebuffer, candidate, 0, 0, 0);
openCLSafeCall(clReleaseMemObject(stagebuffer));
openCLSafeCall(clReleaseMemObject(scaleinfobuffer));
openCLSafeCall(clReleaseMemObject(nodebuffer));
@ -1303,20 +1215,547 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
openCLSafeCall(clReleaseMemObject(pbuffer));
openCLSafeCall(clReleaseMemObject(correctionbuffer));
}
//t = (double)cvGetTickCount() ;
cvFree(&cascade->hid_cascade);
// printf("%d\n",globalcounter);
rectList.resize(allCandidates.size());
if(!allCandidates.empty())
std::copy(allCandidates.begin(), allCandidates.end(), rectList.begin());
//cout << "count = " << rectList.size()<< endl;
if( minNeighbors != 0 || findBiggestObject )
groupRectangles(rectList, rweights, std::max(minNeighbors, 1), GROUP_EPS);
else
rweights.resize(rectList.size(), 0);
if( findBiggestObject && rectList.size() )
{
CvAvgComp result_comp = {{0, 0, 0, 0}, 0};
for( size_t i = 0; i < rectList.size(); i++ )
{
cv::Rect r = rectList[i];
if( r.area() > cv::Rect(result_comp.rect).area() )
{
result_comp.rect = r;
result_comp.neighbors = rweights[i];
}
}
cvSeqPush( result_seq, &result_comp );
}
else
{
for( size_t i = 0; i < rectList.size(); i++ )
{
CvAvgComp c;
c.rect = rectList[i];
c.neighbors = rweights[i];
cvSeqPush( result_seq, &c );
}
}
return result_seq;
}
struct OclBuffers
{
cl_mem stagebuffer;
cl_mem nodebuffer;
cl_mem candidatebuffer;
cl_mem scaleinfobuffer;
cl_mem pbuffer;
cl_mem correctionbuffer;
cl_mem newnodebuffer;
};
struct getRect
{
Rect operator()(const CvAvgComp &e) const
{
return e.rect;
}
};
void cv::ocl::OclCascadeClassifierBuf::detectMultiScale(oclMat &gimg, CV_OUT std::vector<cv::Rect>& faces,
double scaleFactor, int minNeighbors, int flags,
Size minSize, Size maxSize)
{
int blocksize = 8;
int grp_per_CU = 12;
size_t localThreads[3] = { blocksize, blocksize, 1 };
size_t globalThreads[3] = { grp_per_CU * Context::getContext()->computeUnits() * localThreads[0],
localThreads[1],
1 };
int outputsz = 256 * globalThreads[0] / localThreads[0];
Init(gimg.rows, gimg.cols, scaleFactor, flags, outputsz, localThreads, minSize, maxSize);
const double GROUP_EPS = 0.2;
cv::ConcurrentRectVector allCandidates;
std::vector<cv::Rect> rectList;
std::vector<int> rweights;
CvHaarClassifierCascade *cascade = oldCascade;
GpuHidHaarClassifierCascade *gcascade;
GpuHidHaarStageClassifier *stage;
GpuHidHaarClassifier *classifier;
GpuHidHaarTreeNode *node;
if( CV_MAT_DEPTH(gimg.type()) != CV_8U )
CV_Error( CV_StsUnsupportedFormat, "Only 8-bit images are supported" );
if( CV_MAT_CN(gimg.type()) > 1 )
{
oclMat gtemp;
cvtColor( gimg, gtemp, CV_BGR2GRAY );
gimg = gtemp;
}
int *candidate;
if( (flags & CV_HAAR_SCALE_IMAGE) )
{
int indexy = 0;
CvSize sz;
cv::Rect roi, roi2;
cv::Mat imgroi, imgroisq;
cv::ocl::oclMat resizeroi, gimgroi, gimgroisq;
for( int i = 0; i < m_loopcount; i++ )
{
sz = sizev[i];
roi = Rect(0, indexy, sz.width, sz.height);
roi2 = Rect(0, 0, sz.width - 1, sz.height - 1);
resizeroi = gimg1(roi2);
gimgroi = gsum(roi);
gimgroisq = gsqsum(roi);
cv::ocl::resize(gimg, resizeroi, Size(sz.width - 1, sz.height - 1), 0, 0, INTER_LINEAR);
cv::ocl::integral(resizeroi, gimgroi, gimgroisq);
indexy += sz.height;
}
gcascade = (GpuHidHaarClassifierCascade *)(cascade->hid_cascade);
stage = (GpuHidHaarStageClassifier *)(gcascade + 1);
classifier = (GpuHidHaarClassifier *)(stage + gcascade->count);
node = (GpuHidHaarTreeNode *)(classifier->node);
gpuSetImagesForHaarClassifierCascade( cascade, 1., gsum.step / 4 );
cl_command_queue qu = (cl_command_queue)gsum.clCxt->oclCommandQueue();
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->stagebuffer, 1, 0,
sizeof(GpuHidHaarStageClassifier) * gcascade->count,
stage, 0, NULL, NULL));
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->nodebuffer, 1, 0,
m_nodenum * sizeof(GpuHidHaarTreeNode),
node, 0, NULL, NULL));
int startstage = 0;
int endstage = gcascade->count;
int startnode = 0;
int pixelstep = gsum.step / 4;
int splitstage = 3;
int splitnode = stage[0].count + stage[1].count + stage[2].count;
cl_int4 p, pq;
p.s[0] = gcascade->p0;
p.s[1] = gcascade->p1;
p.s[2] = gcascade->p2;
p.s[3] = gcascade->p3;
pq.s[0] = gcascade->pq0;
pq.s[1] = gcascade->pq1;
pq.s[2] = gcascade->pq2;
pq.s[3] = gcascade->pq3;
float correction = gcascade->inv_window_area;
vector<pair<size_t, const void *> > args;
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->stagebuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->scaleinfobuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->nodebuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsum.data ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsqsum.data ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->candidatebuffer ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&pixelstep ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&m_loopcount ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&startstage ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitstage ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&endstage ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&startnode ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitnode ));
args.push_back ( make_pair(sizeof(cl_int4) , (void *)&p ));
args.push_back ( make_pair(sizeof(cl_int4) , (void *)&pq ));
args.push_back ( make_pair(sizeof(cl_float) , (void *)&correction ));
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1);
candidate = (int *)malloc(4 * sizeof(int) * outputsz);
memset(candidate, 0, 4 * sizeof(int) * outputsz);
openCLReadBuffer( gsum.clCxt, ((OclBuffers *)buffers)->candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
for(int i = 0; i < outputsz; i++)
if(candidate[4 * i + 2] != 0)
allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],
candidate[4 * i + 2], candidate[4 * i + 3]));
free((void *)candidate);
candidate = NULL;
}
else
{
cv::ocl::integral(gimg, gsum, gsqsum);
gpuSetHaarClassifierCascade(cascade);
gcascade = (GpuHidHaarClassifierCascade *)cascade->hid_cascade;
stage = (GpuHidHaarStageClassifier *)(gcascade + 1);
classifier = (GpuHidHaarClassifier *)(stage + gcascade->count);
node = (GpuHidHaarTreeNode *)(classifier->node);
cl_command_queue qu = (cl_command_queue)gsum.clCxt->oclCommandQueue();
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->nodebuffer, 1, 0,
m_nodenum * sizeof(GpuHidHaarTreeNode),
node, 0, NULL, NULL));
cl_int4 *p = (cl_int4 *)malloc(sizeof(cl_int4) * m_loopcount);
float *correction = (float *)malloc(sizeof(float) * m_loopcount);
int startstage = 0;
int endstage = gcascade->count;
double factor;
for(int i = 0; i < m_loopcount; i++)
{
factor = scalev[i];
int equRect_x = (int)(factor * gcascade->p0 + 0.5);
int equRect_y = (int)(factor * gcascade->p1 + 0.5);
int equRect_w = (int)(factor * gcascade->p3 + 0.5);
int equRect_h = (int)(factor * gcascade->p2 + 0.5);
p[i].s[0] = equRect_x;
p[i].s[1] = equRect_y;
p[i].s[2] = equRect_x + equRect_w;
p[i].s[3] = equRect_y + equRect_h;
correction[i] = 1. / (equRect_w * equRect_h);
int startnodenum = m_nodenum * i;
float factor2 = (float)factor;
vector<pair<size_t, const void *> > args1;
args1.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->nodebuffer ));
args1.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->newnodebuffer ));
args1.push_back ( make_pair(sizeof(cl_float) , (void *)&factor2 ));
args1.push_back ( make_pair(sizeof(cl_float) , (void *)&correction[i] ));
args1.push_back ( make_pair(sizeof(cl_int) , (void *)&startnodenum ));
size_t globalThreads2[3] = {m_nodenum, 1, 1};
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuscaleclassifier", globalThreads2, NULL/*localThreads2*/, args1, -1, -1);
}
int step = gsum.step / 4;
int startnode = 0;
int splitstage = 3;
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->stagebuffer, 1, 0, sizeof(GpuHidHaarStageClassifier)*gcascade->count, stage, 0, NULL, NULL));
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->pbuffer, 1, 0, sizeof(cl_int4)*m_loopcount, p, 0, NULL, NULL));
openCLSafeCall(clEnqueueWriteBuffer(qu, ((OclBuffers *)buffers)->correctionbuffer, 1, 0, sizeof(cl_float)*m_loopcount, correction, 0, NULL, NULL));
vector<pair<size_t, const void *> > args;
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->stagebuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->scaleinfobuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->newnodebuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsum.data ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsqsum.data ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->candidatebuffer ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&gsum.rows ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&gsum.cols ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&step ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&m_loopcount ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&startstage ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitstage ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&endstage ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&startnode ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->pbuffer ));
args.push_back ( make_pair(sizeof(cl_mem) , (void *)&((OclBuffers *)buffers)->correctionbuffer ));
args.push_back ( make_pair(sizeof(cl_int) , (void *)&m_nodenum ));
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2", globalThreads, localThreads, args, -1, -1);
candidate = (int *)clEnqueueMapBuffer(qu, ((OclBuffers *)buffers)->candidatebuffer, 1, CL_MAP_READ, 0, 4 * sizeof(int) * outputsz, 0, 0, 0, NULL);
for(int i = 0; i < outputsz; i++)
{
if(candidate[4 * i + 2] != 0)
allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],
candidate[4 * i + 2], candidate[4 * i + 3]));
}
free(p);
free(correction);
clEnqueueUnmapMemObject(qu, ((OclBuffers *)buffers)->candidatebuffer, candidate, 0, 0, 0);
}
rectList.resize(allCandidates.size());
if(!allCandidates.empty())
std::copy(allCandidates.begin(), allCandidates.end(), rectList.begin());
if( minNeighbors != 0 || findBiggestObject )
groupRectangles(rectList, rweights, std::max(minNeighbors, 1), GROUP_EPS);
else
rweights.resize(rectList.size(), 0);
GenResult(faces, rectList, rweights);
}
void cv::ocl::OclCascadeClassifierBuf::Init(const int rows, const int cols,
double scaleFactor, int flags,
const int outputsz, const size_t localThreads[],
CvSize minSize, CvSize maxSize)
{
CvHaarClassifierCascade *cascade = oldCascade;
if( !CV_IS_HAAR_CLASSIFIER(cascade) )
CV_Error( !cascade ? CV_StsNullPtr : CV_StsBadArg, "Invalid classifier cascade" );
if( scaleFactor <= 1 )
CV_Error( CV_StsOutOfRange, "scale factor must be > 1" );
if( cols < minSize.width || rows < minSize.height )
CV_Error(CV_StsError, "Image too small");
int datasize=0;
int totalclassifier=0;
if( !cascade->hid_cascade )
gpuCreateHidHaarClassifierCascade(cascade, &datasize, &totalclassifier);
if( maxSize.height == 0 || maxSize.width == 0 )
{
maxSize.height = rows;
maxSize.width = cols;
}
findBiggestObject = (flags & CV_HAAR_FIND_BIGGEST_OBJECT) != 0;
if( findBiggestObject )
flags &= ~(CV_HAAR_SCALE_IMAGE | CV_HAAR_DO_CANNY_PRUNING);
CreateBaseBufs(datasize, totalclassifier, flags, outputsz);
CreateFactorRelatedBufs(rows, cols, flags, scaleFactor, localThreads, minSize, maxSize);
m_scaleFactor = scaleFactor;
m_rows = rows;
m_cols = cols;
m_flags = flags;
m_minSize = minSize;
m_maxSize = maxSize;
initialized = true;
}
void cv::ocl::OclCascadeClassifierBuf::CreateBaseBufs(const int datasize, const int totalclassifier,
const int flags, const int outputsz)
{
if (!initialized)
{
buffers = malloc(sizeof(OclBuffers));
size_t tempSize =
sizeof(GpuHidHaarStageClassifier) * ((GpuHidHaarClassifierCascade *)oldCascade->hid_cascade)->count;
m_nodenum = (datasize - sizeof(GpuHidHaarClassifierCascade) - tempSize - sizeof(GpuHidHaarClassifier) * totalclassifier)
/ sizeof(GpuHidHaarTreeNode);
((OclBuffers *)buffers)->stagebuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_ONLY, tempSize);
((OclBuffers *)buffers)->nodebuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_ONLY, m_nodenum * sizeof(GpuHidHaarTreeNode));
}
if (initialized
&& ((m_flags & CV_HAAR_SCALE_IMAGE) ^ (flags & CV_HAAR_SCALE_IMAGE)))
{
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->candidatebuffer));
}
if (flags & CV_HAAR_SCALE_IMAGE)
{
((OclBuffers *)buffers)->candidatebuffer = openCLCreateBuffer(cv::ocl::Context::getContext(),
CL_MEM_WRITE_ONLY,
4 * sizeof(int) * outputsz);
}
else
{
((OclBuffers *)buffers)->candidatebuffer = openCLCreateBuffer(cv::ocl::Context::getContext(),
CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
4 * sizeof(int) * outputsz);
}
}
void cv::ocl::OclCascadeClassifierBuf::CreateFactorRelatedBufs(
const int rows, const int cols, const int flags,
const double scaleFactor, const size_t localThreads[],
CvSize minSize, CvSize maxSize)
{
if (initialized)
{
if ((m_flags & CV_HAAR_SCALE_IMAGE) && !(flags & CV_HAAR_SCALE_IMAGE))
{
gimg1.release();
gsum.release();
gsqsum.release();
}
else if (!(m_flags & CV_HAAR_SCALE_IMAGE) && (flags & CV_HAAR_SCALE_IMAGE))
{
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->newnodebuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->correctionbuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->pbuffer));
}
else if ((m_flags & CV_HAAR_SCALE_IMAGE) && (flags & CV_HAAR_SCALE_IMAGE))
{
if (fabs(m_scaleFactor - scaleFactor) < 1e-6
&& (rows == m_rows && cols == m_cols)
&& (minSize.width == m_minSize.width)
&& (minSize.height == m_minSize.height)
&& (maxSize.width == m_maxSize.width)
&& (maxSize.height == m_maxSize.height))
{
return;
}
}
else
{
if (fabs(m_scaleFactor - scaleFactor) < 1e-6
&& (rows == m_rows && cols == m_cols)
&& (minSize.width == m_minSize.width)
&& (minSize.height == m_minSize.height)
&& (maxSize.width == m_maxSize.width)
&& (maxSize.height == m_maxSize.height))
{
return;
}
else
{
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->newnodebuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->correctionbuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->pbuffer));
}
}
}
int loopcount;
int indexy = 0;
int totalheight = 0;
double factor;
Rect roi;
CvSize sz;
CvSize winSize0 = oldCascade->orig_window_size;
detect_piramid_info *scaleinfo;
if (flags & CV_HAAR_SCALE_IMAGE)
{
for(factor = 1.f;; factor *= scaleFactor)
{
CvSize winSize = { cvRound(winSize0.width * factor), cvRound(winSize0.height * factor) };
sz.width = cvRound( cols / factor ) + 1;
sz.height = cvRound( rows / factor ) + 1;
CvSize sz1 = { sz.width - winSize0.width - 1, sz.height - winSize0.height - 1 };
if( sz1.width <= 0 || sz1.height <= 0 )
break;
if( winSize.width > maxSize.width || winSize.height > maxSize.height )
break;
if( winSize.width < minSize.width || winSize.height < minSize.height )
continue;
totalheight += sz.height;
sizev.push_back(sz);
scalev.push_back(static_cast<float>(factor));
}
loopcount = sizev.size();
gimg1.create(rows, cols, CV_8UC1);
gsum.create(totalheight + 4, cols + 1, CV_32SC1);
gsqsum.create(totalheight + 4, cols + 1, CV_32FC1);
scaleinfo = (detect_piramid_info *)malloc(sizeof(detect_piramid_info) * loopcount);
for( int i = 0; i < loopcount; i++ )
{
sz = sizev[i];
roi = Rect(0, indexy, sz.width, sz.height);
int width = sz.width - 1 - oldCascade->orig_window_size.width;
int height = sz.height - 1 - oldCascade->orig_window_size.height;
int grpnumperline = (width + localThreads[0] - 1) / localThreads[0];
int totalgrp = ((height + localThreads[1] - 1) / localThreads[1]) * grpnumperline;
((detect_piramid_info *)scaleinfo)[i].width_height = (width << 16) | height;
((detect_piramid_info *)scaleinfo)[i].grpnumperline_totalgrp = (grpnumperline << 16) | totalgrp;
((detect_piramid_info *)scaleinfo)[i].imgoff = gsum(roi).offset >> 2;
((detect_piramid_info *)scaleinfo)[i].factor = scalev[i];
indexy += sz.height;
}
}
else
{
for(factor = 1;
cvRound(factor * winSize0.width) < cols - 10 && cvRound(factor * winSize0.height) < rows - 10;
factor *= scaleFactor)
{
CvSize winSize = { cvRound( winSize0.width * factor ), cvRound( winSize0.height * factor ) };
if( winSize.width < minSize.width || winSize.height < minSize.height )
{
continue;
}
sizev.push_back(winSize);
scalev.push_back(factor);
}
loopcount = scalev.size();
if(loopcount == 0)
{
loopcount = 1;
sizev.push_back(minSize);
scalev.push_back( min(cvRound(minSize.width / winSize0.width), cvRound(minSize.height / winSize0.height)) );
}
((OclBuffers *)buffers)->pbuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_ONLY,
sizeof(cl_int4) * loopcount);
((OclBuffers *)buffers)->correctionbuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_ONLY,
sizeof(cl_float) * loopcount);
((OclBuffers *)buffers)->newnodebuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_WRITE,
loopcount * m_nodenum * sizeof(GpuHidHaarTreeNode));
scaleinfo = (detect_piramid_info *)malloc(sizeof(detect_piramid_info) * loopcount);
for( int i = 0; i < loopcount; i++ )
{
sz = sizev[i];
factor = scalev[i];
int ystep = cvRound(std::max(2., factor));
int width = (cols - 1 - sz.width + ystep - 1) / ystep;
int height = (rows - 1 - sz.height + ystep - 1) / ystep;
int grpnumperline = (width + localThreads[0] - 1) / localThreads[0];
int totalgrp = ((height + localThreads[1] - 1) / localThreads[1]) * grpnumperline;
((detect_piramid_info *)scaleinfo)[i].width_height = (width << 16) | height;
((detect_piramid_info *)scaleinfo)[i].grpnumperline_totalgrp = (grpnumperline << 16) | totalgrp;
((detect_piramid_info *)scaleinfo)[i].imgoff = 0;
((detect_piramid_info *)scaleinfo)[i].factor = factor;
}
}
if (loopcount != m_loopcount)
{
if (initialized)
{
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->scaleinfobuffer));
}
((OclBuffers *)buffers)->scaleinfobuffer = openCLCreateBuffer(cv::ocl::Context::getContext(), CL_MEM_READ_ONLY, sizeof(detect_piramid_info) * loopcount);
}
openCLSafeCall(clEnqueueWriteBuffer((cl_command_queue)cv::ocl::Context::getContext()->oclCommandQueue(), ((OclBuffers *)buffers)->scaleinfobuffer, 1, 0,
sizeof(detect_piramid_info)*loopcount,
scaleinfo, 0, NULL, NULL));
free(scaleinfo);
m_loopcount = loopcount;
}
void cv::ocl::OclCascadeClassifierBuf::GenResult(CV_OUT std::vector<cv::Rect>& faces,
const std::vector<cv::Rect> &rectList,
const std::vector<int> &rweights)
{
CvSeq *result_seq = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvAvgComp), cvCreateMemStorage(0) );
if( findBiggestObject && rectList.size() )
{
@ -1343,13 +1782,34 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
cvSeqPush( result_seq, &c );
}
}
//t = (double)cvGetTickCount() - t;
//printf( "get face time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
//alltime = (double)cvGetTickCount() - alltime;
//printf( "all time = %g ms\n", alltime/((double)cvGetTickFrequency()*1000.) );
return result_seq;
vector<CvAvgComp> vecAvgComp;
Seq<CvAvgComp>(result_seq).copyTo(vecAvgComp);
faces.resize(vecAvgComp.size());
std::transform(vecAvgComp.begin(), vecAvgComp.end(), faces.begin(), getRect());
}
void cv::ocl::OclCascadeClassifierBuf::release()
{
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->stagebuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->scaleinfobuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->nodebuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->candidatebuffer));
if( (m_flags & CV_HAAR_SCALE_IMAGE) )
{
cvFree(&oldCascade->hid_cascade);
}
else
{
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->newnodebuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->correctionbuffer));
openCLSafeCall(clReleaseMemObject(((OclBuffers *)buffers)->pbuffer));
}
free(buffers);
buffers = NULL;
}
#ifndef _MAX_PATH
#define _MAX_PATH 1024

@ -1012,10 +1012,8 @@ namespace cv
warpPerspective_gpu(src, dst, coeffs, interpolation);
}
////////////////////////////////////////////////////////////////////////
// integral
void integral(const oclMat &src, oclMat &sum, oclMat &sqsum)
{
CV_Assert(src.type() == CV_8UC1);
@ -1029,13 +1027,24 @@ namespace cv
int vcols = (pre_invalid + src.cols + vlen - 1) / vlen;
oclMat t_sum , t_sqsum;
t_sum.create(src.cols, src.rows, CV_32SC1);
t_sqsum.create(src.cols, src.rows, CV_32FC1);
int w = src.cols + 1, h = src.rows + 1;
sum.create(h, w, CV_32SC1);
int depth;
if( src.cols * src.rows <= 2901 * 2901 ) //2901 is the maximum size for int when all values are 255
{
t_sum.create(src.cols, src.rows, CV_32SC1);
sum.create(h, w, CV_32SC1);
}
else
{
//Use float to prevent overflow
t_sum.create(src.cols, src.rows, CV_32FC1);
sum.create(h, w, CV_32FC1);
}
t_sqsum.create(src.cols, src.rows, CV_32FC1);
sqsum.create(h, w, CV_32FC1);
int sum_offset = sum.offset / vlen, sqsum_offset = sqsum.offset / vlen;
depth = sum.depth();
int sum_offset = sum.offset / vlen;
int sqsum_offset = sqsum.offset / vlen;
std::vector<std::pair<size_t , const void *> > args;
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
@ -1048,7 +1057,7 @@ namespace cv
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.step ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&t_sum.step));
size_t gt[3] = {((vcols + 1) / 2) * 256, 1, 1}, lt[3] = {256, 1, 1};
openCLExecuteKernel(src.clCxt, &imgproc_integral, "integral_cols", gt, lt, args, -1, -1);
openCLExecuteKernel(src.clCxt, &imgproc_integral, "integral_cols", gt, lt, args, -1, depth);
args.clear();
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&t_sum.data ));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&t_sqsum.data ));
@ -1062,9 +1071,9 @@ namespace cv
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&sum_offset));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&sqsum_offset));
size_t gt2[3] = {t_sum.cols * 32, 1, 1}, lt2[3] = {256, 1, 1};
openCLExecuteKernel(src.clCxt, &imgproc_integral, "integral_rows", gt2, lt2, args, -1, -1);
//std::cout << "tested" << std::endl;
openCLExecuteKernel(src.clCxt, &imgproc_integral, "integral_rows", gt2, lt2, args, -1, depth);
}
void integral(const oclMat &src, oclMat &sum)
{
CV_Assert(src.type() == CV_8UC1);
@ -1074,10 +1083,18 @@ namespace cv
int vcols = (pre_invalid + src.cols + vlen - 1) / vlen;
oclMat t_sum;
t_sum.create(src.cols, src.rows, CV_32SC1);
int w = src.cols + 1, h = src.rows + 1;
sum.create(h, w, CV_32SC1);
int depth;
if(src.cols * src.rows <= 2901 * 2901)
{
t_sum.create(src.cols, src.rows, CV_32SC1);
sum.create(h, w, CV_32SC1);
}else
{
t_sum.create(src.cols, src.rows, CV_32FC1);
sum.create(h, w, CV_32FC1);
}
depth = sum.depth();
int sum_offset = sum.offset / vlen;
std::vector<std::pair<size_t , const void *> > args;
@ -1090,7 +1107,7 @@ namespace cv
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.step ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&t_sum.step));
size_t gt[3] = {((vcols + 1) / 2) * 256, 1, 1}, lt[3] = {256, 1, 1};
openCLExecuteKernel(src.clCxt, &imgproc_integral_sum, "integral_sum_cols", gt, lt, args, -1, -1);
openCLExecuteKernel(src.clCxt, &imgproc_integral_sum, "integral_sum_cols", gt, lt, args, -1, depth);
args.clear();
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&t_sum.data ));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&sum.data ));
@ -1100,7 +1117,7 @@ namespace cv
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&sum.step));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&sum_offset));
size_t gt2[3] = {t_sum.cols * 32, 1, 1}, lt2[3] = {256, 1, 1};
openCLExecuteKernel(src.clCxt, &imgproc_integral_sum, "integral_sum_rows", gt2, lt2, args, -1, -1);
openCLExecuteKernel(src.clCxt, &imgproc_integral_sum, "integral_sum_rows", gt2, lt2, args, -1, depth);
//std::cout << "tested" << std::endl;
}

@ -128,6 +128,8 @@ namespace cv
std::vector<cl_device_id> devices;
std::vector<String> devName;
String platName;
String clVersion;
cl_context oclcontext;
cl_command_queue clCmdQueue;
int devnum;
@ -260,7 +262,7 @@ namespace cv
int setDevMemType(DevMemRW rw_type, DevMemType mem_type)
{
if( (mem_type == DEVICE_MEM_PM &&
if( (mem_type == DEVICE_MEM_PM &&
Context::getContext()->impl->unified_memory == 0) )
return -1;
gDeviceMemRW = rw_type;
@ -303,6 +305,7 @@ namespace cv
const static int max_name_length = 256;
char deviceName[max_name_length];
char plfmName[max_name_length];
char clVersion[256];
for (unsigned i = 0; i < numPlatforms; ++i)
{
@ -322,6 +325,8 @@ namespace cv
ocltmpinfo.PlatformName = String(plfmName);
ocltmpinfo.impl->platName = String(plfmName);
ocltmpinfo.impl->oclplatform = platforms[i];
openCLSafeCall(clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, sizeof(clVersion), clVersion, NULL));
ocltmpinfo.impl->clVersion = clVersion;
for(unsigned j = 0; j < numsdev; ++j)
{
ocltmpinfo.impl->devices.push_back(devices[j]);
@ -424,13 +429,13 @@ namespace cv
}
void openCLMallocPitchEx(Context *clCxt, void **dev_ptr, size_t *pitch,
size_t widthInBytes, size_t height,
size_t widthInBytes, size_t height,
DevMemRW rw_type, DevMemType mem_type, void* hptr)
{
cl_int status;
if(hptr && (mem_type==DEVICE_MEM_UHP || mem_type==DEVICE_MEM_CHP))
*dev_ptr = clCreateBuffer(clCxt->impl->oclcontext,
gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type],
*dev_ptr = clCreateBuffer(clCxt->impl->oclcontext,
gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type],
widthInBytes * height, hptr, &status);
else
*dev_ptr = clCreateBuffer(clCxt->impl->oclcontext, gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type],
@ -985,6 +990,8 @@ namespace cv
return impl->double_support == 1;
case CL_UNIFIED_MEM:
return impl->unified_memory == 1;
case CL_VER_1_2:
return impl->clVersion.find("OpenCL 1.2") != String::npos;
default:
return false;
}

@ -196,7 +196,7 @@ void cv::ocl::oclMat::upload(const Mat &m)
// try to use host ptr
createEx(wholeSize, m.type(), gDeviceMemRW, gDeviceMemType, m.datastart);
if(gDeviceMemType!=DEVICE_MEM_UHP && gDeviceMemType!=DEVICE_MEM_CHP)
openCLMemcpy2D(clCxt, data, step, m.datastart, m.step,
openCLMemcpy2D(clCxt, data, step, m.datastart, m.step,
wholeSize.width * elemSize(), wholeSize.height, clMemcpyHostToDevice);
}
@ -571,11 +571,16 @@ static void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, Stri
CV_Error(Error::StsUnsupportedFormat, "unknown depth");
}
#ifdef CL_VERSION_1_2
if(dst.offset == 0 && dst.cols == dst.wholecols)
//this enables backwards portability to
//run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
if(Context::getContext()->supportsFeature(Context::CL_VER_1_2) &&
dst.offset == 0 && dst.cols == dst.wholecols)
{
clEnqueueFillBuffer((cl_command_queue)dst.clCxt->oclCommandQueue(), (cl_mem)dst.data, args[0].second, args[0].first, 0, dst.step * dst.rows, 0, NULL, NULL);
clEnqueueFillBuffer((cl_command_queue)dst.clCxt->oclCommandQueue(),
(cl_mem)dst.data, args[0].second, args[0].first, 0, dst.step * dst.rows, 0, NULL, NULL);
}
else
#endif
{
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols ));
@ -583,17 +588,8 @@ static void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, Stri
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&step_in_pixel ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset_in_pixel));
openCLExecuteKernel(dst.clCxt , &operator_setTo, kernelName, globalThreads,
localThreads, args, -1, -1, compile_option);
localThreads, args, -1, -1, compile_option);
}
#else
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&step_in_pixel ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset_in_pixel));
openCLExecuteKernel(dst.clCxt , &operator_setTo, kernelName, globalThreads,
localThreads, args, -1, -1, compile_option);
#endif
}
static void set_to_withmask_run(const oclMat &dst, const Scalar &scalar, const oclMat &mask, String kernelName)
@ -887,7 +883,7 @@ oclMat cv::ocl::oclMat::reshape(int new_cn, int new_rows) const
}
void cv::ocl::oclMat::createEx(Size size, int type,
void cv::ocl::oclMat::createEx(Size size, int type,
DevMemRW rw_type, DevMemType mem_type, void* hptr)
{
createEx(size.height, size.width, type, rw_type, mem_type, hptr);
@ -898,7 +894,7 @@ void cv::ocl::oclMat::create(int _rows, int _cols, int _type)
createEx(_rows, _cols, _type, gDeviceMemRW, gDeviceMemType);
}
void cv::ocl::oclMat::createEx(int _rows, int _cols, int _type,
void cv::ocl::oclMat::createEx(int _rows, int _cols, int _type,
DevMemRW rw_type, DevMemType mem_type, void* hptr)
{
clCxt = Context::getContext();
@ -919,7 +915,7 @@ void cv::ocl::oclMat::createEx(int _rows, int _cols, int _type,
size_t esz = elemSize();
void *dev_ptr;
openCLMallocPitchEx(clCxt, &dev_ptr, &step, GPU_MATRIX_MALLOC_STEP(esz * cols),
openCLMallocPitchEx(clCxt, &dev_ptr, &step, GPU_MATRIX_MALLOC_STEP(esz * cols),
rows, rw_type, mem_type, hptr);
if (esz * cols == step)

@ -43,11 +43,10 @@
//
//M*/
#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
#include "precomp.hpp"
#ifndef CL_VERSION_1_2
#define CL_VERSION_1_2 0
#endif
using namespace std;
namespace cv
{
@ -160,30 +159,44 @@ namespace cv
CV_Error(-1, "Image forma is not supported");
break;
}
#if CL_VERSION_1_2
cl_image_desc desc;
desc.image_type = CL_MEM_OBJECT_IMAGE2D;
desc.image_width = mat.cols;
desc.image_height = mat.rows;
desc.image_depth = 0;
desc.image_array_size = 1;
desc.image_row_pitch = 0;
desc.image_slice_pitch = 0;
desc.buffer = NULL;
desc.num_mip_levels = 0;
desc.num_samples = 0;
texture = clCreateImage((cl_context)mat.clCxt->oclContext(), CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
#else
texture = clCreateImage2D(
(cl_context)mat.clCxt->oclContext(),
CL_MEM_READ_WRITE,
&format,
mat.cols,
mat.rows,
0,
NULL,
&err);
#ifdef CL_VERSION_1_2
//this enables backwards portability to
//run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
if(Context::getContext()->supportsFeature(Context::CL_VER_1_2))
{
cl_image_desc desc;
desc.image_type = CL_MEM_OBJECT_IMAGE2D;
desc.image_width = mat.cols;
desc.image_height = mat.rows;
desc.image_depth = 0;
desc.image_array_size = 1;
desc.image_row_pitch = 0;
desc.image_slice_pitch = 0;
desc.buffer = NULL;
desc.num_mip_levels = 0;
desc.num_samples = 0;
texture = clCreateImage((cl_context)mat.clCxt->oclContext(), CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
}
else
#endif
{
#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
#endif
texture = clCreateImage2D(
(cl_context)mat.clCxt->oclContext(),
CL_MEM_READ_WRITE,
&format,
mat.cols,
mat.rows,
0,
NULL,
&err);
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif
}
size_t origin[] = { 0, 0, 0 };
size_t region[] = { mat.cols, mat.rows, 1 };
@ -196,7 +209,7 @@ namespace cv
clEnqueueCopyBufferRect((cl_command_queue)mat.clCxt->oclCommandQueue(), (cl_mem)mat.data, devData, origin, origin,
regin, mat.step, 0, mat.cols * mat.elemSize(), 0, 0, NULL, NULL);
clFlush((cl_command_queue)mat.clCxt->oclCommandQueue());
}
}
else
{
devData = (cl_mem)mat.data;
@ -212,7 +225,6 @@ namespace cv
openCLSafeCall(err);
return texture;
}
void releaseTexture(cl_mem& texture)
{
openCLFree(texture);

@ -330,7 +330,7 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
mom->m12 = dstsum[8];
mom->m03 = dstsum[9];
delete [] dstsum;
openCLSafeCall(clReleaseMemObject(sum));
icvCompleteMomentState( mom );
}

@ -1,966 +0,0 @@
////////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Jiang Liyuan, jlyuan001.good@163.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////BITWISE_AND////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
/**************************************and with scalar without mask**************************************/
__kernel void arithm_s_bitwise_and_C1_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
uchar4 src1_data = vload4(0, src1 + src1_index);
uchar4 src2_data = (uchar4)(src2.x, src2.x, src2.x, src2.x);
uchar4 data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = src1_data & src2_data;
data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
*((__global uchar4 *)(dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_C1_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
char4 src1_data = vload4(0, src1 + src1_index);
char4 src2_data = (char4)(src2.x, src2.x, src2.x, src2.x);
char4 data = *((__global char4 *)(dst + dst_index));
char4 tmp_data = src1_data & src2_data;
data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
*((__global char4 *)(dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_C1_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 1;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
ushort2 src2_data = (ushort2)(src2.x, src2.x);
ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
ushort2 tmp_data = src1_data & src2_data;
data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
data.y = (dst_index + 2 < dst_end ) ? tmp_data.y : data.y;
*((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_C1_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 1;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
short2 src2_data = (short2)(src2.x, src2.x);
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
short2 tmp_data = src1_data & src2_data;
data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
data.y = (dst_index + 2 < dst_end ) ? tmp_data.y : data.y;
*((__global short2 *)((__global uchar *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_C1_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
int src_data2 = src2.x;
int data = src_data1 & src_data2;
*((__global int *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_C1_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
char4 src1_data = *((__global char4 *)((__global char *)src1 + src1_index));
char4 src2_data = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
char4 data = *((__global char4 *)((__global char *)dst + dst_index));
char4 tmp_data = src1_data & src2_data;
data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
*((__global char4 *)((__global char *)dst + dst_index)) = data;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_and_C1_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
short4 src1_data = *((__global short4 *)((__global char *)src1 + src1_index));
short4 src2_data = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
short4 tmp_data = src1_data & src2_data;
*((__global short4 *)((__global char *)dst + dst_index)) = tmp_data;
}
}
#endif
__kernel void arithm_s_bitwise_and_C2_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 1;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
uchar4 src1_data = vload4(0, src1 + src1_index);
uchar4 src2_data = (uchar4)(src2.x, src2.y, src2.x, src2.y);
uchar4 data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = src1_data & src2_data;
data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
data.zw = (dst_index + 2 < dst_end ) ? tmp_data.zw : data.zw;
*((__global uchar4 *)(dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_C2_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 1;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
char4 src1_data = vload4(0, src1 + src1_index);
char4 src2_data = (char4)(src2.x, src2.y, src2.x, src2.y);
char4 data = *((__global char4 *)(dst + dst_index));
char4 tmp_data = src1_data & src2_data;
data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
data.zw = (dst_index + 2 < dst_end ) ? tmp_data.zw : data.zw;
*((__global char4 *)(dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_C2_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
ushort2 src_data2 = (ushort2)(src2.x, src2.y);
ushort2 data = src_data1 & src_data2;
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_C2_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
short2 src_data2 = (short2)(src2.x, src2.y);
short2 data = src_data1 & src_data2;
*((__global short2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_C2_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
int2 src_data2 = (int2)(src2.x, src2.y);
int2 data = src_data1 & src_data2;
*((__global int2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_C2_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
char8 src1_data = *((__global char8 *)((__global char *)src1 + src1_index));
char8 src2_data = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
char8 tmp_data = src1_data & src2_data;
*((__global char8 *)((__global char *)dst + dst_index)) = tmp_data;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_and_C2_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 4) + dst_offset);
short8 src1_data = *((__global short8 *)((__global char *)src1 + src1_index));
short8 src2_data = (short8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
short8 tmp_data = src1_data & src2_data;
*((__global short8 *)((__global char *)dst + dst_index)) = tmp_data;
}
}
#endif
__kernel void arithm_s_bitwise_and_C3_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y);
uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
uchar4 tmp_data_0 = src1_data_0 & src2_data_0;
uchar4 tmp_data_1 = src1_data_1 & src2_data_1;
uchar4 tmp_data_2 = src1_data_2 & src2_data_2;
data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
*((__global uchar4 *)(dst + dst_index + 4)) = data_1;
*((__global uchar4 *)(dst + dst_index + 8)) = data_2;
}
}
__kernel void arithm_s_bitwise_and_C3_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
char4 src1_data_0 = vload4(0, src1 + src1_index + 0);
char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
char4 src1_data_2 = vload4(0, src1 + src1_index + 8);
char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y);
char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
char4 data_0 = *((__global char4 *)(dst + dst_index + 0));
char4 data_1 = *((__global char4 *)(dst + dst_index + 4));
char4 data_2 = *((__global char4 *)(dst + dst_index + 8));
char4 tmp_data_0 = convert_char4_sat(convert_uchar4_sat(src1_data_0) & convert_uchar4_sat(src2_data_0));
char4 tmp_data_1 = convert_char4_sat(convert_uchar4_sat(src1_data_1) & convert_uchar4_sat(src2_data_1));
char4 tmp_data_2 = convert_char4_sat(convert_uchar4_sat(src1_data_2) & convert_uchar4_sat(src2_data_2));
data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global char4 *)(dst + dst_index + 0)) = data_0;
*((__global char4 *)(dst + dst_index + 4)) = data_1;
*((__global char4 *)(dst + dst_index + 8)) = data_2;
}
}
__kernel void arithm_s_bitwise_and_C3_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 1;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
ushort2 src2_data_0 = (ushort2)(src2.x, src2.y);
ushort2 src2_data_1 = (ushort2)(src2.z, src2.x);
ushort2 src2_data_2 = (ushort2)(src2.y, src2.z);
ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
ushort2 tmp_data_0 = src1_data_0 & src2_data_0;
ushort2 tmp_data_1 = src1_data_1 & src2_data_1;
ushort2 tmp_data_2 = src1_data_2 & src2_data_2;
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_s_bitwise_and_C3_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 1;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
short2 src2_data_0 = (short2)(src2.x, src2.y);
short2 src2_data_1 = (short2)(src2.z, src2.x);
short2 src2_data_2 = (short2)(src2.y, src2.z);
short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
short2 tmp_data_0 = src1_data_0 & src2_data_0;
short2 tmp_data_1 = src1_data_1 & src2_data_1;
short2 tmp_data_2 = src1_data_2 & src2_data_2;
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_s_bitwise_and_C3_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
int src2_data_0 = src2.x;
int src2_data_1 = src2.y;
int src2_data_2 = src2.z;
int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
int tmp_data_0 = src1_data_0 & src2_data_0;
int tmp_data_1 = src1_data_1 & src2_data_1;
int tmp_data_2 = src1_data_2 & src2_data_2;
*((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
}
}
__kernel void arithm_s_bitwise_and_C3_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7);
char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0));
char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4));
char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8));
char4 tmp_data_0 = src1_data_0 & src2_data_0;
char4 tmp_data_1 = src1_data_1 & src2_data_1;
char4 tmp_data_2 = src1_data_2 & src2_data_2;
*((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0;
*((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1;
*((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_and_C3_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 ));
short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 ));
short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 ));
short4 data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8 ));
short4 data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16));
short4 tmp_data_0 = src1_data_0 & src2_data_0;
short4 tmp_data_1 = src1_data_1 & src2_data_1;
short4 tmp_data_2 = src1_data_2 & src2_data_2;
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
}
}
#endif
__kernel void arithm_s_bitwise_and_C4_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
uchar4 data = src_data1 & src2;
*((__global uchar4 *)(dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_C4_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
char4 src_data1 = *((__global char4 *)(src1 + src1_index));
char4 data = src_data1 & src2;
*((__global char4 *)(dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_C4_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
ushort4 data = src_data1 & src2;
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_C4_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
short4 data = src_data1 & src2;
*((__global short4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_C4_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 4) + dst_offset);
int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
int4 data = src_data1 & src2;
*((__global int4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_C4_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 4) + dst_offset);
char16 src1_data = *((__global char16 *)((__global char *)src1 + src1_index));
char16 src2_data = (char16)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7,
src2.s8, src2.s9, src2.sa, src2.sb, src2.sc, src2.sd, src2.se, src2.sf);
char16 tmp_data = src1_data & src2_data;
*((__global char16 *)((__global char *)dst + dst_index)) = tmp_data;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_and_C4_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 5) + dst_offset);
short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0));
short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8));
short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
short4 src1_data_3 = *((__global short4 *)((__global char *)src1 + src1_index + 24));
short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf);
short4 tmp_data_0 = src1_data_0 & src2_data_0;
short4 tmp_data_1 = src1_data_1 & src2_data_1;
short4 tmp_data_2 = src1_data_2 & src2_data_2;
short4 tmp_data_3 = src1_data_3 & src2_data_3;
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
*((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
}
}
#endif

@ -16,6 +16,7 @@
//
// @Authors
// Jiang Liyuan, jlyuan001.good@163.com
// Peng Xiao, pengxiao@outlook.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
@ -50,11 +51,17 @@
#endif
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////BITWISE_AND////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
/**************************************bitwise_and without mask**************************************/
__kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int src1_offset,
//bitwise_binary without mask for and, or, xor operators
/////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////bitwise_binary///////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////////
#ifndef OP_BINARY
#define OP_BINARY &
#endif
__kernel void arithm_bitwise_binary_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
@ -95,7 +102,7 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = src1_data & src2_data;
uchar4 tmp_data = src1_data OP_BINARY src2_data;
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
@ -107,7 +114,7 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
}
__kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src1_offset,
__kernel void arithm_bitwise_binary_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
@ -148,7 +155,7 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
char4 dst_data = *((__global char4 *)(dst + dst_index));
char4 tmp_data = src1_data & src2_data;
char4 tmp_data = src1_data OP_BINARY src2_data;
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
@ -160,7 +167,7 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
}
__kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int src1_offset,
__kernel void arithm_bitwise_binary_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
@ -202,7 +209,7 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
ushort4 tmp_data = src1_data & src2_data;
ushort4 tmp_data = src1_data OP_BINARY src2_data;
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
@ -215,7 +222,7 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
__kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int src1_offset,
__kernel void arithm_bitwise_binary_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
@ -257,7 +264,7 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
short4 tmp_data = src1_data & src2_data;
short4 tmp_data = src1_data OP_BINARY src2_data;
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
@ -270,7 +277,7 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
__kernel void arithm_bitwise_and_D4 (__global int *src1, int src1_step, int src1_offset,
__kernel void arithm_bitwise_binary_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
@ -286,13 +293,13 @@ __kernel void arithm_bitwise_and_D4 (__global int *src1, int src1_step, int src1
int data1 = *((__global int *)((__global char *)src1 + src1_index));
int data2 = *((__global int *)((__global char *)src2 + src2_index));
int tmp = data1 & data2;
int tmp = data1 OP_BINARY data2;
*((__global int *)((__global char *)dst + dst_index)) = tmp;
}
}
__kernel void arithm_bitwise_and_D5 (__global char *src1, int src1_step, int src1_offset,
__kernel void arithm_bitwise_binary_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
@ -308,14 +315,14 @@ __kernel void arithm_bitwise_and_D5 (__global char *src1, int src1_step, int src
char4 data1 = *((__global char4 *)((__global char *)src1 + src1_index));
char4 data2 = *((__global char4 *)((__global char *)src2 + src2_index));
char4 tmp = data1 & data2;
char4 tmp = data1 OP_BINARY data2;
*((__global char4 *)((__global char *)dst + dst_index)) = tmp;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_bitwise_and_D6 (__global char *src1, int src1_step, int src1_offset,
__kernel void arithm_bitwise_binary_D6 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
@ -332,7 +339,7 @@ __kernel void arithm_bitwise_and_D6 (__global char *src1, int src1_step, int src
char8 data1 = *((__global char8 *)((__global char *)src1 + src1_index));
char8 data2 = *((__global char8 *)((__global char *)src2 + src2_index));
*((__global char8 *)((__global char *)dst + dst_index)) = data1 & data2;
*((__global char8 *)((__global char *)dst + dst_index)) = data1 OP_BINARY data2;
}
}
#endif

@ -16,6 +16,7 @@
//
// @Authors
// Jiang Liyuan, jlyuan001.good@163.com
// Peng Xiao, pengxiao@outlook.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
@ -49,11 +50,16 @@
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif
#ifndef OP_BINARY
#define OP_BINARY &
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////bitwise_binary////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////BITWISE_AND////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
/**************************************bitwise_and with mask**************************************/
__kernel void arithm_bitwise_and_with_mask_C1_D0 (
/**************************************bitwise_binary with mask**************************************/
__kernel void arithm_bitwise_binary_with_mask_C1_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -85,7 +91,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D0 (
uchar4 mask_data = vload4(0, mask + mask_index);
uchar4 data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = src1_data & src2_data;
uchar4 tmp_data = src1_data OP_BINARY src2_data;
data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
@ -98,7 +104,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D0 (
__kernel void arithm_bitwise_and_with_mask_C1_D1 (
__kernel void arithm_bitwise_binary_with_mask_C1_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -130,7 +136,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D1 (
uchar4 mask_data = vload4(0, mask + mask_index);
char4 data = *((__global char4 *)(dst + dst_index));
char4 tmp_data = src1_data & src2_data;
char4 tmp_data = src1_data OP_BINARY src2_data;
data.x = convert_char((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
data.y = convert_char((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
@ -143,7 +149,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D1 (
__kernel void arithm_bitwise_and_with_mask_C1_D2 (
__kernel void arithm_bitwise_binary_with_mask_C1_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -161,7 +167,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D2 (
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
#define dst_align ((dst_offset / 2) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -175,7 +181,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D2 (
uchar2 mask_data = vload2(0, mask + mask_index);
ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
ushort2 tmp_data = src1_data & src2_data;
ushort2 tmp_data = src1_data OP_BINARY src2_data;
data.x = convert_ushort((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
data.y = convert_ushort((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y;
@ -186,7 +192,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D2 (
__kernel void arithm_bitwise_and_with_mask_C1_D3 (
__kernel void arithm_bitwise_binary_with_mask_C1_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -204,7 +210,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D3 (
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
#define dst_align ((dst_offset / 2) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -218,7 +224,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D3 (
uchar2 mask_data = vload2(0, mask + mask_index);
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
short2 tmp_data = src1_data & src2_data;
short2 tmp_data = src1_data OP_BINARY src2_data;
data.x = convert_short((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
data.y = convert_short((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y;
@ -229,7 +235,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D3 (
__kernel void arithm_bitwise_and_with_mask_C1_D4 (
__kernel void arithm_bitwise_binary_with_mask_C1_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -253,7 +259,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D4 (
int src_data2 = *((__global int *)((__global char *)src2 + src2_index));
int dst_data = *((__global int *)((__global char *)dst + dst_index));
int data = src_data1 & src_data2;
int data = src_data1 OP_BINARY src_data2;
data = mask_data ? data : dst_data;
*((__global int *)((__global char *)dst + dst_index)) = data;
@ -262,7 +268,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D4 (
__kernel void arithm_bitwise_and_with_mask_C1_D5 (
__kernel void arithm_bitwise_binary_with_mask_C1_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -286,7 +292,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D5 (
char4 src_data2 = *((__global char4 *)((__global char *)src2 + src2_index));
char4 dst_data = *((__global char4 *)((__global char *)dst + dst_index));
char4 data = src_data1 & src_data2;
char4 data = src_data1 OP_BINARY src_data2;
data = mask_data ? data : dst_data;
*((__global char4 *)((__global char *)dst + dst_index)) = data;
@ -295,7 +301,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D5 (
__kernel void arithm_bitwise_and_with_mask_C1_D6 (
__kernel void arithm_bitwise_binary_with_mask_C1_D6 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -319,7 +325,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D6 (
char8 src_data2 = *((__global char8 *)((__global char *)src2 + src2_index));
char8 dst_data = *((__global char8 *)((__global char *)dst + dst_index));
char8 data = src_data1 & src_data2;
char8 data = src_data1 OP_BINARY src_data2;
data = mask_data ? data : dst_data;
*((__global char8 *)((__global char *)dst + dst_index)) = data;
@ -329,7 +335,7 @@ __kernel void arithm_bitwise_and_with_mask_C1_D6 (
__kernel void arithm_bitwise_and_with_mask_C2_D0 (
__kernel void arithm_bitwise_binary_with_mask_C2_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -347,7 +353,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D0 (
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
#define dst_align ((dst_offset / 2) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -361,7 +367,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D0 (
uchar2 mask_data = vload2(0, mask + mask_index);
uchar4 data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = src1_data & src2_data;
uchar4 tmp_data = src1_data OP_BINARY src2_data;
data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
data.zw = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.zw : data.zw;
@ -371,7 +377,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D0 (
}
__kernel void arithm_bitwise_and_with_mask_C2_D1 (
__kernel void arithm_bitwise_binary_with_mask_C2_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -389,7 +395,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D1 (
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
#define dst_align ((dst_offset / 2) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -403,7 +409,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D1 (
uchar2 mask_data = vload2(0, mask + mask_index);
char4 data = *((__global char4 *)(dst + dst_index));
char4 tmp_data = src1_data & src2_data;
char4 tmp_data = src1_data OP_BINARY src2_data;
data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
data.zw = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.zw : data.zw;
@ -412,7 +418,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D1 (
}
}
__kernel void arithm_bitwise_and_with_mask_C2_D2 (
__kernel void arithm_bitwise_binary_with_mask_C2_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -436,13 +442,13 @@ __kernel void arithm_bitwise_and_with_mask_C2_D2 (
ushort2 src_data2 = *((__global ushort2 *)((__global char *)src2 + src2_index));
ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index));
ushort2 data = src_data1 & src_data2;
ushort2 data = src_data1 OP_BINARY src_data2;
data = mask_data ? data : dst_data;
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_bitwise_and_with_mask_C2_D3 (
__kernel void arithm_bitwise_binary_with_mask_C2_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -466,13 +472,13 @@ __kernel void arithm_bitwise_and_with_mask_C2_D3 (
short2 src_data2 = *((__global short2 *)((__global char *)src2 + src2_index));
short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index));
short2 data = src_data1 & src_data2;
short2 data = src_data1 OP_BINARY src_data2;
data = mask_data ? data : dst_data;
*((__global short2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_bitwise_and_with_mask_C2_D4 (
__kernel void arithm_bitwise_binary_with_mask_C2_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -496,13 +502,13 @@ __kernel void arithm_bitwise_and_with_mask_C2_D4 (
int2 src_data2 = *((__global int2 *)((__global char *)src2 + src2_index));
int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index));
int2 data = src_data1 & src_data2;
int2 data = src_data1 OP_BINARY src_data2;
data = mask_data ? data : dst_data;
*((__global int2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_bitwise_and_with_mask_C2_D5 (
__kernel void arithm_bitwise_binary_with_mask_C2_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -526,14 +532,14 @@ __kernel void arithm_bitwise_and_with_mask_C2_D5 (
char8 src_data2 = *((__global char8 *)((__global char *)src2 + src2_index));
char8 dst_data = *((__global char8 *)((__global char *)dst + dst_index));
char8 data = src_data1 & src_data2;
char8 data = src_data1 OP_BINARY src_data2;
data = mask_data ? data : dst_data;
*((__global char8 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_bitwise_and_with_mask_C2_D6 (
__kernel void arithm_bitwise_binary_with_mask_C2_D6 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -557,7 +563,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D6 (
char16 src_data2 = *((__global char16 *)((__global char *)src2 + src2_index));
char16 dst_data = *((__global char16 *)((__global char *)dst + dst_index));
char16 data = src_data1 & src_data2;
char16 data = src_data1 OP_BINARY src_data2;
data = mask_data ? data : dst_data;
*((__global char16 *)((__global char *)dst + dst_index)) = data;
@ -565,398 +571,7 @@ __kernel void arithm_bitwise_and_with_mask_C2_D6 (
}
__kernel void arithm_bitwise_and_with_mask_C3_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
uchar4 src2_data_0 = vload4(0, src2 + src2_index + 0);
uchar4 src2_data_1 = vload4(0, src2 + src2_index + 4);
uchar4 src2_data_2 = vload4(0, src2 + src2_index + 8);
uchar4 mask_data = vload4(0, mask + mask_index);
uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
uchar4 tmp_data_0 = src1_data_0 & src2_data_0;
uchar4 tmp_data_1 = src1_data_1 & src2_data_1;
uchar4 tmp_data_2 = src1_data_2 & src2_data_2;
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
*((__global uchar4 *)(dst + dst_index + 4)) = data_1;
*((__global uchar4 *)(dst + dst_index + 8)) = data_2;
}
}
__kernel void arithm_bitwise_and_with_mask_C3_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
char4 src1_data_0 = vload4(0, src1 + src1_index + 0);
char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
char4 src1_data_2 = vload4(0, src1 + src1_index + 8);
char4 src2_data_0 = vload4(0, src2 + src2_index + 0);
char4 src2_data_1 = vload4(0, src2 + src2_index + 4);
char4 src2_data_2 = vload4(0, src2 + src2_index + 8);
uchar4 mask_data = vload4(0, mask + mask_index);
char4 data_0 = *((__global char4 *)(dst + dst_index + 0));
char4 data_1 = *((__global char4 *)(dst + dst_index + 4));
char4 data_2 = *((__global char4 *)(dst + dst_index + 8));
char4 tmp_data_0 = src1_data_0 & src2_data_0;
char4 tmp_data_1 = src1_data_1 & src2_data_1;
char4 tmp_data_2 = src1_data_2 & src2_data_2;
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global char4 *)(dst + dst_index + 0)) = data_0;
*((__global char4 *)(dst + dst_index + 4)) = data_1;
*((__global char4 *)(dst + dst_index + 8)) = data_2;
}
}
__kernel void arithm_bitwise_and_with_mask_C3_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 1;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
ushort2 src2_data_0 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 0));
ushort2 src2_data_1 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 4));
ushort2 src2_data_2 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 8));
uchar2 mask_data = vload2(0, mask + mask_index);
ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
ushort2 tmp_data_0 = src1_data_0 & src2_data_0 ;
ushort2 tmp_data_1 = src1_data_1 & src2_data_1 ;
ushort2 tmp_data_2 = src1_data_2 & src2_data_2 ;
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_bitwise_and_with_mask_C3_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 1;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
short2 src2_data_0 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 0));
short2 src2_data_1 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 4));
short2 src2_data_2 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 8));
uchar2 mask_data = vload2(0, mask + mask_index);
short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
short2 tmp_data_0 = src1_data_0 & src2_data_0 ;
short2 tmp_data_1 = src1_data_1 & src2_data_1 ;
short2 tmp_data_2 = src1_data_2 & src2_data_2 ;
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_bitwise_and_with_mask_C3_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
int src2_data_0 = *((__global int *)((__global char *)src2 + src2_index + 0));
int src2_data_1 = *((__global int *)((__global char *)src2 + src2_index + 4));
int src2_data_2 = *((__global int *)((__global char *)src2 + src2_index + 8));
uchar mask_data = * (mask + mask_index);
int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
int tmp_data_0 = src1_data_0 & src2_data_0 ;
int tmp_data_1 = src1_data_1 & src2_data_1 ;
int tmp_data_2 = src1_data_2 & src2_data_2 ;
data_0 = mask_data ? tmp_data_0 : data_0;
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_bitwise_and_with_mask_C3_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
char4 src2_data_0 = *((__global char4 *)((__global char *)src2 + src2_index + 0));
char4 src2_data_1 = *((__global char4 *)((__global char *)src2 + src2_index + 4));
char4 src2_data_2 = *((__global char4 *)((__global char *)src2 + src2_index + 8));
uchar mask_data = * (mask + mask_index);
char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0));
char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4));
char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8));
char4 tmp_data_0 = src1_data_0 & src2_data_0;
char4 tmp_data_1 = src1_data_1 & src2_data_1;
char4 tmp_data_2 = src1_data_2 & src2_data_2;
data_0 = mask_data ? tmp_data_0 : data_0;
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
*((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_bitwise_and_with_mask_C3_D6 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int src2_index = mad24(y, src2_step, (x * 24) + src2_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
char8 src1_data_0 = *((__global char8 *)((__global char *)src1 + src1_index + 0 ));
char8 src1_data_1 = *((__global char8 *)((__global char *)src1 + src1_index + 8 ));
char8 src1_data_2 = *((__global char8 *)((__global char *)src1 + src1_index + 16));
char8 src2_data_0 = *((__global char8 *)((__global char *)src2 + src2_index + 0 ));
char8 src2_data_1 = *((__global char8 *)((__global char *)src2 + src2_index + 8 ));
char8 src2_data_2 = *((__global char8 *)((__global char *)src2 + src2_index + 16));
uchar mask_data = * (mask + mask_index);
char8 data_0 = *((__global char8 *)((__global char *)dst + dst_index + 0 ));
char8 data_1 = *((__global char8 *)((__global char *)dst + dst_index + 8 ));
char8 data_2 = *((__global char8 *)((__global char *)dst + dst_index + 16));
char8 tmp_data_0 = src1_data_0 & src2_data_0;
char8 tmp_data_1 = src1_data_1 & src2_data_1;
char8 tmp_data_2 = src1_data_2 & src2_data_2;
data_0 = mask_data ? tmp_data_0 : data_0;
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
*((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0;
*((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1;
*((__global char8 *)((__global char *)dst + dst_index + 16))= data_2;
}
}
#endif
__kernel void arithm_bitwise_and_with_mask_C4_D0 (
__kernel void arithm_bitwise_binary_with_mask_C4_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -980,7 +595,7 @@ __kernel void arithm_bitwise_and_with_mask_C4_D0 (
uchar4 src_data2 = *((__global uchar4 *)(src2 + src2_index));
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 data = src_data1 & src_data2;
uchar4 data = src_data1 OP_BINARY src_data2;
data = mask_data ? data : dst_data;
*((__global uchar4 *)(dst + dst_index)) = data;
@ -988,7 +603,7 @@ __kernel void arithm_bitwise_and_with_mask_C4_D0 (
}
__kernel void arithm_bitwise_and_with_mask_C4_D1 (
__kernel void arithm_bitwise_binary_with_mask_C4_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -1012,14 +627,14 @@ __kernel void arithm_bitwise_and_with_mask_C4_D1 (
char4 src_data2 = *((__global char4 *)(src2 + src2_index));
char4 dst_data = *((__global char4 *)(dst + dst_index));
char4 data = src_data1 & src_data2;
char4 data = src_data1 OP_BINARY src_data2;
data = mask_data ? data : dst_data;
*((__global char4 *)(dst + dst_index)) = data;
}
}
__kernel void arithm_bitwise_and_with_mask_C4_D2 (
__kernel void arithm_bitwise_binary_with_mask_C4_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -1043,13 +658,13 @@ __kernel void arithm_bitwise_and_with_mask_C4_D2 (
ushort4 src_data2 = *((__global ushort4 *)((__global char *)src2 + src2_index));
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
ushort4 data = src_data1 & src_data2;
ushort4 data = src_data1 OP_BINARY src_data2;
data = mask_data ? data : dst_data;
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_bitwise_and_with_mask_C4_D3 (
__kernel void arithm_bitwise_binary_with_mask_C4_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -1073,13 +688,13 @@ __kernel void arithm_bitwise_and_with_mask_C4_D3 (
short4 src_data2 = *((__global short4 *)((__global char *)src2 + src2_index));
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
short4 data = src_data1 & src_data2;
short4 data = src_data1 OP_BINARY src_data2;
data = mask_data ? data : dst_data;
*((__global short4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_bitwise_and_with_mask_C4_D4 (
__kernel void arithm_bitwise_binary_with_mask_C4_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -1103,13 +718,13 @@ __kernel void arithm_bitwise_and_with_mask_C4_D4 (
int4 src_data2 = *((__global int4 *)((__global char *)src2 + src2_index));
int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
int4 data = src_data1 & src_data2;
int4 data = src_data1 OP_BINARY src_data2;
data = mask_data ? data : dst_data;
*((__global int4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_bitwise_and_with_mask_C4_D5 (
__kernel void arithm_bitwise_binary_with_mask_C4_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -1133,14 +748,14 @@ __kernel void arithm_bitwise_and_with_mask_C4_D5 (
char16 src_data2 = *((__global char16 *)((__global char *)src2 + src2_index));
char16 dst_data = *((__global char16 *)((__global char *)dst + dst_index));
char16 data = src_data1 & src_data2;
char16 data = src_data1 OP_BINARY src_data2;
data = mask_data ? data : dst_data;
*((__global char16 *)((__global char *)dst + dst_index)) = data;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_bitwise_and_with_mask_C4_D6 (
__kernel void arithm_bitwise_binary_with_mask_C4_D6 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -1175,10 +790,10 @@ __kernel void arithm_bitwise_and_with_mask_C4_D6 (
char8 dst_data_2 = *((__global char8 *)((__global char *)dst + dst_index + 16));
char8 dst_data_3 = *((__global char8 *)((__global char *)dst + dst_index + 24));
char8 data_0 = src_data1_0 & src_data2_0;
char8 data_1 = src_data1_1 & src_data2_1;
char8 data_2 = src_data1_2 & src_data2_2;
char8 data_3 = src_data1_3 & src_data2_3;
char8 data_0 = src_data1_0 OP_BINARY src_data2_0;
char8 data_1 = src_data1_1 OP_BINARY src_data2_1;
char8 data_2 = src_data1_2 OP_BINARY src_data2_2;
char8 data_3 = src_data1_3 OP_BINARY src_data2_3;
data_0 = mask_data ? data_0 : dst_data_0;
data_1 = mask_data ? data_1 : dst_data_1;

@ -16,6 +16,7 @@
//
// @Authors
// Jiang Liyuan, jlyuan001.good@163.com
// Peng Xiao, pengxiao@outlook.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
@ -49,11 +50,16 @@
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////BITWISE_XOR////////////////////////////////////////////////////
#ifndef OP_BINARY
#define OP_BINARY &
#endif
///////////////////////////////////////////////////////////////////////////////////////////////////////
/**************************************xor with scalar without mask**************************************/
__kernel void arithm_s_bitwise_xor_C1_D0 (
////////////////////////////////////////////bitwise_binary/////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
/******************************bitwise binary with scalar without mask********************************/
__kernel void arithm_s_bitwise_binary_C1_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
@ -79,7 +85,7 @@ __kernel void arithm_s_bitwise_xor_C1_D0 (
uchar4 src2_data = (uchar4)(src2.x, src2.x, src2.x, src2.x);
uchar4 data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = src1_data ^ src2_data;
uchar4 tmp_data = src1_data OP_BINARY src2_data;
data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
@ -91,7 +97,7 @@ __kernel void arithm_s_bitwise_xor_C1_D0 (
}
__kernel void arithm_s_bitwise_xor_C1_D1 (
__kernel void arithm_s_bitwise_binary_C1_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
@ -117,7 +123,7 @@ __kernel void arithm_s_bitwise_xor_C1_D1 (
char4 src2_data = (char4)(src2.x, src2.x, src2.x, src2.x);
char4 data = *((__global char4 *)(dst + dst_index));
char4 tmp_data = src1_data ^ src2_data;
char4 tmp_data = src1_data OP_BINARY src2_data;
data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
@ -128,7 +134,7 @@ __kernel void arithm_s_bitwise_xor_C1_D1 (
}
}
__kernel void arithm_s_bitwise_xor_C1_D2 (
__kernel void arithm_s_bitwise_binary_C1_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
@ -155,7 +161,7 @@ __kernel void arithm_s_bitwise_xor_C1_D2 (
ushort2 src2_data = (ushort2)(src2.x, src2.x);
ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
ushort2 tmp_data = src1_data ^ src2_data;
ushort2 tmp_data = src1_data OP_BINARY src2_data;
data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
data.y = (dst_index + 2 < dst_end ) ? tmp_data.y : data.y;
@ -163,7 +169,7 @@ __kernel void arithm_s_bitwise_xor_C1_D2 (
*((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_xor_C1_D3 (
__kernel void arithm_s_bitwise_binary_C1_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
@ -190,7 +196,7 @@ __kernel void arithm_s_bitwise_xor_C1_D3 (
short2 src2_data = (short2)(src2.x, src2.x);
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
short2 tmp_data = src1_data ^ src2_data;
short2 tmp_data = src1_data OP_BINARY src2_data;
data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
data.y = (dst_index + 2 < dst_end ) ? tmp_data.y : data.y;
@ -198,7 +204,7 @@ __kernel void arithm_s_bitwise_xor_C1_D3 (
*((__global short2 *)((__global uchar *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_xor_C1_D4 (
__kernel void arithm_s_bitwise_binary_C1_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
@ -215,12 +221,12 @@ __kernel void arithm_s_bitwise_xor_C1_D4 (
int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
int src_data2 = src2.x;
int data = src_data1 ^ src_data2;
int data = src_data1 OP_BINARY src_data2;
*((__global int *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_xor_C1_D5 (
__kernel void arithm_s_bitwise_binary_C1_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
@ -241,7 +247,7 @@ __kernel void arithm_s_bitwise_xor_C1_D5 (
char4 src2_data = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
char4 data = *((__global char4 *)((__global char *)dst + dst_index));
char4 tmp_data = src1_data ^ src2_data;
char4 tmp_data = src1_data OP_BINARY src2_data;
data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
@ -251,9 +257,8 @@ __kernel void arithm_s_bitwise_xor_C1_D5 (
*((__global char4 *)((__global char *)dst + dst_index)) = data;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_xor_C1_D6 (
__kernel void arithm_s_bitwise_binary_C1_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
@ -270,13 +275,13 @@ __kernel void arithm_s_bitwise_xor_C1_D6 (
short4 src1_data = *((__global short4 *)((__global char *)src1 + src1_index));
short4 src2_data = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
short4 tmp_data = src1_data ^ src2_data;
short4 tmp_data = src1_data OP_BINARY src2_data;
*((__global short4 *)((__global char *)dst + dst_index)) = tmp_data;
}
}
#endif
__kernel void arithm_s_bitwise_xor_C2_D0 (
__kernel void arithm_s_bitwise_binary_C2_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
@ -303,7 +308,7 @@ __kernel void arithm_s_bitwise_xor_C2_D0 (
uchar4 src2_data = (uchar4)(src2.x, src2.y, src2.x, src2.y);
uchar4 data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = src1_data ^ src2_data;
uchar4 tmp_data = src1_data OP_BINARY src2_data;
data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
@ -314,7 +319,7 @@ __kernel void arithm_s_bitwise_xor_C2_D0 (
}
__kernel void arithm_s_bitwise_xor_C2_D1 (
__kernel void arithm_s_bitwise_binary_C2_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
@ -341,7 +346,7 @@ __kernel void arithm_s_bitwise_xor_C2_D1 (
char4 src2_data = (char4)(src2.x, src2.y, src2.x, src2.y);
char4 data = *((__global char4 *)(dst + dst_index));
char4 tmp_data = src1_data ^ src2_data;
char4 tmp_data = src1_data OP_BINARY src2_data;
data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
data.zw = (dst_index + 2 < dst_end ) ? tmp_data.zw : data.zw;
@ -350,7 +355,7 @@ __kernel void arithm_s_bitwise_xor_C2_D1 (
}
}
__kernel void arithm_s_bitwise_xor_C2_D2 (
__kernel void arithm_s_bitwise_binary_C2_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
@ -367,12 +372,12 @@ __kernel void arithm_s_bitwise_xor_C2_D2 (
ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
ushort2 src_data2 = (ushort2)(src2.x, src2.y);
ushort2 data = src_data1 ^ src_data2;
ushort2 data = src_data1 OP_BINARY src_data2;
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_xor_C2_D3 (
__kernel void arithm_s_bitwise_binary_C2_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
@ -389,12 +394,12 @@ __kernel void arithm_s_bitwise_xor_C2_D3 (
short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
short2 src_data2 = (short2)(src2.x, src2.y);
short2 data = src_data1 ^ src_data2;
short2 data = src_data1 OP_BINARY src_data2;
*((__global short2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_xor_C2_D4 (
__kernel void arithm_s_bitwise_binary_C2_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
@ -411,11 +416,11 @@ __kernel void arithm_s_bitwise_xor_C2_D4 (
int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
int2 src_data2 = (int2)(src2.x, src2.y);
int2 data = src_data1 ^ src_data2;
int2 data = src_data1 OP_BINARY src_data2;
*((__global int2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_xor_C2_D5 (
__kernel void arithm_s_bitwise_binary_C2_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
@ -432,13 +437,13 @@ __kernel void arithm_s_bitwise_xor_C2_D5 (
char8 src1_data = *((__global char8 *)((__global char *)src1 + src1_index));
char8 src2_data = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
char8 tmp_data = src1_data ^ src2_data;
char8 tmp_data = src1_data OP_BINARY src2_data;
*((__global char8 *)((__global char *)dst + dst_index)) = tmp_data;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_xor_C2_D6 (
__kernel void arithm_s_bitwise_binary_C2_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
@ -455,347 +460,14 @@ __kernel void arithm_s_bitwise_xor_C2_D6 (
short8 src1_data = *((__global short8 *)((__global char *)src1 + src1_index));
short8 src2_data = (short8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
short8 tmp_data = src1_data ^ src2_data;
short8 tmp_data = src1_data OP_BINARY src2_data;
*((__global short8 *)((__global char *)dst + dst_index)) = tmp_data;
}
}
#endif
__kernel void arithm_s_bitwise_xor_C3_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y);
uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
uchar4 tmp_data_0 = src1_data_0 ^ src2_data_0;
uchar4 tmp_data_1 = src1_data_1 ^ src2_data_1;
uchar4 tmp_data_2 = src1_data_2 ^ src2_data_2;
data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
*((__global uchar4 *)(dst + dst_index + 4)) = data_1;
*((__global uchar4 *)(dst + dst_index + 8)) = data_2;
}
}
__kernel void arithm_s_bitwise_xor_C3_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
char4 src1_data_0 = vload4(0, src1 + src1_index + 0);
char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
char4 src1_data_2 = vload4(0, src1 + src1_index + 8);
char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y);
char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
char4 data_0 = *((__global char4 *)(dst + dst_index + 0));
char4 data_1 = *((__global char4 *)(dst + dst_index + 4));
char4 data_2 = *((__global char4 *)(dst + dst_index + 8));
char4 tmp_data_0 = src1_data_0 ^ src2_data_0;
char4 tmp_data_1 = src1_data_1 ^ src2_data_1;
char4 tmp_data_2 = src1_data_2 ^ src2_data_2;
data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global char4 *)(dst + dst_index + 0)) = data_0;
*((__global char4 *)(dst + dst_index + 4)) = data_1;
*((__global char4 *)(dst + dst_index + 8)) = data_2;
}
}
__kernel void arithm_s_bitwise_xor_C3_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 1;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
ushort2 src2_data_0 = (ushort2)(src2.x, src2.y);
ushort2 src2_data_1 = (ushort2)(src2.z, src2.x);
ushort2 src2_data_2 = (ushort2)(src2.y, src2.z);
ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
ushort2 tmp_data_0 = src1_data_0 ^ src2_data_0;
ushort2 tmp_data_1 = src1_data_1 ^ src2_data_1;
ushort2 tmp_data_2 = src1_data_2 ^ src2_data_2;
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_s_bitwise_xor_C3_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 1;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
short2 src2_data_0 = (short2)(src2.x, src2.y);
short2 src2_data_1 = (short2)(src2.z, src2.x);
short2 src2_data_2 = (short2)(src2.y, src2.z);
short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
short2 tmp_data_0 = src1_data_0 ^ src2_data_0;
short2 tmp_data_1 = src1_data_1 ^ src2_data_1;
short2 tmp_data_2 = src1_data_2 ^ src2_data_2;
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_s_bitwise_xor_C3_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
int src2_data_0 = src2.x;
int src2_data_1 = src2.y;
int src2_data_2 = src2.z;
int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
int tmp_data_0 = src1_data_0 ^ src2_data_0;
int tmp_data_1 = src1_data_1 ^ src2_data_1;
int tmp_data_2 = src1_data_2 ^ src2_data_2;
*((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
}
}
__kernel void arithm_s_bitwise_xor_C3_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7);
char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0));
char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4));
char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8));
char4 tmp_data_0 = src1_data_0 ^ src2_data_0;
char4 tmp_data_1 = src1_data_1 ^ src2_data_1;
char4 tmp_data_2 = src1_data_2 ^ src2_data_2;
*((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0;
*((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1;
*((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_xor_C3_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 ));
short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 ));
short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 ));
short4 data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8 ));
short4 data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16));
short4 tmp_data_0 = src1_data_0 ^ src2_data_0;
short4 tmp_data_1 = src1_data_1 ^ src2_data_1;
short4 tmp_data_2 = src1_data_2 ^ src2_data_2;
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
}
}
#endif
__kernel void arithm_s_bitwise_xor_C4_D0 (
__kernel void arithm_s_bitwise_binary_C4_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
@ -811,14 +483,14 @@ __kernel void arithm_s_bitwise_xor_C4_D0 (
uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
uchar4 data = src_data1 ^ src2;
uchar4 data = src_data1 OP_BINARY src2;
*((__global uchar4 *)(dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_xor_C4_D1 (
__kernel void arithm_s_bitwise_binary_C4_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
@ -834,13 +506,13 @@ __kernel void arithm_s_bitwise_xor_C4_D1 (
char4 src_data1 = *((__global char4 *)(src1 + src1_index));
char4 data = src_data1 ^ src2;
char4 data = src_data1 OP_BINARY src2;
*((__global char4 *)(dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_xor_C4_D2 (
__kernel void arithm_s_bitwise_binary_C4_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
@ -856,12 +528,12 @@ __kernel void arithm_s_bitwise_xor_C4_D2 (
ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
ushort4 data = src_data1 ^ src2;
ushort4 data = src_data1 OP_BINARY src2;
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_xor_C4_D3 (
__kernel void arithm_s_bitwise_binary_C4_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
@ -877,12 +549,12 @@ __kernel void arithm_s_bitwise_xor_C4_D3 (
short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
short4 data = src_data1 ^ src2;
short4 data = src_data1 OP_BINARY src2;
*((__global short4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_xor_C4_D4 (
__kernel void arithm_s_bitwise_binary_C4_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
@ -898,12 +570,12 @@ __kernel void arithm_s_bitwise_xor_C4_D4 (
int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
int4 data = src_data1 ^ src2;
int4 data = src_data1 OP_BINARY src2;
*((__global int4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_xor_C4_D5 (
__kernel void arithm_s_bitwise_binary_C4_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
@ -921,13 +593,13 @@ __kernel void arithm_s_bitwise_xor_C4_D5 (
char16 src2_data = (char16)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7,
src2.s8, src2.s9, src2.sa, src2.sb, src2.sc, src2.sd, src2.se, src2.sf);
char16 tmp_data = src1_data ^ src2_data;
char16 tmp_data = src1_data OP_BINARY src2_data;
*((__global char16 *)((__global char *)dst + dst_index)) = tmp_data;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_xor_C4_D6 (
__kernel void arithm_s_bitwise_binary_C4_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
@ -951,10 +623,10 @@ __kernel void arithm_s_bitwise_xor_C4_D6 (
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf);
short4 tmp_data_0 = src1_data_0 ^ src2_data_0;
short4 tmp_data_1 = src1_data_1 ^ src2_data_1;
short4 tmp_data_2 = src1_data_2 ^ src2_data_2;
short4 tmp_data_3 = src1_data_3 ^ src2_data_3;
short4 tmp_data_0 = src1_data_0 OP_BINARY src2_data_0;
short4 tmp_data_1 = src1_data_1 OP_BINARY src2_data_1;
short4 tmp_data_2 = src1_data_2 OP_BINARY src2_data_2;
short4 tmp_data_3 = src1_data_3 OP_BINARY src2_data_3;
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
@ -963,4 +635,4 @@ __kernel void arithm_s_bitwise_xor_C4_D6 (
}
}
#endif
#endif

@ -49,11 +49,16 @@
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif
#ifndef OP_BINARY
#define OP_BINARY &
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////bitwise_binary////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////BITWISE_AND////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
/**************************************bitwise_and with scalar with mask**************************************/
__kernel void arithm_s_bitwise_and_with_mask_C1_D0 (
/**************************************bitwise_binary with scalar with mask**************************************/
__kernel void arithm_s_bitwise_binary_with_mask_C1_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -83,7 +88,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D0 (
uchar4 mask_data = vload4(0, mask + mask_index);
uchar4 data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = src1_data & src2_data;
uchar4 tmp_data = src1_data OP_BINARY src2_data;
data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
@ -95,7 +100,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D0 (
}
__kernel void arithm_s_bitwise_and_with_mask_C1_D1 (
__kernel void arithm_s_bitwise_binary_with_mask_C1_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -125,7 +130,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D1 (
uchar4 mask_data = vload4(0, mask + mask_index);
char4 data = *((__global char4 *)(dst + dst_index));
char4 tmp_data = src1_data & src2_data;
char4 tmp_data = src1_data OP_BINARY src2_data;
data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
@ -136,7 +141,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D1 (
}
}
__kernel void arithm_s_bitwise_and_with_mask_C1_D2 (
__kernel void arithm_s_bitwise_binary_with_mask_C1_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -153,7 +158,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D2 (
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
#define dst_align ((dst_offset / 2) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -166,7 +171,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D2 (
uchar2 mask_data = vload2(0, mask + mask_index);
ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
ushort2 tmp_data = src1_data & src2_data;
ushort2 tmp_data = src1_data OP_BINARY src2_data;
data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
data.y = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.y : data.y;
@ -174,7 +179,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D2 (
*((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_with_mask_C1_D3 (
__kernel void arithm_s_bitwise_binary_with_mask_C1_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -191,7 +196,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D3 (
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
#define dst_align ((dst_offset / 2) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -204,7 +209,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D3 (
uchar2 mask_data = vload2(0, mask + mask_index);
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
short2 tmp_data = src1_data & src2_data;
short2 tmp_data = src1_data OP_BINARY src2_data;
data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
data.y = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.y : data.y;
@ -212,7 +217,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D3 (
*((__global short2 *)((__global uchar *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_with_mask_C1_D4 (
__kernel void arithm_s_bitwise_binary_with_mask_C1_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -234,14 +239,14 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D4 (
int src_data2 = src2.x;
int dst_data = *((__global int *)((__global char *)dst + dst_index));
int data = src_data1 & src_data2;
int data = src_data1 OP_BINARY src_data2;
data = mask_data ? data : dst_data;
*((__global int *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_with_mask_C1_D5 (
__kernel void arithm_s_bitwise_binary_with_mask_C1_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -263,7 +268,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D5 (
char4 src2_data = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
char4 dst_data = *((__global char4 *)((__global char *)dst + dst_index));
char4 data = src1_data & src2_data;
char4 data = src1_data OP_BINARY src2_data;
data = mask_data ? data : dst_data;
*((__global char4 *)((__global char *)dst + dst_index)) = data;
@ -271,7 +276,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D5 (
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_and_with_mask_C1_D6 (
__kernel void arithm_s_bitwise_binary_with_mask_C1_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -293,14 +298,14 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D6 (
short4 src2_data = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
short4 data = src1_data & src2_data;
short4 data = src1_data OP_BINARY src2_data;
data = mask_data ? data : dst_data;
*((__global short4 *)((__global char *)dst + dst_index)) = data;
}
}
#endif
__kernel void arithm_s_bitwise_and_with_mask_C2_D0 (
__kernel void arithm_s_bitwise_binary_with_mask_C2_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -317,7 +322,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D0 (
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
#define dst_align ((dst_offset / 2) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -330,7 +335,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D0 (
uchar2 mask_data = vload2(0, mask + mask_index);
uchar4 data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = src1_data & src2_data;
uchar4 tmp_data = src1_data OP_BINARY src2_data;
data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
data.zw = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.zw : data.zw;
@ -340,7 +345,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D0 (
}
__kernel void arithm_s_bitwise_and_with_mask_C2_D1 (
__kernel void arithm_s_bitwise_binary_with_mask_C2_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -357,7 +362,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D1 (
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
#define dst_align ((dst_offset / 2) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -370,7 +375,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D1 (
uchar2 mask_data = vload2(0, mask + mask_index);
char4 data = *((__global char4 *)(dst + dst_index));
char4 tmp_data = src1_data & src2_data;
char4 tmp_data = src1_data OP_BINARY src2_data;
data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
data.zw = ((mask_data.y) && (dst_index + 2 < dst_end )) ? tmp_data.zw : data.zw;
@ -379,7 +384,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D1 (
}
}
__kernel void arithm_s_bitwise_and_with_mask_C2_D2 (
__kernel void arithm_s_bitwise_binary_with_mask_C2_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -401,13 +406,13 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D2 (
ushort2 src_data2 = (ushort2)(src2.x, src2.y);
ushort2 dst_data = *((__global ushort2 *)((__global char *)dst + dst_index));
ushort2 data = src_data1 & src_data2;
ushort2 data = src_data1 OP_BINARY src_data2;
data = mask_data ? data : dst_data;
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_with_mask_C2_D3 (
__kernel void arithm_s_bitwise_binary_with_mask_C2_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -429,13 +434,13 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D3 (
short2 src_data2 = (short2)(src2.x, src2.y);
short2 dst_data = *((__global short2 *)((__global char *)dst + dst_index));
short2 data = src_data1 & src_data2;
short2 data = src_data1 OP_BINARY src_data2;
data = mask_data ? data : dst_data;
*((__global short2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_with_mask_C2_D4 (
__kernel void arithm_s_bitwise_binary_with_mask_C2_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -457,13 +462,13 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D4 (
int2 src_data2 = (int2)(src2.x, src2.y);
int2 dst_data = *((__global int2 *)((__global char *)dst + dst_index));
int2 data = src_data1 & src_data2;
int2 data = src_data1 OP_BINARY src_data2;
data = mask_data ? data : dst_data;
*((__global int2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_with_mask_C2_D5 (
__kernel void arithm_s_bitwise_binary_with_mask_C2_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -485,7 +490,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D5 (
char8 src2_data = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
char8 dst_data = *((__global char8 *)((__global char *)dst + dst_index));
char8 data = src1_data & src2_data;
char8 data = src1_data OP_BINARY src2_data;
data = mask_data ? data : dst_data;
@ -493,7 +498,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D5 (
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_and_with_mask_C2_D6 (
__kernel void arithm_s_bitwise_binary_with_mask_C2_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -515,388 +520,14 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D6 (
short8 src2_data = (short8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
short8 dst_data = *((__global short8 *)((__global char *)dst + dst_index));
short8 data = src1_data & src2_data;
short8 data = src1_data OP_BINARY src2_data;
data = mask_data ? data : dst_data;
*((__global short8 *)((__global char *)dst + dst_index)) = data;
}
}
#endif
__kernel void arithm_s_bitwise_and_with_mask_C3_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y);
uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
uchar4 mask_data = vload4(0, mask + mask_index);
uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
uchar4 tmp_data_0 = src1_data_0 & src2_data_0;
uchar4 tmp_data_1 = src1_data_1 & src2_data_1;
uchar4 tmp_data_2 = src1_data_2 & src2_data_2;
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
*((__global uchar4 *)(dst + dst_index + 4)) = data_1;
*((__global uchar4 *)(dst + dst_index + 8)) = data_2;
}
}
__kernel void arithm_s_bitwise_and_with_mask_C3_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
char4 src1_data_0 = vload4(0, src1 + src1_index + 0);
char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
char4 src1_data_2 = vload4(0, src1 + src1_index + 8);
char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y);
char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
uchar4 mask_data = vload4(0, mask + mask_index);
char4 data_0 = *((__global char4 *)(dst + dst_index + 0));
char4 data_1 = *((__global char4 *)(dst + dst_index + 4));
char4 data_2 = *((__global char4 *)(dst + dst_index + 8));
char4 tmp_data_0 = src1_data_0 & src2_data_0;
char4 tmp_data_1 = src1_data_1 & src2_data_1;
char4 tmp_data_2 = src1_data_2 & src2_data_2;
data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global char4 *)(dst + dst_index + 0)) = data_0;
*((__global char4 *)(dst + dst_index + 4)) = data_1;
*((__global char4 *)(dst + dst_index + 8)) = data_2;
}
}
__kernel void arithm_s_bitwise_and_with_mask_C3_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 1;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
ushort2 src2_data_0 = (ushort2)(src2.x, src2.y);
ushort2 src2_data_1 = (ushort2)(src2.z, src2.x);
ushort2 src2_data_2 = (ushort2)(src2.y, src2.z);
uchar2 mask_data = vload2(0, mask + mask_index);
ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
ushort2 tmp_data_0 = src1_data_0 & src2_data_0;
ushort2 tmp_data_1 = src1_data_1 & src2_data_1;
ushort2 tmp_data_2 = src1_data_2 & src2_data_2;
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_s_bitwise_and_with_mask_C3_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 1;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
short2 src2_data_0 = (short2)(src2.x, src2.y);
short2 src2_data_1 = (short2)(src2.z, src2.x);
short2 src2_data_2 = (short2)(src2.y, src2.z);
uchar2 mask_data = vload2(0, mask + mask_index);
short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
short2 tmp_data_0 = src1_data_0 & src2_data_0;
short2 tmp_data_1 = src1_data_1 & src2_data_1;
short2 tmp_data_2 = src1_data_2 & src2_data_2;
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_s_bitwise_and_with_mask_C3_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
int src2_data_0 = src2.x;
int src2_data_1 = src2.y;
int src2_data_2 = src2.z;
uchar mask_data = * (mask + mask_index);
int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
int tmp_data_0 = src1_data_0 & src2_data_0;
int tmp_data_1 = src1_data_1 & src2_data_1;
int tmp_data_2 = src1_data_2 & src2_data_2;
data_0 = mask_data ? tmp_data_0 : data_0;
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_s_bitwise_and_with_mask_C3_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7);
char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
uchar mask_data = * (mask + mask_index);
char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0));
char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4));
char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8));
char4 tmp_data_0 = src1_data_0 & src2_data_0;
char4 tmp_data_1 = src1_data_1 & src2_data_1;
char4 tmp_data_2 = src1_data_2 & src2_data_2;
data_0 = mask_data ? tmp_data_0 : data_0;
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
*((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_and_with_mask_C3_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int mask_index = mad24(y, mask_step, x + mask_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 ));
short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 ));
short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
uchar mask_data = * (mask + mask_index);
short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 ));
short4 data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8 ));
short4 data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16));
short4 tmp_data_0 = src1_data_0 & src2_data_0;
short4 tmp_data_1 = src1_data_1 & src2_data_1;
short4 tmp_data_2 = src1_data_2 & src2_data_2;
data_0 = mask_data ? tmp_data_0 : data_0;
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0;
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1;
*((__global short4 *)((__global char *)dst + dst_index + 16))= data_2;
}
}
#endif
__kernel void arithm_s_bitwise_and_with_mask_C4_D0 (
__kernel void arithm_s_bitwise_binary_with_mask_C4_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -917,7 +548,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D0 (
uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 data = src_data1 & src2;
uchar4 data = src_data1 OP_BINARY src2;
data = mask_data ? data : dst_data;
*((__global uchar4 *)(dst + dst_index)) = data;
@ -925,7 +556,7 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D0 (
}
__kernel void arithm_s_bitwise_and_with_mask_C4_D1 (
__kernel void arithm_s_bitwise_binary_with_mask_C4_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -946,14 +577,14 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D1 (
char4 src_data1 = *((__global char4 *)(src1 + src1_index));
char4 dst_data = *((__global char4 *)(dst + dst_index));
char4 data = src_data1 & src2;
char4 data = src_data1 OP_BINARY src2;
data = mask_data ? data : dst_data;
*((__global char4 *)(dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_with_mask_C4_D2 (
__kernel void arithm_s_bitwise_binary_with_mask_C4_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -974,13 +605,13 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D2 (
ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
ushort4 data = src_data1 & src2;
ushort4 data = src_data1 OP_BINARY src2;
data = mask_data ? data : dst_data;
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_with_mask_C4_D3 (
__kernel void arithm_s_bitwise_binary_with_mask_C4_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -1001,13 +632,13 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D3 (
short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
short4 data = src_data1 & src2;
short4 data = src_data1 OP_BINARY src2;
data = mask_data ? data : dst_data;
*((__global short4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_with_mask_C4_D4 (
__kernel void arithm_s_bitwise_binary_with_mask_C4_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -1028,13 +659,13 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D4 (
int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
int4 data = src_data1 & src2;
int4 data = src_data1 OP_BINARY src2;
data = mask_data ? data : dst_data;
*((__global int4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_with_mask_C4_D5 (
__kernel void arithm_s_bitwise_binary_with_mask_C4_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -1057,14 +688,14 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D5 (
src2.s8, src2.s9, src2.sa, src2.sb, src2.sc, src2.sd, src2.se, src2.sf);
char16 dst_data = *((__global char16 *)((__global char *)dst + dst_index));
char16 data = src1_data & src2_data;
char16 data = src1_data OP_BINARY src2_data;
data = mask_data ? data : dst_data;
*((__global char16 *)((__global char *)dst + dst_index)) = data;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_and_with_mask_C4_D6 (
__kernel void arithm_s_bitwise_binary_with_mask_C4_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
@ -1097,10 +728,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D6 (
short4 dst_data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16));
short4 dst_data_3 = *((__global short4 *)((__global char *)dst + dst_index + 24));
short4 data_0 = src1_data_0 & src2_data_0;
short4 data_1 = src1_data_1 & src2_data_1;
short4 data_2 = src1_data_2 & src2_data_2;
short4 data_3 = src1_data_3 & src2_data_3;
short4 data_0 = src1_data_0 OP_BINARY src2_data_0;
short4 data_1 = src1_data_1 OP_BINARY src2_data_1;
short4 data_2 = src1_data_2 OP_BINARY src2_data_2;
short4 data_3 = src1_data_3 OP_BINARY src2_data_3;
data_0 = mask_data ? data_0 : dst_data_0;
data_1 = mask_data ? data_1 : dst_data_1;

@ -1,294 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Jiang Liyuan, jlyuan001.good@163.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////BITWISE_OR////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
/**************************************bitwise_or without mask**************************************/
__kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = src1_data | src2_data;
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
*((__global uchar4 *)(dst + dst_index)) = dst_data;
}
}
__kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
char4 src1_data = vload4(0, src1 + src1_index);
char4 src2_data = vload4(0, src2 + src2_index);
char4 dst_data = *((__global char4 *)(dst + dst_index));
char4 tmp_data = src1_data | src2_data;
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
*((__global char4 *)(dst + dst_index)) = dst_data;
}
}
__kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
ushort4 tmp_data = src1_data | src2_data;
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
*((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
}
}
__kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
short4 tmp_data = src1_data | src2_data;
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
*((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
}
}
__kernel void arithm_bitwise_or_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
int data1 = *((__global int *)((__global char *)src1 + src1_index));
int data2 = *((__global int *)((__global char *)src2 + src2_index));
int tmp = data1 | data2;
*((__global int *)((__global char *)dst + dst_index)) = tmp;
}
}
__kernel void arithm_bitwise_or_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
char4 data1 = *((__global char4 *)((__global char *)src1 + src1_index));
char4 data2 = *((__global char4 *)((__global char *)src2 + src2_index));
char4 tmp = data1 | data2;
*((__global char4 *)((__global char *)dst + dst_index)) = tmp;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_bitwise_or_D6 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
char8 data1 = *((__global char8 *)((__global char *)src1 + src1_index));
char8 data2 = *((__global char8 *)((__global char *)src2 + src2_index));
*((__global char8 *)((__global char *)dst + dst_index)) = data1 | data2;
}
}
#endif

File diff suppressed because it is too large Load Diff

@ -1,973 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Jiang Liyuan, jlyuan001.good@163.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////BITWISE_OR////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
/**************************************and with scalar without mask**************************************/
__kernel void arithm_s_bitwise_or_C1_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
uchar4 src1_data = vload4(0, src1 + src1_index);
uchar4 src2_data = (uchar4)(src2.x, src2.x, src2.x, src2.x);
uchar4 data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = src1_data | src2_data;
data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
*((__global uchar4 *)(dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_or_C1_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
char4 src1_data = vload4(0, src1 + src1_index);
char4 src2_data = (char4)(src2.x, src2.x, src2.x, src2.x);
char4 data = *((__global char4 *)(dst + dst_index));
char4 tmp_data = src1_data | src2_data;
data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
*((__global char4 *)(dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_or_C1_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 1;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
ushort2 src2_data = (ushort2)(src2.x, src2.x);
ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
ushort2 tmp_data = src1_data | src2_data;
data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
data.y = (dst_index + 2 < dst_end ) ? tmp_data.y : data.y;
*((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_or_C1_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 1;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
short2 src2_data = (short2)(src2.x, src2.x);
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
short2 tmp_data = src1_data | src2_data;
data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
data.y = (dst_index + 2 < dst_end ) ? tmp_data.y : data.y;
*((__global short2 *)((__global uchar *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_or_C1_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
int src_data2 = src2.x;
int data = src_data1 | src_data2;
*((__global int *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_or_C1_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
char4 src_data1 = *((__global char4 *)((__global char *)src1 + src1_index));
char4 src_data2 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
char4 data = src_data1 | src_data2;
*((__global char4 *)((__global char *)dst + dst_index)) = data;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_or_C1_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
short4 src1_data = *((__global short4 *)((__global char *)src1 + src1_index));
short4 src2_data = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
short4 tmp_data = src1_data | src2_data;
*((__global short4 *)((__global char *)dst + dst_index)) = tmp_data;
}
}
#endif
__kernel void arithm_s_bitwise_or_C2_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 1;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
uchar4 src1_data = vload4(0, src1 + src1_index);
uchar4 src2_data = (uchar4)(src2.x, src2.y, src2.x, src2.y);
uchar4 data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = src1_data | src2_data;
data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
data.zw = (dst_index + 2 < dst_end ) ? tmp_data.zw : data.zw;
*((__global uchar4 *)(dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_or_C2_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 1;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
char4 src1_data = vload4(0, src1 + src1_index);
char4 src2_data = (char4)(src2.x, src2.y, src2.x, src2.y);
char4 data = *((__global char4 *)(dst + dst_index));
char4 tmp_data = src1_data | src2_data;
data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
data.zw = (dst_index + 2 < dst_end ) ? tmp_data.zw : data.zw;
*((__global char4 *)(dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_or_C2_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
ushort2 src_data2 = (ushort2)(src2.x, src2.y);
ushort2 data = src_data1 | src_data2;
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_or_C2_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
short2 src_data2 = (short2)(src2.x, src2.y);
short2 data = src_data1 | src_data2;
*((__global short2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_or_C2_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
int2 src_data2 = (int2)(src2.x, src2.y);
int2 data = src_data1 | src_data2;
*((__global int2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_or_C2_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
char8 src_data1 = *((__global char8 *)((__global char *)src1 + src1_index));
char8 src_data2 = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
char8 data = src_data1 | src_data2;
*((__global char8 *)((__global char *)dst + dst_index)) = data;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_or_C2_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 4) + dst_offset);
short8 src1_data = *((__global short8 *)((__global char *)src1 + src1_index));
short8 src2_data = (short8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
short8 tmp_data = src1_data & src2_data;
*((__global short8 *)((__global char *)dst + dst_index)) = tmp_data;
}
}
#endif
__kernel void arithm_s_bitwise_or_C3_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y);
uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
uchar4 tmp_data_0 = src1_data_0 | src2_data_0 ;
uchar4 tmp_data_1 = src1_data_1 | src2_data_1 ;
uchar4 tmp_data_2 = src1_data_2 | src2_data_2 ;
data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global uchar4 *)(dst + dst_index + 0)) = data_0;
*((__global uchar4 *)(dst + dst_index + 4)) = data_1;
*((__global uchar4 *)(dst + dst_index + 8)) = data_2;
}
}
__kernel void arithm_s_bitwise_or_C3_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
char4 src1_data_0 = vload4(0, src1 + src1_index + 0);
char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
char4 src1_data_2 = vload4(0, src1 + src1_index + 8);
char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y);
char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
char4 data_0 = *((__global char4 *)(dst + dst_index + 0));
char4 data_1 = *((__global char4 *)(dst + dst_index + 4));
char4 data_2 = *((__global char4 *)(dst + dst_index + 8));
char4 tmp_data_0 = src1_data_0 | src2_data_0;
char4 tmp_data_1 = src1_data_1 | src2_data_1;
char4 tmp_data_2 = src1_data_2 | src2_data_2;
data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_0.w : data_0.w;
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
? tmp_data_1.xy : data_1.xy;
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.zw : data_1.zw;
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.x : data_2.x;
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
? tmp_data_2.yzw : data_2.yzw;
*((__global char4 *)(dst + dst_index + 0)) = data_0;
*((__global char4 *)(dst + dst_index + 4)) = data_1;
*((__global char4 *)(dst + dst_index + 8)) = data_2;
}
}
__kernel void arithm_s_bitwise_or_C3_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 1;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
ushort2 src2_data_0 = (ushort2)(src2.x, src2.y);
ushort2 src2_data_1 = (ushort2)(src2.z, src2.x);
ushort2 src2_data_2 = (ushort2)(src2.y, src2.z);
ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
ushort2 tmp_data_0 = src1_data_0 | src2_data_0 ;
ushort2 tmp_data_1 = src1_data_1 | src2_data_1 ;
ushort2 tmp_data_2 = src1_data_2 | src2_data_2 ;
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_s_bitwise_or_C3_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 1;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
short2 src2_data_0 = (short2)(src2.x, src2.y);
short2 src2_data_1 = (short2)(src2.z, src2.x);
short2 src2_data_2 = (short2)(src2.y, src2.z);
short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
short2 tmp_data_0 = src1_data_0 | src2_data_0 ;
short2 tmp_data_1 = src1_data_1 | src2_data_1 ;
short2 tmp_data_2 = src1_data_2 | src2_data_2 ;
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_s_bitwise_or_C3_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
int src2_data_0 = src2.x;
int src2_data_1 = src2.y;
int src2_data_2 = src2.z;
int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
int tmp_data_0 = src1_data_0 | src2_data_0;
int tmp_data_1 = src1_data_1 | src2_data_1;
int tmp_data_2 = src1_data_2 | src2_data_2;
*((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
}
}
__kernel void arithm_s_bitwise_or_C3_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7);
char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
char4 tmp_data_0 = src1_data_0 | src2_data_0;
char4 tmp_data_1 = src1_data_1 | src2_data_1;
char4 tmp_data_2 = src1_data_2 | src2_data_2;
*((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0;
*((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1;
*((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_or_C3_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 ));
short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 ));
short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 ));
short4 data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8 ));
short4 data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16));
short4 tmp_data_0 = src1_data_0 | src2_data_0;
short4 tmp_data_1 = src1_data_1 | src2_data_1;
short4 tmp_data_2 = src1_data_2 | src2_data_2;
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
}
}
#endif
__kernel void arithm_s_bitwise_or_C4_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
uchar4 data = src_data1 | src2;
*((__global uchar4 *)(dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_or_C4_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
char4 src_data1 = *((__global char4 *)(src1 + src1_index));
char4 data = src_data1 | src2;
*((__global char4 *)(dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_or_C4_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
ushort4 data = src_data1 | src2;
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_or_C4_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
short4 data = src_data1 | src2;
*((__global short4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_or_C4_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 4) + dst_offset);
int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
int4 data = src_data1 | src2;
*((__global int4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_or_C4_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 4) + dst_offset);
char16 src_data1 = *((__global char16 *)((__global char *)src1 + src1_index));
char16 src_data2 = (char16)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7,
src2.s8, src2.s9, src2.sa, src2.sb, src2.sc, src2.sd, src2.se, src2.sf);
char16 data = src_data1 | src_data2;
*((__global char16 *)((__global char *)dst + dst_index)) = data;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_or_C4_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 5) + dst_offset);
short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0));
short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8));
short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
short4 src1_data_3 = *((__global short4 *)((__global char *)src1 + src1_index + 24));
short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf);
short4 tmp_data_0 = src1_data_0 | src2_data_0;
short4 tmp_data_1 = src1_data_1 | src2_data_1;
short4 tmp_data_2 = src1_data_2 | src2_data_2;
short4 tmp_data_3 = src1_data_3 | src2_data_3;
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
*((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
}
}
#endif

File diff suppressed because it is too large Load Diff

@ -1,340 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Jiang Liyuan, jlyuan001.good@163.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other GpuMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////BITWISE_XOR////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
/**************************************bitwise_xor without mask**************************************/
__kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = src1_data ^ src2_data;
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
*((__global uchar4 *)(dst + dst_index)) = dst_data;
}
}
__kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
char4 src1_data = vload4(0, src1 + src1_index_fix);
char4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0)
{
char4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
char4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
char4 dst_data = *((__global char4 *)(dst + dst_index));
char4 tmp_data = src1_data ^ src2_data;
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
*((__global char4 *)(dst + dst_index)) = dst_data;
}
}
__kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0)
{
ushort4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
ushort4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
ushort4 tmp_data = src1_data ^ src2_data;
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
*((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
}
}
__kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
x = x << 2;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
if(src1_index < 0)
{
short4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
short4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
short4 tmp_data = src1_data ^ src2_data;
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
*((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
}
}
__kernel void arithm_bitwise_xor_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
int data1 = *((__global int *)((__global char *)src1 + src1_index));
int data2 = *((__global int *)((__global char *)src2 + src2_index));
int tmp = data1 ^ data2;
*((__global int *)((__global char *)dst + dst_index)) = tmp;
}
}
__kernel void arithm_bitwise_xor_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
char4 data1 = *((__global char4 *)((__global char *)src1 + src1_index));
char4 data2 = *((__global char4 *)((__global char *)src2 + src2_index));
char4 tmp = data1 ^ data2;
*((__global char4 *)((__global char *)dst + dst_index)) = tmp;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_bitwise_xor_D6 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
char8 data1 = *((__global char8 *)((__global char *)src1 + src1_index));
char8 data2 = *((__global char8 *)((__global char *)src2 + src2_index));
*((__global char8 *)((__global char *)dst + dst_index)) = data1 ^ data2;
}
}
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -79,15 +79,73 @@
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
#endif
#define THREADS 256
#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2)
inline void update_dst_C1_D0(__global uchar *dst, __local uint* temp,
int dst_rows, int dst_cols,
int dst_startX, int dst_x_off,
float alpha)
{
if(get_local_id(0) < anX || get_local_id(0) >= (THREADS-ksX+anX+1))
{
return;
}
uint4 tmp_sum = 0;
int posX = dst_startX - dst_x_off + (get_local_id(0)-anX)*4;
int posY = (get_group_id(1) << 1);
for(int i=-anX; i<=anX; i++)
{
tmp_sum += vload4(get_local_id(0), temp+i);
}
if(posY < dst_rows && posX < dst_cols)
{
tmp_sum /= (uint4) alpha;
if(posX >= 0 && posX < dst_cols)
*(dst) = tmp_sum.x;
if(posX+1 >= 0 && posX+1 < dst_cols)
*(dst + 1) = tmp_sum.y;
if(posX+2 >= 0 && posX+2 < dst_cols)
*(dst + 2) = tmp_sum.z;
if(posX+3 >= 0 && posX+3 < dst_cols)
*(dst + 3) = tmp_sum.w;
}
}
inline void update_dst_C4_D0(__global uchar4 *dst, __local uint4* temp,
int dst_rows, int dst_cols,
int dst_startX, int dst_x_off,
float alpha)
{
if(get_local_id(0) >= (THREADS-ksX+1))
{
return;
}
int posX = dst_startX - dst_x_off + get_local_id(0);
int posY = (get_group_id(1) << 1);
uint4 temp_sum = 0;
for(int i=-anX; i<=anX; i++)
{
temp_sum += temp[get_local_id(0) + anX + i];
}
if(posX >= 0 && posX < dst_cols && posY >= 0 && posY < dst_rows)
*dst = convert_uchar4(convert_float4(temp_sum)/alpha);
}
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////8uC1////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
#define THREADS 256
#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2)
__kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global uchar *dst, float alpha,
int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
int dst_offset, int dst_rows, int dst_cols, int dst_step
)
int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
int dst_offset, int dst_rows, int dst_cols, int dst_step
)
{
int col = get_local_id(0);
@ -105,115 +163,84 @@ __kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global ucha
int dst_startY = (gY << 1) + dst_y_off;
uint4 data[ksY+1];
__local uint4 temp[(THREADS<<1)];
__local uint4 temp[2][THREADS];
#ifdef BORDER_CONSTANT
for(int i=0; i < ksY+1; i++)
for(int i=0; i < ksY+1; i++)
{
if(startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4+3<src_whole_cols)
{
if(startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4+3<src_whole_cols)
data[i] = convert_uint4(vload4(col,(__global uchar*)(src+(startY+i)*src_step + startX)));
else
{
data[i]=0;
int con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4<src_whole_cols;
if(con)data[i].s0 = *(src+(startY+i)*src_step + startX + col*4);
con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+1 >=0 && startX+col*4+1<src_whole_cols;
if(con)data[i].s1 = *(src+(startY+i)*src_step + startX + col*4+1) ;
con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+2 >=0 && startX+col*4+2<src_whole_cols;
if(con)data[i].s2 = *(src+(startY+i)*src_step + startX + col*4+2);
con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+3 >=0 && startX+col*4+3<src_whole_cols;
if(con)data[i].s3 = *(src+(startY+i)*src_step + startX + col*4+3);
}
data[i].x = *(src+(startY+i)*src_step + startX + col * 4);
data[i].y = *(src+(startY+i)*src_step + startX + col * 4 + 1);
data[i].z = *(src+(startY+i)*src_step + startX + col * 4 + 2);
data[i].w = *(src+(startY+i)*src_step + startX + col * 4 + 3);
}
else
{
data[i]=0;
int con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4<src_whole_cols;
if(con)data[i].s0 = *(src+(startY+i)*src_step + startX + col*4);
con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+1 >=0 && startX+col*4+1<src_whole_cols;
if(con)data[i].s1 = *(src+(startY+i)*src_step + startX + col*4+1) ;
con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+2 >=0 && startX+col*4+2<src_whole_cols;
if(con)data[i].s2 = *(src+(startY+i)*src_step + startX + col*4+2);
con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+3 >=0 && startX+col*4+3<src_whole_cols;
if(con)data[i].s3 = *(src+(startY+i)*src_step + startX + col*4+3);
}
}
#else
int not_all_in_range;
for(int i=0; i < ksY+1; i++)
{
not_all_in_range = (startX+col*4<0) | (startX+col*4+3>src_whole_cols-1)
| (startY+i<0) | (startY+i>src_whole_rows-1);
if(not_all_in_range)
{
int selected_row;
int4 selected_col;
selected_row = ADDR_H(startY+i, 0, src_whole_rows);
selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
selected_col.x = ADDR_L(startX+col*4, 0, src_whole_cols);
selected_col.x = ADDR_R(startX+col*4, src_whole_cols, selected_col.x);
selected_col.y = ADDR_L(startX+col*4+1, 0, src_whole_cols);
selected_col.y = ADDR_R(startX+col*4+1, src_whole_cols, selected_col.y);
selected_col.z = ADDR_L(startX+col*4+2, 0, src_whole_cols);
selected_col.z = ADDR_R(startX+col*4+2, src_whole_cols, selected_col.z);
selected_col.w = ADDR_L(startX+col*4+3, 0, src_whole_cols);
selected_col.w = ADDR_R(startX+col*4+3, src_whole_cols, selected_col.w);
data[i].x = *(src + selected_row * src_step + selected_col.x);
data[i].y = *(src + selected_row * src_step + selected_col.y);
data[i].z = *(src + selected_row * src_step + selected_col.z);
data[i].w = *(src + selected_row * src_step + selected_col.w);
}
else
{
data[i] = convert_uint4(vload4(col,(__global uchar*)(src+(startY+i)*src_step + startX)));
}
}
#endif
uint4 sum0 = 0, sum1 = 0, sum2 = 0;
for(int i=1; i < ksY; i++)
int not_all_in_range;
for(int i=0; i < ksY+1; i++)
{
sum0 += (data[i]);
}
sum1 = sum0 + (data[0]);
sum2 = sum0 + (data[ksY]);
not_all_in_range = (startX+col*4<0) | (startX+col*4+3>src_whole_cols-1)
| (startY+i<0) | (startY+i>src_whole_rows-1);
if(not_all_in_range)
{
int selected_row;
int4 selected_col;
selected_row = ADDR_H(startY+i, 0, src_whole_rows);
selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
temp[col] = sum1;
temp[col+THREADS] = sum2;
barrier(CLK_LOCAL_MEM_FENCE);
selected_col.x = ADDR_L(startX+col*4, 0, src_whole_cols);
selected_col.x = ADDR_R(startX+col*4, src_whole_cols, selected_col.x);
if(col >= anX && col < (THREADS-ksX+anX+1))
{
int posX = dst_startX - dst_x_off + (col-anX)*4;
int posY = (gY << 1);
uint4 tmp_sum1=0, tmp_sum2=0;
for(int i=-anX; i<=anX; i++)
{
tmp_sum1 += vload4(col, (__local uint*)temp+i);
}
selected_col.y = ADDR_L(startX+col*4+1, 0, src_whole_cols);
selected_col.y = ADDR_R(startX+col*4+1, src_whole_cols, selected_col.y);
for(int i=-anX; i<=anX; i++)
{
tmp_sum2 += vload4(col, (__local uint*)(temp+THREADS)+i);
}
selected_col.z = ADDR_L(startX+col*4+2, 0, src_whole_cols);
selected_col.z = ADDR_R(startX+col*4+2, src_whole_cols, selected_col.z);
if(posY < dst_rows && posX < dst_cols)
{
if(posX >= 0 && posX < dst_cols)
*(dst+dst_startY * dst_step + dst_startX + (col-anX)*4) = tmp_sum1.x/alpha;
if(posX+1 >= 0 && posX+1 < dst_cols)
*(dst+dst_startY * dst_step + dst_startX+1 + (col-anX)*4) = tmp_sum1.y/alpha;
if(posX+2 >= 0 && posX+2 < dst_cols)
*(dst+dst_startY * dst_step + dst_startX+2 + (col-anX)*4) = tmp_sum1.z/alpha;
if(posX+3 >= 0 && posX+3 < dst_cols)
*(dst+dst_startY * dst_step + dst_startX+3 + (col-anX)*4) = tmp_sum1.w/alpha;
selected_col.w = ADDR_L(startX+col*4+3, 0, src_whole_cols);
selected_col.w = ADDR_R(startX+col*4+3, src_whole_cols, selected_col.w);
data[i].x = *(src + selected_row * src_step + selected_col.x);
data[i].y = *(src + selected_row * src_step + selected_col.y);
data[i].z = *(src + selected_row * src_step + selected_col.z);
data[i].w = *(src + selected_row * src_step + selected_col.w);
}
if(posY+1 < dst_rows && posX < dst_cols)
else
{
dst_startY+=1;
if(posX >= 0 && posX < dst_cols)
*(dst+dst_startY * dst_step + dst_startX + (col-anX)*4) = tmp_sum2.x/alpha;
if(posX+1 >= 0 && posX+1 < dst_cols)
*(dst+dst_startY * dst_step + dst_startX+1 + (col-anX)*4) = tmp_sum2.y/alpha;
if(posX+2 >= 0 && posX+2 < dst_cols)
*(dst+dst_startY * dst_step + dst_startX+2 + (col-anX)*4) = tmp_sum2.z/alpha;
if(posX+3 >= 0 && posX+3 < dst_cols)
*(dst+dst_startY * dst_step + dst_startX+3 + (col-anX)*4) = tmp_sum2.w/alpha;
data[i] = convert_uint4(vload4(col,(__global uchar*)(src+(startY+i)*src_step + startX)));
}
}
#endif
uint4 tmp_sum = 0;
for(int i=1; i < ksY; i++)
{
tmp_sum += (data[i]);
}
int index = dst_startY * dst_step + dst_startX + (col-anX)*4;
temp[0][col] = tmp_sum + (data[0]);
temp[1][col] = tmp_sum + (data[ksY]);
barrier(CLK_LOCAL_MEM_FENCE);
update_dst_C1_D0(dst+index, (__local uint *)(temp[0]),
dst_rows, dst_cols, dst_startX, dst_x_off, alpha);
update_dst_C1_D0(dst+index+dst_step, (__local uint *)(temp[1]),
dst_rows, dst_cols, dst_startX, dst_x_off, alpha);
}
@ -221,9 +248,9 @@ __kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global ucha
/////////////////////////////////////////8uC4////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uchar4 *dst, float alpha,
int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
int dst_offset, int dst_rows, int dst_cols, int dst_step
)
int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
int dst_offset, int dst_rows, int dst_cols, int dst_step
)
{
int col = get_local_id(0);
const int gX = get_group_id(0);
@ -238,81 +265,63 @@ __kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uch
int startY = (gY << 1) - anY + src_y_off;
int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
int dst_startY = (gY << 1) + dst_y_off;
//int end_addr = (src_whole_rows-1)*(src_step>>2) + src_whole_cols-4;
int end_addr = src_whole_cols-4;
uint4 data[ksY+1];
__local uint4 temp[2][THREADS];
#ifdef BORDER_CONSTANT
bool con;
uint4 ss;
for(int i=0; i < ksY+1; i++)
{
con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
//int cur_addr = clamp((startY+i)*(src_step>>2)+(startX+col),0,end_addr);
//ss = convert_uint4(src[cur_addr]);
int cur_col = clamp(startX + col, 0, src_whole_cols);
if(con)
ss = convert_uint4(src[(startY+i)*(src_step>>2) + cur_col]);
data[i] = con ? ss : 0;
data[i].x = con ? src[(startY+i)*(src_step>>2) + cur_col].x : 0;
data[i].y = con ? src[(startY+i)*(src_step>>2) + cur_col].y : 0;
data[i].z = con ? src[(startY+i)*(src_step>>2) + cur_col].z : 0;
data[i].w = con ? src[(startY+i)*(src_step>>2) + cur_col].w : 0;
}
#else
for(int i=0; i < ksY+1; i++)
{
int selected_row;
int selected_col;
selected_row = ADDR_H(startY+i, 0, src_whole_rows);
selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
for(int i=0; i < ksY+1; i++)
{
int selected_row;
int selected_col;
selected_row = ADDR_H(startY+i, 0, src_whole_rows);
selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
selected_col = ADDR_L(startX+col, 0, src_whole_cols);
selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
selected_col = ADDR_L(startX+col, 0, src_whole_cols);
selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
data[i] = convert_uint4(src[selected_row * (src_step>>2) + selected_col]);
}
data[i] = convert_uint4(src[selected_row * (src_step>>2) + selected_col]);
}
#endif
uint4 sum0 = 0, sum1 = 0, sum2 = 0;
uint4 tmp_sum = 0;
for(int i=1; i < ksY; i++)
{
sum0 += (data[i]);
tmp_sum += (data[i]);
}
sum1 = sum0 + (data[0]);
sum2 = sum0 + (data[ksY]);
temp[0][col] = sum1;
temp[1][col] = sum2;
barrier(CLK_LOCAL_MEM_FENCE);
if(col < (THREADS-(ksX-1)))
{
col += anX;
int posX = dst_startX - dst_x_off + col - anX;
int posY = (gY << 1);
uint4 tmp_sum[2]={(uint4)(0,0,0,0),(uint4)(0,0,0,0)};
for(int k=0; k<2; k++)
for(int i=-anX; i<=anX; i++)
{
tmp_sum[k] += temp[k][col+i];
}
for(int i=0; i<2; i++)
{
if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows)
dst[(dst_startY+i) * (dst_step>>2)+ dst_startX + col - anX] = convert_uchar4(convert_float4(tmp_sum[i])/alpha);
}
int index = dst_startY * (dst_step>>2)+ dst_startX + col;
temp[0][col] = tmp_sum + (data[0]);
temp[1][col] = tmp_sum + (data[ksY]);
barrier(CLK_LOCAL_MEM_FENCE);
update_dst_C4_D0(dst+index, (__local uint4 *)(temp[0]),
dst_rows, dst_cols, dst_startX, dst_x_off, alpha);
update_dst_C4_D0(dst+index+(dst_step>>2), (__local uint4 *)(temp[1]),
dst_rows, dst_cols, dst_startX, dst_x_off, alpha);
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////32fC1////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void boxFilter_C1_D5(__global const float *restrict src, __global float *dst, float alpha,
int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
int dst_offset, int dst_rows, int dst_cols, int dst_step
)
int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
int dst_offset, int dst_rows, int dst_cols, int dst_step
)
{
int col = get_local_id(0);
const int gX = get_group_id(0);
@ -327,7 +336,6 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float
int startY = (gY << 1) - anY + src_y_off;
int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
int dst_startY = (gY << 1) + dst_y_off;
int end_addr = (src_whole_rows-1)*(src_step>>2) + src_whole_cols-4;
float data[ksY+1];
__local float temp[2][THREADS];
#ifdef BORDER_CONSTANT
@ -336,28 +344,25 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float
for(int i=0; i < ksY+1; i++)
{
con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
//int cur_addr = clamp((startY+i)*(src_step>>2)+(startX+col),0,end_addr);
//ss = src[cur_addr];
int cur_col = clamp(startX + col, 0, src_whole_cols);
//ss = src[(startY+i)*(src_step>>2) + cur_col];
ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>2) + cur_col]:0;
ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>2) + cur_col]:(float)0;
data[i] = con ? ss : 0.f;
}
#else
for(int i=0; i < ksY+1; i++)
{
int selected_row;
int selected_col;
selected_row = ADDR_H(startY+i, 0, src_whole_rows);
selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
for(int i=0; i < ksY+1; i++)
{
int selected_row;
int selected_col;
selected_row = ADDR_H(startY+i, 0, src_whole_rows);
selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
selected_col = ADDR_L(startX+col, 0, src_whole_cols);
selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
selected_col = ADDR_L(startX+col, 0, src_whole_cols);
selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
data[i] = src[selected_row * (src_step>>2) + selected_col];
}
data[i] = src[selected_row * (src_step>>2) + selected_col];
}
#endif
float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
@ -376,7 +381,7 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float
int posX = dst_startX - dst_x_off + col - anX;
int posY = (gY << 1);
float tmp_sum[2]={0.0, 0.0};
float tmp_sum[2]= {0.0, 0.0};
for(int k=0; k<2; k++)
for(int i=-anX; i<=anX; i++)
{
@ -395,9 +400,9 @@ __kernel void boxFilter_C1_D5(__global const float *restrict src, __global float
/////////////////////////////////////////32fC4////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global float4 *dst, float alpha,
int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
int dst_offset, int dst_rows, int dst_cols, int dst_step
)
int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
int dst_offset, int dst_rows, int dst_cols, int dst_step
)
{
int col = get_local_id(0);
const int gX = get_group_id(0);
@ -412,7 +417,6 @@ __kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global floa
int startY = (gY << 1) - anY + src_y_off;
int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
int dst_startY = (gY << 1) + dst_y_off;
int end_addr = (src_whole_rows-1)*(src_step>>4) + src_whole_cols-16;
float4 data[ksY+1];
__local float4 temp[2][THREADS];
#ifdef BORDER_CONSTANT
@ -421,28 +425,25 @@ __kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global floa
for(int i=0; i < ksY+1; i++)
{
con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
//int cur_addr = clamp((startY+i)*(src_step>>4)+(startX+col),0,end_addr);
//ss = src[cur_addr];
int cur_col = clamp(startX + col, 0, src_whole_cols);
//ss = src[(startY+i)*(src_step>>4) + cur_col];
ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>4) + cur_col]:0;
ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>4) + cur_col]:(float4)0;
data[i] = con ? ss : (float4)(0.0,0.0,0.0,0.0);
}
#else
for(int i=0; i < ksY+1; i++)
{
int selected_row;
int selected_col;
selected_row = ADDR_H(startY+i, 0, src_whole_rows);
selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
for(int i=0; i < ksY+1; i++)
{
int selected_row;
int selected_col;
selected_row = ADDR_H(startY+i, 0, src_whole_rows);
selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
selected_col = ADDR_L(startX+col, 0, src_whole_cols);
selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
selected_col = ADDR_L(startX+col, 0, src_whole_cols);
selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
data[i] = src[selected_row * (src_step>>4) + selected_col];
}
data[i] = src[selected_row * (src_step>>4) + selected_col];
}
#endif
float4 sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
@ -461,7 +462,7 @@ __kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global floa
int posX = dst_startX - dst_x_off + col - anX;
int posY = (gY << 1);
float4 tmp_sum[2]={(float4)(0.0,0.0,0.0,0.0), (float4)(0.0,0.0,0.0,0.0)};
float4 tmp_sum[2]= {(float4)(0.0,0.0,0.0,0.0), (float4)(0.0,0.0,0.0,0.0)};
for(int k=0; k<2; k++)
for(int i=-anX; i<=anX; i++)
{

@ -112,7 +112,7 @@ typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade
} GpuHidHaarClassifierCascade;
__kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCascade(//constant GpuHidHaarClassifierCascade * cascade,
__kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCascade(
global GpuHidHaarStageClassifier * stagecascadeptr,
global int4 * info,
global GpuHidHaarTreeNode * nodeptr,
@ -128,12 +128,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
const int splitnode,
const int4 p,
const int4 pq,
const float correction
//const int width,
//const int height,
//const int grpnumperline,
//const int totalgrp
)
const float correction)
{
int grpszx = get_local_size(0);
int grpszy = get_local_size(1);
@ -145,13 +140,8 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
int lcl_sz = mul24(grpszx,grpszy);
int lcl_id = mad24(lclidy,grpszx,lclidx);
//assume lcl_sz == 256 or 128 or 64
//int lcl_sz_shift = (lcl_sz == 256) ? 8 : 7;
//lcl_sz_shift = (lcl_sz == 64) ? 6 : lcl_sz_shift;
__local int lclshare[1024];
#define OFF 0
__local int* lcldata = lclshare + OFF;//for save win data
__local int* lcldata = lclshare;//for save win data
__local int* glboutindex = lcldata + 28*28;//for save global out index
__local int* lclcount = glboutindex + 1;//for save the numuber of temp pass pixel
__local int* lcloutindex = lclcount + 1;//for save info of temp pass pixel
@ -181,7 +171,6 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
int totalgrp = scaleinfo1.y & 0xffff;
int imgoff = scaleinfo1.z;
float factor = as_float(scaleinfo1.w);
//int ystep =1;// factor > 2.0 ? 1 : 2;
__global const int * sum = sum1 + imgoff;
__global const float * sqsum = sqsum1 + imgoff;
@ -191,8 +180,6 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
int grpidx = grploop - mul24(grpidy, grpnumperline);
int x = mad24(grpidx,grpszx,lclidx);
int y = mad24(grpidy,grpszy,lclidy);
//candidate_result.x = convert_int_rtn(x*factor);
//candidate_result.y = convert_int_rtn(y*factor);
int grpoffx = x-lclidx;
int grpoffy = y-lclidy;
@ -207,18 +194,11 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
int glb_x = grpoffx + (lcl_x<<2);
int glb_y = grpoffy + lcl_y;
int glb_off = mad24(glb_y,pixelstep,glb_x);
int glb_off = mad24(min(glb_y, height - 1),pixelstep,glb_x);
int4 data = *(__global int4*)&sum[glb_off];
int lcl_off = mad24(lcl_y, readwidth, lcl_x<<2);
#if OFF
lcldata[lcl_off] = data.x;
lcldata[lcl_off+1] = data.y;
lcldata[lcl_off+2] = data.z;
lcldata[lcl_off+3] = data.w;
#else
vstore4(data, 0, &lcldata[lcl_off]);
#endif
}
lcloutindex[lcl_id] = 0;
@ -231,184 +211,170 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
int lcl_off = mad24(lclidy,readwidth,lclidx);
int4 cascadeinfo1, cascadeinfo2;
cascadeinfo1 = p;
cascadeinfo2 = pq;// + mad24(y, pixelstep, x);
cascadeinfo2 = pq;
cascadeinfo1.x +=lcl_off;
cascadeinfo1.z +=lcl_off;
mean = (lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.x)] - lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.z)] -
lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.x)] + lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.z)])
*correction;
//if((x < width) && (y < height))
{
cascadeinfo1.x +=lcl_off;
cascadeinfo1.z +=lcl_off;
mean = (lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.x)] - lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.z)] -
lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.x)] + lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.z)])
*correction;
int p_offset = mad24(y, pixelstep, x);
cascadeinfo2.x +=p_offset;
cascadeinfo2.z +=p_offset;
variance_norm_factor =sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.x)] - sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.z)] -
sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.x)] + sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.z)];
variance_norm_factor = variance_norm_factor * correction - mean * mean;
variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1.f;
//if( cascade->is_stump_based )
//{
for(int stageloop = start_stage; (stageloop < split_stage) && result; stageloop++ )
{
float stage_sum = 0.f;
int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
float stagethreshold = as_float(stageinfo.y);
for(int nodeloop = 0; nodeloop < stageinfo.x; nodeloop++ )
{
__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter);
int p_offset = mad24(y, pixelstep, x);
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
float nodethreshold = w.w * variance_norm_factor;
cascadeinfo2.x +=p_offset;
cascadeinfo2.z +=p_offset;
variance_norm_factor =sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.x)] - sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.z)] -
sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.x)] + sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.z)];
info1.x +=lcl_off;
info1.z +=lcl_off;
info2.x +=lcl_off;
info2.z +=lcl_off;
variance_norm_factor = variance_norm_factor * correction - mean * mean;
variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1.f;
float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] -
lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
for(int stageloop = start_stage; (stageloop < split_stage) && result; stageloop++ )
{
float stage_sum = 0.f;
int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
float stagethreshold = as_float(stageinfo.y);
for(int nodeloop = 0; nodeloop < stageinfo.x; nodeloop++ )
{
__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter);
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
float nodethreshold = w.w * variance_norm_factor;
classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] -
lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
info1.x +=lcl_off;
info1.z +=lcl_off;
info2.x +=lcl_off;
info2.z +=lcl_off;
float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] -
lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
//if((info3.z - info3.x) && (!stageinfo.z))
//{
info3.x +=lcl_off;
info3.z +=lcl_off;
classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
//}
stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
nodecounter++;
}
classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] -
lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
result = (stage_sum >= stagethreshold);
}
info3.x +=lcl_off;
info3.z +=lcl_off;
classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
if(result && (x < width) && (y < height))
{
int queueindex = atomic_inc(lclcount);
lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx;
lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
nodecounter++;
}
result = (stage_sum >= stagethreshold);
}
if(result && (x < width) && (y < height))
{
int queueindex = atomic_inc(lclcount);
lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx;
lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
}
barrier(CLK_LOCAL_MEM_FENCE);
int queuecount = lclcount[0];
barrier(CLK_LOCAL_MEM_FENCE);
nodecounter = splitnode;
for(int stageloop = split_stage; stageloop< end_stage && queuecount>0; stageloop++)
{
lclcount[0]=0;
barrier(CLK_LOCAL_MEM_FENCE);
int queuecount = lclcount[0];
barrier(CLK_LOCAL_MEM_FENCE);
nodecounter = splitnode;
for(int stageloop = split_stage; stageloop< end_stage && queuecount>0; stageloop++)
{
//barrier(CLK_LOCAL_MEM_FENCE);
//if(lcl_id == 0)
lclcount[0]=0;
barrier(CLK_LOCAL_MEM_FENCE);
int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
float stagethreshold = as_float(stageinfo.y);
int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
float stagethreshold = as_float(stageinfo.y);
int perfscale = queuecount > 4 ? 3 : 2;
int queuecount_loop = (queuecount + (1<<perfscale)-1) >> perfscale;
int lcl_compute_win = lcl_sz >> perfscale;
int lcl_compute_win_id = (lcl_id >>(6-perfscale));
int lcl_loops = (stageinfo.x + lcl_compute_win -1) >> (6-perfscale);
int lcl_compute_id = lcl_id - (lcl_compute_win_id << (6-perfscale));
for(int queueloop=0; queueloop<queuecount_loop/* && lcl_compute_win_id < queuecount*/; queueloop++)
int perfscale = queuecount > 4 ? 3 : 2;
int queuecount_loop = (queuecount + (1<<perfscale)-1) >> perfscale;
int lcl_compute_win = lcl_sz >> perfscale;
int lcl_compute_win_id = (lcl_id >>(6-perfscale));
int lcl_loops = (stageinfo.x + lcl_compute_win -1) >> (6-perfscale);
int lcl_compute_id = lcl_id - (lcl_compute_win_id << (6-perfscale));
for(int queueloop=0; queueloop<queuecount_loop; queueloop++)
{
float stage_sum = 0.f;
int temp_coord = lcloutindex[lcl_compute_win_id<<1];
float variance_norm_factor = as_float(lcloutindex[(lcl_compute_win_id<<1)+1]);
int queue_pixel = mad24(((temp_coord & (int)0xffff0000)>>16),readwidth,temp_coord & 0xffff);
if(lcl_compute_win_id < queuecount)
{
float stage_sum = 0.f;
int temp_coord = lcloutindex[lcl_compute_win_id<<1];
float variance_norm_factor = as_float(lcloutindex[(lcl_compute_win_id<<1)+1]);
int queue_pixel = mad24(((temp_coord & (int)0xffff0000)>>16),readwidth,temp_coord & 0xffff);
//barrier(CLK_LOCAL_MEM_FENCE);
if(lcl_compute_win_id < queuecount)
int tempnodecounter = lcl_compute_id;
float part_sum = 0.f;
for(int lcl_loop=0; lcl_loop<lcl_loops && tempnodecounter<stageinfo.x; lcl_loop++)
{
__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter + tempnodecounter);
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
float nodethreshold = w.w * variance_norm_factor;
info1.x +=queue_pixel;
info1.z +=queue_pixel;
info2.x +=queue_pixel;
info2.z +=queue_pixel;
float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] -
lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
int tempnodecounter = lcl_compute_id;
float part_sum = 0.f;
for(int lcl_loop=0; lcl_loop<lcl_loops && tempnodecounter<stageinfo.x; lcl_loop++)
{
__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter + tempnodecounter);
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
float2 alpha2 = *(__global float2*)(&(currentnodeptr->alpha[0]));
float nodethreshold = w.w * variance_norm_factor;
info1.x +=queue_pixel;
info1.z +=queue_pixel;
info2.x +=queue_pixel;
info2.z +=queue_pixel;
float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] -
lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] -
lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
//if((info3.z - info3.x) && (!stageinfo.z))
//{
info3.x +=queue_pixel;
info3.z +=queue_pixel;
classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
//}
part_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
tempnodecounter +=lcl_compute_win;
}//end for(int lcl_loop=0;lcl_loop<lcl_loops;lcl_loop++)
partialsum[lcl_id]=part_sum;
classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] -
lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
info3.x +=queue_pixel;
info3.z +=queue_pixel;
classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
part_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
tempnodecounter +=lcl_compute_win;
}//end for(int lcl_loop=0;lcl_loop<lcl_loops;lcl_loop++)
partialsum[lcl_id]=part_sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(lcl_compute_win_id < queuecount)
{
for(int i=0; i<lcl_compute_win && (lcl_compute_id==0); i++)
{
stage_sum += partialsum[lcl_id+i];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(lcl_compute_win_id < queuecount)
if(stage_sum >= stagethreshold && (lcl_compute_id==0))
{
for(int i=0; i<lcl_compute_win && (lcl_compute_id==0); i++)
{
stage_sum += partialsum[lcl_id+i];
}
if(stage_sum >= stagethreshold && (lcl_compute_id==0))
{
int queueindex = atomic_inc(lclcount);
lcloutindex[queueindex<<1] = temp_coord;
lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
}
lcl_compute_win_id +=(1<<perfscale);
int queueindex = atomic_inc(lclcount);
lcloutindex[queueindex<<1] = temp_coord;
lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
}
barrier(CLK_LOCAL_MEM_FENCE);
}//end for(int queueloop=0;queueloop<queuecount_loop;queueloop++)
//barrier(CLK_LOCAL_MEM_FENCE);
queuecount = lclcount[0];
lcl_compute_win_id +=(1<<perfscale);
}
barrier(CLK_LOCAL_MEM_FENCE);
nodecounter += stageinfo.x;
}//end for(int stageloop = splitstage; stageloop< endstage && queuecount>0;stageloop++)
//barrier(CLK_LOCAL_MEM_FENCE);
if(lcl_id<queuecount)
{
int temp = lcloutindex[lcl_id<<1];
int x = mad24(grpidx,grpszx,temp & 0xffff);
int y = mad24(grpidy,grpszy,((temp & (int)0xffff0000) >> 16));
temp = glboutindex[0];
int4 candidate_result;
candidate_result.zw = (int2)convert_int_rtn(factor*20.f);
candidate_result.x = convert_int_rtn(x*factor);
candidate_result.y = convert_int_rtn(y*factor);
atomic_inc(glboutindex);
candidate[outputoff+temp+lcl_id] = candidate_result;
}
}//end for(int queueloop=0;queueloop<queuecount_loop;queueloop++)
queuecount = lclcount[0];
barrier(CLK_LOCAL_MEM_FENCE);
}//end if((x < width) && (y < height))
nodecounter += stageinfo.x;
}//end for(int stageloop = splitstage; stageloop< endstage && queuecount>0;stageloop++)
if(lcl_id<queuecount)
{
int temp = lcloutindex[lcl_id<<1];
int x = mad24(grpidx,grpszx,temp & 0xffff);
int y = mad24(grpidy,grpszy,((temp & (int)0xffff0000) >> 16));
temp = glboutindex[0];
int4 candidate_result;
candidate_result.zw = (int2)convert_int_rtn(factor*20.f);
candidate_result.x = convert_int_rtn(x*factor);
candidate_result.y = convert_int_rtn(y*factor);
atomic_inc(glboutindex);
candidate[outputoff+temp+lcl_id] = candidate_result;
}
barrier(CLK_LOCAL_MEM_FENCE);
}//end for(int grploop=grpidx;grploop<totalgrp;grploop+=grpnumx)
//outputoff +=mul24(width,height);
}//end for(int scalei = 0; scalei <loopcount; scalei++)
}

@ -16,6 +16,7 @@
//
// @Authors
// Wu Xinglong, wxl370@126.com
// Sen Liu, swjtuls1987@126.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
@ -52,11 +53,11 @@ typedef struct __attribute__((aligned(128))) GpuHidHaarFeature
{
struct __attribute__((aligned(32)))
{
int p0 __attribute__((aligned(4)));
int p1 __attribute__((aligned(4)));
int p2 __attribute__((aligned(4)));
int p3 __attribute__((aligned(4)));
float weight __attribute__((aligned(4)));
int p0 __attribute__((aligned(4)));
int p1 __attribute__((aligned(4)));
int p2 __attribute__((aligned(4)));
int p3 __attribute__((aligned(4)));
float weight __attribute__((aligned(4)));
}
rect[CV_HAAR_FEATURE_MAX] __attribute__((aligned(32)));
}
@ -113,173 +114,168 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
global const int *restrict sum,
global const float *restrict sqsum,
global int4 *candidate,
const int rows,
const int cols,
const int step,
const int loopcount,
const int start_stage,
const int split_stage,
const int end_stage,
const int startnode,
const int splitnode,
global int4 *p,
//const int4 * pq,
global float *correction,
const int nodecount)
{
int grpszx = get_local_size(0);
int grpszy = get_local_size(1);
int grpnumx = get_num_groups(0);
int grpidx = get_group_id(0);
int lclidx = get_local_id(0);
int lclidy = get_local_id(1);
int lcl_sz = mul24(grpszx, grpszy);
int lcl_id = mad24(lclidy, grpszx, lclidx);
__local int lclshare[1024];
__local int *glboutindex = lclshare + 0;
__local int *lclcount = glboutindex + 1;
__local int *lcloutindex = lclcount + 1;
__local float *partialsum = (__local float *)(lcloutindex + (lcl_sz << 1));
glboutindex[0] = 0;
int outputoff = mul24(grpidx, 256);
candidate[outputoff + (lcl_id << 2)] = (int4)0;
candidate[outputoff + (lcl_id << 2) + 1] = (int4)0;
candidate[outputoff + (lcl_id << 2) + 2] = (int4)0;
candidate[outputoff + (lcl_id << 2) + 3] = (int4)0;
int grpszx = get_local_size(0);
int grpszy = get_local_size(1);
int grpnumx = get_num_groups(0);
int grpidx = get_group_id(0);
int lclidx = get_local_id(0);
int lclidy = get_local_id(1);
int lcl_sz = mul24(grpszx, grpszy);
int lcl_id = mad24(lclidy, grpszx, lclidx);
__local int glboutindex[1];
__local int lclcount[1];
__local int lcloutindex[64];
glboutindex[0] = 0;
int outputoff = mul24(grpidx, 256);
candidate[outputoff + (lcl_id << 2)] = (int4)0;
candidate[outputoff + (lcl_id << 2) + 1] = (int4)0;
candidate[outputoff + (lcl_id << 2) + 2] = (int4)0;
candidate[outputoff + (lcl_id << 2) + 3] = (int4)0;
int max_idx = rows * cols - 1;
for (int scalei = 0; scalei < loopcount; scalei++)
{
int4 scaleinfo1;
scaleinfo1 = info[scalei];
int width = (scaleinfo1.x & 0xffff0000) >> 16;
int height = scaleinfo1.x & 0xffff;
int grpnumperline = (scaleinfo1.y & 0xffff0000) >> 16;
int totalgrp = scaleinfo1.y & 0xffff;
float factor = as_float(scaleinfo1.w);
float correction_t = correction[scalei];
int ystep = (int)(max(2.0f, factor) + 0.5f);
for (int scalei = 0; scalei < loopcount; scalei++)
for (int grploop = get_group_id(0); grploop < totalgrp; grploop += grpnumx)
{
int4 scaleinfo1;
scaleinfo1 = info[scalei];
int width = (scaleinfo1.x & 0xffff0000) >> 16;
int height = scaleinfo1.x & 0xffff;
int grpnumperline = (scaleinfo1.y & 0xffff0000) >> 16;
int totalgrp = scaleinfo1.y & 0xffff;
float factor = as_float(scaleinfo1.w);
float correction_t = correction[scalei];
int ystep = (int)(max(2.0f, factor) + 0.5f);
int4 cascadeinfo = p[scalei];
int grpidy = grploop / grpnumperline;
int grpidx = grploop - mul24(grpidy, grpnumperline);
int ix = mad24(grpidx, grpszx, lclidx);
int iy = mad24(grpidy, grpszy, lclidy);
int x = ix * ystep;
int y = iy * ystep;
lcloutindex[lcl_id] = 0;
lclcount[0] = 0;
int nodecounter;
float mean, variance_norm_factor;
//if((ix < width) && (iy < height))
{
const int p_offset = mad24(y, step, x);
cascadeinfo.x += p_offset;
cascadeinfo.z += p_offset;
mean = (sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)] - sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)] + sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)])
* correction_t;
variance_norm_factor = sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)] - sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)] + sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)];
variance_norm_factor = variance_norm_factor * correction_t - mean * mean;
variance_norm_factor = variance_norm_factor >= 0.f ? sqrt(variance_norm_factor) : 1.f;
bool result = true;
nodecounter = startnode + nodecount * scalei;
for (int grploop = get_group_id(0); grploop < totalgrp; grploop += grpnumx)
for (int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++)
{
int4 cascadeinfo = p[scalei];
int grpidy = grploop / grpnumperline;
int grpidx = grploop - mul24(grpidy, grpnumperline);
int ix = mad24(grpidx, grpszx, lclidx);
int iy = mad24(grpidy, grpszy, lclidy);
int x = ix * ystep;
int y = iy * ystep;
lcloutindex[lcl_id] = 0;
lclcount[0] = 0;
int result = 1, nodecounter;
float mean, variance_norm_factor;
//if((ix < width) && (iy < height))
{
const int p_offset = mad24(y, step, x);
cascadeinfo.x += p_offset;
cascadeinfo.z += p_offset;
mean = (sum[mad24(cascadeinfo.y, step, cascadeinfo.x)] - sum[mad24(cascadeinfo.y, step, cascadeinfo.z)] -
sum[mad24(cascadeinfo.w, step, cascadeinfo.x)] + sum[mad24(cascadeinfo.w, step, cascadeinfo.z)])
* correction_t;
variance_norm_factor = sqsum[mad24(cascadeinfo.y, step, cascadeinfo.x)] - sqsum[mad24(cascadeinfo.y, step, cascadeinfo.z)] -
sqsum[mad24(cascadeinfo.w, step, cascadeinfo.x)] + sqsum[mad24(cascadeinfo.w, step, cascadeinfo.z)];
variance_norm_factor = variance_norm_factor * correction_t - mean * mean;
variance_norm_factor = variance_norm_factor >= 0.f ? sqrt(variance_norm_factor) : 1.f;
result = 1;
nodecounter = startnode + nodecount * scalei;
for (int stageloop = start_stage; stageloop < end_stage && result; stageloop++)
{
float stage_sum = 0.f;
int4 stageinfo = *(global int4 *)(stagecascadeptr + stageloop);
float stagethreshold = as_float(stageinfo.y);
for (int nodeloop = 0; nodeloop < stageinfo.x; nodeloop++)
{
__global GpuHidHaarTreeNode *currentnodeptr = (nodeptr + nodecounter);
int4 info1 = *(__global int4 *)(&(currentnodeptr->p[0][0]));
int4 info2 = *(__global int4 *)(&(currentnodeptr->p[1][0]));
int4 info3 = *(__global int4 *)(&(currentnodeptr->p[2][0]));
float4 w = *(__global float4 *)(&(currentnodeptr->weight[0]));
float2 alpha2 = *(__global float2 *)(&(currentnodeptr->alpha[0]));
float nodethreshold = w.w * variance_norm_factor;
info1.x += p_offset;
info1.z += p_offset;
info2.x += p_offset;
info2.z += p_offset;
float classsum = (sum[mad24(info1.y, step, info1.x)] - sum[mad24(info1.y, step, info1.z)] -
sum[mad24(info1.w, step, info1.x)] + sum[mad24(info1.w, step, info1.z)]) * w.x;
classsum += (sum[mad24(info2.y, step, info2.x)] - sum[mad24(info2.y, step, info2.z)] -
sum[mad24(info2.w, step, info2.x)] + sum[mad24(info2.w, step, info2.z)]) * w.y;
info3.x += p_offset;
info3.z += p_offset;
classsum += (sum[mad24(info3.y, step, info3.x)] - sum[mad24(info3.y, step, info3.z)] -
sum[mad24(info3.w, step, info3.x)] + sum[mad24(info3.w, step, info3.z)]) * w.z;
stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
nodecounter++;
}
result = (stage_sum >= stagethreshold);
}
float stage_sum = 0.f;
int stagecount = stagecascadeptr[stageloop].count;
for (int nodeloop = 0; nodeloop < stagecount; nodeloop++)
{
__global GpuHidHaarTreeNode *currentnodeptr = (nodeptr + nodecounter);
int4 info1 = *(__global int4 *)(&(currentnodeptr->p[0][0]));
int4 info2 = *(__global int4 *)(&(currentnodeptr->p[1][0]));
int4 info3 = *(__global int4 *)(&(currentnodeptr->p[2][0]));
float4 w = *(__global float4 *)(&(currentnodeptr->weight[0]));
float2 alpha2 = *(__global float2 *)(&(currentnodeptr->alpha[0]));
float nodethreshold = w.w * variance_norm_factor;
info1.x += p_offset;
info1.z += p_offset;
info2.x += p_offset;
info2.z += p_offset;
float classsum = (sum[clamp(mad24(info1.y, step, info1.x), 0, max_idx)] - sum[clamp(mad24(info1.y, step, info1.z), 0, max_idx)] -
sum[clamp(mad24(info1.w, step, info1.x), 0, max_idx)] + sum[clamp(mad24(info1.w, step, info1.z), 0, max_idx)]) * w.x;
classsum += (sum[clamp(mad24(info2.y, step, info2.x), 0, max_idx)] - sum[clamp(mad24(info2.y, step, info2.z), 0, max_idx)] -
sum[clamp(mad24(info2.w, step, info2.x), 0, max_idx)] + sum[clamp(mad24(info2.w, step, info2.z), 0, max_idx)]) * w.y;
info3.x += p_offset;
info3.z += p_offset;
classsum += (sum[clamp(mad24(info3.y, step, info3.x), 0, max_idx)] - sum[clamp(mad24(info3.y, step, info3.z), 0, max_idx)] -
sum[clamp(mad24(info3.w, step, info3.x), 0, max_idx)] + sum[clamp(mad24(info3.w, step, info3.z), 0, max_idx)]) * w.z;
stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
nodecounter++;
}
result = (bool)(stage_sum >= stagecascadeptr[stageloop].threshold);
}
if (result && (ix < width) && (iy < height))
{
int queueindex = atomic_inc(lclcount);
lcloutindex[queueindex << 1] = (y << 16) | x;
lcloutindex[(queueindex << 1) + 1] = as_int(variance_norm_factor);
}
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
int queuecount = lclcount[0];
nodecounter = splitnode + nodecount * scalei;
if (result && (ix < width) && (iy < height))
{
int queueindex = atomic_inc(lclcount);
lcloutindex[queueindex] = (y << 16) | x;
}
if (lcl_id < queuecount)
{
int temp = lcloutindex[lcl_id << 1];
int x = temp & 0xffff;
int y = (temp & (int)0xffff0000) >> 16;
temp = glboutindex[0];
int4 candidate_result;
candidate_result.zw = (int2)convert_int_rtn(factor * 20.f);
candidate_result.x = x;
candidate_result.y = y;
atomic_inc(glboutindex);
candidate[outputoff + temp + lcl_id] = candidate_result;
}
barrier(CLK_LOCAL_MEM_FENCE);
int queuecount = lclcount[0];
barrier(CLK_LOCAL_MEM_FENCE);
}
if (lcl_id < queuecount)
{
int temp = lcloutindex[lcl_id];
int x = temp & 0xffff;
int y = (temp & (int)0xffff0000) >> 16;
temp = atomic_inc(glboutindex);
int4 candidate_result;
candidate_result.zw = (int2)convert_int_rtn(factor * 20.f);
candidate_result.x = x;
candidate_result.y = y;
candidate[outputoff + temp + lcl_id] = candidate_result;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
}
}
__kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuHidHaarTreeNode *newnode, float scale, float weight_scale, int nodenum)
{
int counter = get_global_id(0);
int tr_x[3], tr_y[3], tr_h[3], tr_w[3], i = 0;
GpuHidHaarTreeNode t1 = *(orinode + counter);
int counter = get_global_id(0);
int tr_x[3], tr_y[3], tr_h[3], tr_w[3], i = 0;
GpuHidHaarTreeNode t1 = *(orinode + counter);
#pragma unroll
for (i = 0; i < 3; i++)
{
tr_x[i] = (int)(t1.p[i][0] * scale + 0.5f);
tr_y[i] = (int)(t1.p[i][1] * scale + 0.5f);
tr_w[i] = (int)(t1.p[i][2] * scale + 0.5f);
tr_h[i] = (int)(t1.p[i][3] * scale + 0.5f);
}
for (i = 0; i < 3; i++)
{
tr_x[i] = (int)(t1.p[i][0] * scale + 0.5f);
tr_y[i] = (int)(t1.p[i][1] * scale + 0.5f);
tr_w[i] = (int)(t1.p[i][2] * scale + 0.5f);
tr_h[i] = (int)(t1.p[i][3] * scale + 0.5f);
}
t1.weight[0] = t1.p[2][0] ? -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]) : -t1.weight[1] * tr_h[1] * tr_w[1] / (tr_h[0] * tr_w[0]);
counter += nodenum;
t1.weight[0] = t1.p[2][0] ? -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]) : -t1.weight[1] * tr_h[1] * tr_w[1] / (tr_h[0] * tr_w[0]);
counter += nodenum;
#pragma unroll
for (i = 0; i < 3; i++)
{
newnode[counter].p[i][0] = tr_x[i];
newnode[counter].p[i][1] = tr_y[i];
newnode[counter].p[i][2] = tr_x[i] + tr_w[i];
newnode[counter].p[i][3] = tr_y[i] + tr_h[i];
newnode[counter].weight[i] = t1.weight[i] * weight_scale;
}
for (i = 0; i < 3; i++)
{
newnode[counter].p[i][0] = tr_x[i];
newnode[counter].p[i][1] = tr_y[i];
newnode[counter].p[i][2] = tr_x[i] + tr_w[i];
newnode[counter].p[i][3] = tr_y[i] + tr_h[i];
newnode[counter].weight[i] = t1.weight[i] * weight_scale;
}
newnode[counter].left = t1.left;
newnode[counter].right = t1.right;
newnode[counter].threshold = t1.threshold;
newnode[counter].alpha[0] = t1.alpha[0];
newnode[counter].alpha[1] = t1.alpha[1];
newnode[counter].left = t1.left;
newnode[counter].right = t1.right;
newnode[counter].threshold = t1.threshold;
newnode[counter].alpha[0] = t1.alpha[0];
newnode[counter].alpha[1] = t1.alpha[1];
}

@ -60,7 +60,7 @@
#define GET_CONFLICT_OFFSET(lid) ((lid) >> LOG_NUM_BANKS)
kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float *sqsum,
kernel void integral_cols_D4(__global uchar4 *src,__global int *sum ,__global float *sqsum,
int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
{
unsigned int lid = get_local_id(0);
@ -159,7 +159,7 @@ kernel void integral_cols(__global uchar4 *src,__global int *sum ,__global float
}
kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__global int *sum ,
kernel void integral_rows_D4(__global int4 *srcsum,__global float4 * srcsqsum,__global int *sum ,
__global float *sqsum,int rows,int cols,int src_step,int sum_step,
int sqsum_step,int sum_offset,int sqsum_offset)
{
@ -275,3 +275,219 @@ kernel void integral_rows(__global int4 *srcsum,__global float4 * srcsqsum,__glo
barrier(CLK_LOCAL_MEM_FENCE);
}
}
kernel void integral_cols_D5(__global uchar4 *src,__global float *sum ,__global float *sqsum,
int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
{
unsigned int lid = get_local_id(0);
unsigned int gid = get_group_id(0);
float4 src_t[2], sum_t[2];
float4 sqsum_t[2];
__local float4 lm_sum[2][LSIZE + LOG_LSIZE];
__local float4 lm_sqsum[2][LSIZE + LOG_LSIZE];
__local float* sum_p;
__local float* sqsum_p;
src_step = src_step >> 2;
gid = gid << 1;
for(int i = 0; i < rows; i =i + LSIZE_1)
{
src_t[0] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid, (uint)cols - 1)]) : (float4)0);
src_t[1] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid + 1, (uint)cols - 1)]) : (float4)0);
sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
sum_t[1] = (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
barrier(CLK_LOCAL_MEM_FENCE);
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
lm_sum[0][bf_loc] = src_t[0];
lm_sqsum[0][bf_loc] = convert_float4(src_t[0] * src_t[0]);
lm_sum[1][bf_loc] = src_t[1];
lm_sqsum[1][bf_loc] = convert_float4(src_t[1] * src_t[1]);
int offset = 1;
for(int d = LSIZE >> 1 ; d > 0; d>>=1)
{
barrier(CLK_LOCAL_MEM_FENCE);
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
if((lid & 127) < d)
{
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
}
offset <<= 1;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(lid < 2)
{
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0;
}
for(int d = 1; d < LSIZE; d <<= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
offset >>= 1;
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
if((lid & 127) < d)
{
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
if(lid > 0 && (i+lid) <= rows)
{
lm_sum[0][bf_loc] += sum_t[0];
lm_sum[1][bf_loc] += sum_t[1];
lm_sqsum[0][bf_loc] += sqsum_t[0];
lm_sqsum[1][bf_loc] += sqsum_t[1];
sum_p = (__local float*)(&(lm_sum[0][bf_loc]));
sqsum_p = (__local float*)(&(lm_sqsum[0][bf_loc]));
for(int k = 0; k < 4; k++)
{
if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue;
sum[loc_s0 + k * dst_step / 4] = sum_p[k];
sqsum[loc_s0 + k * dst_step / 4] = sqsum_p[k];
}
sum_p = (__local float*)(&(lm_sum[1][bf_loc]));
sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc]));
for(int k = 0; k < 4; k++)
{
if(gid * 4 + k + 4 >= cols + pre_invalid) break;
sum[loc_s1 + k * dst_step / 4] = sum_p[k];
sqsum[loc_s1 + k * dst_step / 4] = sqsum_p[k];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
kernel void integral_rows_D5(__global float4 *srcsum,__global float4 * srcsqsum,__global float *sum ,
__global float *sqsum,int rows,int cols,int src_step,int sum_step,
int sqsum_step,int sum_offset,int sqsum_offset)
{
unsigned int lid = get_local_id(0);
unsigned int gid = get_group_id(0);
float4 src_t[2], sum_t[2];
float4 sqsrc_t[2],sqsum_t[2];
__local float4 lm_sum[2][LSIZE + LOG_LSIZE];
__local float4 lm_sqsum[2][LSIZE + LOG_LSIZE];
__local float *sum_p;
__local float *sqsum_p;
src_step = src_step >> 4;
for(int i = 0; i < rows; i =i + LSIZE_1)
{
src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : (float4)0;
sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2] : (float4)0;
src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0;
sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0;
sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
sum_t[1] = (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
barrier(CLK_LOCAL_MEM_FENCE);
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
lm_sum[0][bf_loc] = src_t[0];
lm_sqsum[0][bf_loc] = sqsrc_t[0];
lm_sum[1][bf_loc] = src_t[1];
lm_sqsum[1][bf_loc] = sqsrc_t[1];
int offset = 1;
for(int d = LSIZE >> 1 ; d > 0; d>>=1)
{
barrier(CLK_LOCAL_MEM_FENCE);
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
if((lid & 127) < d)
{
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
}
offset <<= 1;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(lid < 2)
{
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0;
}
for(int d = 1; d < LSIZE; d <<= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
offset >>= 1;
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
if((lid & 127) < d)
{
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(gid == 0 && (i + lid) <= rows)
{
sum[sum_offset + i + lid] = 0;
sqsum[sqsum_offset + i + lid] = 0;
}
if(i + lid == 0)
{
int loc0 = gid * 2 * sum_step;
int loc1 = gid * 2 * sqsum_step;
for(int k = 1; k <= 8; k++)
{
if(gid * 8 + k > cols) break;
sum[sum_offset + loc0 + k * sum_step / 4] = 0;
sqsum[sqsum_offset + loc1 + k * sqsum_step / 4] = 0;
}
}
int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
int loc_sq0 = sqsum_offset + gid * 2 * sqsum_step + sqsum_step / 4 + i + lid, loc_sq1 = loc_sq0 + sqsum_step ;
if(lid > 0 && (i+lid) <= rows)
{
lm_sum[0][bf_loc] += sum_t[0];
lm_sum[1][bf_loc] += sum_t[1];
lm_sqsum[0][bf_loc] += sqsum_t[0];
lm_sqsum[1][bf_loc] += sqsum_t[1];
sum_p = (__local float*)(&(lm_sum[0][bf_loc]));
sqsum_p = (__local float*)(&(lm_sqsum[0][bf_loc]));
for(int k = 0; k < 4; k++)
{
if(gid * 8 + k >= cols) break;
sum[loc_s0 + k * sum_step / 4] = sum_p[k];
sqsum[loc_sq0 + k * sqsum_step / 4] = sqsum_p[k];
}
sum_p = (__local float*)(&(lm_sum[1][bf_loc]));
sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc]));
for(int k = 0; k < 4; k++)
{
if(gid * 8 + 4 + k >= cols) break;
sum[loc_s1 + k * sum_step / 4] = sum_p[k];
sqsum[loc_sq1 + k * sqsum_step / 4] = sqsum_p[k];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}

@ -44,8 +44,13 @@
//M*/
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif
#define LSIZE 256
#define LSIZE_1 255
#define LSIZE_2 254
@ -56,8 +61,8 @@
#define GET_CONFLICT_OFFSET(lid) ((lid) >> LOG_NUM_BANKS)
kernel void integral_sum_cols(__global uchar4 *src,__global int *sum ,
int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
kernel void integral_sum_cols_D4(__global uchar4 *src,__global int *sum ,
int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
{
unsigned int lid = get_local_id(0);
unsigned int gid = get_group_id(0);
@ -114,7 +119,8 @@ kernel void integral_sum_cols(__global uchar4 *src,__global int *sum ,
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(lid > 0 && (i+lid) <= rows){
if(lid > 0 && (i+lid) <= rows)
{
int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
lm_sum[0][bf_loc] += sum_t[0];
lm_sum[1][bf_loc] += sum_t[1];
@ -136,9 +142,9 @@ kernel void integral_sum_cols(__global uchar4 *src,__global int *sum ,
}
kernel void integral_sum_rows(__global int4 *srcsum,__global int *sum ,
int rows,int cols,int src_step,int sum_step,
int sum_offset)
kernel void integral_sum_rows_D4(__global int4 *srcsum,__global int *sum ,
int rows,int cols,int src_step,int sum_step,
int sum_offset)
{
unsigned int lid = get_local_id(0);
unsigned int gid = get_group_id(0);
@ -196,19 +202,20 @@ kernel void integral_sum_rows(__global int4 *srcsum,__global int *sum ,
barrier(CLK_LOCAL_MEM_FENCE);
if(gid == 0 && (i + lid) <= rows)
{
sum[sum_offset + i + lid] = 0;
sum[sum_offset + i + lid] = 0;
}
if(i + lid == 0)
{
int loc0 = gid * 2 * sum_step;
for(int k = 1;k <= 8;k++)
for(int k = 1; k <= 8; k++)
{
if(gid * 8 + k > cols) break;
sum[sum_offset + loc0 + k * sum_step / 4] = 0;
}
}
if(lid > 0 && (i+lid) <= rows){
if(lid > 0 && (i+lid) <= rows)
{
int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
lm_sum[0][bf_loc] += sum_t[0];
lm_sum[1][bf_loc] += sum_t[1];
@ -228,3 +235,178 @@ kernel void integral_sum_rows(__global int4 *srcsum,__global int *sum ,
barrier(CLK_LOCAL_MEM_FENCE);
}
}
kernel void integral_sum_cols_D5(__global uchar4 *src,__global float *sum ,
int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
{
unsigned int lid = get_local_id(0);
unsigned int gid = get_group_id(0);
float4 src_t[2], sum_t[2];
__local float4 lm_sum[2][LSIZE + LOG_LSIZE];
__local float* sum_p;
src_step = src_step >> 2;
gid = gid << 1;
for(int i = 0; i < rows; i =i + LSIZE_1)
{
src_t[0] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + gid]) : (float4)0);
src_t[1] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + gid + 1]) : (float4)0);
sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
sum_t[1] = (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
barrier(CLK_LOCAL_MEM_FENCE);
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
lm_sum[0][bf_loc] = src_t[0];
lm_sum[1][bf_loc] = src_t[1];
int offset = 1;
for(int d = LSIZE >> 1 ; d > 0; d>>=1)
{
barrier(CLK_LOCAL_MEM_FENCE);
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
if((lid & 127) < d)
{
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
}
offset <<= 1;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(lid < 2)
{
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
}
for(int d = 1; d < LSIZE; d <<= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
offset >>= 1;
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
if((lid & 127) < d)
{
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(lid > 0 && (i+lid) <= rows)
{
int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
lm_sum[0][bf_loc] += sum_t[0];
lm_sum[1][bf_loc] += sum_t[1];
sum_p = (__local float*)(&(lm_sum[0][bf_loc]));
for(int k = 0; k < 4; k++)
{
if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue;
sum[loc_s0 + k * dst_step / 4] = sum_p[k];
}
sum_p = (__local float*)(&(lm_sum[1][bf_loc]));
for(int k = 0; k < 4; k++)
{
if(gid * 4 + k + 4 >= cols + pre_invalid) break;
sum[loc_s1 + k * dst_step / 4] = sum_p[k];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
kernel void integral_sum_rows_D5(__global float4 *srcsum,__global float *sum ,
int rows,int cols,int src_step,int sum_step,
int sum_offset)
{
unsigned int lid = get_local_id(0);
unsigned int gid = get_group_id(0);
float4 src_t[2], sum_t[2];
__local float4 lm_sum[2][LSIZE + LOG_LSIZE];
__local float *sum_p;
src_step = src_step >> 4;
for(int i = 0; i < rows; i =i + LSIZE_1)
{
src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : (float4)0;
src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0;
sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
sum_t[1] = (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
barrier(CLK_LOCAL_MEM_FENCE);
int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
lm_sum[0][bf_loc] = src_t[0];
lm_sum[1][bf_loc] = src_t[1];
int offset = 1;
for(int d = LSIZE >> 1 ; d > 0; d>>=1)
{
barrier(CLK_LOCAL_MEM_FENCE);
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
if((lid & 127) < d)
{
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
}
offset <<= 1;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(lid < 2)
{
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
}
for(int d = 1; d < LSIZE; d <<= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
offset >>= 1;
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
ai += GET_CONFLICT_OFFSET(ai);
bi += GET_CONFLICT_OFFSET(bi);
if((lid & 127) < d)
{
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(gid == 0 && (i + lid) <= rows)
{
sum[sum_offset + i + lid] = 0;
}
if(i + lid == 0)
{
int loc0 = gid * 2 * sum_step;
for(int k = 1; k <= 8; k++)
{
if(gid * 8 + k > cols) break;
sum[sum_offset + loc0 + k * sum_step / 4] = 0;
}
}
if(lid > 0 && (i+lid) <= rows)
{
int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
lm_sum[0][bf_loc] += sum_t[0];
lm_sum[1][bf_loc] += sum_t[1];
sum_p = (__local float*)(&(lm_sum[0][bf_loc]));
for(int k = 0; k < 4; k++)
{
if(gid * 8 + k >= cols) break;
sum[loc_s0 + k * sum_step / 4] = sum_p[k];
}
sum_p = (__local float*)(&(lm_sum[1][bf_loc]));
for(int k = 0; k < 4; k++)
{
if(gid * 8 + 4 + k >= cols) break;
sum[loc_s1 + k * sum_step / 4] = sum_p[k];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}

@ -1,3 +1,48 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Sen Liu, swjtuls1987@126.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
@ -609,22 +654,33 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
int y = wgidy*TILE_SIZE; // real Y index of pixel
int x = wgidx*TILE_SIZE; // real X index of pixel
int kcn = (cn==2)?2:4;
int rstep = min(src_step/4, TILE_SIZE);
src_step /= sizeof(*src_data);
int rstep = min(src_step, TILE_SIZE);
tileSize_height = min(TILE_SIZE, src_rows - y);
tileSize_width = min(TILE_SIZE, src_cols -x);
if(tileSize_width < TILE_SIZE)
for(int i = tileSize_width; i < rstep; i++ )
*((__global float*)src_data+(y+lidy)*src_step/4+x+i) = 0;
int maxIdx = mul24(src_rows, src_cols);
int yOff = (y+lidy)*src_step;
int index;
if(tileSize_width < TILE_SIZE && yOff < src_rows)
for(int i = tileSize_width; i < rstep && (yOff+x+i) < maxIdx; i++ )
*(src_data+yOff+x+i) = 0;
if( coi > 0 )
for(int i=0; i < tileSize_width; i+=VLEN_F)
{
#pragma unroll
for(int j=0; j<4; j++)
tmp_coi[j] = *(src_data+(y+lidy)*src_step/4+(x+i+j)*kcn+coi-1);
{
index = yOff+(x+i+j)*kcn+coi-1;
if (index < maxIdx)
tmp_coi[j] = *(src_data+index);
else
tmp_coi[j] = 0;
}
tmp[i/VLEN_F] = (float4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
}
else
for(int i=0; i < tileSize_width; i+=VLEN_F)
tmp[i/VLEN_F] = (float4)(*(src_data+(y+lidy)*src_step/4+x+i),*(src_data+(y+lidy)*src_step/4+x+i+1),*(src_data+(y+lidy)*src_step/4+x+i+2),*(src_data+(y+lidy)*src_step/4+x+i+3));
for(int i=0; i < tileSize_width && (yOff+x+i) < maxIdx; i+=VLEN_F)
tmp[i/VLEN_F] = (*(__global float4 *)(src_data+yOff+x+i));
float4 zero = (float4)(0);
float4 full = (float4)(255);
if( binary )
@ -714,35 +770,59 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
// accumulate moments computed in each tile
dst_step /= sizeof(F);
int dst_x_off = mad24(wgidy, dst_cols, wgidx);
int dst_off = 0;
int max_dst_index = 10 * blocky * get_global_size(1);
// + m00 ( = m00' )
*(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
dst_off = mad24(DST_ROW_00 * blocky, dst_step, dst_x_off);
if (dst_off < max_dst_index)
*(dst_m + dst_off) = mom[0];
// + m10 ( = m10' + x*m00' )
*(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
dst_off = mad24(DST_ROW_10 * blocky, dst_step, dst_x_off);
if (dst_off < max_dst_index)
*(dst_m + dst_off) = mom[1] + xm;
// + m01 ( = m01' + y*m00' )
*(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
dst_off = mad24(DST_ROW_01 * blocky, dst_step, dst_x_off);
if (dst_off < max_dst_index)
*(dst_m + dst_off) = mom[2] + ym;
// + m20 ( = m20' + 2*x*m10' + x*x*m00' )
*(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
dst_off = mad24(DST_ROW_20 * blocky, dst_step, dst_x_off);
if (dst_off < max_dst_index)
*(dst_m + dst_off) = mom[3] + x * (mom[1] * 2 + xm);
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
*(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
dst_off = mad24(DST_ROW_11 * blocky, dst_step, dst_x_off);
if (dst_off < max_dst_index)
*(dst_m + dst_off) = mom[4] + x * (mom[2] + ym) + y * mom[1];
// + m02 ( = m02' + 2*y*m01' + y*y*m00' )
*(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
dst_off = mad24(DST_ROW_02 * blocky, dst_step, dst_x_off);
if (dst_off < max_dst_index)
*(dst_m + dst_off) = mom[5] + y * (mom[2] * 2 + ym);
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
*(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
dst_off = mad24(DST_ROW_30 * blocky, dst_step, dst_x_off);
if (dst_off < max_dst_index)
*(dst_m + dst_off) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
*(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
dst_off = mad24(DST_ROW_21 * blocky, dst_step, dst_x_off);
if (dst_off < max_dst_index)
*(dst_m + dst_off) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
*(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
dst_off = mad24(DST_ROW_12 * blocky, dst_step, dst_x_off);
if (dst_off < max_dst_index)
*(dst_m + dst_off) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
*(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
dst_off = mad24(DST_ROW_03 * blocky, dst_step, dst_x_off);
if (dst_off < max_dst_index)
*(dst_m + dst_off) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
}
}

@ -18,6 +18,7 @@
// Zhang Chunpeng chunpeng@multicorewareinc.com
// Dachuan Zhao, dachuan@multicorewareinc.com
// Yao Wang, yao@multicorewareinc.com
// Peng Xiao, pengxiao@outlook.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
@ -47,7 +48,7 @@
//#pragma OPENCL EXTENSION cl_amd_printf : enable
uchar get_valid_uchar(uchar data)
uchar get_valid_uchar(float data)
{
return (uchar)(data <= 255 ? data : data > 0 ? 255 : 0);
}
@ -142,7 +143,7 @@ __kernel void pyrUp_C1_D0(__global uchar* src,__global uchar* dst,
sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][tidx];
if ((x < dstCols) && (y < dstRows))
dst[x + y * dstStep] = (float)(4.0f * sum);
dst[x + y * dstStep] = convert_uchar_sat_rte(4.0f * sum);
}
@ -244,7 +245,7 @@ __kernel void pyrUp_C1_D2(__global ushort* src,__global ushort* dst,
sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][get_local_id(0)];
if ((x < dstCols) && (y < dstRows))
dst[x + y * dstStep] = (float)(4.0f * sum);
dst[x + y * dstStep] = convert_short_sat_rte(4.0f * sum);
}
@ -351,31 +352,6 @@ __kernel void pyrUp_C1_D5(__global float* src,__global float* dst,
///////////////////////////////////////////////////////////////////////
////////////////////////// CV_8UC4 //////////////////////////////////
///////////////////////////////////////////////////////////////////////
float4 covert_uchar4_to_float4(uchar4 data)
{
float4 f4Data = {0,0,0,0};
f4Data.x = (float)data.x;
f4Data.y = (float)data.y;
f4Data.z = (float)data.z;
f4Data.w = (float)data.w;
return f4Data;
}
uchar4 convert_float4_to_uchar4(float4 data)
{
uchar4 u4Data;
u4Data.x = get_valid_uchar(data.x);
u4Data.y = get_valid_uchar(data.y);
u4Data.z = get_valid_uchar(data.z);
u4Data.w = get_valid_uchar(data.w);
return u4Data;
}
__kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst,
int srcRows,int dstRows,int srcCols,int dstCols,
int srcOffset,int dstOffset,int srcStep,int dstStep)
@ -406,7 +382,7 @@ __kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst,
srcy = abs(srcy);
srcy = min(srcRows -1 ,srcy);
s_srcPatch[tidy][tidx] = covert_uchar4_to_float4(src[srcx + srcy * srcStep]);
s_srcPatch[tidy][tidx] = convert_float4(src[srcx + srcy * srcStep]);
}
barrier(CLK_LOCAL_MEM_FENCE);
@ -476,38 +452,12 @@ __kernel void pyrUp_C4_D0(__global uchar4* src,__global uchar4* dst,
if ((x < dstCols) && (y < dstRows))
{
dst[x + y * dstStep] = convert_float4_to_uchar4(4.0f * sum);
dst[x + y * dstStep] = convert_uchar4_sat_rte(4.0f * sum);
}
}
///////////////////////////////////////////////////////////////////////
////////////////////////// CV_16UC4 //////////////////////////////////
///////////////////////////////////////////////////////////////////////
float4 covert_ushort4_to_float4(ushort4 data)
{
float4 f4Data = {0,0,0,0};
f4Data.x = (float)data.x;
f4Data.y = (float)data.y;
f4Data.z = (float)data.z;
f4Data.w = (float)data.w;
return f4Data;
}
ushort4 convert_float4_to_ushort4(float4 data)
{
ushort4 u4Data;
u4Data.x = (float)data.x;
u4Data.y = (float)data.y;
u4Data.z = (float)data.z;
u4Data.w = (float)data.w;
return u4Data;
}
__kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst,
int srcRows,int dstRows,int srcCols,int dstCols,
int srcOffset,int dstOffset,int srcStep,int dstStep)
@ -535,7 +485,7 @@ __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst,
srcy = abs(srcy);
srcy = min(srcRows -1 ,srcy);
s_srcPatch[get_local_id(1)][get_local_id(0)] = covert_ushort4_to_float4(src[srcx + srcy * srcStep]);
s_srcPatch[get_local_id(1)][get_local_id(0)] = convert_float4(src[srcx + srcy * srcStep]);
}
barrier(CLK_LOCAL_MEM_FENCE);
@ -570,11 +520,11 @@ __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst,
if (eveny)
{
sum = sum + (evenFlag * co3) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
sum = sum + ( oddFlag * co2 ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
sum = sum + (evenFlag * co3 ) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
sum = sum + (oddFlag * co2 ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
sum = sum + (evenFlag * co1 ) * s_srcPatch[0][1 + ((tidx ) >> 1)];
sum = sum + ( oddFlag * co2 ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
sum = sum + (evenFlag * co3) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
sum = sum + (oddFlag * co2 ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
sum = sum + (evenFlag * co3 ) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
}
s_dstPatch[get_local_id(1)][get_local_id(0)] = sum;
@ -610,7 +560,7 @@ __kernel void pyrUp_C4_D2(__global ushort4* src,__global ushort4* dst,
if ((x < dstCols) && (y < dstRows))
{
dst[x + y * dstStep] = convert_float4_to_ushort4(4.0f * sum);
dst[x + y * dstStep] = convert_ushort4_sat_rte(4.0f * sum);
}
}
@ -681,11 +631,11 @@ __kernel void pyrUp_C4_D5(__global float4* src,__global float4* dst,
if (eveny)
{
sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-16][1 + ((tidx - 2) >> 1)];
sum = sum + ( oddFlag * co2 ) * s_srcPatch[lsizey-16][1 + ((tidx - 1) >> 1)];
sum = sum + (evenFlag * co3 ) * s_srcPatch[lsizey-16][1 + ((tidx - 2) >> 1)];
sum = sum + (oddFlag * co2 ) * s_srcPatch[lsizey-16][1 + ((tidx - 1) >> 1)];
sum = sum + (evenFlag * co1 ) * s_srcPatch[lsizey-16][1 + ((tidx ) >> 1)];
sum = sum + ( oddFlag * co2 ) * s_srcPatch[lsizey-16][1 + ((tidx + 1) >> 1)];
sum = sum + (evenFlag * co3) * s_srcPatch[lsizey-16][1 + ((tidx + 2) >> 1)];
sum = sum + ( oddFlag * co2 ) * s_srcPatch[lsizey-16][1 + ((tidx + 1) >> 1)];
sum = sum + (evenFlag * co3 ) * s_srcPatch[lsizey-16][1 + ((tidx + 2) >> 1)];
}
s_dstPatch[tidy][tidx] = sum;

@ -16,6 +16,8 @@
//
// @Authors
// Jia Haipeng, jiahaipeng95@gmail.com
// Sen Liu, swjtuls1987@126.com
// Peng Xiao, pengxiao@outlook.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
@ -50,59 +52,40 @@
#define STEREO_MIND 0 // The minimum d range to check
#define STEREO_DISP_STEP N_DISPARITIES // the d step, must be <= 1 to avoid aliasing
int SQ(int a)
{
return a * a;
}
#ifndef radius
#define radius 64
#endif
unsigned int CalcSSD(volatile __local unsigned int *col_ssd_cache,
volatile __local unsigned int *col_ssd, int radius)
unsigned int CalcSSD(__local unsigned int *col_ssd)
{
unsigned int cache = 0;
unsigned int cache2 = 0;
unsigned int cache = col_ssd[0];
for(int i = 1; i <= radius; i++)
#pragma unroll
for(int i = 1; i <= (radius << 1); i++)
cache += col_ssd[i];
col_ssd_cache[0] = cache;
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < BLOCK_W - radius)
cache2 = col_ssd_cache[radius];
else
for(int i = radius + 1; i < (2 * radius + 1); i++)
cache2 += col_ssd[i];
return col_ssd[0] + cache + cache2;
return cache;
}
uint2 MinSSD(volatile __local unsigned int *col_ssd_cache,
volatile __local unsigned int *col_ssd, int radius)
uint2 MinSSD(__local unsigned int *col_ssd)
{
unsigned int ssd[N_DISPARITIES];
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * radius)
ssd[0] = CalcSSD(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * radius), radius);
barrier(CLK_LOCAL_MEM_FENCE);
ssd[1] = CalcSSD(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * radius), radius);
barrier(CLK_LOCAL_MEM_FENCE);
ssd[2] = CalcSSD(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * radius), radius);
barrier(CLK_LOCAL_MEM_FENCE);
ssd[3] = CalcSSD(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * radius), radius);
barrier(CLK_LOCAL_MEM_FENCE);
ssd[4] = CalcSSD(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * radius), radius);
barrier(CLK_LOCAL_MEM_FENCE);
ssd[5] = CalcSSD(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * radius), radius);
barrier(CLK_LOCAL_MEM_FENCE);
ssd[6] = CalcSSD(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * radius), radius);
barrier(CLK_LOCAL_MEM_FENCE);
ssd[7] = CalcSSD(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * radius), radius);
barrier(CLK_LOCAL_MEM_FENCE);
const int win_size = (radius << 1);
//See above: #define COL_SSD_SIZE (BLOCK_W + WIN_SIZE)
ssd[0] = CalcSSD(col_ssd + 0 * (BLOCK_W + win_size));
ssd[1] = CalcSSD(col_ssd + 1 * (BLOCK_W + win_size));
ssd[2] = CalcSSD(col_ssd + 2 * (BLOCK_W + win_size));
ssd[3] = CalcSSD(col_ssd + 3 * (BLOCK_W + win_size));
ssd[4] = CalcSSD(col_ssd + 4 * (BLOCK_W + win_size));
ssd[5] = CalcSSD(col_ssd + 5 * (BLOCK_W + win_size));
ssd[6] = CalcSSD(col_ssd + 6 * (BLOCK_W + win_size));
ssd[7] = CalcSSD(col_ssd + 7 * (BLOCK_W + win_size));
unsigned int mssd = min(min(min(ssd[0], ssd[1]), min(ssd[4], ssd[5])), min(min(ssd[2], ssd[3]), min(ssd[6], ssd[7])));
int bestIdx = 0;
for (int i = 0; i < N_DISPARITIES; i++)
{
if (mssd == ssd[i])
@ -113,124 +96,66 @@ uint2 MinSSD(volatile __local unsigned int *col_ssd_cache,
}
void StepDown(int idx1, int idx2, __global unsigned char* imageL,
__global unsigned char* imageR, int d, volatile __local unsigned int *col_ssd, int radius)
__global unsigned char* imageR, int d, __local unsigned int *col_ssd)
{
unsigned char leftPixel1;
unsigned char leftPixel2;
unsigned char rightPixel1[8];
unsigned char rightPixel2[8];
unsigned int diff1, diff2;
leftPixel1 = imageL[idx1];
leftPixel2 = imageL[idx2];
idx1 = idx1 - d;
idx2 = idx2 - d;
rightPixel1[7] = imageR[idx1 - 7];
rightPixel1[0] = imageR[idx1 - 0];
rightPixel1[1] = imageR[idx1 - 1];
rightPixel1[2] = imageR[idx1 - 2];
rightPixel1[3] = imageR[idx1 - 3];
rightPixel1[4] = imageR[idx1 - 4];
rightPixel1[5] = imageR[idx1 - 5];
rightPixel1[6] = imageR[idx1 - 6];
rightPixel2[7] = imageR[idx2 - 7];
rightPixel2[0] = imageR[idx2 - 0];
rightPixel2[1] = imageR[idx2 - 1];
rightPixel2[2] = imageR[idx2 - 2];
rightPixel2[3] = imageR[idx2 - 3];
rightPixel2[4] = imageR[idx2 - 4];
rightPixel2[5] = imageR[idx2 - 5];
rightPixel2[6] = imageR[idx2 - 6];
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * radius)
diff1 = leftPixel1 - rightPixel1[0];
diff2 = leftPixel2 - rightPixel2[0];
col_ssd[0 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[1];
diff2 = leftPixel2 - rightPixel2[1];
col_ssd[1 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[2];
diff2 = leftPixel2 - rightPixel2[2];
col_ssd[2 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[3];
diff2 = leftPixel2 - rightPixel2[3];
col_ssd[3 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[4];
diff2 = leftPixel2 - rightPixel2[4];
col_ssd[4 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[5];
diff2 = leftPixel2 - rightPixel2[5];
col_ssd[5 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[6];
diff2 = leftPixel2 - rightPixel2[6];
col_ssd[6 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
diff1 = leftPixel1 - rightPixel1[7];
diff2 = leftPixel2 - rightPixel2[7];
col_ssd[7 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
uint8 imgR1 = convert_uint8(vload8(0, imageR + (idx1 - d - 7)));
uint8 imgR2 = convert_uint8(vload8(0, imageR + (idx2 - d - 7)));
uint8 diff1 = (uint8)(imageL[idx1]) - imgR1;
uint8 diff2 = (uint8)(imageL[idx2]) - imgR2;
uint8 res = diff2 * diff2 - diff1 * diff1;
const int win_size = (radius << 1);
col_ssd[0 * (BLOCK_W + win_size)] += res.s7;
col_ssd[1 * (BLOCK_W + win_size)] += res.s6;
col_ssd[2 * (BLOCK_W + win_size)] += res.s5;
col_ssd[3 * (BLOCK_W + win_size)] += res.s4;
col_ssd[4 * (BLOCK_W + win_size)] += res.s3;
col_ssd[5 * (BLOCK_W + win_size)] += res.s2;
col_ssd[6 * (BLOCK_W + win_size)] += res.s1;
col_ssd[7 * (BLOCK_W + win_size)] += res.s0;
}
void InitColSSD(int x_tex, int y_tex, int im_pitch, __global unsigned char* imageL,
__global unsigned char* imageR, int d,
volatile __local unsigned int *col_ssd, int radius)
__local unsigned int *col_ssd)
{
unsigned char leftPixel1;
int idx;
unsigned int diffa[] = {0, 0, 0, 0, 0, 0, 0, 0};
for(int i = 0; i < (2 * radius + 1); i++)
uint8 leftPixel1;
uint8 diffa = 0;
int idx = y_tex * im_pitch + x_tex;
const int win_size = (radius << 1);
for(int i = 0; i < (win_size + 1); i++)
{
idx = y_tex * im_pitch + x_tex;
leftPixel1 = imageL[idx];
idx = idx - d;
diffa[0] += SQ(leftPixel1 - imageR[idx - 0]);
diffa[1] += SQ(leftPixel1 - imageR[idx - 1]);
diffa[2] += SQ(leftPixel1 - imageR[idx - 2]);
diffa[3] += SQ(leftPixel1 - imageR[idx - 3]);
diffa[4] += SQ(leftPixel1 - imageR[idx - 4]);
diffa[5] += SQ(leftPixel1 - imageR[idx - 5]);
diffa[6] += SQ(leftPixel1 - imageR[idx - 6]);
diffa[7] += SQ(leftPixel1 - imageR[idx - 7]);
y_tex += 1;
leftPixel1 = (uint8)(imageL[idx]);
uint8 imgR = convert_uint8(vload8(0, imageR + (idx - d - 7)));
uint8 res = leftPixel1 - imgR;
diffa += res * res;
idx += im_pitch;
}
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * radius)
col_ssd[0 * (BLOCK_W + 2 * radius)] = diffa[0];
col_ssd[1 * (BLOCK_W + 2 * radius)] = diffa[1];
col_ssd[2 * (BLOCK_W + 2 * radius)] = diffa[2];
col_ssd[3 * (BLOCK_W + 2 * radius)] = diffa[3];
col_ssd[4 * (BLOCK_W + 2 * radius)] = diffa[4];
col_ssd[5 * (BLOCK_W + 2 * radius)] = diffa[5];
col_ssd[6 * (BLOCK_W + 2 * radius)] = diffa[6];
col_ssd[7 * (BLOCK_W + 2 * radius)] = diffa[7];
//See above: #define COL_SSD_SIZE (BLOCK_W + WIN_SIZE)
col_ssd[0 * (BLOCK_W + win_size)] = diffa.s7;
col_ssd[1 * (BLOCK_W + win_size)] = diffa.s6;
col_ssd[2 * (BLOCK_W + win_size)] = diffa.s5;
col_ssd[3 * (BLOCK_W + win_size)] = diffa.s4;
col_ssd[4 * (BLOCK_W + win_size)] = diffa.s3;
col_ssd[5 * (BLOCK_W + win_size)] = diffa.s2;
col_ssd[6 * (BLOCK_W + win_size)] = diffa.s1;
col_ssd[7 * (BLOCK_W + win_size)] = diffa.s0;
}
__kernel void stereoKernel(__global unsigned char *left, __global unsigned char *right,
__global unsigned int *cminSSDImage, int cminSSD_step,
__global unsigned char *disp, int disp_step,int cwidth, int cheight,
int img_step, int maxdisp, int radius,
int img_step, int maxdisp,
__local unsigned int *col_ssd_cache)
{
volatile __local unsigned int *col_ssd = col_ssd_cache + BLOCK_W + get_local_id(0);
volatile __local unsigned int *col_ssd_extra = get_local_id(0) < (2 * radius) ? col_ssd + BLOCK_W : 0;
__local unsigned int *col_ssd = col_ssd_cache + get_local_id(0);
__local unsigned int *col_ssd_extra = get_local_id(0) < (radius << 1) ? col_ssd + BLOCK_W : 0;
int X = get_group_id(0) * BLOCK_W + get_local_id(0) + maxdisp + radius;
// int Y = get_group_id(1) * ROWSperTHREAD + radius;
#define Y (get_group_id(1) * ROWSperTHREAD + radius)
volatile __global unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
__global unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
__global unsigned char* disparImage = disp + X + Y * disp_step;
int end_row = ROWSperTHREAD < (cheight - Y) ? ROWSperTHREAD:(cheight - Y);
@ -244,14 +169,14 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char
{
y_tex = Y - radius;
InitColSSD(x_tex, y_tex, img_step, left, right, d, col_ssd, radius);
InitColSSD(x_tex, y_tex, img_step, left, right, d, col_ssd);
if (col_ssd_extra > 0)
if (x_tex + BLOCK_W < cwidth)
InitColSSD(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra, radius);
InitColSSD(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra);
barrier(CLK_LOCAL_MEM_FENCE); //before MinSSD function
uint2 minSSD = MinSSD(col_ssd_cache + get_local_id(0), col_ssd, radius);
uint2 minSSD = MinSSD(col_ssd);
if (X < cwidth - radius && Y < cheight - radius)
{
if (minSSD.x < minSSDImage[0])
@ -264,21 +189,18 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char
for(int row = 1; row < end_row; row++)
{
int idx1 = y_tex * img_step + x_tex;
int idx2 = min(y_tex + (2 * radius + 1), cheight - 1) * img_step + x_tex;
barrier(CLK_GLOBAL_MEM_FENCE);
int idx2 = min(y_tex + ((radius << 1) + 1), cheight - 1) * img_step + x_tex;
barrier(CLK_LOCAL_MEM_FENCE);
StepDown(idx1, idx2, left, right, d, col_ssd, radius);
StepDown(idx1, idx2, left, right, d, col_ssd);
if (col_ssd_extra > 0)
if (x_tex + BLOCK_W < cwidth)
StepDown(idx1, idx2, left + BLOCK_W, right + BLOCK_W, d, col_ssd_extra, radius);
y_tex += 1;
StepDown(idx1, idx2, left + BLOCK_W, right + BLOCK_W, d, col_ssd_extra);
barrier(CLK_LOCAL_MEM_FENCE);
uint2 minSSD = MinSSD(col_ssd_cache + get_local_id(0), col_ssd, radius);
uint2 minSSD = MinSSD(col_ssd);
if (X < cwidth - radius && row < cheight - radius - Y)
{
int idx = row * cminSSD_step;
@ -288,10 +210,11 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char
minSSDImage[idx] = minSSD.x;
}
}
y_tex++;
} // for row loop
} // for d loop
}
//////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////// Sobel Prefiler (signal channel)//////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////

@ -185,10 +185,10 @@ __kernel void data_step_down(__global T *src, int src_rows,
for (int d = 0; d < cndisp; ++d)
{
float dst_reg;
dst_reg = src[(d * src_rows + (2*y+0)) * src_step + 2*x+0];
dst_reg += src[(d * src_rows + (2*y+1)) * src_step + 2*x+0];
dst_reg += src[(d * src_rows + (2*y+0)) * src_step + 2*x+1];
dst_reg += src[(d * src_rows + (2*y+1)) * src_step + 2*x+1];
dst_reg = src[(d * src_rows + min(2*y+0, src_rows-1)) * src_step + 2*x+0];
dst_reg += src[(d * src_rows + min(2*y+1, src_rows-1)) * src_step + 2*x+0];
dst_reg += src[(d * src_rows + min(2*y+0, src_rows-1)) * src_step + 2*x+1];
dst_reg += src[(d * src_rows + min(2*y+1, src_rows-1)) * src_step + 2*x+1];
dst[(d * dst_rows + y) * dst_step + x] = saturate_cast(dst_reg);
}

File diff suppressed because it is too large Load Diff

@ -0,0 +1,407 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Jin Ma jin@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
__kernel void centeredGradientKernel(__global const float* src, int src_col, int src_row, int src_step,
__global float* dx, __global float* dy, int dx_step)
{
int x = get_global_id(0);
int y = get_global_id(1);
if((x < src_col)&&(y < src_row))
{
int src_x1 = (x + 1) < (src_col -1)? (x + 1) : (src_col - 1);
int src_x2 = (x - 1) > 0 ? (x -1) : 0;
//if(src[y * src_step + src_x1] == src[y * src_step+ src_x2])
//{
// printf("y = %d\n", y);
// printf("src_x1 = %d\n", src_x1);
// printf("src_x2 = %d\n", src_x2);
//}
dx[y * dx_step+ x] = 0.5f * (src[y * src_step + src_x1] - src[y * src_step+ src_x2]);
int src_y1 = (y+1) < (src_row - 1) ? (y + 1) : (src_row - 1);
int src_y2 = (y - 1) > 0 ? (y - 1) : 0;
dy[y * dx_step+ x] = 0.5f * (src[src_y1 * src_step + x] - src[src_y2 * src_step+ x]);
}
}
float bicubicCoeff(float x_)
{
float x = fabs(x_);
if (x <= 1.0f)
{
return x * x * (1.5f * x - 2.5f) + 1.0f;
}
else if (x < 2.0f)
{
return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f;
}
else
{
return 0.0f;
}
}
__kernel void warpBackwardKernel(__global const float* I0, int I0_step, int I0_col, int I0_row,
image2d_t tex_I1, image2d_t tex_I1x, image2d_t tex_I1y,
__global const float* u1, int u1_step,
__global const float* u2,
__global float* I1w,
__global float* I1wx, /*int I1wx_step,*/
__global float* I1wy, /*int I1wy_step,*/
__global float* grad, /*int grad_step,*/
__global float* rho,
int I1w_step,
int u2_step,
int u1_offset_x,
int u1_offset_y,
int u2_offset_x,
int u2_offset_y)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
if(x < I0_col&&y < I0_row)
{
//const float u1Val = u1(y, x);
const float u1Val = u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];
//const float u2Val = u2(y, x);
const float u2Val = u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];
const float wx = x + u1Val;
const float wy = y + u2Val;
const int xmin = ceil(wx - 2.0f);
const int xmax = floor(wx + 2.0f);
const int ymin = ceil(wy - 2.0f);
const int ymax = floor(wy + 2.0f);
float sum = 0.0f;
float sumx = 0.0f;
float sumy = 0.0f;
float wsum = 0.0f;
sampler_t sampleri = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
for (int cy = ymin; cy <= ymax; ++cy)
{
for (int cx = xmin; cx <= xmax; ++cx)
{
const float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);
//sum += w * tex2D(tex_I1 , cx, cy);
int2 cood = (int2)(cx, cy);
sum += w * read_imagef(tex_I1, sampleri, cood).x;
//sumx += w * tex2D(tex_I1x, cx, cy);
sumx += w * read_imagef(tex_I1x, sampleri, cood).x;
//sumy += w * tex2D(tex_I1y, cx, cy);
sumy += w * read_imagef(tex_I1y, sampleri, cood).x;
wsum += w;
}
}
const float coeff = 1.0f / wsum;
const float I1wVal = sum * coeff;
const float I1wxVal = sumx * coeff;
const float I1wyVal = sumy * coeff;
I1w[y * I1w_step + x] = I1wVal;
I1wx[y * I1w_step + x] = I1wxVal;
I1wy[y * I1w_step + x] = I1wyVal;
const float Ix2 = I1wxVal * I1wxVal;
const float Iy2 = I1wyVal * I1wyVal;
// store the |Grad(I1)|^2
grad[y * I1w_step + x] = Ix2 + Iy2;
// compute the constant part of the rho function
const float I0Val = I0[y * I0_step + x];
rho[y * I1w_step + x] = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val;
}
}
float readImage(__global const float *image, const int x, const int y, const int rows, const int cols, const int elemCntPerRow)
{
int i0 = clamp(x, 0, cols - 1);
int j0 = clamp(y, 0, rows - 1);
int i1 = clamp(x + 1, 0, cols - 1);
int j1 = clamp(y + 1, 0, rows - 1);
return image[j0 * elemCntPerRow + i0];
}
__kernel void warpBackwardKernelNoImage2d(__global const float* I0, int I0_step, int I0_col, int I0_row,
__global const float* tex_I1, __global const float* tex_I1x, __global const float* tex_I1y,
__global const float* u1, int u1_step,
__global const float* u2,
__global float* I1w,
__global float* I1wx, /*int I1wx_step,*/
__global float* I1wy, /*int I1wy_step,*/
__global float* grad, /*int grad_step,*/
__global float* rho,
int I1w_step,
int u2_step,
int I1_step,
int I1x_step)
{
const int x = get_global_id(0);
const int y = get_global_id(1);
if(x < I0_col&&y < I0_row)
{
//const float u1Val = u1(y, x);
const float u1Val = u1[y * u1_step + x];
//const float u2Val = u2(y, x);
const float u2Val = u2[y * u2_step + x];
const float wx = x + u1Val;
const float wy = y + u2Val;
const int xmin = ceil(wx - 2.0f);
const int xmax = floor(wx + 2.0f);
const int ymin = ceil(wy - 2.0f);
const int ymax = floor(wy + 2.0f);
float sum = 0.0f;
float sumx = 0.0f;
float sumy = 0.0f;
float wsum = 0.0f;
for (int cy = ymin; cy <= ymax; ++cy)
{
for (int cx = xmin; cx <= xmax; ++cx)
{
const float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);
int2 cood = (int2)(cx, cy);
sum += w * readImage(tex_I1, cood.x, cood.y, I0_col, I0_row, I1_step);
sumx += w * readImage(tex_I1x, cood.x, cood.y, I0_col, I0_row, I1x_step);
sumy += w * readImage(tex_I1y, cood.x, cood.y, I0_col, I0_row, I1x_step);
wsum += w;
}
}
const float coeff = 1.0f / wsum;
const float I1wVal = sum * coeff;
const float I1wxVal = sumx * coeff;
const float I1wyVal = sumy * coeff;
I1w[y * I1w_step + x] = I1wVal;
I1wx[y * I1w_step + x] = I1wxVal;
I1wy[y * I1w_step + x] = I1wyVal;
const float Ix2 = I1wxVal * I1wxVal;
const float Iy2 = I1wyVal * I1wyVal;
// store the |Grad(I1)|^2
grad[y * I1w_step + x] = Ix2 + Iy2;
// compute the constant part of the rho function
const float I0Val = I0[y * I0_step + x];
rho[y * I1w_step + x] = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val;
}
}
__kernel void estimateDualVariablesKernel(__global const float* u1, int u1_col, int u1_row, int u1_step,
__global const float* u2,
__global float* p11, int p11_step,
__global float* p12,
__global float* p21,
__global float* p22,
const float taut,
int u2_step,
int u1_offset_x,
int u1_offset_y,
int u2_offset_x,
int u2_offset_y)
{
//const int x = blockIdx.x * blockDim.x + threadIdx.x;
//const int y = blockIdx.y * blockDim.y + threadIdx.y;
const int x = get_global_id(0);
const int y = get_global_id(1);
if(x < u1_col && y < u1_row)
{
int src_x1 = (x + 1) < (u1_col - 1) ? (x + 1) : (u1_col - 1);
const float u1x = u1[(y + u1_offset_y) * u1_step + src_x1 + u1_offset_x] - u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];
int src_y1 = (y + 1) < (u1_row - 1) ? (y + 1) : (u1_row - 1);
const float u1y = u1[(src_y1 + u1_offset_y) * u1_step + x + u1_offset_x] - u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];
int src_x2 = (x + 1) < (u1_col - 1) ? (x + 1) : (u1_col - 1);
const float u2x = u2[(y + u2_offset_y) * u2_step + src_x2 + u2_offset_x] - u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];
int src_y2 = (y + 1) < (u1_row - 1) ? (y + 1) : (u1_row - 1);
const float u2y = u2[(src_y2 + u2_offset_y) * u2_step + x + u2_offset_x] - u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];
const float g1 = hypot(u1x, u1y);
const float g2 = hypot(u2x, u2y);
const float ng1 = 1.0f + taut * g1;
const float ng2 = 1.0f + taut * g2;
p11[y * p11_step + x] = (p11[y * p11_step + x] + taut * u1x) / ng1;
p12[y * p11_step + x] = (p12[y * p11_step + x] + taut * u1y) / ng1;
p21[y * p11_step + x] = (p21[y * p11_step + x] + taut * u2x) / ng2;
p22[y * p11_step + x] = (p22[y * p11_step + x] + taut * u2y) / ng2;
}
}
float divergence(__global const float* v1, __global const float* v2, int y, int x, int v1_step, int v2_step)
{
if (x > 0 && y > 0)
{
const float v1x = v1[y * v1_step + x] - v1[y * v1_step + x - 1];
const float v2y = v2[y * v2_step + x] - v2[(y - 1) * v2_step + x];
return v1x + v2y;
}
else
{
if (y > 0)
return v1[y * v1_step + 0] + v2[y * v2_step + 0] - v2[(y - 1) * v2_step + 0];
else
{
if (x > 0)
return v1[0 * v1_step + x] - v1[0 * v1_step + x - 1] + v2[0 * v2_step + x];
else
return v1[0 * v1_step + 0] + v2[0 * v2_step + 0];
}
}
}
__kernel void estimateUKernel(__global const float* I1wx, int I1wx_col, int I1wx_row, int I1wx_step,
__global const float* I1wy, /*int I1wy_step,*/
__global const float* grad, /*int grad_step,*/
__global const float* rho_c, /*int rho_c_step,*/
__global const float* p11, /*int p11_step,*/
__global const float* p12, /*int p12_step,*/
__global const float* p21, /*int p21_step,*/
__global const float* p22, /*int p22_step,*/
__global float* u1, int u1_step,
__global float* u2,
__global float* error, const float l_t, const float theta, int u2_step,
int u1_offset_x,
int u1_offset_y,
int u2_offset_x,
int u2_offset_y)
{
//const int x = blockIdx.x * blockDim.x + threadIdx.x;
//const int y = blockIdx.y * blockDim.y + threadIdx.y;
int x = get_global_id(0);
int y = get_global_id(1);
if(x < I1wx_col && y < I1wx_row)
{
const float I1wxVal = I1wx[y * I1wx_step + x];
const float I1wyVal = I1wy[y * I1wx_step + x];
const float gradVal = grad[y * I1wx_step + x];
const float u1OldVal = u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];
const float u2OldVal = u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];
const float rho = rho_c[y * I1wx_step + x] + (I1wxVal * u1OldVal + I1wyVal * u2OldVal);
// estimate the values of the variable (v1, v2) (thresholding operator TH)
float d1 = 0.0f;
float d2 = 0.0f;
if (rho < -l_t * gradVal)
{
d1 = l_t * I1wxVal;
d2 = l_t * I1wyVal;
}
else if (rho > l_t * gradVal)
{
d1 = -l_t * I1wxVal;
d2 = -l_t * I1wyVal;
}
else if (gradVal > 1.192092896e-07f)
{
const float fi = -rho / gradVal;
d1 = fi * I1wxVal;
d2 = fi * I1wyVal;
}
const float v1 = u1OldVal + d1;
const float v2 = u2OldVal + d2;
// compute the divergence of the dual variable (p1, p2)
const float div_p1 = divergence(p11, p12, y, x, I1wx_step, I1wx_step);
const float div_p2 = divergence(p21, p22, y, x, I1wx_step, I1wx_step);
// estimate the values of the optical flow (u1, u2)
const float u1NewVal = v1 + theta * div_p1;
const float u2NewVal = v2 + theta * div_p2;
u1[(y + u1_offset_y) * u1_step + x + u1_offset_x] = u1NewVal;
u2[(y + u2_offset_y) * u2_step + x + u2_offset_x] = u2NewVal;
const float n1 = (u1OldVal - u1NewVal) * (u1OldVal - u1NewVal);
const float n2 = (u2OldVal - u2NewVal) * (u2OldVal - u2NewVal);
error[y * I1wx_step + x] = n1 + n2;
}
}

@ -0,0 +1,756 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Jia Haipeng, jiahaipeng95@gmail.com
// Jin Ma, jin@multicorewareinc.com
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
using namespace cv;
using namespace cv::ocl;
using namespace std;
#if !defined (HAVE_OPENCL)
namespace cv
{
namespace ocl
{
void cv::ocl::StereoConstantSpaceBP::estimateRecommendedParams(int, int, int &, int &, int &, int &)
{
throw_nogpu();
}
cv::ocl::StereoConstantSpaceBP::StereoConstantSpaceBP(int, int, int, int, int)
{
throw_nogpu();
}
cv::ocl::StereoConstantSpaceBP::StereoConstantSpaceBP(int, int, int, int, float, float,
float, float, int, int)
{
throw_nogpu();
}
void cv::ocl::StereoConstantSpaceBP::operator()(const oclMat &, const oclMat &, oclMat &)
{
throw_nogpu();
}
}
}
#else /* !defined (HAVE_OPENCL) */
namespace cv
{
namespace ocl
{
///////////////////////////OpenCL kernel Strings///////////////////////////
extern const char *stereocsbp;
}
}
namespace cv
{
namespace ocl
{
namespace stereoCSBP
{
//////////////////////////////////////////////////////////////////////////
//////////////////////////////common////////////////////////////////////
////////////////////////////////////////////////////////////////////////
static inline int divUp(int total, int grain)
{
return (total + grain - 1) / grain;
}
static String get_kernel_name(String kernel_name, int data_type)
{
return kernel_name + (data_type == CV_16S ? "0" : "1");
}
using cv::ocl::StereoConstantSpaceBP;
//////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////init_data_cost//////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////
static void init_data_cost_caller(const oclMat &left, const oclMat &right, oclMat &temp,
StereoConstantSpaceBP &rthis,
int msg_step, int h, int w, int level)
{
Context *clCxt = left.clCxt;
int data_type = rthis.msg_type;
int channels = left.oclchannels();
String kernelName = get_kernel_name("init_data_cost_", data_type);
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
//size_t blockSize = 256;
size_t localThreads[] = {32, 8 ,1};
size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0],
divUp(h, localThreads[1]) *localThreads[1],
1
};
int cdisp_step1 = msg_step * h;
openCLVerifyKernel(clCxt, kernel, localThreads);
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&temp.data));
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&left.data));
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&right.data));
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&h));
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&w));
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&level));
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&channels));
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&msg_step));
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_float), (void *)&rthis.data_weight));
openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_float), (void *)&rthis.max_data_term));
openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&cdisp_step1));
openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&rthis.min_disp_th));
openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&left.step));
openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_int), (void *)&rthis.ndisp));
openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL,
globalThreads, localThreads, 0, NULL, NULL));
clFinish(*(cl_command_queue*)getoclCommandQueue());
openCLSafeCall(clReleaseKernel(kernel));
}
static void init_data_cost_reduce_caller(const oclMat &left, const oclMat &right, oclMat &temp,
StereoConstantSpaceBP &rthis,
int msg_step, int h, int w, int level)
{
Context *clCxt = left.clCxt;
int data_type = rthis.msg_type;
int channels = left.oclchannels();
int win_size = (int)std::pow(2.f, level);
String kernelName = get_kernel_name("init_data_cost_reduce_", data_type);
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
const int threadsNum = 256;
//size_t blockSize = threadsNum;
size_t localThreads[3] = {win_size, 1, threadsNum / win_size};
size_t globalThreads[3] = {w *localThreads[0],
h * divUp(rthis.ndisp, localThreads[2]) *localThreads[1], 1 * localThreads[2]
};
int local_mem_size = threadsNum * sizeof(float);
int cdisp_step1 = msg_step * h;
openCLVerifyKernel(clCxt, kernel, localThreads);
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&temp.data));
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&left.data));
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&right.data));
openCLSafeCall(clSetKernelArg(kernel, 3, local_mem_size, (void *)NULL));
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&level));
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&left.rows));
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&left.cols));
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&h));
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&win_size));
openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&channels));
openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&rthis.ndisp));
openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&left.step));
openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_float), (void *)&rthis.data_weight));
openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_float), (void *)&rthis.max_data_term));
openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_int), (void *)&rthis.min_disp_th));
openCLSafeCall(clSetKernelArg(kernel, 15, sizeof(cl_int), (void *)&cdisp_step1));
openCLSafeCall(clSetKernelArg(kernel, 16, sizeof(cl_int), (void *)&msg_step));
openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 3, NULL,
globalThreads, localThreads, 0, NULL, NULL));
clFinish(*(cl_command_queue*)getoclCommandQueue());
openCLSafeCall(clReleaseKernel(kernel));
}
static void get_first_initial_local_caller(uchar *data_cost_selected, uchar *disp_selected_pyr,
oclMat &temp, StereoConstantSpaceBP &rthis,
int h, int w, int nr_plane, int msg_step)
{
Context *clCxt = temp.clCxt;
int data_type = rthis.msg_type;
String kernelName = get_kernel_name("get_first_k_initial_local_", data_type);
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
//size_t blockSize = 256;
size_t localThreads[] = {32, 8 ,1};
size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0],
divUp(h, localThreads[1]) *localThreads[1],
1
};
int disp_step = msg_step * h;
openCLVerifyKernel(clCxt, kernel, localThreads);
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&data_cost_selected));
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&disp_selected_pyr));
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&temp.data));
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&h));
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&w));
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&nr_plane));
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&msg_step));
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&disp_step));
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&rthis.ndisp));
openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL,
globalThreads, localThreads, 0, NULL, NULL));
clFinish(*(cl_command_queue*)getoclCommandQueue());
openCLSafeCall(clReleaseKernel(kernel));
}
static void get_first_initial_global_caller(uchar *data_cost_selected, uchar *disp_selected_pyr,
oclMat &temp, StereoConstantSpaceBP &rthis,
int h, int w, int nr_plane, int msg_step)
{
Context *clCxt = temp.clCxt;
int data_type = rthis.msg_type;
String kernelName = get_kernel_name("get_first_k_initial_global_", data_type);
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
//size_t blockSize = 256;
size_t localThreads[] = {32, 8, 1};
size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0],
divUp(h, localThreads[1]) *localThreads[1],
1
};
int disp_step = msg_step * h;
openCLVerifyKernel(clCxt, kernel, localThreads);
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&data_cost_selected));
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&disp_selected_pyr));
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&temp.data));
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&h));
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&w));
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&nr_plane));
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&msg_step));
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&disp_step));
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&rthis.ndisp));
openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL,
globalThreads, localThreads, 0, NULL, NULL));
clFinish(*(cl_command_queue*)getoclCommandQueue());
openCLSafeCall(clReleaseKernel(kernel));
}
static void init_data_cost(const oclMat &left, const oclMat &right, oclMat &temp, StereoConstantSpaceBP &rthis,
uchar *disp_selected_pyr, uchar *data_cost_selected,
size_t msg_step, int h, int w, int level, int nr_plane)
{
if(level <= 1)
init_data_cost_caller(left, right, temp, rthis, msg_step, h, w, level);
else
init_data_cost_reduce_caller(left, right, temp, rthis, msg_step, h, w, level);
if(rthis.use_local_init_data_cost == true)
{
get_first_initial_local_caller(data_cost_selected, disp_selected_pyr, temp, rthis, h, w, nr_plane, msg_step);
}
else
{
get_first_initial_global_caller(data_cost_selected, disp_selected_pyr, temp, rthis, h, w,
nr_plane, msg_step);
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////compute_data_cost//////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////
static void compute_data_cost_caller(uchar *disp_selected_pyr, uchar *data_cost,
StereoConstantSpaceBP &rthis, int msg_step1,
int msg_step2, const oclMat &left, const oclMat &right, int h,
int w, int h2, int level, int nr_plane)
{
Context *clCxt = left.clCxt;
int channels = left.oclchannels();
int data_type = rthis.msg_type;
String kernelName = get_kernel_name("compute_data_cost_", data_type);
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
//size_t blockSize = 256;
size_t localThreads[] = {32, 8, 1};
size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0],
divUp(h, localThreads[1]) *localThreads[1],
1
};
int disp_step1 = msg_step1 * h;
int disp_step2 = msg_step2 * h2;
openCLVerifyKernel(clCxt, kernel, localThreads);
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&disp_selected_pyr));
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&data_cost));
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&left.data));
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&right.data));
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&h));
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&w));
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&level));
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&nr_plane));
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&channels));
openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&msg_step1));
openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&msg_step2));
openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&disp_step1));
openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&disp_step2));
openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_float), (void *)&rthis.data_weight));
openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_float), (void *)&rthis.max_data_term));
openCLSafeCall(clSetKernelArg(kernel, 15, sizeof(cl_int), (void *)&left.step));
openCLSafeCall(clSetKernelArg(kernel, 16, sizeof(cl_int), (void *)&rthis.min_disp_th));
openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL,
globalThreads, localThreads, 0, NULL, NULL));
clFinish(*(cl_command_queue*)getoclCommandQueue());
openCLSafeCall(clReleaseKernel(kernel));
}
static void compute_data_cost_reduce_caller(uchar *disp_selected_pyr, uchar *data_cost,
StereoConstantSpaceBP &rthis, int msg_step1,
int msg_step2, const oclMat &left, const oclMat &right, int h,
int w, int h2, int level, int nr_plane)
{
Context *clCxt = left.clCxt;
int data_type = rthis.msg_type;
int channels = left.oclchannels();
int win_size = (int)std::pow(2.f, level);
String kernelName = get_kernel_name("compute_data_cost_reduce_", data_type);
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
const size_t threadsNum = 256;
//size_t blockSize = threadsNum;
size_t localThreads[3] = {win_size, 1, threadsNum / win_size};
size_t globalThreads[3] = {w *localThreads[0],
h * divUp(nr_plane, localThreads[2]) *localThreads[1], 1 * localThreads[2]
};
int disp_step1 = msg_step1 * h;
int disp_step2 = msg_step2 * h2;
size_t local_mem_size = threadsNum * sizeof(float);
openCLVerifyKernel(clCxt, kernel, localThreads);
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&disp_selected_pyr));
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&data_cost));
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&left.data));
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&right.data));
openCLSafeCall(clSetKernelArg(kernel, 4, local_mem_size, (void *)NULL));
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&level));
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&left.rows));
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&left.cols));
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&h));
openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&nr_plane));
openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&channels));
openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&win_size));
openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&msg_step1));
openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_int), (void *)&msg_step2));
openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_int), (void *)&disp_step1));
openCLSafeCall(clSetKernelArg(kernel, 15, sizeof(cl_int), (void *)&disp_step2));
openCLSafeCall(clSetKernelArg(kernel, 16, sizeof(cl_float), (void *)&rthis.data_weight));
openCLSafeCall(clSetKernelArg(kernel, 17, sizeof(cl_float), (void *)&rthis.max_data_term));
openCLSafeCall(clSetKernelArg(kernel, 18, sizeof(cl_int), (void *)&left.step));
openCLSafeCall(clSetKernelArg(kernel, 19, sizeof(cl_int), (void *)&rthis.min_disp_th));
openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 3, NULL,
globalThreads, localThreads, 0, NULL, NULL));
clFinish(*(cl_command_queue*)getoclCommandQueue());
openCLSafeCall(clReleaseKernel(kernel));
}
static void compute_data_cost(uchar *disp_selected_pyr, uchar *data_cost, StereoConstantSpaceBP &rthis,
int msg_step1, int msg_step2, const oclMat &left, const oclMat &right, int h, int w,
int h2, int level, int nr_plane)
{
if(level <= 1)
compute_data_cost_caller(disp_selected_pyr, data_cost, rthis, msg_step1, msg_step2,
left, right, h, w, h2, level, nr_plane);
else
compute_data_cost_reduce_caller(disp_selected_pyr, data_cost, rthis, msg_step1, msg_step2,
left, right, h, w, h2, level, nr_plane);
}
////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////init message//////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////
static void init_message(uchar *u_new, uchar *d_new, uchar *l_new, uchar *r_new,
uchar *u_cur, uchar *d_cur, uchar *l_cur, uchar *r_cur,
uchar *disp_selected_pyr_new, uchar *disp_selected_pyr_cur,
uchar *data_cost_selected, uchar *data_cost, oclMat &temp, StereoConstantSpaceBP rthis,
size_t msg_step1, size_t msg_step2, int h, int w, int nr_plane,
int h2, int w2, int nr_plane2)
{
Context *clCxt = temp.clCxt;
int data_type = rthis.msg_type;
String kernelName = get_kernel_name("init_message_", data_type);
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
//size_t blockSize = 256;
size_t localThreads[] = {32, 8, 1};
size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0],
divUp(h, localThreads[1]) *localThreads[1],
1
};
int disp_step1 = msg_step1 * h;
int disp_step2 = msg_step2 * h2;
openCLVerifyKernel(clCxt, kernel, localThreads);
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&u_new));
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d_new));
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&l_new));
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&r_new));
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&u_cur));
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&d_cur));
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&l_cur));
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *)&r_cur));
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *)&temp.data));
openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_mem), (void *)&disp_selected_pyr_new));
openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_mem), (void *)&disp_selected_pyr_cur));
openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_mem), (void *)&data_cost_selected));
openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_mem), (void *)&data_cost));
openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_int), (void *)&h));
openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_int), (void *)&w));
openCLSafeCall(clSetKernelArg(kernel, 15, sizeof(cl_int), (void *)&nr_plane));
openCLSafeCall(clSetKernelArg(kernel, 16, sizeof(cl_int), (void *)&h2));
openCLSafeCall(clSetKernelArg(kernel, 17, sizeof(cl_int), (void *)&w2));
openCLSafeCall(clSetKernelArg(kernel, 18, sizeof(cl_int), (void *)&nr_plane2));
openCLSafeCall(clSetKernelArg(kernel, 19, sizeof(cl_int), (void *)&disp_step1));
openCLSafeCall(clSetKernelArg(kernel, 20, sizeof(cl_int), (void *)&disp_step2));
openCLSafeCall(clSetKernelArg(kernel, 21, sizeof(cl_int), (void *)&msg_step1));
openCLSafeCall(clSetKernelArg(kernel, 22, sizeof(cl_int), (void *)&msg_step2));
openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL,
globalThreads, localThreads, 0, NULL, NULL));
clFinish(*(cl_command_queue*)getoclCommandQueue());
openCLSafeCall(clReleaseKernel(kernel));
}
////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////calc_all_iterations////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////
static void calc_all_iterations_caller(uchar *u, uchar *d, uchar *l, uchar *r, uchar *data_cost_selected,
uchar *disp_selected_pyr, oclMat &temp, StereoConstantSpaceBP rthis,
int msg_step, int h, int w, int nr_plane, int i)
{
Context *clCxt = temp.clCxt;
int data_type = rthis.msg_type;
String kernelName = get_kernel_name("compute_message_", data_type);
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
size_t localThreads[] = {32, 8, 1};
size_t globalThreads[] = {divUp(w, (localThreads[0]) << 1) *localThreads[0],
divUp(h, localThreads[1]) *localThreads[1],
1
};
int disp_step = msg_step * h;
openCLVerifyKernel(clCxt, kernel, localThreads);
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&u));
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d));
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&l));
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&r));
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&data_cost_selected));
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&disp_selected_pyr));
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&temp.data));
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&h));
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&w));
openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&nr_plane));
openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&i));
openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_float), (void *)&rthis.max_disc_term));
openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&disp_step));
openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_int), (void *)&msg_step));
openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_float), (void *)&rthis.disc_single_jump));
openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL,
globalThreads, localThreads, 0, NULL, NULL));
clFinish(*(cl_command_queue*)getoclCommandQueue());
openCLSafeCall(clReleaseKernel(kernel));
}
static void calc_all_iterations(uchar *u, uchar *d, uchar *l, uchar *r, uchar *data_cost_selected,
uchar *disp_selected_pyr, oclMat &temp, StereoConstantSpaceBP rthis,
int msg_step, int h, int w, int nr_plane)
{
for(int t = 0; t < rthis.iters; t++)
calc_all_iterations_caller(u, d, l, r, data_cost_selected, disp_selected_pyr, temp, rthis,
msg_step, h, w, nr_plane, t & 1);
}
///////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////compute_disp////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////
static void compute_disp(uchar *u, uchar *d, uchar *l, uchar *r, uchar *data_cost_selected,
uchar *disp_selected_pyr, StereoConstantSpaceBP &rthis, size_t msg_step,
oclMat &disp, int nr_plane)
{
Context *clCxt = disp.clCxt;
int data_type = rthis.msg_type;
String kernelName = get_kernel_name("compute_disp_", data_type);
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName);
//size_t blockSize = 256;
size_t localThreads[] = {32, 8, 1};
size_t globalThreads[] = {divUp(disp.cols, localThreads[0]) *localThreads[0],
divUp(disp.rows, localThreads[1]) *localThreads[1],
1
};
int step_size = disp.step / disp.elemSize();
int disp_step = disp.rows * msg_step;
openCLVerifyKernel(clCxt, kernel, localThreads);
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&u));
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d));
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&l));
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&r));
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&data_cost_selected));
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&disp_selected_pyr));
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&disp.data));
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&step_size));
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&disp.cols));
openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&disp.rows));
openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&nr_plane));
openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&msg_step));
openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&disp_step));
openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL,
globalThreads, localThreads, 0, NULL, NULL));
clFinish(*(cl_command_queue*)getoclCommandQueue());
openCLSafeCall(clReleaseKernel(kernel));
}
}
}
}
namespace
{
const float DEFAULT_MAX_DATA_TERM = 30.0f;
const float DEFAULT_DATA_WEIGHT = 1.0f;
const float DEFAULT_MAX_DISC_TERM = 160.0f;
const float DEFAULT_DISC_SINGLE_JUMP = 10.0f;
}
void cv::ocl::StereoConstantSpaceBP::estimateRecommendedParams(int width, int height, int &ndisp, int &iters, int &levels, int &nr_plane)
{
ndisp = (int) ((float) width / 3.14f);
if ((ndisp & 1) != 0)
ndisp++;
int mm = ::max(width, height);
iters = mm / 100 + ((mm > 1200) ? - 4 : 4);
levels = (int)::log(static_cast<double>(mm)) * 2 / 3;
if (levels == 0) levels++;
nr_plane = (int) ((float) ndisp / std::pow(2.0, levels + 1));
}
cv::ocl::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp_, int iters_, int levels_, int nr_plane_,
int msg_type_)
: ndisp(ndisp_), iters(iters_), levels(levels_), nr_plane(nr_plane_),
max_data_term(DEFAULT_MAX_DATA_TERM), data_weight(DEFAULT_DATA_WEIGHT),
max_disc_term(DEFAULT_MAX_DISC_TERM), disc_single_jump(DEFAULT_DISC_SINGLE_JUMP), min_disp_th(0),
msg_type(msg_type_), use_local_init_data_cost(true)
{
CV_Assert(msg_type_ == CV_32F || msg_type_ == CV_16S);
}
cv::ocl::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp_, int iters_, int levels_, int nr_plane_,
float max_data_term_, float data_weight_, float max_disc_term_, float disc_single_jump_,
int min_disp_th_, int msg_type_)
: ndisp(ndisp_), iters(iters_), levels(levels_), nr_plane(nr_plane_),
max_data_term(max_data_term_), data_weight(data_weight_),
max_disc_term(max_disc_term_), disc_single_jump(disc_single_jump_), min_disp_th(min_disp_th_),
msg_type(msg_type_), use_local_init_data_cost(true)
{
CV_Assert(msg_type_ == CV_32F || msg_type_ == CV_16S);
}
template<class T>
static void csbp_operator(StereoConstantSpaceBP &rthis, oclMat u[2], oclMat d[2], oclMat l[2], oclMat r[2],
oclMat disp_selected_pyr[2], oclMat &data_cost, oclMat &data_cost_selected,
oclMat &temp, oclMat &out, const oclMat &left, const oclMat &right, oclMat &disp)
{
CV_DbgAssert(0 < rthis.ndisp && 0 < rthis.iters && 0 < rthis.levels && 0 < rthis.nr_plane
&& left.rows == right.rows && left.cols == right.cols && left.type() == right.type());
CV_Assert(rthis.levels <= 8 && (left.type() == CV_8UC1 || left.type() == CV_8UC3));
const Scalar zero = Scalar::all(0);
////////////////////////////////////Init///////////////////////////////////////////////////
int rows = left.rows;
int cols = left.cols;
rthis.levels = min(rthis.levels, int(log((double)rthis.ndisp) / log(2.0)));
int levels = rthis.levels;
AutoBuffer<int> buf(levels * 4);
int *cols_pyr = buf;
int *rows_pyr = cols_pyr + levels;
int *nr_plane_pyr = rows_pyr + levels;
int *step_pyr = nr_plane_pyr + levels;
cols_pyr[0] = cols;
rows_pyr[0] = rows;
nr_plane_pyr[0] = rthis.nr_plane;
const int n = 64;
step_pyr[0] = alignSize(cols * sizeof(T), n) / sizeof(T);
for (int i = 1; i < levels; i++)
{
cols_pyr[i] = cols_pyr[i - 1] / 2;
rows_pyr[i] = rows_pyr[i - 1]/ 2;
nr_plane_pyr[i] = nr_plane_pyr[i - 1] * 2;
step_pyr[i] = alignSize(cols_pyr[i] * sizeof(T), n) / sizeof(T);
}
Size msg_size(step_pyr[0], rows * nr_plane_pyr[0]);
Size data_cost_size(step_pyr[0], rows * nr_plane_pyr[0] * 2);
u[0].create(msg_size, DataType<T>::type);
d[0].create(msg_size, DataType<T>::type);
l[0].create(msg_size, DataType<T>::type);
r[0].create(msg_size, DataType<T>::type);
u[1].create(msg_size, DataType<T>::type);
d[1].create(msg_size, DataType<T>::type);
l[1].create(msg_size, DataType<T>::type);
r[1].create(msg_size, DataType<T>::type);
disp_selected_pyr[0].create(msg_size, DataType<T>::type);
disp_selected_pyr[1].create(msg_size, DataType<T>::type);
data_cost.create(data_cost_size, DataType<T>::type);
data_cost_selected.create(msg_size, DataType<T>::type);
Size temp_size = data_cost_size;
if (data_cost_size.width * data_cost_size.height < step_pyr[0] * rows_pyr[levels - 1] * rthis.ndisp)
temp_size = Size(step_pyr[0], rows_pyr[levels - 1] * rthis.ndisp);
temp.create(temp_size, DataType<T>::type);
temp = zero;
///////////////////////////////// Compute////////////////////////////////////////////////
//csbp::load_constants(rthis.ndisp, rthis.max_data_term, rthis.data_weight,
// rthis.max_disc_term, rthis.disc_single_jump, rthis.min_disp_th, left, right, temp);
l[0] = zero;
d[0] = zero;
r[0] = zero;
u[0] = zero;
disp_selected_pyr[0] = zero;
l[1] = zero;
d[1] = zero;
r[1] = zero;
u[1] = zero;
disp_selected_pyr[1] = zero;
data_cost = zero;
data_cost_selected = zero;
int cur_idx = 0;
for (int i = levels - 1; i >= 0; i--)
{
if (i == levels - 1)
{
cv::ocl::stereoCSBP::init_data_cost(left, right, temp, rthis, disp_selected_pyr[cur_idx].data,
data_cost_selected.data, step_pyr[0], rows_pyr[i], cols_pyr[i],
i, nr_plane_pyr[i]);
}
else
{
cv::ocl::stereoCSBP::compute_data_cost(
disp_selected_pyr[cur_idx].data, data_cost.data, rthis, step_pyr[0],
step_pyr[0], left, right, rows_pyr[i], cols_pyr[i], rows_pyr[i + 1], i,
nr_plane_pyr[i + 1]);
int new_idx = (cur_idx + 1) & 1;
cv::ocl::stereoCSBP::init_message(u[new_idx].data, d[new_idx].data, l[new_idx].data, r[new_idx].data,
u[cur_idx].data, d[cur_idx].data, l[cur_idx].data, r[cur_idx].data,
disp_selected_pyr[new_idx].data, disp_selected_pyr[cur_idx].data,
data_cost_selected.data, data_cost.data, temp, rthis, step_pyr[0],
step_pyr[0], rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], rows_pyr[i + 1],
cols_pyr[i + 1], nr_plane_pyr[i + 1]);
cur_idx = new_idx;
}
cv::ocl::stereoCSBP::calc_all_iterations(u[cur_idx].data, d[cur_idx].data, l[cur_idx].data, r[cur_idx].data,
data_cost_selected.data, disp_selected_pyr[cur_idx].data, temp,
rthis, step_pyr[0], rows_pyr[i], cols_pyr[i], nr_plane_pyr[i]);
}
if (disp.empty())
disp.create(rows, cols, CV_16S);
out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out));
out = zero;
stereoCSBP::compute_disp(u[cur_idx].data, d[cur_idx].data, l[cur_idx].data, r[cur_idx].data,
data_cost_selected.data, disp_selected_pyr[cur_idx].data, rthis, step_pyr[0],
out, nr_plane_pyr[0]);
if (disp.type() != CV_16S)
out.convertTo(disp, disp.type());
}
typedef void (*csbp_operator_t)(StereoConstantSpaceBP &rthis, oclMat u[2], oclMat d[2], oclMat l[2], oclMat r[2],
oclMat disp_selected_pyr[2], oclMat &data_cost, oclMat &data_cost_selected,
oclMat &temp, oclMat &out, const oclMat &left, const oclMat &right, oclMat &disp);
const static csbp_operator_t operators[] = {0, 0, 0, csbp_operator<short>, 0, csbp_operator<float>, 0, 0};
void cv::ocl::StereoConstantSpaceBP::operator()(const oclMat &left, const oclMat &right, oclMat &disp)
{
CV_Assert(msg_type == CV_32F || msg_type == CV_16S);
operators[msg_type](*this, u, d, l, r, disp_selected_pyr, data_cost, data_cost_selected, temp, out,
left, right, disp);
}
#endif /* !defined (HAVE_OPENCL) */

@ -72,28 +72,21 @@ namespace stereoBM
////////////////////////////////////////////////////////////////////////
static void prefilter_xsobel(const oclMat &input, oclMat &output, int prefilterCap)
{
Context *clCxt = input.clCxt;
String kernelName = "prefilter_xsobel";
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereobm, kernelName);
size_t blockSize = 1;
size_t globalThreads[3] = { input.cols, input.rows, 1 };
size_t localThreads[3] = { blockSize, blockSize, 1 };
openCLVerifyKernel(clCxt, kernel, localThreads);
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&input.data));
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&output.data));
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&input.rows));
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&input.cols));
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&prefilterCap));
openCLSafeCall(clEnqueueNDRangeKernel((cl_command_queue)clCxt->oclCommandQueue(), kernel, 3, NULL,
globalThreads, localThreads, 0, NULL, NULL));
clFinish((cl_command_queue)clCxt->oclCommandQueue());
openCLSafeCall(clReleaseKernel(kernel));
std::vector< std::pair<size_t, const void *> > args;
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&input.data));
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&output.data));
args.push_back(std::make_pair(sizeof(cl_int), (void *)&input.rows));
args.push_back(std::make_pair(sizeof(cl_int), (void *)&input.cols));
args.push_back(std::make_pair(sizeof(cl_int), (void *)&prefilterCap));
openCLExecuteKernel(Context::getContext(), &stereobm, kernelName,
globalThreads, localThreads, args, -1, -1);
}
//////////////////////////////////////////////////////////////////////////
//////////////////////////////common////////////////////////////////////
@ -113,16 +106,13 @@ static void stereo_bm(const oclMat &left, const oclMat &right, oclMat &disp,
{
int winsz2 = winSize >> 1;
Context *clCxt = left.clCxt;
String kernelName = "stereoKernel";
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereobm, kernelName);
disp.setTo(Scalar_<unsigned char>::all(0));
minSSD_buf.setTo(Scalar_<unsigned int>::all(0xFFFFFFFF));
size_t minssd_step = minSSD_buf.step / minSSD_buf.elemSize();
size_t local_mem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * winsz2)) *
size_t local_mem_size = (N_DISPARITIES * (BLOCK_W + 2 * winsz2)) *
sizeof(cl_uint);
//size_t blockSize = 1;
size_t localThreads[] = { BLOCK_W, 1,1};
@ -131,26 +121,23 @@ static void stereo_bm(const oclMat &left, const oclMat &right, oclMat &disp,
1
};
openCLVerifyKernel(clCxt, kernel, localThreads);
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&left.data));
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&right.data));
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&minSSD_buf.data));
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&minssd_step));
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&disp.data));
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&disp.step));
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&left.cols));
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&left.rows));
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&left.step));
openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&maxdisp));
openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&winsz2));
openCLSafeCall(clSetKernelArg(kernel, 11, local_mem_size, (void *)NULL));
openCLSafeCall(clEnqueueNDRangeKernel((cl_command_queue)clCxt->oclCommandQueue(), kernel, 2, NULL,
globalThreads, localThreads, 0, NULL, NULL));
clFinish((cl_command_queue)clCxt->oclCommandQueue());
openCLSafeCall(clReleaseKernel(kernel));
std::vector< std::pair<size_t, const void *> > args;
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&left.data));
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&right.data));
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&minSSD_buf.data));
args.push_back(std::make_pair(sizeof(cl_int), (void *)&minssd_step));
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&disp.data));
args.push_back(std::make_pair(sizeof(cl_int), (void *)&disp.step));
args.push_back(std::make_pair(sizeof(cl_int), (void *)&left.cols));
args.push_back(std::make_pair(sizeof(cl_int), (void *)&left.rows));
args.push_back(std::make_pair(sizeof(cl_int), (void *)&left.step));
args.push_back(std::make_pair(sizeof(cl_int), (void *)&maxdisp));
args.push_back(std::make_pair(local_mem_size, (void *)NULL));
char opt [128];
sprintf(opt, "-D radius=%d", winsz2);
openCLExecuteKernel(Context::getContext(), &stereobm, kernelName,
globalThreads, localThreads, args, -1, -1, opt);
}
////////////////////////////////////////////////////////////////////////////
///////////////////////////////postfilter_textureness///////////////////////
@ -158,10 +145,7 @@ static void stereo_bm(const oclMat &left, const oclMat &right, oclMat &disp,
static void postfilter_textureness(oclMat &left, int winSize,
float avergeTexThreshold, oclMat &disparity)
{
Context *clCxt = left.clCxt;
String kernelName = "textureness_kernel";
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereobm, kernelName);
size_t blockSize = 1;
size_t localThreads[] = { BLOCK_W, blockSize ,1};
@ -172,22 +156,19 @@ static void postfilter_textureness(oclMat &left, int winSize,
size_t local_mem_size = (localThreads[0] + localThreads[0] + (winSize / 2) * 2) * sizeof(float);
openCLVerifyKernel(clCxt, kernel, localThreads);
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&disparity.data));
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&disparity.rows));
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&disparity.cols));
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&disparity.step));
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&left.data));
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&left.rows));
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&left.cols));
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&winSize));
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_float), (void *)&avergeTexThreshold));
openCLSafeCall(clSetKernelArg(kernel, 9, local_mem_size, NULL));
openCLSafeCall(clEnqueueNDRangeKernel((cl_command_queue)clCxt->oclCommandQueue(), kernel, 2, NULL,
globalThreads, localThreads, 0, NULL, NULL));
clFinish((cl_command_queue)clCxt->oclCommandQueue());
openCLSafeCall(clReleaseKernel(kernel));
std::vector< std::pair<size_t, const void *> > args;
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&disparity.data));
args.push_back(std::make_pair(sizeof(cl_int), (void *)&disparity.rows));
args.push_back(std::make_pair(sizeof(cl_int), (void *)&disparity.cols));
args.push_back(std::make_pair(sizeof(cl_int), (void *)&disparity.step));
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&left.data));
args.push_back(std::make_pair(sizeof(cl_int), (void *)&left.rows));
args.push_back(std::make_pair(sizeof(cl_int), (void *)&left.cols));
args.push_back(std::make_pair(sizeof(cl_int), (void *)&winSize));
args.push_back(std::make_pair(sizeof(cl_float), (void *)&avergeTexThreshold));
args.push_back(std::make_pair(local_mem_size, (void*)NULL));
openCLExecuteKernel(Context::getContext(), &stereobm, kernelName,
globalThreads, localThreads, args, -1, -1);
}
//////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////operator/////////////////////////////////

@ -0,0 +1,475 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Jin Ma, jin@multicorewareinc.com
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
using namespace std;
using namespace cv;
using namespace cv::ocl;
namespace cv
{
namespace ocl
{
///////////////////////////OpenCL kernel strings///////////////////////////
extern const char* tvl1flow;
}
}
cv::ocl::OpticalFlowDual_TVL1_OCL::OpticalFlowDual_TVL1_OCL()
{
tau = 0.25;
lambda = 0.15;
theta = 0.3;
nscales = 5;
warps = 5;
epsilon = 0.01;
iterations = 300;
useInitialFlow = false;
}
void cv::ocl::OpticalFlowDual_TVL1_OCL::operator()(const oclMat& I0, const oclMat& I1, oclMat& flowx, oclMat& flowy)
{
CV_Assert( I0.type() == CV_8UC1 || I0.type() == CV_32FC1 );
CV_Assert( I0.size() == I1.size() );
CV_Assert( I0.type() == I1.type() );
CV_Assert( !useInitialFlow || (flowx.size() == I0.size() && flowx.type() == CV_32FC1 && flowy.size() == flowx.size() && flowy.type() == flowx.type()) );
CV_Assert( nscales > 0 );
// allocate memory for the pyramid structure
I0s.resize(nscales);
I1s.resize(nscales);
u1s.resize(nscales);
u2s.resize(nscales);
//I0s_step == I1s_step
I0.convertTo(I0s[0], CV_32F, I0.depth() == CV_8U ? 1.0 : 255.0);
I1.convertTo(I1s[0], CV_32F, I1.depth() == CV_8U ? 1.0 : 255.0);
if (!useInitialFlow)
{
flowx.create(I0.size(), CV_32FC1);
flowy.create(I0.size(), CV_32FC1);
}
//u1s_step != u2s_step
u1s[0] = flowx;
u2s[0] = flowy;
I1x_buf.create(I0.size(), CV_32FC1);
I1y_buf.create(I0.size(), CV_32FC1);
I1w_buf.create(I0.size(), CV_32FC1);
I1wx_buf.create(I0.size(), CV_32FC1);
I1wy_buf.create(I0.size(), CV_32FC1);
grad_buf.create(I0.size(), CV_32FC1);
rho_c_buf.create(I0.size(), CV_32FC1);
p11_buf.create(I0.size(), CV_32FC1);
p12_buf.create(I0.size(), CV_32FC1);
p21_buf.create(I0.size(), CV_32FC1);
p22_buf.create(I0.size(), CV_32FC1);
diff_buf.create(I0.size(), CV_32FC1);
// create the scales
for (int s = 1; s < nscales; ++s)
{
ocl::pyrDown(I0s[s - 1], I0s[s]);
ocl::pyrDown(I1s[s - 1], I1s[s]);
if (I0s[s].cols < 16 || I0s[s].rows < 16)
{
nscales = s;
break;
}
if (useInitialFlow)
{
ocl::pyrDown(u1s[s - 1], u1s[s]);
ocl::pyrDown(u2s[s - 1], u2s[s]);
//ocl::multiply(u1s[s], Scalar::all(0.5), u1s[s]);
multiply(0.5, u1s[s], u1s[s]);
//ocl::multiply(u2s[s], Scalar::all(0.5), u2s[s]);
multiply(0.5, u1s[s], u2s[s]);
}
}
// pyramidal structure for computing the optical flow
for (int s = nscales - 1; s >= 0; --s)
{
// compute the optical flow at the current scale
procOneScale(I0s[s], I1s[s], u1s[s], u2s[s]);
// if this was the last scale, finish now
if (s == 0)
break;
// otherwise, upsample the optical flow
// zoom the optical flow for the next finer scale
ocl::resize(u1s[s], u1s[s - 1], I0s[s - 1].size());
ocl::resize(u2s[s], u2s[s - 1], I0s[s - 1].size());
// scale the optical flow with the appropriate zoom factor
multiply(2, u1s[s - 1], u1s[s - 1]);
multiply(2, u2s[s - 1], u2s[s - 1]);
}
}
namespace ocl_tvl1flow
{
void centeredGradient(const oclMat &src, oclMat &dx, oclMat &dy);
void warpBackward(const oclMat &I0, const oclMat &I1, oclMat &I1x, oclMat &I1y,
oclMat &u1, oclMat &u2, oclMat &I1w, oclMat &I1wx, oclMat &I1wy,
oclMat &grad, oclMat &rho);
void estimateU(oclMat &I1wx, oclMat &I1wy, oclMat &grad,
oclMat &rho_c, oclMat &p11, oclMat &p12,
oclMat &p21, oclMat &p22, oclMat &u1,
oclMat &u2, oclMat &error, float l_t, float theta);
void estimateDualVariables(oclMat &u1, oclMat &u2,
oclMat &p11, oclMat &p12, oclMat &p21, oclMat &p22, float taut);
}
void cv::ocl::OpticalFlowDual_TVL1_OCL::procOneScale(const oclMat &I0, const oclMat &I1, oclMat &u1, oclMat &u2)
{
using namespace ocl_tvl1flow;
const double scaledEpsilon = epsilon * epsilon * I0.size().area();
CV_DbgAssert( I1.size() == I0.size() );
CV_DbgAssert( I1.type() == I0.type() );
CV_DbgAssert( u1.empty() || u1.size() == I0.size() );
CV_DbgAssert( u2.size() == u1.size() );
if (u1.empty())
{
u1.create(I0.size(), CV_32FC1);
u1.setTo(Scalar::all(0));
u2.create(I0.size(), CV_32FC1);
u2.setTo(Scalar::all(0));
}
oclMat I1x = I1x_buf(Rect(0, 0, I0.cols, I0.rows));
oclMat I1y = I1y_buf(Rect(0, 0, I0.cols, I0.rows));
centeredGradient(I1, I1x, I1y);
oclMat I1w = I1w_buf(Rect(0, 0, I0.cols, I0.rows));
oclMat I1wx = I1wx_buf(Rect(0, 0, I0.cols, I0.rows));
oclMat I1wy = I1wy_buf(Rect(0, 0, I0.cols, I0.rows));
oclMat grad = grad_buf(Rect(0, 0, I0.cols, I0.rows));
oclMat rho_c = rho_c_buf(Rect(0, 0, I0.cols, I0.rows));
oclMat p11 = p11_buf(Rect(0, 0, I0.cols, I0.rows));
oclMat p12 = p12_buf(Rect(0, 0, I0.cols, I0.rows));
oclMat p21 = p21_buf(Rect(0, 0, I0.cols, I0.rows));
oclMat p22 = p22_buf(Rect(0, 0, I0.cols, I0.rows));
p11.setTo(Scalar::all(0));
p12.setTo(Scalar::all(0));
p21.setTo(Scalar::all(0));
p22.setTo(Scalar::all(0));
oclMat diff = diff_buf(Rect(0, 0, I0.cols, I0.rows));
const float l_t = static_cast<float>(lambda * theta);
const float taut = static_cast<float>(tau / theta);
for (int warpings = 0; warpings < warps; ++warpings)
{
warpBackward(I0, I1, I1x, I1y, u1, u2, I1w, I1wx, I1wy, grad, rho_c);
double error = numeric_limits<double>::max();
for (int n = 0; error > scaledEpsilon && n < iterations; ++n)
{
estimateU(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22,
u1, u2, diff, l_t, static_cast<float>(theta));
error = ocl::sum(diff)[0];
estimateDualVariables(u1, u2, p11, p12, p21, p22, taut);
}
}
}
void cv::ocl::OpticalFlowDual_TVL1_OCL::collectGarbage()
{
I0s.clear();
I1s.clear();
u1s.clear();
u2s.clear();
I1x_buf.release();
I1y_buf.release();
I1w_buf.release();
I1wx_buf.release();
I1wy_buf.release();
grad_buf.release();
rho_c_buf.release();
p11_buf.release();
p12_buf.release();
p21_buf.release();
p22_buf.release();
diff_buf.release();
norm_buf.release();
}
void ocl_tvl1flow::centeredGradient(const oclMat &src, oclMat &dx, oclMat &dy)
{
Context *clCxt = src.clCxt;
size_t localThreads[3] = {32, 8, 1};
size_t globalThreads[3] = {src.cols, src.rows, 1};
int srcElementSize = src.elemSize();
int src_step = src.step/srcElementSize;
int dElememntSize = dx.elemSize();
int dx_step = dx.step/dElememntSize;
String kernelName = "centeredGradientKernel";
vector< pair<size_t, const void *> > args;
args.push_back( make_pair( sizeof(cl_mem), (void*)&src.data));
args.push_back( make_pair( sizeof(cl_int), (void*)&src.cols));
args.push_back( make_pair( sizeof(cl_int), (void*)&src.rows));
args.push_back( make_pair( sizeof(cl_int), (void*)&src_step));
args.push_back( make_pair( sizeof(cl_mem), (void*)&dx.data));
args.push_back( make_pair( sizeof(cl_mem), (void*)&dy.data));
args.push_back( make_pair( sizeof(cl_int), (void*)&dx_step));
openCLExecuteKernel(clCxt, &tvl1flow, kernelName, globalThreads, localThreads, args, -1, -1);
}
void ocl_tvl1flow::estimateDualVariables(oclMat &u1, oclMat &u2, oclMat &p11, oclMat &p12, oclMat &p21, oclMat &p22, float taut)
{
Context *clCxt = u1.clCxt;
size_t localThread[] = {32, 8, 1};
size_t globalThread[] =
{
u1.cols,
u1.rows,
1
};
int u1_element_size = u1.elemSize();
int u1_step = u1.step/u1_element_size;
int u2_element_size = u2.elemSize();
int u2_step = u2.step/u2_element_size;
int p11_element_size = p11.elemSize();
int p11_step = p11.step/p11_element_size;
int u1_offset_y = u1.offset/u1.step;
int u1_offset_x = u1.offset%u1.step;
u1_offset_x = u1_offset_x/u1.elemSize();
int u2_offset_y = u2.offset/u2.step;
int u2_offset_x = u2.offset%u2.step;
u2_offset_x = u2_offset_x/u2.elemSize();
String kernelName = "estimateDualVariablesKernel";
vector< pair<size_t, const void *> > args;
args.push_back( make_pair( sizeof(cl_mem), (void*)&u1.data));
args.push_back( make_pair( sizeof(cl_int), (void*)&u1.cols));
args.push_back( make_pair( sizeof(cl_int), (void*)&u1.rows));
args.push_back( make_pair( sizeof(cl_int), (void*)&u1_step));
args.push_back( make_pair( sizeof(cl_mem), (void*)&u2.data));
args.push_back( make_pair( sizeof(cl_mem), (void*)&p11.data));
args.push_back( make_pair( sizeof(cl_int), (void*)&p11_step));
args.push_back( make_pair( sizeof(cl_mem), (void*)&p12.data));
args.push_back( make_pair( sizeof(cl_mem), (void*)&p21.data));
args.push_back( make_pair( sizeof(cl_mem), (void*)&p22.data));
args.push_back( make_pair( sizeof(cl_float), (void*)&taut));
args.push_back( make_pair( sizeof(cl_int), (void*)&u2_step));
args.push_back( make_pair( sizeof(cl_int), (void*)&u1_offset_x));
args.push_back( make_pair( sizeof(cl_int), (void*)&u1_offset_y));
args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_x));
args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_y));
openCLExecuteKernel(clCxt, &tvl1flow, kernelName, globalThread, localThread, args, -1, -1);
}
void ocl_tvl1flow::estimateU(oclMat &I1wx, oclMat &I1wy, oclMat &grad,
oclMat &rho_c, oclMat &p11, oclMat &p12,
oclMat &p21, oclMat &p22, oclMat &u1,
oclMat &u2, oclMat &error, float l_t, float theta)
{
Context* clCxt = I1wx.clCxt;
size_t localThread[] = {32, 8, 1};
size_t globalThread[] =
{
I1wx.cols,
I1wx.rows,
1
};
int I1wx_element_size = I1wx.elemSize();
int I1wx_step = I1wx.step/I1wx_element_size;
int u1_element_size = u1.elemSize();
int u1_step = u1.step/u1_element_size;
int u2_element_size = u2.elemSize();
int u2_step = u2.step/u2_element_size;
int u1_offset_y = u1.offset/u1.step;
int u1_offset_x = u1.offset%u1.step;
u1_offset_x = u1_offset_x/u1.elemSize();
int u2_offset_y = u2.offset/u2.step;
int u2_offset_x = u2.offset%u2.step;
u2_offset_x = u2_offset_x/u2.elemSize();
String kernelName = "estimateUKernel";
vector< pair<size_t, const void *> > args;
args.push_back( make_pair( sizeof(cl_mem), (void*)&I1wx.data));
args.push_back( make_pair( sizeof(cl_int), (void*)&I1wx.cols));
args.push_back( make_pair( sizeof(cl_int), (void*)&I1wx.rows));
args.push_back( make_pair( sizeof(cl_int), (void*)&I1wx_step));
args.push_back( make_pair( sizeof(cl_mem), (void*)&I1wy.data));
args.push_back( make_pair( sizeof(cl_mem), (void*)&grad.data));
args.push_back( make_pair( sizeof(cl_mem), (void*)&rho_c.data));
args.push_back( make_pair( sizeof(cl_mem), (void*)&p11.data));
args.push_back( make_pair( sizeof(cl_mem), (void*)&p12.data));
args.push_back( make_pair( sizeof(cl_mem), (void*)&p21.data));
args.push_back( make_pair( sizeof(cl_mem), (void*)&p22.data));
args.push_back( make_pair( sizeof(cl_mem), (void*)&u1.data));
args.push_back( make_pair( sizeof(cl_int), (void*)&u1_step));
args.push_back( make_pair( sizeof(cl_mem), (void*)&u2.data));
args.push_back( make_pair( sizeof(cl_mem), (void*)&error.data));
args.push_back( make_pair( sizeof(cl_float), (void*)&l_t));
args.push_back( make_pair( sizeof(cl_float), (void*)&theta));
args.push_back( make_pair( sizeof(cl_int), (void*)&u2_step));
args.push_back( make_pair( sizeof(cl_int), (void*)&u1_offset_x));
args.push_back( make_pair( sizeof(cl_int), (void*)&u1_offset_y));
args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_x));
args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_y));
openCLExecuteKernel(clCxt, &tvl1flow, kernelName, globalThread, localThread, args, -1, -1);
}
void ocl_tvl1flow::warpBackward(const oclMat &I0, const oclMat &I1, oclMat &I1x, oclMat &I1y, oclMat &u1, oclMat &u2, oclMat &I1w, oclMat &I1wx, oclMat &I1wy, oclMat &grad, oclMat &rho)
{
Context* clCxt = I0.clCxt;
const bool isImgSupported = support_image2d(clCxt);
CV_Assert(isImgSupported);
int u1ElementSize = u1.elemSize();
int u1Step = u1.step/u1ElementSize;
int u2ElementSize = u2.elemSize();
int u2Step = u2.step/u2ElementSize;
int I0ElementSize = I0.elemSize();
int I0Step = I0.step/I0ElementSize;
int I1w_element_size = I1w.elemSize();
int I1w_step = I1w.step/I1w_element_size;
int u1_offset_y = u1.offset/u1.step;
int u1_offset_x = u1.offset%u1.step;
u1_offset_x = u1_offset_x/u1.elemSize();
int u2_offset_y = u2.offset/u2.step;
int u2_offset_x = u2.offset%u2.step;
u2_offset_x = u2_offset_x/u2.elemSize();
size_t localThread[] = {32, 8, 1};
size_t globalThread[] =
{
I0.cols,
I0.rows,
1
};
cl_mem I1_tex;
cl_mem I1x_tex;
cl_mem I1y_tex;
I1_tex = bindTexture(I1);
I1x_tex = bindTexture(I1x);
I1y_tex = bindTexture(I1y);
String kernelName = "warpBackwardKernel";
vector< pair<size_t, const void *> > args;
args.push_back( make_pair( sizeof(cl_mem), (void*)&I0.data));
args.push_back( make_pair( sizeof(cl_int), (void*)&I0Step));
args.push_back( make_pair( sizeof(cl_int), (void*)&I0.cols));
args.push_back( make_pair( sizeof(cl_int), (void*)&I0.rows));
args.push_back( make_pair( sizeof(cl_mem), (void*)&I1_tex));
args.push_back( make_pair( sizeof(cl_mem), (void*)&I1x_tex));
args.push_back( make_pair( sizeof(cl_mem), (void*)&I1y_tex));
args.push_back( make_pair( sizeof(cl_mem), (void*)&u1.data));
args.push_back( make_pair( sizeof(cl_int), (void*)&u1Step));
args.push_back( make_pair( sizeof(cl_mem), (void*)&u2.data));
args.push_back( make_pair( sizeof(cl_mem), (void*)&I1w.data));
args.push_back( make_pair( sizeof(cl_mem), (void*)&I1wx.data));
args.push_back( make_pair( sizeof(cl_mem), (void*)&I1wy.data));
args.push_back( make_pair( sizeof(cl_mem), (void*)&grad.data));
args.push_back( make_pair( sizeof(cl_mem), (void*)&rho.data));
args.push_back( make_pair( sizeof(cl_int), (void*)&I1w_step));
args.push_back( make_pair( sizeof(cl_int), (void*)&u2Step));
args.push_back( make_pair( sizeof(cl_int), (void*)&u1_offset_x));
args.push_back( make_pair( sizeof(cl_int), (void*)&u1_offset_y));
args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_x));
args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_y));
openCLExecuteKernel(clCxt, &tvl1flow, kernelName, globalThread, localThread, args, -1, -1);
}

@ -1,120 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// Intel License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000, Intel Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_TEST_INTERPOLATION_HPP__
#define __OPENCV_TEST_INTERPOLATION_HPP__
template <typename T> T readVal(const cv::Mat &src, int y, int x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
{
if (border_type == cv::BORDER_CONSTANT)
return (y >= 0 && y < src.rows && x >= 0 && x < src.cols) ? src.at<T>(y, x * src.channels() + c) : cv::saturate_cast<T>(borderVal.val[c]);
return src.at<T>(cv::borderInterpolate(y, src.rows, border_type), cv::borderInterpolate(x, src.cols, border_type) * src.channels() + c);
}
template <typename T> struct NearestInterpolator
{
static T getValue(const cv::Mat &src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
{
return readVal<T>(src, cvFloor(y), cvFloor(x), c, border_type, borderVal);
}
};
template <typename T> struct LinearInterpolator
{
static T getValue(const cv::Mat &src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
{
x -= 0.5f;
y -= 0.5f;
int x1 = cvFloor(x);
int y1 = cvFloor(y);
int x2 = x1 + 1;
int y2 = y1 + 1;
float res = 0;
res += readVal<T>(src, y1, x1, c, border_type, borderVal) * ((x2 - x) * (y2 - y));
res += readVal<T>(src, y1, x2, c, border_type, borderVal) * ((x - x1) * (y2 - y));
res += readVal<T>(src, y2, x1, c, border_type, borderVal) * ((x2 - x) * (y - y1));
res += readVal<T>(src, y2, x2, c, border_type, borderVal) * ((x - x1) * (y - y1));
return cv::saturate_cast<T>(res);
}
};
template <typename T> struct CubicInterpolator
{
static float getValue(float p[4], float x)
{
return p[1] + 0.5 * x * (p[2] - p[0] + x * (2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2] - p[3] + x * (3.0 * (p[1] - p[2]) + p[3] - p[0])));
}
static float getValue(float p[4][4], float x, float y)
{
float arr[4];
arr[0] = getValue(p[0], x);
arr[1] = getValue(p[1], x);
arr[2] = getValue(p[2], x);
arr[3] = getValue(p[3], x);
return getValue(arr, y);
}
static T getValue(const cv::Mat &src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
{
int ix = cvRound(x);
int iy = cvRound(y);
float vals[4][4] =
{
{readVal<T>(src, iy - 2, ix - 2, c, border_type, borderVal), readVal<T>(src, iy - 2, ix - 1, c, border_type, borderVal), readVal<T>(src, iy - 2, ix, c, border_type, borderVal), readVal<T>(src, iy - 2, ix + 1, c, border_type, borderVal)},
{readVal<T>(src, iy - 1, ix - 2, c, border_type, borderVal), readVal<T>(src, iy - 1, ix - 1, c, border_type, borderVal), readVal<T>(src, iy - 1, ix, c, border_type, borderVal), readVal<T>(src, iy - 1, ix + 1, c, border_type, borderVal)},
{readVal<T>(src, iy , ix - 2, c, border_type, borderVal), readVal<T>(src, iy , ix - 1, c, border_type, borderVal), readVal<T>(src, iy , ix, c, border_type, borderVal), readVal<T>(src, iy , ix + 1, c, border_type, borderVal)},
{readVal<T>(src, iy + 1, ix - 2, c, border_type, borderVal), readVal<T>(src, iy + 1, ix - 1, c, border_type, borderVal), readVal<T>(src, iy + 1, ix, c, border_type, borderVal), readVal<T>(src, iy + 1, ix + 1, c, border_type, borderVal)},
};
return cv::saturate_cast<T>(getValue(vals, (x - ix + 2.0) / 4.0, (y - iy + 2.0) / 4.0));
}
};
#endif // __OPENCV_TEST_INTERPOLATION_HPP__

@ -68,9 +68,7 @@
#include "opencv2/ocl.hpp"
#include "utility.hpp"
#include "interpolation.hpp"
#include "opencv2/core/private.hpp"
#endif

@ -12,6 +12,7 @@
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
@ -21,6 +22,7 @@
// Jiang Liyuan,jlyuan001.good@163.com
// Rock Li, Rock.Li@amd.com
// Zailong Wu, bullet@yeah.net
// Yao Wang, bitwangyaoyao@gmail.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
@ -87,14 +89,13 @@ PARAM_TEST_CASE(ArithmTestBase, MatType, bool)
int maskx;
int masky;
//src mat with roi
cv::Mat mat1_roi;
cv::Mat mat2_roi;
cv::Mat mask_roi;
cv::Mat dst_roi;
cv::Mat dst1_roi; //bak
//std::vector<cv::ocl::Info> oclinfo;
//ocl dst mat for testing
cv::ocl::oclMat gdst_whole;
cv::ocl::oclMat gdst1_whole; //bak
@ -125,10 +126,6 @@ PARAM_TEST_CASE(ArithmTestBase, MatType, bool)
val = cv::Scalar(rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0));
//int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE);
//CV_Assert(devnums > 0);
////if you want to use undefault device, set it here
////setDevice(oclinfo[0]);
}
void random_roi()
@ -175,14 +172,22 @@ PARAM_TEST_CASE(ArithmTestBase, MatType, bool)
gmask = mask_roi; //end
}
void Near(double threshold = 0.)
{
EXPECT_MAT_NEAR(dst, Mat(gdst_whole), threshold);
}
void Near1(double threshold = 0.)
{
EXPECT_MAT_NEAR(dst1, Mat(gdst1_whole), threshold);
}
};
////////////////////////////////lut/////////////////////////////////////////////////
struct Lut : ArithmTestBase {};
#define VARNAME(A) string(#A);
TEST_P(Lut, Mat)
{
@ -203,20 +208,12 @@ TEST_P(Lut, Mat)
cv::LUT(mat1_roi, mat2_roi, dst_roi);
cv::ocl::LUT(gmat1, gmat2, gdst);
cv::Mat cpu_dst;
gdst_whole.download (cpu_dst);
char s[1024];
sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 0, s);
Near(0);
}
}
////////////////////////////////exp/////////////////////////////////////////////////
struct Exp : ArithmTestBase {};
TEST_P(Exp, Mat)
@ -227,20 +224,12 @@ TEST_P(Exp, Mat)
cv::exp(mat1_roi, dst_roi);
cv::ocl::exp(gmat1, gdst);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char s[1024];
sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 2, s);
Near(2);
}
}
////////////////////////////////log/////////////////////////////////////////////////
struct Log : ArithmTestBase {};
TEST_P(Log, Mat)
@ -249,24 +238,14 @@ TEST_P(Log, Mat)
{
random_roi();
cv::log(mat1_roi, dst_roi);
cv::ocl::log(gmat1, gdst);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char s[1024];
sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 1, s);
Near(1);
}
}
////////////////////////////////add/////////////////////////////////////////////////
struct Add : ArithmTestBase {};
TEST_P(Add, Mat)
@ -277,12 +256,7 @@ TEST_P(Add, Mat)
cv::add(mat1_roi, mat2_roi, dst_roi);
cv::ocl::add(gmat1, gmat2, gdst);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char s[1024];
sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, s);
Near(0);
}
}
@ -294,14 +268,10 @@ TEST_P(Add, Mat_Mask)
cv::add(mat1_roi, mat2_roi, dst_roi, mask_roi);
cv::ocl::add(gmat1, gmat2, gdst, gmask);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char s[1024];
sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, s);
Near(0);
}
}
TEST_P(Add, Scalar)
{
for(int j = 0; j < LOOP_TIMES; j++)
@ -310,12 +280,7 @@ TEST_P(Add, Scalar)
cv::add(mat1_roi, val, dst_roi);
cv::ocl::add(gmat1, val, gdst);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char s[1024];
sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, s);
Near(1e-5);
}
}
@ -327,12 +292,7 @@ TEST_P(Add, Scalar_Mask)
cv::add(mat1_roi, val, dst_roi, mask_roi);
cv::ocl::add(gmat1, val, gdst, gmask);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char s[1024];
sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, s);
Near(1e-5);
}
}
@ -349,12 +309,7 @@ TEST_P(Sub, Mat)
cv::subtract(mat1_roi, mat2_roi, dst_roi);
cv::ocl::subtract(gmat1, gmat2, gdst);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char s[1024];
sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, s);
Near(0);
}
}
@ -366,14 +321,10 @@ TEST_P(Sub, Mat_Mask)
cv::subtract(mat1_roi, mat2_roi, dst_roi, mask_roi);
cv::ocl::subtract(gmat1, gmat2, gdst, gmask);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char s[1024];
sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, s);
Near(0);
}
}
TEST_P(Sub, Scalar)
{
for(int j = 0; j < LOOP_TIMES; j++)
@ -382,12 +333,7 @@ TEST_P(Sub, Scalar)
cv::subtract(mat1_roi, val, dst_roi);
cv::ocl::subtract(gmat1, val, gdst);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char s[1024];
sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, s);
Near(1e-5);
}
}
@ -399,12 +345,7 @@ TEST_P(Sub, Scalar_Mask)
cv::subtract(mat1_roi, val, dst_roi, mask_roi);
cv::ocl::subtract(gmat1, val, gdst, gmask);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char s[1024];
sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, s);
Near(1e-5);
}
}
@ -421,12 +362,7 @@ TEST_P(Mul, Mat)
cv::multiply(mat1_roi, mat2_roi, dst_roi);
cv::ocl::multiply(gmat1, gmat2, gdst);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char s[1024];
sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, s);
Near(0);
}
}
@ -441,12 +377,7 @@ TEST_P(Mul, Mat_Scalar)
cv::multiply(mat1_roi, mat2_roi, dst_roi, s);
cv::ocl::multiply(gmat1, gmat2, gdst, s);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 0.001, sss);
Near(.001);
}
}
@ -462,13 +393,7 @@ TEST_P(Div, Mat)
cv::divide(mat1_roi, mat2_roi, dst_roi);
cv::ocl::divide(gmat1, gmat2, gdst);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 1, sss);
Near(1);
}
}
@ -483,13 +408,7 @@ TEST_P(Div, Mat_Scalar)
cv::divide(mat1_roi, mat2_roi, dst_roi, s);
cv::ocl::divide(gmat1, gmat2, gdst, s);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 0.001, sss);
Near(.001);
}
}
@ -504,13 +423,7 @@ TEST_P(Absdiff, Mat)
cv::absdiff(mat1_roi, mat2_roi, dst_roi);
cv::ocl::absdiff(gmat1, gmat2, gdst);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 0, sss);
Near(0);
}
}
@ -522,13 +435,7 @@ TEST_P(Absdiff, Mat_Scalar)
cv::absdiff(mat1_roi, val, dst_roi);
cv::ocl::absdiff(gmat1, val, gdst);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss);
Near(1e-5);
}
}
@ -544,16 +451,8 @@ TEST_P(CartToPolar, angleInDegree)
cv::cartToPolar(mat1_roi, mat2_roi, dst_roi, dst1_roi, 1);
cv::ocl::cartToPolar(gmat1, gmat2, gdst, gdst1, 1);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
cv::Mat cpu_dst1;
gdst1_whole.download(cpu_dst1);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 0.5, sss);
EXPECT_MAT_NEAR(dst1, cpu_dst1, 0.5, sss);
Near(.5);
Near1(.5);
}
}
@ -565,22 +464,12 @@ TEST_P(CartToPolar, angleInRadians)
cv::cartToPolar(mat1_roi, mat2_roi, dst_roi, dst1_roi, 0);
cv::ocl::cartToPolar(gmat1, gmat2, gdst, gdst1, 0);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
cv::Mat cpu_dst1;
gdst1_whole.download(cpu_dst1);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 0.5, sss);
EXPECT_MAT_NEAR(dst1, cpu_dst1, 0.5, sss);
Near(.5);
Near1(.5);
}
}
struct PolarToCart : ArithmTestBase {};
TEST_P(PolarToCart, angleInDegree)
@ -591,17 +480,8 @@ TEST_P(PolarToCart, angleInDegree)
cv::polarToCart(mat1_roi, mat2_roi, dst_roi, dst1_roi, 1);
cv::ocl::polarToCart(gmat1, gmat2, gdst, gdst1, 1);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
cv::Mat cpu_dst1;
gdst1_whole.download(cpu_dst1);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 0.5, sss);
EXPECT_MAT_NEAR(dst1, cpu_dst1, 0.5, sss);
Near(.5);
Near1(.5);
}
}
@ -613,17 +493,8 @@ TEST_P(PolarToCart, angleInRadians)
cv::polarToCart(mat1_roi, mat2_roi, dst_roi, dst1_roi, 0);
cv::ocl::polarToCart(gmat1, gmat2, gdst, gdst1, 0);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
cv::Mat cpu_dst1;
gdst1_whole.download(cpu_dst1);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 0.5, sss);
EXPECT_MAT_NEAR(dst1, cpu_dst1, 0.5, sss);
Near(.5);
Near1(.5);
}
}
@ -640,19 +511,11 @@ TEST_P(Magnitude, Mat)
cv::magnitude(mat1_roi, mat2_roi, dst_roi);
cv::ocl::magnitude(gmat1, gmat2, gdst);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss);
Near(1e-5);
}
}
struct Transpose : ArithmTestBase {};
TEST_P(Transpose, Mat)
@ -663,20 +526,11 @@ TEST_P(Transpose, Mat)
cv::transpose(mat1_roi, dst_roi);
cv::ocl::transpose(gmat1, gdst);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss);
Near(1e-5);
}
}
struct Flip : ArithmTestBase {};
TEST_P(Flip, X)
@ -687,13 +541,7 @@ TEST_P(Flip, X)
cv::flip(mat1_roi, dst_roi, 0);
cv::ocl::flip(gmat1, gdst, 0);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss);
Near(1e-5);
}
}
@ -705,13 +553,7 @@ TEST_P(Flip, Y)
cv::flip(mat1_roi, dst_roi, 1);
cv::ocl::flip(gmat1, gdst, 1);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss);
Near(1e-5);
}
}
@ -723,18 +565,11 @@ TEST_P(Flip, BOTH)
cv::flip(mat1_roi, dst_roi, -1);
cv::ocl::flip(gmat1, gdst, -1);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss);
Near(1e-5);
}
}
struct MinMax : ArithmTestBase {};
TEST_P(MinMax, MAT)
@ -765,12 +600,8 @@ TEST_P(MinMax, MAT)
double minVal_, maxVal_;
cv::ocl::minMax(gmat1, &minVal_, &maxVal_);
//check results
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_DOUBLE_EQ(minVal_, minVal) << sss;
EXPECT_DOUBLE_EQ(maxVal_, maxVal) << sss;
EXPECT_DOUBLE_EQ(minVal_, minVal);
EXPECT_DOUBLE_EQ(maxVal_, maxVal);
}
}
@ -803,12 +634,8 @@ TEST_P(MinMax, MASK)
double minVal_, maxVal_;
cv::ocl::minMax(gmat1, &minVal_, &maxVal_, gmask);
//check results
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_DOUBLE_EQ(minVal_, minVal) << sss;
EXPECT_DOUBLE_EQ(maxVal_, maxVal) << sss;
EXPECT_DOUBLE_EQ(minVal_, minVal);
EXPECT_DOUBLE_EQ(maxVal_, maxVal);
}
}
@ -919,17 +746,13 @@ TEST_P(MinMaxLoc, MAT)
error1 = ::abs(mat1_roi.at<double>(maxLoc_) - mat1_roi.at<double>(maxLoc));
}
//check results
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_DOUBLE_EQ(minVal_, minVal) << sss;
EXPECT_DOUBLE_EQ(maxVal_, maxVal) << sss;
EXPECT_DOUBLE_EQ(minlocVal_, minlocVal) << sss;
EXPECT_DOUBLE_EQ(maxlocVal_, maxlocVal) << sss;
EXPECT_DOUBLE_EQ(minVal_, minVal);
EXPECT_DOUBLE_EQ(maxVal_, maxVal);
EXPECT_DOUBLE_EQ(minlocVal_, minlocVal);
EXPECT_DOUBLE_EQ(maxlocVal_, maxlocVal);
EXPECT_DOUBLE_EQ(error0, 0.0) << sss;
EXPECT_DOUBLE_EQ(error1, 0.0) << sss;
EXPECT_DOUBLE_EQ(error0, 0.0);
EXPECT_DOUBLE_EQ(error1, 0.0);
}
}
@ -1040,17 +863,13 @@ TEST_P(MinMaxLoc, MASK)
error1 = ::abs(mat1_roi.at<double>(maxLoc_) - mat1_roi.at<double>(maxLoc));
}
//check results
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_DOUBLE_EQ(minVal_, minVal) << sss;
EXPECT_DOUBLE_EQ(maxVal_, maxVal) << sss;
EXPECT_DOUBLE_EQ(minlocVal_, minlocVal) << sss;
EXPECT_DOUBLE_EQ(maxlocVal_, maxlocVal) << sss;
EXPECT_DOUBLE_EQ(minVal_, minVal);
EXPECT_DOUBLE_EQ(maxVal_, maxVal);
EXPECT_DOUBLE_EQ(minlocVal_, minlocVal);
EXPECT_DOUBLE_EQ(maxlocVal_, maxlocVal);
EXPECT_DOUBLE_EQ(error0, 0.0) << sss;
EXPECT_DOUBLE_EQ(error1, 0.0) << sss;
EXPECT_DOUBLE_EQ(error0, 0.0);
EXPECT_DOUBLE_EQ(error1, 0.0);
}
}
@ -1064,14 +883,12 @@ TEST_P(Sum, MAT)
random_roi();
Scalar cpures = cv::sum(mat1_roi);
Scalar gpures = cv::ocl::sum(gmat1);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
//check results
EXPECT_NEAR(cpures[0], gpures[0], 0.1) << sss;
EXPECT_NEAR(cpures[1], gpures[1], 0.1) << sss;
EXPECT_NEAR(cpures[2], gpures[2], 0.1) << sss;
EXPECT_NEAR(cpures[3], gpures[3], 0.1) << sss;
EXPECT_NEAR(cpures[0], gpures[0], 0.1);
EXPECT_NEAR(cpures[1], gpures[1], 0.1);
EXPECT_NEAR(cpures[2], gpures[2], 0.1);
EXPECT_NEAR(cpures[3], gpures[3], 0.1);
}
}
@ -1086,11 +903,7 @@ TEST_P(CountNonZero, MAT)
int cpures = cv::countNonZero(mat1_roi);
int gpures = cv::ocl::countNonZero(gmat1);
//check results
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_DOUBLE_EQ((double)cpures, (double)gpures) << sss;
EXPECT_DOUBLE_EQ((double)cpures, (double)gpures);
}
}
@ -1112,13 +925,7 @@ TEST_P(Phase, Mat)
random_roi();
cv::phase(mat1_roi, mat2_roi, dst_roi, angelInDegrees ? true : false);
cv::ocl::phase(gmat1, gmat2, gdst, angelInDegrees ? true : false);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-2, sss);
Near(1e-2);
}
}
}
@ -1135,13 +942,7 @@ TEST_P(Bitwise_and, Mat)
cv::bitwise_and(mat1_roi, mat2_roi, dst_roi);
cv::ocl::bitwise_and(gmat1, gmat2, gdst);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, sss);
Near(0);
}
}
@ -1153,15 +954,10 @@ TEST_P(Bitwise_and, Mat_Mask)
cv::bitwise_and(mat1_roi, mat2_roi, dst_roi, mask_roi);
cv::ocl::bitwise_and(gmat1, gmat2, gdst, gmask);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, sss);
Near(0);
}
}
TEST_P(Bitwise_and, Scalar)
{
for(int j = 0; j < LOOP_TIMES; j++)
@ -1170,14 +966,7 @@ TEST_P(Bitwise_and, Scalar)
cv::bitwise_and(mat1_roi, val, dst_roi);
cv::ocl::bitwise_and(gmat1, val, gdst);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss);
Near(1e-5);
}
}
@ -1189,14 +978,7 @@ TEST_P(Bitwise_and, Scalar_Mask)
cv::bitwise_and(mat1_roi, val, dst_roi, mask_roi);
cv::ocl::bitwise_and(gmat1, val, gdst, gmask);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char *sss = new char[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss);
delete[] sss;
Near(1e-5);
}
}
@ -1214,13 +996,7 @@ TEST_P(Bitwise_or, Mat)
cv::bitwise_or(mat1_roi, mat2_roi, dst_roi);
cv::ocl::bitwise_or(gmat1, gmat2, gdst);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, sss);
Near(0);
}
}
@ -1232,15 +1008,10 @@ TEST_P(Bitwise_or, Mat_Mask)
cv::bitwise_or(mat1_roi, mat2_roi, dst_roi, mask_roi);
cv::ocl::bitwise_or(gmat1, gmat2, gdst, gmask);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, sss);
Near(0);
}
}
TEST_P(Bitwise_or, Scalar)
{
for(int j = 0; j < LOOP_TIMES; j++)
@ -1249,13 +1020,7 @@ TEST_P(Bitwise_or, Scalar)
cv::bitwise_or(mat1_roi, val, dst_roi);
cv::ocl::bitwise_or(gmat1, val, gdst);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss);
Near(1e-5);
}
}
@ -1267,13 +1032,7 @@ TEST_P(Bitwise_or, Scalar_Mask)
cv::bitwise_or(mat1_roi, val, dst_roi, mask_roi);
cv::ocl::bitwise_or(gmat1, val, gdst, gmask);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss);
Near(1e-5);
}
}
@ -1291,13 +1050,7 @@ TEST_P(Bitwise_xor, Mat)
cv::bitwise_xor(mat1_roi, mat2_roi, dst_roi);
cv::ocl::bitwise_xor(gmat1, gmat2, gdst);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, sss);
Near(0);
}
}
@ -1309,15 +1062,10 @@ TEST_P(Bitwise_xor, Mat_Mask)
cv::bitwise_xor(mat1_roi, mat2_roi, dst_roi, mask_roi);
cv::ocl::bitwise_xor(gmat1, gmat2, gdst, gmask);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, sss);
Near(0);
}
}
TEST_P(Bitwise_xor, Scalar)
{
for(int j = 0; j < LOOP_TIMES; j++)
@ -1326,13 +1074,7 @@ TEST_P(Bitwise_xor, Scalar)
cv::bitwise_xor(mat1_roi, val, dst_roi);
cv::ocl::bitwise_xor(gmat1, val, gdst);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss);
Near(1e-5);
}
}
@ -1344,13 +1086,7 @@ TEST_P(Bitwise_xor, Scalar_Mask)
cv::bitwise_xor(mat1_roi, val, dst_roi, mask_roi);
cv::ocl::bitwise_xor(gmat1, val, gdst, gmask);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss);
Near(1e-5);
}
}
@ -1367,13 +1103,7 @@ TEST_P(Bitwise_not, Mat)
cv::bitwise_not(mat1_roi, dst_roi);
cv::ocl::bitwise_not(gmat1, gdst);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, sss);
Near(0);
}
}
@ -1390,7 +1120,7 @@ TEST_P(Compare, Mat)
}
int cmp_codes[] = {CMP_EQ, CMP_GT, CMP_GE, CMP_LT, CMP_LE, CMP_NE};
const char *cmp_str[] = {"CMP_EQ", "CMP_GT", "CMP_GE", "CMP_LT", "CMP_LE", "CMP_NE"};
//const char *cmp_str[] = {"CMP_EQ", "CMP_GT", "CMP_GE", "CMP_LT", "CMP_LE", "CMP_NE"};
int cmp_num = sizeof(cmp_codes) / sizeof(int);
for (int i = 0; i < cmp_num; ++i)
@ -1402,13 +1132,7 @@ TEST_P(Compare, Mat)
cv::compare(mat1_roi, mat2_roi, dst_roi, cmp_codes[i]);
cv::ocl::compare(gmat1, gmat2, gdst, cmp_codes[i]);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char sss[1024];
sprintf(sss, "cmptype=%s, roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", cmp_str[i], roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 0.0, sss);
Near(0);
}
}
@ -1430,14 +1154,7 @@ TEST_P(Pow, Mat)
double p = 4.5;
cv::pow(mat1_roi, p, dst_roi);
cv::ocl::pow(gmat1, p, gdst);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 1, sss);
Near(1);
}
}
@ -1448,36 +1165,18 @@ TEST_P(MagnitudeSqr, Mat)
{
for(int j = 0; j < LOOP_TIMES; j++)
{
// random_roi();
// int64 start, end;
// start = cv::getTickCount();
random_roi();
for(int i = 0; i < mat1.rows; ++i)
for(int j = 0; j < mat1.cols; ++j)
{
float val1 = mat1.at<float>(i, j);
float val2 = mat2.at<float>(i, j);
((float *)(dst.data))[i * dst.step / 4 + j] = val1 * val1 + val2 * val2;
// float val1 =((float *)( mat1.data))[(i*mat1.step/8 +j)*2];
//
// float val2 =((float *)( mat1.data))[(i*mat1.step/8 +j)*2+ 1 ];
// ((float *)(dst.data))[i*dst.step/4 +j]= val1 * val1 +val2 * val2;
}
// end = cv::getTickCount();
cv::ocl::oclMat clmat1(mat1), clmat2(mat2), cldst;
cv::ocl::magnitudeSqr(clmat1, clmat2, cldst);
cv::Mat cpu_dst;
cldst.download(cpu_dst);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 1, sss);
cv::ocl::oclMat clmat1(mat1), clmat2(mat2);
cv::ocl::magnitudeSqr(clmat1, clmat2, gdst);
Near(1);
}
}
@ -1498,21 +1197,13 @@ TEST_P(AddWeighted, Mat)
cv::ocl::addWeighted(gmat1, alpha, gmat2, beta, gama, gdst);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss);
Near(1e-5);
}
}
//********test****************
INSTANTIATE_TEST_CASE_P(Arithm, Lut, Combine(

@ -1,3 +1,47 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Nathan, liujun@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
#include <iomanip>
@ -33,20 +77,14 @@ void blendLinearGold(const cv::Mat &img1, const cv::Mat &img2, const cv::Mat &we
PARAM_TEST_CASE(Blend, cv::Size, MatType/*, UseRoi*/)
{
//std::vector<cv::ocl::Info> oclinfo;
cv::Size size;
int type;
bool useRoi;
virtual void SetUp()
{
//devInfo = GET_PARAM(0);
size = GET_PARAM(0);
type = GET_PARAM(1);
/*useRoi = GET_PARAM(3);*/
//int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE);
//CV_Assert(devnums > 0);
}
};
@ -59,12 +97,9 @@ TEST_P(Blend, Accuracy)
cv::Mat weights1 = randomMat(size, CV_32F, 0, 1);
cv::Mat weights2 = randomMat(size, CV_32F, 0, 1);
cv::ocl::oclMat gimg1(size, type), gimg2(size, type), gweights1(size, CV_32F), gweights2(size, CV_32F);
cv::ocl::oclMat dst(size, type);
gimg1.upload(img1);
gimg2.upload(img2);
gweights1.upload(weights1);
gweights2.upload(weights2);
cv::ocl::oclMat gimg1(img1), gimg2(img2), gweights1(weights1), gweights2(weights2);
cv::ocl::oclMat dst;
cv::ocl::blendLinear(gimg1, gimg2, gweights1, gweights2, dst);
cv::Mat result;
cv::Mat result_gold;
@ -74,10 +109,10 @@ TEST_P(Blend, Accuracy)
else
blendLinearGold<float>(img1, img2, weights1, weights2, result_gold);
EXPECT_MAT_NEAR(result_gold, result, CV_MAT_DEPTH(type) == CV_8U ? 1.f : 1e-5f, 0);
EXPECT_MAT_NEAR(result_gold, result, CV_MAT_DEPTH(type) == CV_8U ? 1.f : 1e-5f);
}
INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Blend, Combine(
INSTANTIATE_TEST_CASE_P(OCL_ImgProc, Blend, Combine(
DIFFERENT_SIZES,
testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC4))
));

@ -7,12 +7,16 @@
// copy or use the software.
//
//
// Intel License Agreement
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware inc., all rights reserved.
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Nathan, liujun@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
@ -21,12 +25,12 @@
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
// and/or other oclMaterials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote products
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,

@ -129,9 +129,69 @@ TEST_P(StereoMatchBP, Regression)
bp(d_left, d_right, d_disp);
d_disp.download(disp);
disp.convertTo(disp, disp_gold.depth());
EXPECT_MAT_NEAR(disp_gold, disp, 0.0, "");
EXPECT_MAT_NEAR(disp_gold, disp, 0.0);
}
INSTANTIATE_TEST_CASE_P(OCL_Calib3D, StereoMatchBP, testing::Combine(testing::Values(64),
testing::Values(8),testing::Values(2),testing::Values(25.0f),
testing::Values(0.1f),testing::Values(15.0f),testing::Values(1.0f)));
//////////////////////////////////////////////////////////////////////////
// ConstSpaceBeliefPropagation
PARAM_TEST_CASE(StereoMatchConstSpaceBP, int, int, int, int, float, float, float, float, int, int)
{
int ndisp_;
int iters_;
int levels_;
int nr_plane_;
float max_data_term_;
float data_weight_;
float max_disc_term_;
float disc_single_jump_;
int min_disp_th_;
int msg_type_;
virtual void SetUp()
{
ndisp_ = GET_PARAM(0);
iters_ = GET_PARAM(1);
levels_ = GET_PARAM(2);
nr_plane_ = GET_PARAM(3);
max_data_term_ = GET_PARAM(4);
data_weight_ = GET_PARAM(5);
max_disc_term_ = GET_PARAM(6);
disc_single_jump_ = GET_PARAM(7);
min_disp_th_ = GET_PARAM(8);
msg_type_ = GET_PARAM(9);
}
};
TEST_P(StereoMatchConstSpaceBP, Regression)
{
Mat left_image = readImage("csstereobp/aloe-L.png");
Mat right_image = readImage("csstereobp/aloe-R.png");
Mat disp_gold = readImage("csstereobp/aloe-disp.png", IMREAD_GRAYSCALE);
ocl::oclMat d_left, d_right;
ocl::oclMat d_disp;
Mat disp;
ASSERT_FALSE(left_image.empty());
ASSERT_FALSE(right_image.empty());
ASSERT_FALSE(disp_gold.empty());
d_left.upload(left_image);
d_right.upload(right_image);
ocl::StereoConstantSpaceBP bp(ndisp_, iters_, levels_, nr_plane_, max_data_term_, data_weight_,
max_disc_term_, disc_single_jump_, 0, CV_32F);
bp(d_left, d_right, d_disp);
d_disp.download(disp);
disp.convertTo(disp, disp_gold.depth());
EXPECT_MAT_SIMILAR(disp_gold, disp, 1e-4);
//EXPECT_MAT_NEAR(disp_gold, disp, 1.0, "");
}
INSTANTIATE_TEST_CASE_P(OCL_Calib3D, StereoMatchConstSpaceBP, testing::Combine(testing::Values(128),
testing::Values(16),testing::Values(4), testing::Values(4), testing::Values(30.0f),
testing::Values(1.0f),testing::Values(160.0f),
testing::Values(10.0f), testing::Values(0), testing::Values(CV_32F)));
#endif // HAVE_OPENCL

@ -103,7 +103,7 @@ PARAM_TEST_CASE(CvtColor, cv::Size, MatDepth)
cv::cvtColor(src, dst_gold, CVTCODE(name));\
cv::Mat dst_mat;\
dst.download(dst_mat);\
EXPECT_MAT_NEAR(dst_gold, dst_mat, 1e-5, "");\
EXPECT_MAT_NEAR(dst_gold, dst_mat, 1e-5);\
}
//add new ones here using macro
@ -144,7 +144,7 @@ TEST_P(CvtColor_Gray2RGB, Accuracy)
cv::cvtColor(src, dst_gold, code);
cv::Mat dst_mat;
dst.download(dst_mat);
EXPECT_MAT_NEAR(dst_gold, dst_mat, 1e-5, "");
EXPECT_MAT_NEAR(dst_gold, dst_mat, 1e-5);
}
@ -174,7 +174,7 @@ TEST_P(CvtColor_YUV420, Accuracy)
cv::Mat dst_mat;
dst.download(dst_mat);
MAT_DIFF(dst_mat, dst_gold);
EXPECT_MAT_NEAR(dst_gold, dst_mat, 1e-5, "");
EXPECT_MAT_NEAR(dst_gold, dst_mat, 1e-5);
}
INSTANTIATE_TEST_CASE_P(OCL_ImgProc, CvtColor, testing::Combine(

@ -47,27 +47,16 @@
#include "precomp.hpp"
#include <iomanip>
///////////////////////////////////////////////////////////////////////////////
/// ColumnSum
#ifdef HAVE_OPENCL
////////////////////////////////////////////////////////////////////////
// ColumnSum
PARAM_TEST_CASE(ColumnSum, cv::Size, bool )
PARAM_TEST_CASE(ColumnSum, cv::Size)
{
cv::Size size;
cv::Mat src;
bool useRoi;
//std::vector<cv::ocl::Info> oclinfo;
virtual void SetUp()
{
size = GET_PARAM(0);
useRoi = GET_PARAM(1);
//int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE);
//CV_Assert(devnums > 0);
}
};
@ -99,8 +88,7 @@ TEST_P(ColumnSum, Accuracy)
}
}
INSTANTIATE_TEST_CASE_P(GPU_ImgProc, ColumnSum, testing::Combine(
DIFFERENT_SIZES, testing::Values(Inverse(false), Inverse(true))));
INSTANTIATE_TEST_CASE_P(OCL_ImgProc, ColumnSum, DIFFERENT_SIZES);
#endif

@ -68,7 +68,7 @@ TEST_P(Dft, C2C)
cv::dft(a, b_gold, dft_flags);
cv::ocl::dft(cv::ocl::oclMat(a), d_b, a.size(), dft_flags);
EXPECT_MAT_NEAR(b_gold, cv::Mat(d_b), a.size().area() * 1e-4, "");
EXPECT_MAT_NEAR(b_gold, cv::Mat(d_b), a.size().area() * 1e-4);
}
TEST_P(Dft, R2C)
@ -81,11 +81,11 @@ TEST_P(Dft, R2C)
cv::dft(a, b_gold, cv::DFT_COMPLEX_OUTPUT | dft_flags);
b_gold_roi = b_gold(cv::Rect(0, 0, d_b.cols, d_b.rows));
EXPECT_MAT_NEAR(b_gold_roi, cv::Mat(d_b), a.size().area() * 1e-4, "");
EXPECT_MAT_NEAR(b_gold_roi, cv::Mat(d_b), a.size().area() * 1e-4);
cv::Mat c_gold;
cv::dft(b_gold, c_gold, cv::DFT_INVERSE | cv::DFT_REAL_OUTPUT | cv::DFT_SCALE);
EXPECT_MAT_NEAR(b_gold_roi, cv::Mat(d_b), a.size().area() * 1e-4, "");
EXPECT_MAT_NEAR(b_gold_roi, cv::Mat(d_b), a.size().area() * 1e-4);
}
TEST_P(Dft, R2CthenC2R)
@ -95,7 +95,7 @@ TEST_P(Dft, R2CthenC2R)
cv::ocl::oclMat d_b, d_c;
cv::ocl::dft(cv::ocl::oclMat(a), d_b, a.size(), 0);
cv::ocl::dft(d_b, d_c, a.size(), cv::DFT_SCALE | cv::DFT_INVERSE | cv::DFT_REAL_OUTPUT);
EXPECT_MAT_NEAR(a, d_c, a.size().area() * 1e-4, "");
EXPECT_MAT_NEAR(a, d_c, a.size().area() * 1e-4);
}

@ -12,6 +12,7 @@
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
@ -19,6 +20,7 @@
// Jia Haipeng, jiahaipeng95@gmail.com
// Zero Lin, Zero.Lin@amd.com
// Zhang Ying, zhangying913@gmail.com
// Yao Wang, bitwangyaoyao@gmail.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
@ -55,191 +57,93 @@ using namespace testing;
using namespace std;
PARAM_TEST_CASE(FilterTestBase, MatType, bool)
PARAM_TEST_CASE(FilterTestBase,
MatType,
cv::Size, // kernel size
cv::Size, // dx,dy
int // border type, or iteration
)
{
int type;
cv::Scalar val;
//src mat
cv::Mat mat1;
cv::Mat mat2;
cv::Mat mask;
cv::Mat dst;
cv::Mat dst1; //bak, for two outputs
// set up roi
int roicols;
int roirows;
int src1x;
int src1y;
int src2x;
int src2y;
int dstx;
int dsty;
int maskx;
int masky;
//src mat with roi
cv::Mat mat1_roi;
cv::Mat mat2_roi;
cv::Mat mask_roi;
cv::Mat dst_roi;
cv::Mat dst1_roi; //bak
//std::vector<cv::ocl::Info> oclinfo;
//ocl dst mat for testing
cv::ocl::oclMat gdst_whole;
cv::ocl::oclMat gdst1_whole; //bak
//ocl mat with roi
cv::ocl::oclMat gmat1;
cv::ocl::oclMat gmat2;
cv::ocl::oclMat gdst;
cv::ocl::oclMat gdst1; //bak
cv::ocl::oclMat gmask;
virtual void SetUp()
{
type = GET_PARAM(0);
cv::RNG &rng = TS::ptr()->get_rng();
cv::Size size(MWIDTH, MHEIGHT);
mat1 = randomMat(rng, size, type, 5, 16, false);
mat2 = randomMat(rng, size, type, 5, 16, false);
dst = randomMat(rng, size, type, 5, 16, false);
dst1 = randomMat(rng, size, type, 5, 16, false);
mask = randomMat(rng, size, CV_8UC1, 0, 2, false);
cv::threshold(mask, mask, 0.5, 255., CV_8UC1);
val = cv::Scalar(rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0), rng.uniform(-10.0, 10.0));
}
void random_roi()
{
#ifdef RANDOMROI
//randomize ROI
cv::RNG &rng = TS::ptr()->get_rng();
roicols = rng.uniform(1, mat1.cols);
roirows = rng.uniform(1, mat1.rows);
roicols = rng.uniform(2, mat1.cols);
roirows = rng.uniform(2, mat1.rows);
src1x = rng.uniform(0, mat1.cols - roicols);
src1y = rng.uniform(0, mat1.rows - roirows);
src2x = rng.uniform(0, mat2.cols - roicols);
src2y = rng.uniform(0, mat2.rows - roirows);
dstx = rng.uniform(0, dst.cols - roicols);
dsty = rng.uniform(0, dst.rows - roirows);
maskx = rng.uniform(0, mask.cols - roicols);
masky = rng.uniform(0, mask.rows - roirows);
#else
roicols = mat1.cols;
roirows = mat1.rows;
src1x = 0;
src1y = 0;
src2x = 0;
src2y = 0;
dstx = 0;
dsty = 0;
maskx = 0;
masky = 0;
#endif
mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
mat2_roi = mat2(Rect(src2x, src2y, roicols, roirows));
mask_roi = mask(Rect(maskx, masky, roicols, roirows));
dst_roi = dst(Rect(dstx, dsty, roicols, roirows));
dst1_roi = dst1(Rect(dstx, dsty, roicols, roirows));
gdst_whole = dst;
gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
gdst1_whole = dst1;
gdst1 = gdst1_whole(Rect(dstx, dsty, roicols, roirows));
gmat1 = mat1_roi;
gmat2 = mat2_roi;
gmask = mask_roi;
}
void Init(int mat_type)
{
cv::Size size(MWIDTH, MHEIGHT);
mat1 = randomMat(size, mat_type, 5, 16);
dst = randomMat(size, mat_type, 5, 16);
}
void Near(double threshold)
{
EXPECT_MAT_NEAR(dst, Mat(gdst_whole), threshold);
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
// blur
PARAM_TEST_CASE(Blur, MatType, cv::Size, int)
struct Blur : FilterTestBase
{
int type;
cv::Size ksize;
int bordertype;
//src mat
cv::Mat mat1;
cv::Mat dst;
// set up roi
int roicols;
int roirows;
int src1x;
int src1y;
int dstx;
int dsty;
//src mat with roi
cv::Mat mat1_roi;
cv::Mat dst_roi;
//std::vector<cv::ocl::Info> oclinfo;
//ocl dst mat for testing
cv::ocl::oclMat gdst_whole;
//ocl mat with roi
cv::ocl::oclMat gmat1;
cv::ocl::oclMat gdst;
virtual void SetUp()
{
type = GET_PARAM(0);
ksize = GET_PARAM(1);
bordertype = GET_PARAM(2);
cv::RNG &rng = TS::ptr()->get_rng();
cv::Size size(MWIDTH, MHEIGHT);
mat1 = randomMat(rng, size, type, 5, 16, false);
dst = randomMat(rng, size, type, 5, 16, false);
//int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE);
//CV_Assert(devnums > 0);
////if you want to use undefault device, set it here
////setDevice(oclinfo[0]);
}
void random_roi()
{
#ifdef RANDOMROI
//randomize ROI
cv::RNG &rng = TS::ptr()->get_rng();
roicols = rng.uniform(2, mat1.cols);
roirows = rng.uniform(2, mat1.rows);
src1x = rng.uniform(0, mat1.cols - roicols);
src1y = rng.uniform(0, mat1.rows - roirows);
dstx = rng.uniform(0, dst.cols - roicols);
dsty = rng.uniform(0, dst.rows - roirows);
#else
roicols = mat1.cols;
roirows = mat1.rows;
src1x = 0;
src1y = 0;
dstx = 0;
dsty = 0;
#endif
mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
dst_roi = dst(Rect(dstx, dsty, roicols, roirows));
gdst_whole = dst;
gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
gmat1 = mat1_roi;
bordertype = GET_PARAM(3);
Init(type);
}
};
TEST_P(Blur, Mat)
@ -247,116 +151,36 @@ TEST_P(Blur, Mat)
for(int j = 0; j < LOOP_TIMES; j++)
{
random_roi();
cv::blur(mat1_roi, dst_roi, ksize, Point(-1, -1), bordertype);
cv::ocl::blur(gmat1, gdst, ksize, Point(-1, -1), bordertype);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d", roicols, roirows, src1x, src1y, dstx, dsty);
EXPECT_MAT_NEAR(dst, cpu_dst, 1.0, sss);
Near(1.0);
}
}
/////////////////////////////////////////////////////////////////////////////////////////////////
//Laplacian
PARAM_TEST_CASE(LaplacianTestBase, MatType, int)
struct Laplacian : FilterTestBase
{
int type;
int ksize;
//src mat
cv::Mat mat;
cv::Mat dst;
// set up roi
int roicols;
int roirows;
int srcx;
int srcy;
int dstx;
int dsty;
//src mat with roi
cv::Mat mat_roi;
cv::Mat dst_roi;
//std::vector<cv::ocl::Info> oclinfo;
//ocl dst mat for testing
cv::ocl::oclMat gdst_whole;
//ocl mat with roi
cv::ocl::oclMat gmat;
cv::ocl::oclMat gdst;
cv::Size ksize;
virtual void SetUp()
{
type = GET_PARAM(0);
ksize = GET_PARAM(1);
cv::RNG &rng = TS::ptr()->get_rng();
cv::Size size(MWIDTH, MHEIGHT);
mat = randomMat(rng, size, type, 5, 16, false);
dst = randomMat(rng, size, type, 5, 16, false);
//int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE);
//CV_Assert(devnums > 0);
////if you want to use undefault device, set it here
////setDevice(oclinfo[0]);
}
void random_roi()
{
#ifdef RANDOMROI
//randomize ROI
cv::RNG &rng = TS::ptr()->get_rng();
roicols = rng.uniform(2, mat.cols);
roirows = rng.uniform(2, mat.rows);
srcx = rng.uniform(0, mat.cols - roicols);
srcy = rng.uniform(0, mat.rows - roirows);
dstx = rng.uniform(0, dst.cols - roicols);
dsty = rng.uniform(0, dst.rows - roirows);
#else
roicols = mat.cols;
roirows = mat.rows;
srcx = 0;
srcy = 0;
dstx = 0;
dsty = 0;
#endif
mat_roi = mat(Rect(srcx, srcy, roicols, roirows));
dst_roi = dst(Rect(dstx, dsty, roicols, roirows));
gdst_whole = dst;
gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
gmat = mat_roi;
Init(type);
}
};
struct Laplacian : LaplacianTestBase {};
TEST_P(Laplacian, Accuracy)
{
for(int j = 0; j < LOOP_TIMES; j++)
{
random_roi();
cv::Laplacian(mat_roi, dst_roi, -1, ksize, 1);
cv::ocl::Laplacian(gmat, gdst, -1, ksize, 1);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d", roicols, roirows, srcx, srcy, dstx, dsty);
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss);
cv::Laplacian(mat1_roi, dst_roi, -1, ksize.width, 1);
cv::ocl::Laplacian(gmat1, gdst, -1, ksize.width, 1);
Near(1e-5);
}
}
@ -364,8 +188,7 @@ TEST_P(Laplacian, Accuracy)
/////////////////////////////////////////////////////////////////////////////////////////////////
// erode & dilate
PARAM_TEST_CASE(ErodeDilateBase, MatType, int)
struct ErodeDilate : FilterTestBase
{
int type;
int iterations;
@ -373,210 +196,54 @@ PARAM_TEST_CASE(ErodeDilateBase, MatType, int)
//erode or dilate kernel
cv::Mat kernel;
//src mat
cv::Mat mat1;
cv::Mat dst;
// set up roi
int roicols;
int roirows;
int src1x;
int src1y;
int dstx;
int dsty;
//src mat with roi
cv::Mat mat1_roi;
cv::Mat dst_roi;
//std::vector<cv::ocl::Info> oclinfo;
//ocl dst mat for testing
cv::ocl::oclMat gdst_whole;
//ocl mat with roi
cv::ocl::oclMat gmat1;
cv::ocl::oclMat gdst;
virtual void SetUp()
{
type = GET_PARAM(0);
iterations = GET_PARAM(1);
cv::RNG &rng = TS::ptr()->get_rng();
cv::Size size(MWIDTH, MHEIGHT);
mat1 = randomMat(rng, size, type, 5, 16, false);
dst = randomMat(rng, size, type, 5, 16, false);
iterations = GET_PARAM(3);
Init(type);
// rng.fill(kernel, cv::RNG::UNIFORM, cv::Scalar::all(0), cv::Scalar::all(3));
kernel = randomMat(rng, Size(3, 3), CV_8UC1, 0, 3, false);
}
void random_roi()
{
#ifdef RANDOMROI
//randomize ROI
cv::RNG &rng = TS::ptr()->get_rng();
roicols = rng.uniform(2, mat1.cols);
roirows = rng.uniform(2, mat1.rows);
src1x = rng.uniform(0, mat1.cols - roicols);
src1y = rng.uniform(0, mat1.rows - roirows);
dstx = rng.uniform(0, dst.cols - roicols);
dsty = rng.uniform(0, dst.rows - roirows);
#else
roicols = mat1.cols;
roirows = mat1.rows;
src1x = 0;
src1y = 0;
dstx = 0;
dsty = 0;
#endif
mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
dst_roi = dst(Rect(dstx, dsty, roicols, roirows));
gdst_whole = dst;
gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
gmat1 = mat1_roi;
kernel = randomMat(Size(3, 3), CV_8UC1, 0, 3);
}
};
// erode
struct Erode : ErodeDilateBase {};
TEST_P(Erode, Mat)
TEST_P(ErodeDilate, Mat)
{
for(int j = 0; j < LOOP_TIMES; j++)
{
random_roi();
cv::erode(mat1_roi, dst_roi, kernel, Point(-1, -1), iterations);
cv::ocl::erode(gmat1, gdst, kernel, Point(-1, -1), iterations);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d", roicols, roirows, src1x, src1y, dstx, dsty);
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss);
Near(1e-5);
}
}
// dilate
struct Dilate : ErodeDilateBase {};
TEST_P(Dilate, Mat)
{
for(int j = 0; j < LOOP_TIMES; j++)
{
random_roi();
cv::erode(mat1_roi, dst_roi, kernel, Point(-1, -1), iterations);
cv::ocl::erode(gmat1, gdst, kernel, Point(-1, -1), iterations);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d", roicols, roirows, src1x, src1y, dstx, dsty);
EXPECT_MAT_NEAR(dst, cpu_dst, 1e-5, sss);
cv::dilate(mat1_roi, dst_roi, kernel, Point(-1, -1), iterations);
cv::ocl::dilate(gmat1, gdst, kernel, Point(-1, -1), iterations);
Near(1e-5);
}
}
/////////////////////////////////////////////////////////////////////////////////////////////////
// Sobel
PARAM_TEST_CASE(Sobel, MatType, int, int, int, int)
struct Sobel : FilterTestBase
{
int type;
int dx, dy, ksize, bordertype;
//src mat
cv::Mat mat1;
cv::Mat dst;
// set up roi
int roicols;
int roirows;
int src1x;
int src1y;
int dstx;
int dsty;
//src mat with roi
cv::Mat mat1_roi;
cv::Mat dst_roi;
//std::vector<cv::ocl::Info> oclinfo;
//ocl dst mat for testing
cv::ocl::oclMat gdst_whole;
//ocl mat with roi
cv::ocl::oclMat gmat1;
cv::ocl::oclMat gdst;
virtual void SetUp()
{
type = GET_PARAM(0);
dx = GET_PARAM(1);
dy = GET_PARAM(2);
ksize = GET_PARAM(3);
bordertype = GET_PARAM(4);
cv::RNG &rng = TS::ptr()->get_rng();
cv::Size size(MWIDTH, MHEIGHT);
mat1 = randomMat(rng, size, type, 5, 16, false);
dst = randomMat(rng, size, type, 5, 16, false);
//int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE);
//CV_Assert(devnums > 0);
////if you want to use undefault device, set it here
////setDevice(oclinfo[0]);
}
void random_roi()
{
#ifdef RANDOMROI
//randomize ROI
cv::RNG &rng = TS::ptr()->get_rng();
roicols = rng.uniform(2, mat1.cols);
roirows = rng.uniform(2, mat1.rows);
src1x = rng.uniform(0, mat1.cols - roicols);
src1y = rng.uniform(0, mat1.rows - roirows);
dstx = rng.uniform(0, dst.cols - roicols);
dsty = rng.uniform(0, dst.rows - roirows);
#else
roicols = mat1.cols;
roirows = mat1.rows;
src1x = 0;
src1y = 0;
dstx = 0;
dsty = 0;
#endif
mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
dst_roi = dst(Rect(dstx, dsty, roicols, roirows));
gdst_whole = dst;
gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
gmat1 = mat1_roi;
Size s = GET_PARAM(1);
ksize = s.width;
s = GET_PARAM(2);
dx = s.width;
dy = s.height;
bordertype = GET_PARAM(3);
Init(type);
}
};
TEST_P(Sobel, Mat)
@ -584,103 +251,29 @@ TEST_P(Sobel, Mat)
for(int j = 0; j < LOOP_TIMES; j++)
{
random_roi();
cv::Sobel(mat1_roi, dst_roi, -1, dx, dy, ksize, /*scale*/0.00001,/*delta*/0, bordertype);
cv::ocl::Sobel(gmat1, gdst, -1, dx, dy, ksize,/*scale*/0.00001,/*delta*/0, bordertype);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d", roicols, roirows, src1x, src1y, dstx, dsty);
EXPECT_MAT_NEAR(dst, cpu_dst, 1, sss);
Near(1);
}
}
/////////////////////////////////////////////////////////////////////////////////////////////////
// Scharr
PARAM_TEST_CASE(Scharr, MatType, int, int, int)
struct Scharr : FilterTestBase
{
int type;
int dx, dy, bordertype;
//src mat
cv::Mat mat1;
cv::Mat dst;
// set up roi
int roicols;
int roirows;
int src1x;
int src1y;
int dstx;
int dsty;
//src mat with roi
cv::Mat mat1_roi;
cv::Mat dst_roi;
//std::vector<cv::ocl::Info> oclinfo;
//ocl dst mat for testing
cv::ocl::oclMat gdst_whole;
//ocl mat with roi
cv::ocl::oclMat gmat1;
cv::ocl::oclMat gdst;
virtual void SetUp()
{
type = GET_PARAM(0);
dx = GET_PARAM(1);
dy = GET_PARAM(2);
Size s = GET_PARAM(2);
dx = s.width;
dy = s.height;
bordertype = GET_PARAM(3);
dx = 1;
dy = 0;
cv::RNG &rng = TS::ptr()->get_rng();
cv::Size size(MWIDTH, MHEIGHT);
mat1 = randomMat(rng, size, type, 5, 16, false);
dst = randomMat(rng, size, type, 5, 16, false);
//int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE);
//CV_Assert(devnums > 0);
////if you want to use undefault device, set it here
////setDevice(oclinfo[0]);
Init(type);
}
void random_roi()
{
#ifdef RANDOMROI
//randomize ROI
cv::RNG &rng = TS::ptr()->get_rng();
roicols = rng.uniform(2, mat1.cols);
roirows = rng.uniform(2, mat1.rows);
src1x = rng.uniform(0, mat1.cols - roicols);
src1y = rng.uniform(0, mat1.rows - roirows);
dstx = rng.uniform(0, dst.cols - roicols);
dsty = rng.uniform(0, dst.rows - roirows);
#else
roicols = mat1.cols;
roirows = mat1.rows;
src1x = 0;
src1y = 0;
dstx = 0;
dsty = 0;
#endif
mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
dst_roi = dst(Rect(dstx, dsty, roicols, roirows));
gdst_whole = dst;
gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
gmat1 = mat1_roi;
}
};
TEST_P(Scharr, Mat)
@ -688,16 +281,9 @@ TEST_P(Scharr, Mat)
for(int j = 0; j < LOOP_TIMES; j++)
{
random_roi();
cv::Scharr(mat1_roi, dst_roi, -1, dx, dy, /*scale*/1,/*delta*/0, bordertype);
cv::ocl::Scharr(gmat1, gdst, -1, dx, dy,/*scale*/1,/*delta*/0, bordertype);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d", roicols, roirows, src1x, src1y, dstx, dsty);
EXPECT_MAT_NEAR(dst, cpu_dst, 1, sss);
Near(1);
}
}
@ -705,89 +291,23 @@ TEST_P(Scharr, Mat)
/////////////////////////////////////////////////////////////////////////////////////////////////
// GaussianBlur
PARAM_TEST_CASE(GaussianBlur, MatType, cv::Size, int)
struct GaussianBlur : FilterTestBase
{
int type;
cv::Size ksize;
int bordertype;
double sigma1, sigma2;
//src mat
cv::Mat mat1;
cv::Mat dst;
// set up roi
int roicols;
int roirows;
int src1x;
int src1y;
int dstx;
int dsty;
//src mat with roi
cv::Mat mat1_roi;
cv::Mat dst_roi;
//std::vector<cv::ocl::Info> oclinfo;
//ocl dst mat for testing
cv::ocl::oclMat gdst_whole;
//ocl mat with roi
cv::ocl::oclMat gmat1;
cv::ocl::oclMat gdst;
virtual void SetUp()
{
type = GET_PARAM(0);
ksize = GET_PARAM(1);
bordertype = GET_PARAM(2);
bordertype = GET_PARAM(3);
Init(type);
cv::RNG &rng = TS::ptr()->get_rng();
cv::Size size(MWIDTH, MHEIGHT);
sigma1 = rng.uniform(0.1, 1.0);
sigma2 = rng.uniform(0.1, 1.0);
mat1 = randomMat(rng, size, type, 5, 16, false);
dst = randomMat(rng, size, type, 5, 16, false);
//int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE);
//CV_Assert(devnums > 0);
////if you want to use undefault device, set it here
////setDevice(oclinfo[0]);
}
void random_roi()
{
#ifdef RANDOMROI
//randomize ROI
cv::RNG &rng = TS::ptr()->get_rng();
roicols = rng.uniform(2, mat1.cols);
roirows = rng.uniform(2, mat1.rows);
src1x = rng.uniform(0, mat1.cols - roicols);
src1y = rng.uniform(0, mat1.rows - roirows);
dstx = rng.uniform(0, dst.cols - roicols);
dsty = rng.uniform(0, dst.rows - roirows);
#else
roicols = mat1.cols;
roirows = mat1.rows;
src1x = 0;
src1y = 0;
dstx = 0;
dsty = 0;
#endif
mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
dst_roi = dst(Rect(dstx, dsty, roicols, roirows));
gdst_whole = dst;
gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
gmat1 = mat1_roi;
}
};
TEST_P(GaussianBlur, Mat)
@ -795,53 +315,53 @@ TEST_P(GaussianBlur, Mat)
for(int j = 0; j < LOOP_TIMES; j++)
{
random_roi();
cv::GaussianBlur(mat1_roi, dst_roi, ksize, sigma1, sigma2, bordertype);
cv::ocl::GaussianBlur(gmat1, gdst, ksize, sigma1, sigma2, bordertype);
cv::Mat cpu_dst;
gdst_whole.download(cpu_dst);
char sss[1024];
sprintf(sss, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d", roicols, roirows, src1x, src1y, dstx, dsty);
EXPECT_MAT_NEAR(dst, cpu_dst, 1.0, sss);
Near(1);
}
}
INSTANTIATE_TEST_CASE_P(Filter, Blur, Combine(Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC4),
INSTANTIATE_TEST_CASE_P(Filter, Blur, Combine(
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC4),
Values(cv::Size(3, 3), cv::Size(5, 5), cv::Size(7, 7)),
Values(Size(0, 0)), //not use
Values((MatType)cv::BORDER_CONSTANT, (MatType)cv::BORDER_REPLICATE, (MatType)cv::BORDER_REFLECT, (MatType)cv::BORDER_REFLECT_101)));
INSTANTIATE_TEST_CASE_P(Filters, Laplacian, Combine(
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
Values(1, 3)));
INSTANTIATE_TEST_CASE_P(Filter, Erode, Combine(Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1)));
//INSTANTIATE_TEST_CASE_P(Filter, Erode, Combine(Values(CV_8UC1, CV_8UC1), Values(false)));
INSTANTIATE_TEST_CASE_P(Filter, Dilate, Combine(Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1)));
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
Values(Size(3, 3)),
Values(Size(0, 0)), //not use
Values(0))); //not use
//INSTANTIATE_TEST_CASE_P(Filter, Dilate, Combine(Values(CV_8UC1, CV_8UC1), Values(false)));
INSTANTIATE_TEST_CASE_P(Filter, ErodeDilate, Combine(
Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
Values(Size(0, 0)), //not use
Values(Size(0, 0)), //not use
Values(1)));
INSTANTIATE_TEST_CASE_P(Filter, Sobel, Combine(Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
Values(1, 2), Values(0, 1), Values(3, 5), Values((MatType)cv::BORDER_CONSTANT,
(MatType)cv::BORDER_REPLICATE)));
INSTANTIATE_TEST_CASE_P(Filter, Sobel, Combine(
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
Values(Size(3, 3), Size(5, 5)),
Values(Size(1, 0), Size(1, 1), Size(2, 0), Size(2, 1)),
Values((MatType)cv::BORDER_CONSTANT, (MatType)cv::BORDER_REPLICATE)));
INSTANTIATE_TEST_CASE_P(Filter, Scharr, Combine(
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC4), Values(0, 1), Values(0, 1),
Values((MatType)cv::BORDER_CONSTANT, (MatType)cv::BORDER_REPLICATE)));
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC4),
Values(Size(0, 0)), //not use
Values(Size(0, 1), Size(1, 0)),
Values((MatType)cv::BORDER_CONSTANT, (MatType)cv::BORDER_REPLICATE)));
INSTANTIATE_TEST_CASE_P(Filter, GaussianBlur, Combine(
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC4),
Values(cv::Size(3, 3), cv::Size(5, 5)),
Values((MatType)cv::BORDER_CONSTANT, (MatType)cv::BORDER_REPLICATE)));
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC4),
Values(Size(3, 3), Size(5, 5)),
Values(Size(0, 0)), //not use
Values((MatType)cv::BORDER_CONSTANT, (MatType)cv::BORDER_REPLICATE)));

@ -53,13 +53,12 @@ PARAM_TEST_CASE(Gemm, int, cv::Size, int)
int type;
cv::Size mat_size;
int flags;
//vector<cv::ocl::Info> info;
virtual void SetUp()
{
type = GET_PARAM(0);
mat_size = GET_PARAM(1);
flags = GET_PARAM(2);
//cv::ocl::getDevice(info);
}
};

@ -12,10 +12,12 @@
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Jia Haipeng, jiahaipeng95@gmail.com
// Sen Liu, swjutls1987@126.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
@ -61,40 +63,31 @@ struct getRect
}
};
PARAM_TEST_CASE(HaarTestBase, int, int)
PARAM_TEST_CASE(Haar, double, int)
{
//std::vector<cv::ocl::Info> oclinfo;
cv::ocl::OclCascadeClassifier cascade, nestedCascade;
cv::ocl::OclCascadeClassifierBuf cascadebuf;
cv::CascadeClassifier cpucascade, cpunestedCascade;
// Mat img;
double scale;
int index;
int flags;
virtual void SetUp()
{
scale = 1.0;
index = 0;
scale = GET_PARAM(0);
flags = GET_PARAM(1);
string cascadeName = workdir + "../../data/haarcascades/haarcascade_frontalface_alt.xml";
if( (!cascade.load( cascadeName )) || (!cpucascade.load(cascadeName)))
if( (!cascade.load( cascadeName )) || (!cpucascade.load(cascadeName)) || (!cascadebuf.load( cascadeName )))
{
cout << "ERROR: Could not load classifier cascade" << endl;
return;
}
//int devnums = getDevice(oclinfo);
//CV_Assert(devnums>0);
////if you want to use undefault device, set it here
////setDevice(oclinfo[0]);
//cv::ocl::setBinpath("E:\\");
}
};
////////////////////////////////faceDetect/////////////////////////////////////////////////
struct Haar : HaarTestBase {};
TEST_F(Haar, FaceDetect)
TEST_P(Haar, FaceDetect)
{
string imgName = workdir + "lena.jpg";
Mat img = imread( imgName, 1 );
@ -105,59 +98,65 @@ TEST_F(Haar, FaceDetect)
return ;
}
//int i = 0;
//double t = 0;
vector<Rect> faces, oclfaces;
// const static Scalar colors[] = { CV_RGB(0, 0, 255),
// CV_RGB(0, 128, 255),
// CV_RGB(0, 255, 255),
// CV_RGB(0, 255, 0),
// CV_RGB(255, 128, 0),
// CV_RGB(255, 255, 0),
// CV_RGB(255, 0, 0),
// CV_RGB(255, 0, 255)
// } ;
Mat gray, smallImg(cvRound (img.rows / scale), cvRound(img.cols / scale), CV_8UC1 );
MemStorage storage(cvCreateMemStorage(0));
cvtColor( img, gray, COLOR_BGR2GRAY );
resize( gray, smallImg, smallImg.size(), 0, 0, INTER_LINEAR );
equalizeHist( smallImg, smallImg );
cv::ocl::oclMat image;
CvSeq *_objects;
image.upload(smallImg);
_objects = cascade.oclHaarDetectObjects( image, storage, 1.1,
3, 0
| CV_HAAR_SCALE_IMAGE
, Size(30, 30), Size(0, 0) );
3, flags, Size(30, 30), Size(0, 0) );
vector<CvAvgComp> vecAvgComp;
Seq<CvAvgComp>(_objects).copyTo(vecAvgComp);
oclfaces.resize(vecAvgComp.size());
std::transform(vecAvgComp.begin(), vecAvgComp.end(), oclfaces.begin(), getRect());
cpucascade.detectMultiScale( smallImg, faces, 1.1,
3, 0
| CV_HAAR_SCALE_IMAGE
, Size(30, 30), Size(0, 0) );
cpucascade.detectMultiScale( smallImg, faces, 1.1, 3,
flags,
Size(30, 30), Size(0, 0) );
EXPECT_EQ(faces.size(), oclfaces.size());
/* for( vector<Rect>::const_iterator r = faces.begin(); r != faces.end(); r++, i++ )
}
TEST_P(Haar, FaceDetectUseBuf)
{
string imgName = workdir + "lena.jpg";
Mat img = imread( imgName, 1 );
if(img.empty())
{
Mat smallImgROI;
Point center;
Scalar color = colors[i%8];
int radius;
center.x = cvRound((r->x + r->width*0.5)*scale);
center.y = cvRound((r->y + r->height*0.5)*scale);
radius = cvRound((r->width + r->height)*0.25*scale);
circle( img, center, radius, color, 3, 8, 0 );
} */
//namedWindow("result");
//imshow("result",img);
//waitKey(0);
//destroyAllWindows();
std::cout << "Couldn't read " << imgName << std::endl;
return ;
}
vector<Rect> faces, oclfaces;
Mat gray, smallImg(cvRound (img.rows / scale), cvRound(img.cols / scale), CV_8UC1 );
MemStorage storage(cvCreateMemStorage(0));
cvtColor( img, gray, CV_BGR2GRAY );
resize( gray, smallImg, smallImg.size(), 0, 0, INTER_LINEAR );
equalizeHist( smallImg, smallImg );
cv::ocl::oclMat image;
image.upload(smallImg);
cascadebuf.detectMultiScale( image, oclfaces, 1.1, 3,
flags,
Size(30, 30), Size(0, 0) );
cascadebuf.release();
cpucascade.detectMultiScale( smallImg, faces, 1.1, 3,
flags,
Size(30, 30), Size(0, 0) );
EXPECT_EQ(faces.size(), oclfaces.size());
}
INSTANTIATE_TEST_CASE_P(FaceDetect, Haar,
Combine(Values(1.0),
Values(CV_HAAR_SCALE_IMAGE, 0)));
#endif // HAVE_OPENCL

@ -240,12 +240,11 @@ TEST_P(HOG, Detect)
}
}
char s[100] = {0};
EXPECT_MAT_NEAR(cv::Mat(d_comp), cv::Mat(comp), 3, s);
EXPECT_MAT_NEAR(cv::Mat(d_comp), cv::Mat(comp), 3);
}
INSTANTIATE_TEST_CASE_P(GPU_ImgProc, HOG, testing::Combine(
INSTANTIATE_TEST_CASE_P(OCL_ObjDetect, HOG, testing::Combine(
testing::Values(cv::Size(64, 128), cv::Size(48, 96)),
testing::Values(MatType(CV_8UC1), MatType(CV_8UC4))));

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save