From eb7ff99f8012c9d4c5c7f911d00d922982ac8a8f Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Fri, 8 Nov 2013 01:08:36 +0400 Subject: [PATCH 01/28] fixed cv::remap and cv::convertMaps for map types CV_16SC2 && CV_16UC1 --- modules/imgproc/src/imgwarp.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index 39cc043db9..1ae73291f7 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -2935,7 +2935,10 @@ public: if( m1->type() == CV_16SC2 && (m2->type() == CV_16UC1 || m2->type() == CV_16SC1) ) { bufxy = (*m1)(Rect(x, y, bcols, brows)); - bufa = (*m2)(Rect(x, y, bcols, brows)); + + const ushort* sA = (const ushort*)(m2->data + m2->step*(y+y1)) + x; + for( x1 = 0; x1 < bcols; x1++ ) + A[x1] = (ushort)(sA[x1] & (INTER_TAB_SIZE2-1)); } else if( planar_input ) { @@ -3242,7 +3245,7 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, { for( x = 0; x < size.width; x++ ) { - int fxy = src2 ? src2[x] : 0; + int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1) : 0; dst1f[x] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale; dst2f[x] = src1[x*2+1] + (fxy >> INTER_BITS)*scale; } @@ -3251,7 +3254,7 @@ void cv::convertMaps( InputArray _map1, InputArray _map2, { for( x = 0; x < size.width; x++ ) { - int fxy = src2 ? src2[x] : 0; + int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1): 0; dst1f[x*2] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale; dst1f[x*2+1] = src1[x*2+1] + (fxy >> INTER_BITS)*scale; } From e8d2a9752b7f6671386c10cf3af0006951b23dfc Mon Sep 17 00:00:00 2001 From: Vladimir Bystricky Date: Fri, 13 Dec 2013 17:25:16 +0400 Subject: [PATCH 02/28] Add support Creative Senz3D camera by Intel Perceptual Computing SDK --- CMakeLists.txt | 7 +- cmake/OpenCVFindIntelPerCSDK.cmake | 51 ++ cmake/OpenCVFindLibsVideo.cmake | 6 + cmake/templates/cvconfig.h.in | 3 + modules/highgui/CMakeLists.txt | 6 + .../include/opencv2/highgui/highgui_c.h | 25 +- modules/highgui/src/cap.cpp | 12 + modules/highgui/src/cap_intelperc.cpp | 699 ++++++++++++++++++ modules/highgui/src/precomp.hpp | 1 + modules/highgui/test/test_precomp.hpp | 1 + samples/cpp/intelperc_capture.cpp | 379 ++++++++++ 11 files changed, 1188 insertions(+), 2 deletions(-) create mode 100644 cmake/OpenCVFindIntelPerCSDK.cmake create mode 100644 modules/highgui/src/cap_intelperc.cpp create mode 100644 samples/cpp/intelperc_capture.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index ebaf45e56a..229b0689af 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -163,7 +163,7 @@ OCV_OPTION(WITH_XINE "Include Xine support (GPL)" OFF OCV_OPTION(WITH_OPENCL "Include OpenCL Runtime support" ON IF (NOT IOS) ) OCV_OPTION(WITH_OPENCLAMDFFT "Include AMD OpenCL FFT library support" ON IF (NOT ANDROID AND NOT IOS) ) OCV_OPTION(WITH_OPENCLAMDBLAS "Include AMD OpenCL BLAS library support" ON IF (NOT ANDROID AND NOT IOS) ) - +OCV_OPTION(WITH_INTELPERC "Include Intel Perceptual Computing support" OFF IF WIN32 ) # OpenCV build components # =================================================== @@ -829,6 +829,11 @@ if(DEFINED WITH_XINE) status(" Xine:" HAVE_XINE THEN "YES (ver ${ALIASOF_libxine_VERSION})" ELSE NO) endif(DEFINED WITH_XINE) +if(DEFINED WITH_INTELPERC) + status(" Intel PerC:" HAVE_INTELPERC THEN "YES" ELSE NO) +endif(DEFINED WITH_INTELPERC) + + # ========================== Other third-party libraries ========================== status("") status(" Other third-party libraries:") diff --git a/cmake/OpenCVFindIntelPerCSDK.cmake b/cmake/OpenCVFindIntelPerCSDK.cmake new file mode 100644 index 0000000000..2d45c6e227 --- /dev/null +++ b/cmake/OpenCVFindIntelPerCSDK.cmake @@ -0,0 +1,51 @@ +# Main variables: +# INTELPERC_LIBRARY and INTELPERC_INCLUDES to link Intel Perceptial Computing SDK modules +# HAVE_INTELPERC for conditional compilation OpenCV with/without Intel Perceptial Computing SDK + +if(NOT "${INTELPERC_LIB_DIR}" STREQUAL "${INTELPERC_LIB_DIR_INTERNAL}") + unset(INTELPERC_LIBRARY CACHE) + unset(INTELPERC_LIB_DIR CACHE) +endif() + +if(NOT "${INTELPERC_INCLUDE_DIR}" STREQUAL "${INTELPERC_INCLUDE_DIR_INTERNAL}") + unset(INTELPERC_INCLUDES CACHE) + unset(INTELPERC_INCLUDE_DIR CACHE) +endif() + +if(WIN32) + if(NOT (MSVC64 OR MINGW64)) + find_file(INTELPERC_INCLUDES "pxcsession.h" PATHS "$ENV{PCSDK_DIR}include" DOC "Intel Perceptual Computing SDK interface header") + find_library(INTELPERC_LIBRARY "libpxc.lib" PATHS "$ENV{PCSDK_DIR}lib/Win32" DOC "Intel Perceptual Computing SDK library") + else() + find_file(INTELPERC_INCLUDES "pxcsession.h" PATHS "$ENV{PCSDK_DIR}include" DOC "Intel Perceptual Computing SDK interface header") + find_library(INTELPERC_LIBRARY "libpxc.lib" PATHS "$ENV{PCSDK_DIR}/lib/x64" DOC "Intel Perceptual Computing SDK library") + endif() +endif() + +if(INTELPERC_LIBRARY AND INTELPERC_INCLUDES) + set(HAVE_INTELPERC TRUE) +endif() #if(INTELPERC_LIBRARY AND INTELPERC_INCLUDES) + +get_filename_component(INTELPERC_LIB_DIR "${INTELPERC_LIBRARY}" PATH) +get_filename_component(INTELPERC_INCLUDE_DIR "${INTELPERC_INCLUDES}" PATH) + +if(HAVE_INTELPERC) + set(INTELPERC_LIB_DIR "${INTELPERC_LIB_DIR}" CACHE PATH "Path to Intel Perceptual Computing SDK interface libraries" FORCE) + set(INTELPERC_INCLUDE_DIR "${INTELPERC_INCLUDE_DIR}" CACHE PATH "Path to Intel Perceptual Computing SDK interface headers" FORCE) +endif() + +if(INTELPERC_LIBRARY) + set(INTELPERC_LIB_DIR_INTERNAL "${INTELPERC_LIB_DIR}" CACHE INTERNAL "This is the value of the last time INTELPERC_LIB_DIR was set successfully." FORCE) +else() + message( WARNING, " Intel Perceptual Computing SDK library directory (set by INTELPERC_LIB_DIR variable) is not found or does not have Intel Perceptual Computing SDK libraries." ) +endif() + +if(INTELPERC_INCLUDES) + set(INTELPERC_INCLUDE_DIR_INTERNAL "${INTELPERC_INCLUDE_DIR}" CACHE INTERNAL "This is the value of the last time INTELPERC_INCLUDE_DIR was set successfully." FORCE) +else() + message( WARNING, " Intel Perceptual Computing SDK include directory (set by INTELPERC_INCLUDE_DIR variable) is not found or does not have Intel Perceptual Computing SDK include files." ) +endif() + +mark_as_advanced(FORCE INTELPERC_LIBRARY) +mark_as_advanced(FORCE INTELPERC_INCLUDES) + diff --git a/cmake/OpenCVFindLibsVideo.cmake b/cmake/OpenCVFindLibsVideo.cmake index 00ed56ad31..22b58f5ef1 100644 --- a/cmake/OpenCVFindLibsVideo.cmake +++ b/cmake/OpenCVFindLibsVideo.cmake @@ -250,3 +250,9 @@ if (NOT IOS) set(HAVE_QTKIT YES) endif() endif() + +# --- Intel Perceptual Computing SSDK --- +ocv_clear_vars(HAVE_INTELPERC) +if(WITH_INTELPERC) + include("${OpenCV_SOURCE_DIR}/cmake/OpenCVFindIntelPerCSDK.cmake") +endif(WITH_INTELPERC) diff --git a/cmake/templates/cvconfig.h.in b/cmake/templates/cvconfig.h.in index 88c307dd98..f52c5e457c 100644 --- a/cmake/templates/cvconfig.h.in +++ b/cmake/templates/cvconfig.h.in @@ -158,6 +158,9 @@ /* Xine video library */ #cmakedefine HAVE_XINE +/* Intel Perceptual Computing SDK library */ +#cmakedefine HAVE_INTELPERC + /* Define to 1 if your processor stores words with the most significant byte first (like Motorola and SPARC, unlike Intel and VAX). */ #cmakedefine WORDS_BIGENDIAN diff --git a/modules/highgui/CMakeLists.txt b/modules/highgui/CMakeLists.txt index c3ad7ca740..5c86a2fcd1 100644 --- a/modules/highgui/CMakeLists.txt +++ b/modules/highgui/CMakeLists.txt @@ -218,6 +218,12 @@ elseif(HAVE_QTKIT) list(APPEND HIGHGUI_LIBRARIES "-framework QTKit" "-framework QuartzCore" "-framework AppKit") endif() +if(HAVE_INTELPERC) + list(APPEND highgui_srcs src/cap_intelperc.cpp) + ocv_include_directories(${INTELPERC_INCLUDE_DIR}) + list(APPEND HIGHGUI_LIBRARIES ${INTELPERC_LIBRARY}) +endif(HAVE_INTELPERC) + if(IOS) add_definitions(-DHAVE_IOS=1) list(APPEND highgui_srcs src/ios_conversions.mm src/cap_ios_abstract_camera.mm src/cap_ios_photo_camera.mm src/cap_ios_video_camera.mm) diff --git a/modules/highgui/include/opencv2/highgui/highgui_c.h b/modules/highgui/include/opencv2/highgui/highgui_c.h index 9204ee81f4..99f453385d 100644 --- a/modules/highgui/include/opencv2/highgui/highgui_c.h +++ b/modules/highgui/include/opencv2/highgui/highgui_c.h @@ -312,7 +312,9 @@ enum CV_CAP_AVFOUNDATION = 1200, // AVFoundation framework for iOS (OS X Lion will have the same API) - CV_CAP_GIGANETIX = 1300 // Smartek Giganetix GigEVisionSDK + CV_CAP_GIGANETIX = 1300, // Smartek Giganetix GigEVisionSDK + + CV_CAP_INTELPERC = 1500 // Intel Perceptual Computing SDK }; /* start capturing frames from camera: index = camera_index + domain_offset (CV_CAP_*) */ @@ -468,6 +470,19 @@ enum CV_CAP_PROP_GIGA_FRAME_HEIGH_MAX = 10004, CV_CAP_PROP_GIGA_FRAME_SENS_WIDTH = 10005, CV_CAP_PROP_GIGA_FRAME_SENS_HEIGH = 10006 + + ,CV_CAP_PROP_INTELPERC_PROFILE_COUNT = 11001, + CV_CAP_PROP_INTELPERC_PROFILE_IDX = 11002, + CV_CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE = 11003, + CV_CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE = 11004, + CV_CAP_PROP_INTELPERC_DEPTH_CONFIDENCE_THRESHOLD = 11005, + CV_CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_HORZ = 11006, + CV_CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_VERT = 11007, + + // Intel PerC streams + CV_CAP_INTELPERC_DEPTH_STREAM = 1 << 31, + CV_CAP_INTELPERC_IMAGE_STREAM = 1 << 30, + CV_CAP_INTELPERC_STREAMS_MASK = CV_CAP_INTELPERC_DEPTH_STREAM + CV_CAP_INTELPERC_IMAGE_STREAM, }; enum @@ -548,6 +563,14 @@ enum CV_CAP_ANDROID_ANTIBANDING_OFF }; +enum +{ + CV_CAP_INTELPERC_DEPTH_MAP = 0, // Each pixel is a 16-bit integer. The value indicates the distance from an object to the camera's XY plane or the Cartesian depth. + CV_CAP_INTELPERC_UVDEPTH_MAP = 1, // Each pixel contains two 32-bit floating point values in the range of 0-1, representing the mapping of depth coordinates to the color coordinates. + CV_CAP_INTELPERC_IR_MAP = 2, // Each pixel is a 16-bit integer. The value indicates the intensity of the reflected laser beam. + CV_CAP_INTELPERC_IMAGE = 3, +}; + /* retrieve or set capture properties */ CVAPI(double) cvGetCaptureProperty( CvCapture* capture, int property_id ); CVAPI(int) cvSetCaptureProperty( CvCapture* capture, int property_id, double value ); diff --git a/modules/highgui/src/cap.cpp b/modules/highgui/src/cap.cpp index bbfcc85964..f3dc8b9787 100644 --- a/modules/highgui/src/cap.cpp +++ b/modules/highgui/src/cap.cpp @@ -155,6 +155,9 @@ CV_IMPL CvCapture * cvCreateCameraCapture (int index) #endif #ifdef HAVE_GIGE_API CV_CAP_GIGANETIX, +#endif +#ifdef HAVE_INTELPERC + CV_CAP_INTELPERC, #endif -1 }; @@ -193,6 +196,7 @@ CV_IMPL CvCapture * cvCreateCameraCapture (int index) defined(HAVE_AVFOUNDATION) || \ defined(HAVE_ANDROID_NATIVE_CAMERA) || \ defined(HAVE_GIGE_API) || \ + defined(HAVE_INTELPERC) || \ (0) // local variable to memorize the captured device CvCapture *capture; @@ -341,6 +345,14 @@ CV_IMPL CvCapture * cvCreateCameraCapture (int index) return capture; break; // CV_CAP_GIGANETIX #endif + +#ifdef HAVE_INTELPERC + case CV_CAP_INTELPERC: + capture = cvCreateCameraCapture_IntelPerC(index); + if (capture) + return capture; + break; // CV_CAP_INTEL_PERC +#endif } } diff --git a/modules/highgui/src/cap_intelperc.cpp b/modules/highgui/src/cap_intelperc.cpp new file mode 100644 index 0000000000..d562dc0c8e --- /dev/null +++ b/modules/highgui/src/cap_intelperc.cpp @@ -0,0 +1,699 @@ +#include "precomp.hpp" + +#ifdef HAVE_INTELPERC + +#if defined TBB_INTERFACE_VERSION && TBB_INTERFACE_VERSION < 5000 +# undef HAVE_TBB +#endif + +#include "pxcsession.h" +#include "pxcsmartptr.h" +#include "pxccapture.h" + +class CvIntelPerCStreamBase +{ +protected: + struct FrameInternal + { + IplImage* retrieveFrame() + { + if (m_mat.empty()) + return NULL; + m_iplHeader = IplImage(m_mat); + return &m_iplHeader; + } + cv::Mat m_mat; + private: + IplImage m_iplHeader; + }; +public: + CvIntelPerCStreamBase() + : m_profileIdx(-1) + , m_frameIdx(0) + , m_timeStampStartNS(0) + { + } + virtual ~CvIntelPerCStreamBase() + { + } + + bool isValid() + { + return (m_device.IsValid() && m_stream.IsValid()); + } + bool grabFrame() + { + if (!m_stream.IsValid()) + return false; + if (-1 == m_profileIdx) + { + if (!setProperty(CV_CAP_PROP_INTELPERC_PROFILE_IDX, 0)) + return false; + } + PXCSmartPtr pxcImage; PXCSmartSP sp; + if (PXC_STATUS_NO_ERROR > m_stream->ReadStreamAsync(&pxcImage, &sp)) + return false; + if (PXC_STATUS_NO_ERROR > sp->Synchronize()) + return false; + if (0 == m_timeStampStartNS) + m_timeStampStartNS = pxcImage->QueryTimeStamp(); + m_timeStamp = (double)((pxcImage->QueryTimeStamp() - m_timeStampStartNS) / 10000); + m_frameIdx++; + return prepareIplImage(pxcImage); + } + int getProfileIDX() const + { + return m_profileIdx; + } +public: + virtual bool initStream(PXCSession *session) = 0; + virtual double getProperty(int propIdx) + { + double ret = 0.0; + switch (propIdx) + { + case CV_CAP_PROP_INTELPERC_PROFILE_COUNT: + ret = (double)m_profiles.size(); + break; + case CV_CAP_PROP_FRAME_WIDTH : + if ((0 <= m_profileIdx) && (m_profileIdx < m_profiles.size())) + ret = (double)m_profiles[m_profileIdx].imageInfo.width; + break; + case CV_CAP_PROP_FRAME_HEIGHT : + if ((0 <= m_profileIdx) && (m_profileIdx < m_profiles.size())) + ret = (double)m_profiles[m_profileIdx].imageInfo.height; + break; + case CV_CAP_PROP_FPS : + if ((0 <= m_profileIdx) && (m_profileIdx < m_profiles.size())) + { + ret = ((double)m_profiles[m_profileIdx].frameRateMin.numerator / (double)m_profiles[m_profileIdx].frameRateMin.denominator + + (double)m_profiles[m_profileIdx].frameRateMax.numerator / (double)m_profiles[m_profileIdx].frameRateMax.denominator) / 2.0; + } + break; + case CV_CAP_PROP_POS_FRAMES: + ret = (double)m_frameIdx; + break; + case CV_CAP_PROP_POS_MSEC: + ret = m_timeStamp; + break; + }; + return ret; + } + virtual bool setProperty(int propIdx, double propVal) + { + bool isSet = false; + switch (propIdx) + { + case CV_CAP_PROP_INTELPERC_PROFILE_IDX: + { + int propValInt = (int)propVal; + if ((0 <= propValInt) && (propValInt < m_profiles.size())) + { + if (m_profileIdx != propValInt) + { + m_profileIdx = propValInt; + if (m_stream.IsValid()) + m_stream->SetProfile(&m_profiles[m_profileIdx]); + m_frameIdx = 0; + m_timeStampStartNS = 0; + } + isSet = true; + } + } + break; + }; + return isSet; + } +protected: + PXCSmartPtr m_device; + bool initDevice(PXCSession *session) + { + if (NULL == session) + return false; + + pxcStatus sts = PXC_STATUS_NO_ERROR; + PXCSession::ImplDesc templat; + memset(&templat,0,sizeof(templat)); + templat.group = PXCSession::IMPL_GROUP_SENSOR; + templat.subgroup= PXCSession::IMPL_SUBGROUP_VIDEO_CAPTURE; + + for (int modidx = 0; PXC_STATUS_NO_ERROR <= sts; modidx++) + { + PXCSession::ImplDesc desc; + sts = session->QueryImpl(&templat, modidx, &desc); + if (PXC_STATUS_NO_ERROR > sts) + break; + + PXCSmartPtr capture; + sts = session->CreateImpl(&desc, &capture); + if (!capture.IsValid()) + continue; + + /* enumerate devices */ + for (int devidx = 0; PXC_STATUS_NO_ERROR <= sts; devidx++) + { + PXCSmartPtr device; + sts = capture->CreateDevice(devidx, &device); + if (PXC_STATUS_NO_ERROR <= sts) + { + m_device = device.ReleasePtr(); + return true; + } + } + } + return false; + } + + PXCSmartPtr m_stream; + void initStreamImpl(PXCImage::ImageType type) + { + if (!m_device.IsValid()) + return; + + pxcStatus sts = PXC_STATUS_NO_ERROR; + /* enumerate streams */ + for (int streamidx = 0; PXC_STATUS_NO_ERROR <= sts; streamidx++) + { + PXCCapture::Device::StreamInfo sinfo; + sts = m_device->QueryStream(streamidx, &sinfo); + if (PXC_STATUS_NO_ERROR > sts) + break; + if (PXCCapture::VideoStream::CUID != sinfo.cuid) + continue; + if (type != sinfo.imageType) + continue; + + sts = m_device->CreateStream(streamidx, &m_stream); + if (PXC_STATUS_NO_ERROR == sts) + break; + m_stream.ReleaseRef(); + } + } +protected: + std::vector m_profiles; + int m_profileIdx; + int m_frameIdx; + pxcU64 m_timeStampStartNS; + double m_timeStamp; + void enumProfiles() + { + m_profiles.clear(); + if (!m_stream.IsValid()) + return; + pxcStatus sts = PXC_STATUS_NO_ERROR; + for (int profidx = 0; PXC_STATUS_NO_ERROR <= sts; profidx++) + { + PXCCapture::VideoStream::ProfileInfo pinfo; + sts = m_stream->QueryProfile(profidx, &pinfo); + if (PXC_STATUS_NO_ERROR > sts) + break; + m_profiles.push_back(pinfo); + } + } + virtual bool prepareIplImage(PXCImage *pxcImage) = 0; +}; + +class CvIntelPerCStreamImage + : public CvIntelPerCStreamBase +{ +public: + CvIntelPerCStreamImage() + { + } + virtual ~CvIntelPerCStreamImage() + { + } + + virtual bool initStream(PXCSession *session) + { + if (!initDevice(session)) + return false; + initStreamImpl(PXCImage::IMAGE_TYPE_COLOR); + if (!m_stream.IsValid()) + return false; + enumProfiles(); + return true; + } + virtual double getProperty(int propIdx) + { + switch (propIdx) + { + case CV_CAP_PROP_BRIGHTNESS: + { + if (!m_device.IsValid()) + return 0.0; + float fret = 0.0f; + if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_BRIGHTNESS, &fret)) + return (double)fret; + return 0.0; + } + break; + case CV_CAP_PROP_CONTRAST: + { + if (!m_device.IsValid()) + return 0.0; + float fret = 0.0f; + if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_CONTRAST, &fret)) + return (double)fret; + return 0.0; + } + break; + case CV_CAP_PROP_SATURATION: + { + if (!m_device.IsValid()) + return 0.0; + float fret = 0.0f; + if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_SATURATION, &fret)) + return (double)fret; + return 0.0; + } + break; + case CV_CAP_PROP_HUE: + { + if (!m_device.IsValid()) + return 0.0; + float fret = 0.0f; + if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_HUE, &fret)) + return (double)fret; + return 0.0; + } + break; + case CV_CAP_PROP_GAMMA: + { + if (!m_device.IsValid()) + return 0.0; + float fret = 0.0f; + if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_GAMMA, &fret)) + return (double)fret; + return 0.0; + } + break; + case CV_CAP_PROP_SHARPNESS: + { + if (!m_device.IsValid()) + return 0.0; + float fret = 0.0f; + if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_SHARPNESS, &fret)) + return (double)fret; + return 0.0; + } + break; + case CV_CAP_PROP_GAIN: + { + if (!m_device.IsValid()) + return 0.0; + float fret = 0.0f; + if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_GAIN, &fret)) + return (double)fret; + return 0.0; + } + break; + case CV_CAP_PROP_BACKLIGHT: + { + if (!m_device.IsValid()) + return 0.0; + float fret = 0.0f; + if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_BACK_LIGHT_COMPENSATION, &fret)) + return (double)fret; + return 0.0; + } + break; + case CV_CAP_PROP_EXPOSURE: + { + if (!m_device.IsValid()) + return 0.0; + float fret = 0.0f; + if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_EXPOSURE, &fret)) + return (double)fret; + return 0.0; + } + break; + //Add image stream specific properties + } + return CvIntelPerCStreamBase::getProperty(propIdx); + } + virtual bool setProperty(int propIdx, double propVal) + { + switch (propIdx) + { + case CV_CAP_PROP_BRIGHTNESS: + { + if (!m_device.IsValid()) + return false; + return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_BRIGHTNESS, (float)propVal)); + } + break; + case CV_CAP_PROP_CONTRAST: + { + if (!m_device.IsValid()) + return false; + return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_CONTRAST, (float)propVal)); + } + break; + case CV_CAP_PROP_SATURATION: + { + if (!m_device.IsValid()) + return false; + return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_SATURATION, (float)propVal)); + } + break; + case CV_CAP_PROP_HUE: + { + if (!m_device.IsValid()) + return false; + return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_HUE, (float)propVal)); + } + break; + case CV_CAP_PROP_GAMMA: + { + if (!m_device.IsValid()) + return false; + return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_GAMMA, (float)propVal)); + } + break; + case CV_CAP_PROP_SHARPNESS: + { + if (!m_device.IsValid()) + return false; + return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_SHARPNESS, (float)propVal)); + } + break; + case CV_CAP_PROP_GAIN: + { + if (!m_device.IsValid()) + return false; + return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_GAIN, (float)propVal)); + } + break; + case CV_CAP_PROP_BACKLIGHT: + { + if (!m_device.IsValid()) + return false; + return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_BACK_LIGHT_COMPENSATION, (float)propVal)); + } + break; + case CV_CAP_PROP_EXPOSURE: + { + if (!m_device.IsValid()) + return false; + return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_EXPOSURE, (float)propVal)); + } + break; + //Add image stream specific properties + } + return CvIntelPerCStreamBase::setProperty(propIdx, propVal); + } +public: + IplImage* retrieveFrame() + { + return m_frame.retrieveFrame(); + } +protected: + FrameInternal m_frame; + bool prepareIplImage(PXCImage *pxcImage) + { + if (NULL == pxcImage) + return false; + PXCImage::ImageInfo info; + pxcImage->QueryInfo(&info); + + PXCImage::ImageData data; + pxcImage->AcquireAccess(PXCImage::ACCESS_READ, PXCImage::COLOR_FORMAT_RGB24, &data); + + if (PXCImage::SURFACE_TYPE_SYSTEM_MEMORY != data.type) + return false; + + cv::Mat temp(info.height, info.width, CV_8UC3, data.planes[0], data.pitches[0]); + temp.copyTo(m_frame.m_mat); + + pxcImage->ReleaseAccess(&data); + return true; + } +}; + +class CvIntelPerCStreamDepth + : public CvIntelPerCStreamBase +{ +public: + CvIntelPerCStreamDepth() + { + } + virtual ~CvIntelPerCStreamDepth() + { + } + + virtual bool initStream(PXCSession *session) + { + if (!initDevice(session)) + return false; + initStreamImpl(PXCImage::IMAGE_TYPE_DEPTH); + if (!m_stream.IsValid()) + return false; + enumProfiles(); + return true; + } + virtual double getProperty(int propIdx) + { + switch (propIdx) + { + case CV_CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE: + { + if (!m_device.IsValid()) + return 0.0; + float fret = 0.0f; + if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_DEPTH_LOW_CONFIDENCE_VALUE, &fret)) + return (double)fret; + return 0.0; + } + break; + case CV_CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE: + { + if (!m_device.IsValid()) + return 0.0; + float fret = 0.0f; + if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_DEPTH_SATURATION_VALUE, &fret)) + return (double)fret; + return 0.0; + } + break; + case CV_CAP_PROP_INTELPERC_DEPTH_CONFIDENCE_THRESHOLD: + { + if (!m_device.IsValid()) + return 0.0; + float fret = 0.0f; + if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_DEPTH_CONFIDENCE_THRESHOLD, &fret)) + return (double)fret; + return 0.0; + } + break; + case CV_CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_HORZ: + { + if (!m_device.IsValid()) + return 0.0f; + PXCPointF32 ptf; + if (PXC_STATUS_NO_ERROR == m_device->QueryPropertyAsPoint(PXCCapture::Device::PROPERTY_DEPTH_FOCAL_LENGTH, &ptf)) + return (double)ptf.x; + return 0.0; + } + break; + case CV_CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_VERT: + { + if (!m_device.IsValid()) + return 0.0f; + PXCPointF32 ptf; + if (PXC_STATUS_NO_ERROR == m_device->QueryPropertyAsPoint(PXCCapture::Device::PROPERTY_DEPTH_FOCAL_LENGTH, &ptf)) + return (double)ptf.y; + return 0.0; + } + break; + //Add depth stream sepcific properties + } + return CvIntelPerCStreamBase::getProperty(propIdx); + } + virtual bool setProperty(int propIdx, double propVal) + { + switch (propIdx) + { + case CV_CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE: + { + if (!m_device.IsValid()) + return false; + return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_DEPTH_LOW_CONFIDENCE_VALUE, (float)propVal)); + } + break; + case CV_CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE: + { + if (!m_device.IsValid()) + return false; + return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_DEPTH_SATURATION_VALUE, (float)propVal)); + } + break; + case CV_CAP_PROP_INTELPERC_DEPTH_CONFIDENCE_THRESHOLD: + { + if (!m_device.IsValid()) + return false; + return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_DEPTH_CONFIDENCE_THRESHOLD, (float)propVal)); + } + break; + //Add depth stream sepcific properties + } + return CvIntelPerCStreamBase::setProperty(propIdx, propVal); + } +public: + IplImage* retrieveDepthFrame() + { + return m_frameDepth.retrieveFrame(); + } + IplImage* retrieveIRFrame() + { + return m_frameIR.retrieveFrame(); + } + IplImage* retrieveUVFrame() + { + return m_frameUV.retrieveFrame(); + } +protected: + FrameInternal m_frameDepth; + FrameInternal m_frameIR; + FrameInternal m_frameUV; + + bool prepareIplImage(PXCImage *pxcImage) + { + if (NULL == pxcImage) + return false; + PXCImage::ImageInfo info; + pxcImage->QueryInfo(&info); + + PXCImage::ImageData data; + pxcImage->AcquireAccess(PXCImage::ACCESS_READ, &data); + + if (PXCImage::SURFACE_TYPE_SYSTEM_MEMORY != data.type) + return false; + + if (PXCImage::COLOR_FORMAT_DEPTH != data.format) + return false; + + { + cv::Mat temp(info.height, info.width, CV_16SC1, data.planes[0], data.pitches[0]); + temp.copyTo(m_frameDepth.m_mat); + } + { + cv::Mat temp(info.height, info.width, CV_16SC1, data.planes[1], data.pitches[1]); + temp.copyTo(m_frameIR.m_mat); + } + { + cv::Mat temp(info.height, info.width, CV_32FC2, data.planes[2], data.pitches[2]); + temp.copyTo(m_frameUV.m_mat); + } + + pxcImage->ReleaseAccess(&data); + return true; + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +class CvCapture_IntelPerC : public CvCapture +{ +public: + CvCapture_IntelPerC(int /*index*/) + : m_contextOpened(false) + { + pxcStatus sts = PXCSession_Create(&m_session); + if (PXC_STATUS_NO_ERROR > sts) + return; + m_contextOpened = m_imageStream.initStream(m_session); + m_contextOpened &= m_depthStream.initStream(m_session); + } + virtual ~CvCapture_IntelPerC(){} + + virtual double getProperty(int propIdx) + { + double propValue = 0; + int purePropIdx = propIdx & ~CV_CAP_INTELPERC_STREAMS_MASK; + if (CV_CAP_INTELPERC_IMAGE_STREAM == (propIdx & CV_CAP_INTELPERC_STREAMS_MASK)) + { + propValue = m_imageStream.getProperty(purePropIdx); + } + else if (CV_CAP_INTELPERC_DEPTH_STREAM == (propIdx & CV_CAP_INTELPERC_STREAMS_MASK)) + { + propValue = m_depthStream.getProperty(purePropIdx); + } + return propValue; + } + virtual bool setProperty(int propIdx, double propVal) + { + bool isSet = false; + int purePropIdx = propIdx & ~CV_CAP_INTELPERC_STREAMS_MASK; + if (CV_CAP_INTELPERC_IMAGE_STREAM == (propIdx & CV_CAP_INTELPERC_STREAMS_MASK)) + { + isSet = m_imageStream.setProperty(purePropIdx, propVal); + } + else if (CV_CAP_INTELPERC_DEPTH_STREAM == (propIdx & CV_CAP_INTELPERC_STREAMS_MASK)) + { + isSet = m_depthStream.setProperty(purePropIdx, propVal); + } + return isSet; + } + + bool grabFrame() + { + if (!isOpened()) + return false; + + bool isGrabbed = false; + if (m_depthStream.isValid()) + isGrabbed = m_depthStream.grabFrame(); + if ((m_imageStream.isValid()) && (-1 != m_imageStream.getProfileIDX())) + isGrabbed &= m_imageStream.grabFrame(); + + return isGrabbed; + } + + virtual IplImage* retrieveFrame(int outputType) + { + IplImage* image = 0; + switch (outputType) + { + case CV_CAP_INTELPERC_DEPTH_MAP: + image = m_depthStream.retrieveDepthFrame(); + break; + case CV_CAP_INTELPERC_UVDEPTH_MAP: + image = m_depthStream.retrieveUVFrame(); + break; + case CV_CAP_INTELPERC_IR_MAP: + image = m_depthStream.retrieveIRFrame(); + break; + case CV_CAP_INTELPERC_IMAGE: + image = m_imageStream.retrieveFrame(); + break; + } + CV_Assert(NULL != image); + return image; + } + + bool isOpened() const + { + return m_contextOpened; + } +protected: + bool m_contextOpened; + + PXCSmartPtr m_session; + CvIntelPerCStreamImage m_imageStream; + CvIntelPerCStreamDepth m_depthStream; +}; + + +CvCapture* cvCreateCameraCapture_IntelPerC(int index) +{ + CvCapture_IntelPerC* capture = new CvCapture_IntelPerC(index); + + if( capture->isOpened() ) + return capture; + + delete capture; + return 0; +} + + +#endif //HAVE_INTELPERC diff --git a/modules/highgui/src/precomp.hpp b/modules/highgui/src/precomp.hpp index dcd4afdc01..88ba8e4b20 100644 --- a/modules/highgui/src/precomp.hpp +++ b/modules/highgui/src/precomp.hpp @@ -127,6 +127,7 @@ CvCapture* cvCreateFileCapture_OpenNI( const char* filename ); CvCapture* cvCreateCameraCapture_Android( int index ); CvCapture* cvCreateCameraCapture_XIMEA( int index ); CvCapture* cvCreateCameraCapture_AVFoundation(int index); +CvCapture* cvCreateCameraCapture_IntelPerC(int index); CVAPI(int) cvHaveImageReader(const char* filename); diff --git a/modules/highgui/test/test_precomp.hpp b/modules/highgui/test/test_precomp.hpp index 7e9f4c63af..e166d9d80c 100644 --- a/modules/highgui/test/test_precomp.hpp +++ b/modules/highgui/test/test_precomp.hpp @@ -34,6 +34,7 @@ defined(HAVE_XIMEA) || \ defined(HAVE_AVFOUNDATION) || \ defined(HAVE_GIGE_API) || \ + defined(HAVE_INTELPERC) || \ (0) //defined(HAVE_ANDROID_NATIVE_CAMERA) || - enable after #1193 # define BUILD_WITH_CAMERA_SUPPORT 1 diff --git a/samples/cpp/intelperc_capture.cpp b/samples/cpp/intelperc_capture.cpp new file mode 100644 index 0000000000..7744377c5a --- /dev/null +++ b/samples/cpp/intelperc_capture.cpp @@ -0,0 +1,379 @@ +// testOpenCVCam.cpp : Defines the entry point for the console application. +// + +#include +#include "opencv2/highgui/highgui.hpp" +//#include "opencv2/imgproc/imgproc.hpp" + +#include + +using namespace cv; +using namespace std; + +static bool g_printStreamSetting = false; +static int g_imageStreamProfileIdx = -1; +static int g_depthStreamProfileIdx = -1; +static bool g_irStreamShow = false; +static double g_imageBrightness = -DBL_MAX; +static double g_imageContrast = -DBL_MAX; +static bool g_printTiming = false; +static bool g_showClosedPoint = false; + + +static int g_closedDepthPoint[2]; + +static void printUsage(char *arg0) +{ + char *filename = arg0; + while (*filename) + filename++; + while ((arg0 <= filename) && ('\\' != *filename) && ('//' != *filename)) + filename--; + filename++; + + cout << "This program demonstrates usage of camera supported\nby Intel Perceptual computing SDK." << endl << endl; + cout << "usage: " << filename << "[-ps] [-isp IDX] [-dsp IDX]\n [-ir] [-imb VAL] [-imc VAL]" << endl << endl; + cout << " -ps, print streams setting and profiles" << endl; + cout << " -isp IDX, set profile index of the image stream" << endl; + cout << " -dsp IDX, set profile index of the depth stream" << endl; + cout << " -ir, show data from IR stream" << endl; + cout << " -imb VAL, set brighness value for a image stream" << endl; + cout << " -imc VAL, set contrast value for a image stream" << endl; + cout << " -pts, print frame index and frame time" << endl; + cout << " --show-closed, print frame index and frame time" << endl; + cout << endl; +} + +static void parseCMDLine(int argc, char* argv[]) +{ + if( argc == 1 ) + { + printUsage(argv[0]); + } + else + { + for( int i = 1; i < argc; i++ ) + { + if ((0 == strcmp(argv[i], "--help")) || (0 == strcmp( argv[i], "-h"))) + { + printUsage(argv[0]); + exit(0); + } + else if ((0 == strcmp( argv[i], "--print-streams")) || (0 == strcmp( argv[i], "-ps"))) + { + g_printStreamSetting = true; + } + else if ((0 == strcmp( argv[i], "--image-stream-prof")) || (0 == strcmp( argv[i], "-isp"))) + { + g_imageStreamProfileIdx = atoi(argv[++i]); + } + else if ((0 == strcmp( argv[i], "--depth-stream-prof")) || (0 == strcmp( argv[i], "-dsp"))) + { + g_depthStreamProfileIdx = atoi(argv[++i]); + } + else if (0 == strcmp( argv[i], "-ir")) + { + g_irStreamShow = true; + } + else if (0 == strcmp( argv[i], "-imb")) + { + g_imageBrightness = atof(argv[++i]); + } + else if (0 == strcmp( argv[i], "-imc")) + { + g_imageContrast = atof(argv[++i]); + } + else if (0 == strcmp(argv[i], "-pts")) + { + g_printTiming = true; + } + else if (0 == strcmp(argv[i], "--show-closed")) + { + g_showClosedPoint = true; + } + else + { + cout << "Unsupported command line argument: " << argv[i] << "." << endl; + exit(-1); + } + } + if (g_closedDepthPoint && (-1 == g_depthStreamProfileIdx)) + { + cerr << "For --show-closed depth profile has be selected" << endl; + exit(-1); + } + } +} + +static void printStreamProperties(VideoCapture &capture) +{ + size_t profilesCount = (size_t)capture.get(CV_CAP_INTELPERC_IMAGE_STREAM | CV_CAP_PROP_INTELPERC_PROFILE_COUNT); + cout << "Image stream." << endl; + cout << " Brightness = " << capture.get(CV_CAP_INTELPERC_IMAGE_STREAM | CV_CAP_PROP_BRIGHTNESS) << endl; + cout << " Contrast = " << capture.get(CV_CAP_INTELPERC_IMAGE_STREAM | CV_CAP_PROP_CONTRAST) << endl; + cout << " Saturation = " << capture.get(CV_CAP_INTELPERC_IMAGE_STREAM | CV_CAP_PROP_SATURATION) << endl; + cout << " Hue = " << capture.get(CV_CAP_INTELPERC_IMAGE_STREAM | CV_CAP_PROP_HUE) << endl; + cout << " Gamma = " << capture.get(CV_CAP_INTELPERC_IMAGE_STREAM | CV_CAP_PROP_GAMMA) << endl; + cout << " Sharpness = " << capture.get(CV_CAP_INTELPERC_IMAGE_STREAM | CV_CAP_PROP_SHARPNESS) << endl; + cout << " Gain = " << capture.get(CV_CAP_INTELPERC_IMAGE_STREAM | CV_CAP_PROP_GAIN) << endl; + cout << " Backligh = " << capture.get(CV_CAP_INTELPERC_IMAGE_STREAM | CV_CAP_PROP_BACKLIGHT) << endl; + cout << "Image streams profiles:" << endl; + for (size_t i = 0; i < profilesCount; i++) + { + capture.set(CV_CAP_INTELPERC_IMAGE_STREAM | CV_CAP_PROP_INTELPERC_PROFILE_IDX, (double)i); + cout << " Profile[" << i << "]: "; + cout << "width = " << + (int)capture.get(CV_CAP_INTELPERC_IMAGE_STREAM | CV_CAP_PROP_FRAME_WIDTH); + cout << ", height = " << + (int)capture.get(CV_CAP_INTELPERC_IMAGE_STREAM | CV_CAP_PROP_FRAME_HEIGHT); + cout << ", fps = " << + capture.get(CV_CAP_INTELPERC_IMAGE_STREAM | CV_CAP_PROP_FPS); + cout << endl; + } + + profilesCount = (size_t)capture.get(CV_CAP_INTELPERC_DEPTH_STREAM | CV_CAP_PROP_INTELPERC_PROFILE_COUNT); + cout << "Depth stream." << endl; + cout << " Low confidence value = " << capture.get(CV_CAP_INTELPERC_DEPTH_STREAM | CV_CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE) << endl; + cout << " Saturation value = " << capture.get(CV_CAP_INTELPERC_DEPTH_STREAM | CV_CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE) << endl; + cout << " Confidence threshold = " << capture.get(CV_CAP_INTELPERC_DEPTH_STREAM | CV_CAP_PROP_INTELPERC_DEPTH_CONFIDENCE_THRESHOLD) << endl; + cout << " Focal length = (" << capture.get(CV_CAP_INTELPERC_DEPTH_STREAM | CV_CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_HORZ) << ", " + << capture.get(CV_CAP_INTELPERC_DEPTH_STREAM | CV_CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_VERT) << ")" << endl; + cout << "Depth streams profiles:" << endl; + for (size_t i = 0; i < profilesCount; i++) + { + capture.set(CV_CAP_INTELPERC_DEPTH_STREAM | CV_CAP_PROP_INTELPERC_PROFILE_IDX, (double)i); + cout << " Profile[" << i << "]: "; + cout << "width = " << + (int)capture.get(CV_CAP_INTELPERC_DEPTH_STREAM | CV_CAP_PROP_FRAME_WIDTH); + cout << ", height = " << + (int)capture.get(CV_CAP_INTELPERC_DEPTH_STREAM | CV_CAP_PROP_FRAME_HEIGHT); + cout << ", fps = " << + capture.get(CV_CAP_INTELPERC_DEPTH_STREAM | CV_CAP_PROP_FPS); + cout << endl; + } +} + +static void imshowImage(const char *winname, Mat &image, VideoCapture &capture) +{ + if (g_closedDepthPoint) + { + Mat uvMap; + if (capture.retrieve(uvMap, CV_CAP_INTELPERC_UVDEPTH_MAP)) + { + float *uvmap = (float *)uvMap.ptr() + 2 * (g_closedDepthPoint[0] * uvMap.cols + g_closedDepthPoint[1]); + int x = (int)((*uvmap) * image.cols); uvmap++; + int y = (int)((*uvmap) * image.rows); + + if ((0 <= x) && (0 <= y)) + { + static const int pointSize = 4; + for (int row = y; row < min(y + pointSize, image.rows); row++) + { + uchar* ptrDst = image.ptr(row) + x * 3 + 2;//+2 -> Red + for (int col = 0; col < min(pointSize, image.cols - x); col++, ptrDst+=3) + { + *ptrDst = 255; + } + } + } + } + } + imshow(winname, image); +} +static void imshowIR(const char *winname, Mat &ir) +{ + Mat image; + if (g_showClosedPoint) + { + image.create(ir.rows, ir.cols, CV_8UC3); + for (int row = 0; row < ir.rows; row++) + { + uchar* ptrDst = image.ptr(row); + short* ptrSrc = (short*)ir.ptr(row); + for (int col = 0; col < ir.cols; col++, ptrSrc++) + { + uchar val = (uchar) ((*ptrSrc) >> 2); + *ptrDst = val; ptrDst++; + *ptrDst = val; ptrDst++; + *ptrDst = val; ptrDst++; + } + } + + static const int pointSize = 4; + for (int row = g_closedDepthPoint[0]; row < min(g_closedDepthPoint[0] + pointSize, image.rows); row++) + { + uchar* ptrDst = image.ptr(row) + g_closedDepthPoint[1] * 3 + 2;//+2 -> Red + for (int col = 0; col < min(pointSize, image.cols - g_closedDepthPoint[1]); col++, ptrDst+=3) + { + *ptrDst = 255; + } + } + } + else + { + image.create(ir.rows, ir.cols, CV_8UC1); + for (int row = 0; row < ir.rows; row++) + { + uchar* ptrDst = image.ptr(row); + short* ptrSrc = (short*)ir.ptr(row); + for (int col = 0; col < ir.cols; col++, ptrSrc++, ptrDst++) + { + *ptrDst = (uchar) ((*ptrSrc) >> 2); + } + } + } + + imshow(winname, image); +} +static void imshowDepth(const char *winname, Mat &depth, VideoCapture &capture) +{ + short lowValue = (short)capture.get(CV_CAP_INTELPERC_DEPTH_STREAM | CV_CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE); + short saturationValue = (short)capture.get(CV_CAP_INTELPERC_DEPTH_STREAM | CV_CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE); + + Mat image; + if (g_showClosedPoint) + { + image.create(depth.rows, depth.cols, CV_8UC3); + for (int row = 0; row < depth.rows; row++) + { + uchar* ptrDst = image.ptr(row); + short* ptrSrc = (short*)depth.ptr(row); + for (int col = 0; col < depth.cols; col++, ptrSrc++) + { + if ((lowValue == (*ptrSrc)) || (saturationValue == (*ptrSrc))) + { + *ptrDst = 0; ptrDst++; + *ptrDst = 0; ptrDst++; + *ptrDst = 0; ptrDst++; + } + else + { + uchar val = (uchar) ((*ptrSrc) >> 2); + *ptrDst = val; ptrDst++; + *ptrDst = val; ptrDst++; + *ptrDst = val; ptrDst++; + } + } + } + + static const int pointSize = 4; + for (int row = g_closedDepthPoint[0]; row < min(g_closedDepthPoint[0] + pointSize, image.rows); row++) + { + uchar* ptrDst = image.ptr(row) + g_closedDepthPoint[1] * 3 + 2;//+2 -> Red + for (int col = 0; col < min(pointSize, image.cols - g_closedDepthPoint[1]); col++, ptrDst+=3) + { + *ptrDst = 255; + } + } + } + else + { + image.create(depth.rows, depth.cols, CV_8UC1); + for (int row = 0; row < depth.rows; row++) + { + uchar* ptrDst = image.ptr(row); + short* ptrSrc = (short*)depth.ptr(row); + for (int col = 0; col < depth.cols; col++, ptrSrc++, ptrDst++) + { + if ((lowValue == (*ptrSrc)) || (saturationValue == (*ptrSrc))) + *ptrDst = 0; + else + *ptrDst = (uchar) ((*ptrSrc) >> 2); + } + } + } + imshow(winname, image); +} + +int _tmain(int argc, char* argv[]) +{ + parseCMDLine(argc, argv); + + VideoCapture capture; + capture.open(CV_CAP_INTELPERC); + if (!capture.isOpened()) + { + cerr << "Can not open a capture object." << endl; + return -1; + } + + if (g_printStreamSetting) + printStreamProperties(capture); + + if (-1 != g_imageStreamProfileIdx) + { + if (!capture.set(CV_CAP_INTELPERC_IMAGE_STREAM | CV_CAP_PROP_INTELPERC_PROFILE_IDX, (double)g_imageStreamProfileIdx)) + { + cerr << "Can not setup a image stream." << endl; + return -1; + } + } + if (-1 != g_depthStreamProfileIdx) + { + if (!capture.set(CV_CAP_INTELPERC_DEPTH_STREAM | CV_CAP_PROP_INTELPERC_PROFILE_IDX, (double)g_depthStreamProfileIdx)) + { + cerr << "Can not setup a depth stream." << endl; + return -1; + } + } + else if (g_irStreamShow) + { + if (!capture.set(CV_CAP_INTELPERC_DEPTH_STREAM | CV_CAP_PROP_INTELPERC_PROFILE_IDX, 0.0)) + { + cerr << "Can not setup a IR stream." << endl; + return -1; + } + } + else + { + cout << "Streams not selected" << endl; + return 0; + } + + //Setup additional properies only after set profile of the stream + if ( (-10000.0 < g_imageBrightness) && (g_imageBrightness < 10000.0)) + capture.set(CV_CAP_INTELPERC_IMAGE_STREAM | CV_CAP_PROP_BRIGHTNESS, g_imageBrightness); + if ( (0 < g_imageContrast) && (g_imageContrast < 10000.0)) + capture.set(CV_CAP_INTELPERC_IMAGE_STREAM | CV_CAP_PROP_BRIGHTNESS, g_imageContrast); + + int frame = 0; + for(;;frame++) + { + Mat bgrImage; + Mat depthImage; + Mat irImage; + + if (!capture.grab()) + { + cout << "Can not grab images." << endl; + return -1; + } + + if ((-1 != g_depthStreamProfileIdx) && (capture.retrieve(depthImage, CV_CAP_INTELPERC_DEPTH_MAP))) + { + if (g_closedDepthPoint) + { + double minVal = 0.0; double maxVal = 0.0; + minMaxIdx(depthImage, &minVal, &maxVal, g_closedDepthPoint); + } + imshowDepth("depth image", depthImage, capture); + } + if ((g_irStreamShow) && (capture.retrieve(irImage, CV_CAP_INTELPERC_IR_MAP))) + imshowIR("ir image", irImage); + if ((-1 != g_imageStreamProfileIdx) && (capture.retrieve(bgrImage, CV_CAP_INTELPERC_IMAGE))) + imshowImage("color image", bgrImage, capture); + + if (g_printTiming) + { + cout << "Image frame: " << capture.get(CV_CAP_INTELPERC_IMAGE_STREAM | CV_CAP_PROP_POS_FRAMES) + << ", Depth(IR) frame: " << capture.get(CV_CAP_INTELPERC_DEPTH_STREAM | CV_CAP_PROP_POS_FRAMES) << endl; + cout << "Image frame: " << capture.get(CV_CAP_INTELPERC_IMAGE_STREAM | CV_CAP_PROP_POS_MSEC) + << ", Depth(IR) frame: " << capture.get(CV_CAP_INTELPERC_DEPTH_STREAM | CV_CAP_PROP_POS_MSEC) << endl; + } + if( waitKey(30) >= 0 ) + break; + } + + return 0; +} + From f44de302a00a8d29be61c9b4e5ef41f5c3279f31 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Sat, 14 Dec 2013 22:48:01 -0200 Subject: [PATCH 03/28] cv::completeSymm fixed to work with any OpenCV data type and multiple channels. --- modules/core/src/matrix.cpp | 37 +++++++++++-------------------------- 1 file changed, 11 insertions(+), 26 deletions(-) diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp index 5a3600b9b3..517ee9dacb 100644 --- a/modules/core/src/matrix.cpp +++ b/modules/core/src/matrix.cpp @@ -2032,39 +2032,24 @@ void cv::transpose( InputArray _src, OutputArray _dst ) } +////////////////////////////////////// completeSymm ///////////////////////////////////////// + void cv::completeSymm( InputOutputArray _m, bool LtoR ) { Mat m = _m.getMat(); - CV_Assert( m.dims <= 2 ); + size_t step = m.step, esz = m.elemSize(); + CV_Assert( m.dims <= 2 && m.rows == m.cols ); - int i, j, nrows = m.rows, type = m.type(); - int j0 = 0, j1 = nrows; - CV_Assert( m.rows == m.cols ); + int rows = m.rows; + int j0 = 0, j1 = rows; - if( type == CV_32FC1 || type == CV_32SC1 ) + uchar* data = m.data; + for( int i = 0; i < rows; i++ ) { - int* data = (int*)m.data; - size_t step = m.step/sizeof(data[0]); - for( i = 0; i < nrows; i++ ) - { - if( !LtoR ) j1 = i; else j0 = i+1; - for( j = j0; j < j1; j++ ) - data[i*step + j] = data[j*step + i]; - } + if( !LtoR ) j1 = i; else j0 = i+1; + for( int j = j0; j < j1; j++ ) + memcpy(data + (i*step + j*esz), data + (j*step + i*esz), esz); } - else if( type == CV_64FC1 ) - { - double* data = (double*)m.data; - size_t step = m.step/sizeof(data[0]); - for( i = 0; i < nrows; i++ ) - { - if( !LtoR ) j1 = i; else j0 = i+1; - for( j = j0; j < j1; j++ ) - data[i*step + j] = data[j*step + i]; - } - } - else - CV_Error( CV_StsUnsupportedFormat, "" ); } From 12c25b93108c255cffc3d243406b75656d29095d Mon Sep 17 00:00:00 2001 From: StevenPuttemans Date: Mon, 16 Dec 2013 11:05:53 +0100 Subject: [PATCH 04/28] Fixed suggestion of bugfix 3431 Seems correct to me and builds fine --- .../ml/introduction_to_svm/introduction_to_svm.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp b/samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp index 480229b53f..1c8dbd24a6 100644 --- a/samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp +++ b/samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp @@ -32,13 +32,13 @@ int main() for (int i = 0; i < image.rows; ++i) for (int j = 0; j < image.cols; ++j) { - Mat sampleMat = (Mat_(1,2) << i,j); + Mat sampleMat = (Mat_(1,2) << j,i); float response = SVM.predict(sampleMat); if (response == 1) - image.at(j, i) = green; + image.at(i,j) = green; else if (response == -1) - image.at(j, i) = blue; + image.at(i,j) = blue; } // Show the training data From 4088013251e9e30fe43b57d41a63bce08f967030 Mon Sep 17 00:00:00 2001 From: Vladimir Bystricky Date: Tue, 17 Dec 2013 12:00:40 +0400 Subject: [PATCH 05/28] Add set/get depth generator properties by default. Add documentation --- doc/user_guide/ug_intelperc.rst | 80 +++++++++++++++++++ doc/user_guide/user_guide.rst | 1 + .../include/opencv2/highgui/highgui_c.h | 6 +- modules/highgui/src/cap_intelperc.cpp | 33 ++++++-- samples/cpp/intelperc_capture.cpp | 68 ++++++++-------- 5 files changed, 144 insertions(+), 44 deletions(-) create mode 100644 doc/user_guide/ug_intelperc.rst diff --git a/doc/user_guide/ug_intelperc.rst b/doc/user_guide/ug_intelperc.rst new file mode 100644 index 0000000000..d00a2f9009 --- /dev/null +++ b/doc/user_guide/ug_intelperc.rst @@ -0,0 +1,80 @@ +******* +HighGUI +******* + +.. highlight:: cpp + +Using Creative Senz3D and other Intel Perceptual Computing SDK compatible depth sensors +====================================================== + +Depth sensors compatible with Intel Perceptual Computing SDK are supported through ``VideoCapture`` class. Depth map, RGB image and some other formats of output can be retrieved by using familiar interface of ``VideoCapture``. + +In order to use depth sensor with OpenCV you should do the following preliminary steps: + +#. + Install Intel Perceptual Computing SDK (from here http://www.intel.com/software/perceptual). + +#. + Configure OpenCV with Intel Perceptual Computing SDK support by setting ``WITH_INTELPERC`` flag in CMake. If Intel Perceptual Computing SDK is found in install folders OpenCV will be built with Intel Perceptual Computing SDK library (see a status ``INTELPERC`` in CMake log). If CMake process doesn't find Intel Perceptual Computing SDK installation folder automatically, the user should change corresponding CMake variables ``INTELPERC_LIB_DIR`` and ``INTELPERC_INCLUDE_DIR`` to the proper value. + +#. + Build OpenCV. + +VideoCapture can retrieve the following data: + +#. + data given from depth generator: + * ``CV_CAP_INTELPERC_DEPTH_MAP`` - each pixel is a 16-bit integer. The value indicates the distance from an object to the camera's XY plane or the Cartesian depth. (CV_16UC1) + * ``CV_CAP_INTELPERC_UVDEPTH_MAP`` - each pixel contains two 32-bit floating point values in the range of 0-1, representing the mapping of depth coordinates to the color coordinates. (CV_32FC2) + * ``CV_CAP_INTELPERC_IR_MAP`` - each pixel is a 16-bit integer. The value indicates the intensity of the reflected laser beam. (CV_16UC1) +#. + data given from RGB image generator: + * ``CV_CAP_INTELPERC_IMAGE`` - color image. (CV_8UC3) + +In order to get depth map from depth sensor use ``VideoCapture::operator >>``, e. g. :: + + VideoCapture capture( CV_CAP_INTELPERC ); + for(;;) + { + Mat depthMap; + capture >> depthMap; + + if( waitKey( 30 ) >= 0 ) + break; + } + +For getting several data maps use ``VideoCapture::grab`` and ``VideoCapture::retrieve``, e.g. :: + + VideoCapture capture(CV_CAP_INTELPERC); + for(;;) + { + Mat depthMap; + Mat image; + Mat irImage; + + capture.grab(); + + capture.retrieve( depthMap, CV_CAP_INTELPERC_DEPTH_MAP ); + capture.retrieve( image, CV_CAP_INTELPERC_IMAGE ); + capture.retrieve( irImage, CV_CAP_INTELPERC_IR_MAP); + + if( waitKey( 30 ) >= 0 ) + break; + } + +For setting and getting some property of sensor` data generators use ``VideoCapture::set`` and ``VideoCapture::get`` methods respectively, e.g. :: + + VideoCapture capture( CV_CAP_INTELPERC ); + capture.set( CV_CAP_INTELPERC_DEPTH_GENERATOR | CV_CAP_PROP_INTELPERC_PROFILE_IDX, 0 ); + cout << "FPS " << capture.get( CV_CAP_INTELPERC_DEPTH_GENERATOR+CV_CAP_PROP_FPS ) << endl; + +Since two types of sensor's data generators are supported (image generator and depth generator), there are two flags that should be used to set/get property of the needed generator: + +* CV_CAP_INTELPERC_IMAGE_GENERATOR -- a flag for access to the image generator properties. + +* CV_CAP_INTELPERC_DEPTH_GENERATOR -- a flag for access to the depth generator properties. This flag value is assumed by default if neither of the two possible values of the property is set. + +For more information please refer to the example of usage intelperc_capture.cpp_ in ``opencv/samples/cpp`` folder. + +.. _intelperc_capture.cpp: https://github.com/Itseez/opencv/tree/master/samples/cpp/intelperc_capture.cpp + diff --git a/doc/user_guide/user_guide.rst b/doc/user_guide/user_guide.rst index de9edcb683..76cf756f85 100644 --- a/doc/user_guide/user_guide.rst +++ b/doc/user_guide/user_guide.rst @@ -9,3 +9,4 @@ OpenCV User Guide ug_features2d.rst ug_highgui.rst ug_traincascade.rst + ug_intelperc.rst diff --git a/modules/highgui/include/opencv2/highgui/highgui_c.h b/modules/highgui/include/opencv2/highgui/highgui_c.h index 99f453385d..862fa053a6 100644 --- a/modules/highgui/include/opencv2/highgui/highgui_c.h +++ b/modules/highgui/include/opencv2/highgui/highgui_c.h @@ -480,9 +480,9 @@ enum CV_CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_VERT = 11007, // Intel PerC streams - CV_CAP_INTELPERC_DEPTH_STREAM = 1 << 31, - CV_CAP_INTELPERC_IMAGE_STREAM = 1 << 30, - CV_CAP_INTELPERC_STREAMS_MASK = CV_CAP_INTELPERC_DEPTH_STREAM + CV_CAP_INTELPERC_IMAGE_STREAM, + CV_CAP_INTELPERC_DEPTH_GENERATOR = 1 << 31, + CV_CAP_INTELPERC_IMAGE_GENERATOR = 1 << 30, + CV_CAP_INTELPERC_GENERATORS_MASK = CV_CAP_INTELPERC_DEPTH_GENERATOR + CV_CAP_INTELPERC_IMAGE_GENERATOR, }; enum diff --git a/modules/highgui/src/cap_intelperc.cpp b/modules/highgui/src/cap_intelperc.cpp index d562dc0c8e..910a6f748a 100644 --- a/modules/highgui/src/cap_intelperc.cpp +++ b/modules/highgui/src/cap_intelperc.cpp @@ -195,6 +195,11 @@ protected: int m_frameIdx; pxcU64 m_timeStampStartNS; double m_timeStamp; + + virtual bool validProfile(const PXCCapture::VideoStream::ProfileInfo& /*pinfo*/) + { + return true; + } void enumProfiles() { m_profiles.clear(); @@ -207,7 +212,8 @@ protected: sts = m_stream->QueryProfile(profidx, &pinfo); if (PXC_STATUS_NO_ERROR > sts) break; - m_profiles.push_back(pinfo); + if (validProfile(pinfo)) + m_profiles.push_back(pinfo); } } virtual bool prepareIplImage(PXCImage *pxcImage) = 0; @@ -552,6 +558,11 @@ public: { return m_frameUV.retrieveFrame(); } +protected: + virtual bool validProfile(const PXCCapture::VideoStream::ProfileInfo& pinfo) + { + return (PXCImage::COLOR_FORMAT_DEPTH == pinfo.imageInfo.format); + } protected: FrameInternal m_frameDepth; FrameInternal m_frameIR; @@ -609,12 +620,16 @@ public: virtual double getProperty(int propIdx) { double propValue = 0; - int purePropIdx = propIdx & ~CV_CAP_INTELPERC_STREAMS_MASK; - if (CV_CAP_INTELPERC_IMAGE_STREAM == (propIdx & CV_CAP_INTELPERC_STREAMS_MASK)) + int purePropIdx = propIdx & ~CV_CAP_INTELPERC_GENERATORS_MASK; + if (CV_CAP_INTELPERC_IMAGE_GENERATOR == (propIdx & CV_CAP_INTELPERC_GENERATORS_MASK)) { propValue = m_imageStream.getProperty(purePropIdx); } - else if (CV_CAP_INTELPERC_DEPTH_STREAM == (propIdx & CV_CAP_INTELPERC_STREAMS_MASK)) + else if (CV_CAP_INTELPERC_DEPTH_GENERATOR == (propIdx & CV_CAP_INTELPERC_GENERATORS_MASK)) + { + propValue = m_depthStream.getProperty(purePropIdx); + } + else { propValue = m_depthStream.getProperty(purePropIdx); } @@ -623,12 +638,16 @@ public: virtual bool setProperty(int propIdx, double propVal) { bool isSet = false; - int purePropIdx = propIdx & ~CV_CAP_INTELPERC_STREAMS_MASK; - if (CV_CAP_INTELPERC_IMAGE_STREAM == (propIdx & CV_CAP_INTELPERC_STREAMS_MASK)) + int purePropIdx = propIdx & ~CV_CAP_INTELPERC_GENERATORS_MASK; + if (CV_CAP_INTELPERC_IMAGE_GENERATOR == (propIdx & CV_CAP_INTELPERC_GENERATORS_MASK)) { isSet = m_imageStream.setProperty(purePropIdx, propVal); } - else if (CV_CAP_INTELPERC_DEPTH_STREAM == (propIdx & CV_CAP_INTELPERC_STREAMS_MASK)) + else if (CV_CAP_INTELPERC_DEPTH_GENERATOR == (propIdx & CV_CAP_INTELPERC_GENERATORS_MASK)) + { + isSet = m_depthStream.setProperty(purePropIdx, propVal); + } + else { isSet = m_depthStream.setProperty(purePropIdx, propVal); } diff --git a/samples/cpp/intelperc_capture.cpp b/samples/cpp/intelperc_capture.cpp index 7744377c5a..30471c3471 100644 --- a/samples/cpp/intelperc_capture.cpp +++ b/samples/cpp/intelperc_capture.cpp @@ -107,48 +107,48 @@ static void parseCMDLine(int argc, char* argv[]) static void printStreamProperties(VideoCapture &capture) { - size_t profilesCount = (size_t)capture.get(CV_CAP_INTELPERC_IMAGE_STREAM | CV_CAP_PROP_INTELPERC_PROFILE_COUNT); + size_t profilesCount = (size_t)capture.get(CV_CAP_INTELPERC_IMAGE_GENERATOR | CV_CAP_PROP_INTELPERC_PROFILE_COUNT); cout << "Image stream." << endl; - cout << " Brightness = " << capture.get(CV_CAP_INTELPERC_IMAGE_STREAM | CV_CAP_PROP_BRIGHTNESS) << endl; - cout << " Contrast = " << capture.get(CV_CAP_INTELPERC_IMAGE_STREAM | CV_CAP_PROP_CONTRAST) << endl; - cout << " Saturation = " << capture.get(CV_CAP_INTELPERC_IMAGE_STREAM | CV_CAP_PROP_SATURATION) << endl; - cout << " Hue = " << capture.get(CV_CAP_INTELPERC_IMAGE_STREAM | CV_CAP_PROP_HUE) << endl; - cout << " Gamma = " << capture.get(CV_CAP_INTELPERC_IMAGE_STREAM | CV_CAP_PROP_GAMMA) << endl; - cout << " Sharpness = " << capture.get(CV_CAP_INTELPERC_IMAGE_STREAM | CV_CAP_PROP_SHARPNESS) << endl; - cout << " Gain = " << capture.get(CV_CAP_INTELPERC_IMAGE_STREAM | CV_CAP_PROP_GAIN) << endl; - cout << " Backligh = " << capture.get(CV_CAP_INTELPERC_IMAGE_STREAM | CV_CAP_PROP_BACKLIGHT) << endl; + cout << " Brightness = " << capture.get(CV_CAP_INTELPERC_IMAGE_GENERATOR | CV_CAP_PROP_BRIGHTNESS) << endl; + cout << " Contrast = " << capture.get(CV_CAP_INTELPERC_IMAGE_GENERATOR | CV_CAP_PROP_CONTRAST) << endl; + cout << " Saturation = " << capture.get(CV_CAP_INTELPERC_IMAGE_GENERATOR | CV_CAP_PROP_SATURATION) << endl; + cout << " Hue = " << capture.get(CV_CAP_INTELPERC_IMAGE_GENERATOR | CV_CAP_PROP_HUE) << endl; + cout << " Gamma = " << capture.get(CV_CAP_INTELPERC_IMAGE_GENERATOR | CV_CAP_PROP_GAMMA) << endl; + cout << " Sharpness = " << capture.get(CV_CAP_INTELPERC_IMAGE_GENERATOR | CV_CAP_PROP_SHARPNESS) << endl; + cout << " Gain = " << capture.get(CV_CAP_INTELPERC_IMAGE_GENERATOR | CV_CAP_PROP_GAIN) << endl; + cout << " Backligh = " << capture.get(CV_CAP_INTELPERC_IMAGE_GENERATOR | CV_CAP_PROP_BACKLIGHT) << endl; cout << "Image streams profiles:" << endl; for (size_t i = 0; i < profilesCount; i++) { - capture.set(CV_CAP_INTELPERC_IMAGE_STREAM | CV_CAP_PROP_INTELPERC_PROFILE_IDX, (double)i); + capture.set(CV_CAP_INTELPERC_IMAGE_GENERATOR | CV_CAP_PROP_INTELPERC_PROFILE_IDX, (double)i); cout << " Profile[" << i << "]: "; cout << "width = " << - (int)capture.get(CV_CAP_INTELPERC_IMAGE_STREAM | CV_CAP_PROP_FRAME_WIDTH); + (int)capture.get(CV_CAP_INTELPERC_IMAGE_GENERATOR | CV_CAP_PROP_FRAME_WIDTH); cout << ", height = " << - (int)capture.get(CV_CAP_INTELPERC_IMAGE_STREAM | CV_CAP_PROP_FRAME_HEIGHT); + (int)capture.get(CV_CAP_INTELPERC_IMAGE_GENERATOR | CV_CAP_PROP_FRAME_HEIGHT); cout << ", fps = " << - capture.get(CV_CAP_INTELPERC_IMAGE_STREAM | CV_CAP_PROP_FPS); + capture.get(CV_CAP_INTELPERC_IMAGE_GENERATOR | CV_CAP_PROP_FPS); cout << endl; } - profilesCount = (size_t)capture.get(CV_CAP_INTELPERC_DEPTH_STREAM | CV_CAP_PROP_INTELPERC_PROFILE_COUNT); + profilesCount = (size_t)capture.get(CV_CAP_INTELPERC_DEPTH_GENERATOR | CV_CAP_PROP_INTELPERC_PROFILE_COUNT); cout << "Depth stream." << endl; - cout << " Low confidence value = " << capture.get(CV_CAP_INTELPERC_DEPTH_STREAM | CV_CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE) << endl; - cout << " Saturation value = " << capture.get(CV_CAP_INTELPERC_DEPTH_STREAM | CV_CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE) << endl; - cout << " Confidence threshold = " << capture.get(CV_CAP_INTELPERC_DEPTH_STREAM | CV_CAP_PROP_INTELPERC_DEPTH_CONFIDENCE_THRESHOLD) << endl; - cout << " Focal length = (" << capture.get(CV_CAP_INTELPERC_DEPTH_STREAM | CV_CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_HORZ) << ", " - << capture.get(CV_CAP_INTELPERC_DEPTH_STREAM | CV_CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_VERT) << ")" << endl; + cout << " Low confidence value = " << capture.get(CV_CAP_INTELPERC_DEPTH_GENERATOR | CV_CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE) << endl; + cout << " Saturation value = " << capture.get(CV_CAP_INTELPERC_DEPTH_GENERATOR | CV_CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE) << endl; + cout << " Confidence threshold = " << capture.get(CV_CAP_INTELPERC_DEPTH_GENERATOR | CV_CAP_PROP_INTELPERC_DEPTH_CONFIDENCE_THRESHOLD) << endl; + cout << " Focal length = (" << capture.get(CV_CAP_INTELPERC_DEPTH_GENERATOR | CV_CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_HORZ) << ", " + << capture.get(CV_CAP_INTELPERC_DEPTH_GENERATOR | CV_CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_VERT) << ")" << endl; cout << "Depth streams profiles:" << endl; for (size_t i = 0; i < profilesCount; i++) { - capture.set(CV_CAP_INTELPERC_DEPTH_STREAM | CV_CAP_PROP_INTELPERC_PROFILE_IDX, (double)i); + capture.set(CV_CAP_INTELPERC_DEPTH_GENERATOR | CV_CAP_PROP_INTELPERC_PROFILE_IDX, (double)i); cout << " Profile[" << i << "]: "; cout << "width = " << - (int)capture.get(CV_CAP_INTELPERC_DEPTH_STREAM | CV_CAP_PROP_FRAME_WIDTH); + (int)capture.get(CV_CAP_INTELPERC_DEPTH_GENERATOR | CV_CAP_PROP_FRAME_WIDTH); cout << ", height = " << - (int)capture.get(CV_CAP_INTELPERC_DEPTH_STREAM | CV_CAP_PROP_FRAME_HEIGHT); + (int)capture.get(CV_CAP_INTELPERC_DEPTH_GENERATOR | CV_CAP_PROP_FRAME_HEIGHT); cout << ", fps = " << - capture.get(CV_CAP_INTELPERC_DEPTH_STREAM | CV_CAP_PROP_FPS); + capture.get(CV_CAP_INTELPERC_DEPTH_GENERATOR | CV_CAP_PROP_FPS); cout << endl; } } @@ -227,8 +227,8 @@ static void imshowIR(const char *winname, Mat &ir) } static void imshowDepth(const char *winname, Mat &depth, VideoCapture &capture) { - short lowValue = (short)capture.get(CV_CAP_INTELPERC_DEPTH_STREAM | CV_CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE); - short saturationValue = (short)capture.get(CV_CAP_INTELPERC_DEPTH_STREAM | CV_CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE); + short lowValue = (short)capture.get(CV_CAP_INTELPERC_DEPTH_GENERATOR | CV_CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE); + short saturationValue = (short)capture.get(CV_CAP_INTELPERC_DEPTH_GENERATOR | CV_CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE); Mat image; if (g_showClosedPoint) @@ -302,7 +302,7 @@ int _tmain(int argc, char* argv[]) if (-1 != g_imageStreamProfileIdx) { - if (!capture.set(CV_CAP_INTELPERC_IMAGE_STREAM | CV_CAP_PROP_INTELPERC_PROFILE_IDX, (double)g_imageStreamProfileIdx)) + if (!capture.set(CV_CAP_INTELPERC_IMAGE_GENERATOR | CV_CAP_PROP_INTELPERC_PROFILE_IDX, (double)g_imageStreamProfileIdx)) { cerr << "Can not setup a image stream." << endl; return -1; @@ -310,7 +310,7 @@ int _tmain(int argc, char* argv[]) } if (-1 != g_depthStreamProfileIdx) { - if (!capture.set(CV_CAP_INTELPERC_DEPTH_STREAM | CV_CAP_PROP_INTELPERC_PROFILE_IDX, (double)g_depthStreamProfileIdx)) + if (!capture.set(CV_CAP_INTELPERC_DEPTH_GENERATOR | CV_CAP_PROP_INTELPERC_PROFILE_IDX, (double)g_depthStreamProfileIdx)) { cerr << "Can not setup a depth stream." << endl; return -1; @@ -318,7 +318,7 @@ int _tmain(int argc, char* argv[]) } else if (g_irStreamShow) { - if (!capture.set(CV_CAP_INTELPERC_DEPTH_STREAM | CV_CAP_PROP_INTELPERC_PROFILE_IDX, 0.0)) + if (!capture.set(CV_CAP_INTELPERC_DEPTH_GENERATOR | CV_CAP_PROP_INTELPERC_PROFILE_IDX, 0.0)) { cerr << "Can not setup a IR stream." << endl; return -1; @@ -332,9 +332,9 @@ int _tmain(int argc, char* argv[]) //Setup additional properies only after set profile of the stream if ( (-10000.0 < g_imageBrightness) && (g_imageBrightness < 10000.0)) - capture.set(CV_CAP_INTELPERC_IMAGE_STREAM | CV_CAP_PROP_BRIGHTNESS, g_imageBrightness); + capture.set(CV_CAP_INTELPERC_IMAGE_GENERATOR | CV_CAP_PROP_BRIGHTNESS, g_imageBrightness); if ( (0 < g_imageContrast) && (g_imageContrast < 10000.0)) - capture.set(CV_CAP_INTELPERC_IMAGE_STREAM | CV_CAP_PROP_BRIGHTNESS, g_imageContrast); + capture.set(CV_CAP_INTELPERC_IMAGE_GENERATOR | CV_CAP_PROP_BRIGHTNESS, g_imageContrast); int frame = 0; for(;;frame++) @@ -365,10 +365,10 @@ int _tmain(int argc, char* argv[]) if (g_printTiming) { - cout << "Image frame: " << capture.get(CV_CAP_INTELPERC_IMAGE_STREAM | CV_CAP_PROP_POS_FRAMES) - << ", Depth(IR) frame: " << capture.get(CV_CAP_INTELPERC_DEPTH_STREAM | CV_CAP_PROP_POS_FRAMES) << endl; - cout << "Image frame: " << capture.get(CV_CAP_INTELPERC_IMAGE_STREAM | CV_CAP_PROP_POS_MSEC) - << ", Depth(IR) frame: " << capture.get(CV_CAP_INTELPERC_DEPTH_STREAM | CV_CAP_PROP_POS_MSEC) << endl; + cout << "Image frame: " << capture.get(CV_CAP_INTELPERC_IMAGE_GENERATOR | CV_CAP_PROP_POS_FRAMES) + << ", Depth(IR) frame: " << capture.get(CV_CAP_INTELPERC_DEPTH_GENERATOR | CV_CAP_PROP_POS_FRAMES) << endl; + cout << "Image frame: " << capture.get(CV_CAP_INTELPERC_IMAGE_GENERATOR | CV_CAP_PROP_POS_MSEC) + << ", Depth(IR) frame: " << capture.get(CV_CAP_INTELPERC_DEPTH_GENERATOR | CV_CAP_PROP_POS_MSEC) << endl; } if( waitKey(30) >= 0 ) break; From de431609db6444aa39ffde0e82966b4fbd3182e8 Mon Sep 17 00:00:00 2001 From: krodyush Date: Tue, 17 Dec 2013 14:01:01 +0400 Subject: [PATCH 06/28] optimize Dx and Dy calcualtion to make it as single opencl kernel --- modules/ocl/src/imgproc.cpp | 156 ++++++--- modules/ocl/src/opencl/imgproc_sobel3.cl | 389 ++++++++++++++++++----- 2 files changed, 417 insertions(+), 128 deletions(-) diff --git a/modules/ocl/src/imgproc.cpp b/modules/ocl/src/imgproc.cpp index c25dddd4dd..3ce7ba62ac 100644 --- a/modules/ocl/src/imgproc.cpp +++ b/modules/ocl/src/imgproc.cpp @@ -1033,67 +1033,117 @@ namespace cv else scale = 1. / scale; - if (ksize > 0) + const int sobel_lsz = 16; + if((src.type() == CV_8UC1 || src.type() == CV_32FC1) && + (ksize==3 || ksize==5 || ksize==7 || ksize==-1) && + src.wholerows > sobel_lsz + (ksize>>1) && + src.wholecols > sobel_lsz + (ksize>>1)) { - Context* clCxt = Context::getContext(); - if(clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE) && src.type() == CV_8UC1 && - src.cols % 8 == 0 && src.rows % 8 == 0 && - ksize==3 && - (borderType ==cv::BORDER_REFLECT || - borderType == cv::BORDER_REPLICATE || - borderType ==cv::BORDER_REFLECT101 || - borderType ==cv::BORDER_WRAP)) + Dx.create(src.size(), CV_32FC1); + Dy.create(src.size(), CV_32FC1); + + CV_Assert(Dx.rows == Dy.rows && Dx.cols == Dy.cols); + + size_t lt2[3] = {sobel_lsz, sobel_lsz, 1}; + size_t gt2[3] = {lt2[0]*(1 + (src.cols-1) / lt2[0]), lt2[1]*(1 + (src.rows-1) / lt2[1]), 1}; + + unsigned int src_pitch = src.step; + unsigned int Dx_pitch = Dx.step; + unsigned int Dy_pitch = Dy.step; + + int src_offset_x = (src.offset % src.step) / src.elemSize(); + int src_offset_y = src.offset / src.step; + + float _scale = scale; + + std::vector > args; + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data )); + args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&src_pitch )); + + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset_x )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset_y )); + + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dx.data )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&Dx.offset )); + args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&Dx_pitch )); + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dy.data )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&Dy.offset )); + args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&Dy_pitch )); + + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.wholecols )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.wholerows )); + + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&Dx.cols )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&Dx.rows )); + + args.push_back( std::make_pair( sizeof(cl_float), (void *)&_scale )); + + string option = cv::format("-D BLK_X=%d -D BLK_Y=%d",(int)lt2[0],(int)lt2[1]); + switch(src.type()) { - Dx.create(src.size(), CV_32FC1); - Dy.create(src.size(), CV_32FC1); - - const unsigned int block_x = 8; - const unsigned int block_y = 8; - - unsigned int src_pitch = src.step; - unsigned int dst_pitch = Dx.cols; - - float _scale = scale; - - std::vector > args; - args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data )); - args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dx.data )); - args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dy.data )); - args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols )); - args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows )); - args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&src_pitch )); - args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&dst_pitch )); - args.push_back( std::make_pair( sizeof(cl_float) , (void *)&_scale )); - size_t gt2[3] = {src.cols, src.rows, 1}, lt2[3] = {block_x, block_y, 1}; - - string option = "-D BLK_X=8 -D BLK_Y=8"; - switch(borderType) - { - case cv::BORDER_REPLICATE: - option += " -D BORDER_REPLICATE"; - break; - case cv::BORDER_REFLECT: - option += " -D BORDER_REFLECT"; - break; - case cv::BORDER_REFLECT101: - option += " -D BORDER_REFLECT101"; - break; - case cv::BORDER_WRAP: - option += " -D BORDER_WRAP"; - break; - } - openCLExecuteKernel(src.clCxt, &imgproc_sobel3, "sobel3", gt2, lt2, args, -1, -1, option.c_str() ); + case CV_8UC1: + option += " -D SRCTYPE=uchar"; + break; + case CV_32FC1: + option += " -D SRCTYPE=float"; + break; } - else + switch(borderType) { - Sobel(src, Dx, CV_32F, 1, 0, ksize, scale, 0, borderType); - Sobel(src, Dy, CV_32F, 0, 1, ksize, scale, 0, borderType); + case cv::BORDER_CONSTANT: + option += " -D BORDER_CONSTANT"; + break; + case cv::BORDER_REPLICATE: + option += " -D BORDER_REPLICATE"; + break; + case cv::BORDER_REFLECT: + option += " -D BORDER_REFLECT"; + break; + case cv::BORDER_REFLECT101: + option += " -D BORDER_REFLECT_101"; + break; + case cv::BORDER_WRAP: + option += " -D BORDER_WRAP"; + break; + default: + CV_Error(CV_StsBadFlag, "BORDER type is not supported!"); + break; } + + string kernel_name; + switch(ksize) + { + case -1: + option += " -D SCHARR"; + kernel_name = "sobel3"; + break; + case 3: + kernel_name = "sobel3"; + break; + case 5: + kernel_name = "sobel5"; + break; + case 7: + kernel_name = "sobel7"; + break; + default: + CV_Error(CV_StsBadFlag, "Kernel size is not supported!"); + break; + } + openCLExecuteKernel(src.clCxt, &imgproc_sobel3, kernel_name, gt2, lt2, args, -1, -1, option.c_str() ); } else { - Scharr(src, Dx, CV_32F, 1, 0, scale, 0, borderType); - Scharr(src, Dy, CV_32F, 0, 1, scale, 0, borderType); + if (ksize > 0) + { + Sobel(src, Dx, CV_32F, 1, 0, ksize, scale, 0, borderType); + Sobel(src, Dy, CV_32F, 0, 1, ksize, scale, 0, borderType); + } + else + { + Scharr(src, Dx, CV_32F, 1, 0, scale, 0, borderType); + Scharr(src, Dy, CV_32F, 0, 1, scale, 0, borderType); + } } CV_Assert(Dx.offset == 0 && Dy.offset == 0); } diff --git a/modules/ocl/src/opencl/imgproc_sobel3.cl b/modules/ocl/src/opencl/imgproc_sobel3.cl index d6a995f552..8356fce018 100644 --- a/modules/ocl/src/opencl/imgproc_sobel3.cl +++ b/modules/ocl/src/opencl/imgproc_sobel3.cl @@ -1,45 +1,97 @@ /////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////Macro for border type//////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////// -#ifdef BORDER_REPLICATE -//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh -#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i)) -#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr)) -#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i)) -#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr)) -#endif -#ifdef BORDER_REFLECT -//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb -#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i)) -#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr)) -#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i)) -#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr)) +#ifdef BORDER_CONSTANT +//CCCCCC|abcdefgh|CCCCCCC +#define EXTRAPOLATE(x, maxV) +#elif defined BORDER_REPLICATE +//aaaaaa|abcdefgh|hhhhhhh +#define EXTRAPOLATE(x, maxV) \ + { \ + (x) = max(min((x), (maxV) - 1), 0); \ + } +#elif defined BORDER_WRAP +//cdefgh|abcdefgh|abcdefg +#define EXTRAPOLATE(x, maxV) \ + { \ + (x) = ( (x) + (maxV) ) % (maxV); \ + } +#elif defined BORDER_REFLECT +//fedcba|abcdefgh|hgfedcb +#define EXTRAPOLATE(x, maxV) \ + { \ + (x) = min( mad24((maxV)-1,2,-(x))+1 , max((x),-(x)-1) ); \ + } +#elif defined BORDER_REFLECT_101 +//gfedcb|abcdefgh|gfedcba +#define EXTRAPOLATE(x, maxV) \ + { \ + (x) = min( mad24((maxV)-1,2,-(x)), max((x),-(x)) ); \ + } +#else +#error No extrapolation method #endif -#ifdef BORDER_REFLECT101 -//BORDER_REFLECT101: gfedcb|abcdefgh|gfedcba -#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i)) -#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr)) -#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i)) -#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr)) -#endif +#define SRC(_x,_y) convert_float(((global SRCTYPE*)(Src+(_y)*SrcPitch))[_x]) -#ifdef BORDER_WRAP -//BORDER_WRAP: cdefgh|abcdefgh|abcdefg -#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i)) -#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr)) -#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i)) -#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr)) +#ifdef BORDER_CONSTANT +//CCCCCC|abcdefgh|CCCCCCC +#define ELEM(_x,_y,r_edge,t_edge,const_v) (_x)<0 | (_x) >= (r_edge) | (_y)<0 | (_y) >= (t_edge) ? (const_v) : SRC((_x),(_y)) +#else +#define ELEM(_x,_y,r_edge,t_edge,const_v) SRC((_x),(_y)) #endif +#define DSTX(_x,_y) (((global float*)(DstX+DstXOffset+(_y)*DstXPitch))[_x]) +#define DSTY(_x,_y) (((global float*)(DstY+DstYOffset+(_y)*DstYPitch))[_x]) + +#define INIT_AND_READ_LOCAL_SOURCE(width, height, fill_const, kernel_border) \ + int srcX = x + srcOffsetX - (kernel_border); \ + int srcY = y + srcOffsetY - (kernel_border); \ + int xb = srcX; \ + int yb = srcY; \ + \ + EXTRAPOLATE(xb, (width)); \ + EXTRAPOLATE(yb, (height)); \ + lsmem[liy][lix] = ELEM(xb, yb, (width), (height), (fill_const) ); \ + \ + if(lix < ((kernel_border)*2)) \ + { \ + int xb = srcX+BLK_X; \ + EXTRAPOLATE(xb,(width)); \ + lsmem[liy][lix+BLK_X] = ELEM(xb, yb, (width), (height), (fill_const) ); \ + } \ + if(liy< ((kernel_border)*2)) \ + { \ + int yb = srcY+BLK_Y; \ + EXTRAPOLATE(yb, (height)); \ + lsmem[liy+BLK_Y][lix] = ELEM(xb, yb, (width), (height), (fill_const) ); \ + } \ + if(lix<((kernel_border)*2) && liy<((kernel_border)*2)) \ + { \ + int xb = srcX+BLK_X; \ + int yb = srcY+BLK_Y; \ + EXTRAPOLATE(xb,(width)); \ + EXTRAPOLATE(yb,(height)); \ + lsmem[liy+BLK_Y][lix+BLK_X] = ELEM(xb, yb, (width), (height), (fill_const) ); \ + } + __kernel void sobel3( __global uchar* Src, - __global float* DstX, - __global float* DstY, - int width, int height, - uint srcStride, uint dstStride, - float scale + const uint SrcPitch, + const int srcOffsetX, + const int srcOffsetY, + __global uchar* DstX, + const int DstXOffset, + const uint DstXPitch, + __global uchar* DstY, + const int DstYOffset, + const uint DstYPitch, + int width, + int height, + int dstWidth, + int dstHeight, + float scale ) { __local float lsmem[BLK_Y+2][BLK_X+2]; @@ -47,62 +99,249 @@ __kernel void sobel3( int lix = get_local_id(0); int liy = get_local_id(1); - int gix = get_group_id(0); - int giy = get_group_id(1); - - int id_x = get_global_id(0); - int id_y = get_global_id(1); - - lsmem[liy+1][lix+1] = convert_float(Src[ id_y * srcStride + id_x ]); - - int id_y_h = ADDR_H(id_y-1, 0,height); - int id_y_b = ADDR_B(id_y+1, height,id_y+1); - - int id_x_l = ADDR_L(id_x-1, 0,width); - int id_x_r = ADDR_R(id_x+1, width,id_x+1); - - if(liy==0) - { - lsmem[0][lix+1]=convert_float(Src[ id_y_h * srcStride + id_x ]); - - if(lix==0) - lsmem[0][0]=convert_float(Src[ id_y_h * srcStride + id_x_l ]); - else if(lix==BLK_X-1) - lsmem[0][BLK_X+1]=convert_float(Src[ id_y_h * srcStride + id_x_r ]); - } - else if(liy==BLK_Y-1) - { - lsmem[BLK_Y+1][lix+1]=convert_float(Src[ id_y_b * srcStride + id_x ]); - - if(lix==0) - lsmem[BLK_Y+1][0]=convert_float(Src[ id_y_b * srcStride + id_x_l ]); - else if(lix==BLK_X-1) - lsmem[BLK_Y+1][BLK_X+1]=convert_float(Src[ id_y_b * srcStride + id_x_r ]); - } - - if(lix==0) - lsmem[liy+1][0] = convert_float(Src[ id_y * srcStride + id_x_l ]); - else if(lix==BLK_X-1) - lsmem[liy+1][BLK_X+1] = convert_float(Src[ id_y * srcStride + id_x_r ]); + int x = (int)get_global_id(0); + int y = (int)get_global_id(1); + INIT_AND_READ_LOCAL_SOURCE(width, height, 0, 1) barrier(CLK_LOCAL_MEM_FENCE); + if( x >= dstWidth || y >=dstHeight ) return; + float u1 = lsmem[liy][lix]; float u2 = lsmem[liy][lix+1]; float u3 = lsmem[liy][lix+2]; float m1 = lsmem[liy+1][lix]; - float m2 = lsmem[liy+1][lix+1]; float m3 = lsmem[liy+1][lix+2]; float b1 = lsmem[liy+2][lix]; float b2 = lsmem[liy+2][lix+1]; float b3 = lsmem[liy+2][lix+2]; - //m2 * scale;// - float dx = mad(2.0f, m3 - m1, u3 - u1 + b3 - b1 ); - DstX[ id_y * dstStride + id_x ] = dx * scale; + //calc and store dx and dy;// +#ifdef SCHARR + DSTX(x,y) = mad(10.0f, m3 - m1, 3.0f * (u3 - u1 + b3 - b1)) * scale; + DSTY(x,y) = mad(10.0f, b2 - u2, 3.0f * (b1 - u1 + b3 - u3)) * scale; +#else + DSTX(x,y) = mad(2.0f, m3 - m1, u3 - u1 + b3 - b1) * scale; + DSTY(x,y) = mad(2.0f, b2 - u2, b1 - u1 + b3 - u3) * scale; +#endif +} + +__kernel void sobel5( + __global uchar* Src, + const uint SrcPitch, + const int srcOffsetX, + const int srcOffsetY, + __global uchar* DstX, + const int DstXOffset, + const uint DstXPitch, + __global uchar* DstY, + const int DstYOffset, + const uint DstYPitch, + int width, + int height, + int dstWidth, + int dstHeight, + float scale + ) +{ + __local float lsmem[BLK_Y+4][BLK_X+4]; + + int lix = get_local_id(0); + int liy = get_local_id(1); + + int x = (int)get_global_id(0); + int y = (int)get_global_id(1); + + INIT_AND_READ_LOCAL_SOURCE(width, height, 0, 2) + barrier(CLK_LOCAL_MEM_FENCE); + + if( x >= dstWidth || y >=dstHeight ) return; + + float t1 = lsmem[liy][lix]; + float t2 = lsmem[liy][lix+1]; + float t3 = lsmem[liy][lix+2]; + float t4 = lsmem[liy][lix+3]; + float t5 = lsmem[liy][lix+4]; + + float u1 = lsmem[liy+1][lix]; + float u2 = lsmem[liy+1][lix+1]; + float u3 = lsmem[liy+1][lix+2]; + float u4 = lsmem[liy+1][lix+3]; + float u5 = lsmem[liy+1][lix+4]; + + float m1 = lsmem[liy+2][lix]; + float m2 = lsmem[liy+2][lix+1]; + float m4 = lsmem[liy+2][lix+3]; + float m5 = lsmem[liy+2][lix+4]; + + float l1 = lsmem[liy+3][lix]; + float l2 = lsmem[liy+3][lix+1]; + float l3 = lsmem[liy+3][lix+2]; + float l4 = lsmem[liy+3][lix+3]; + float l5 = lsmem[liy+3][lix+4]; + + float b1 = lsmem[liy+4][lix]; + float b2 = lsmem[liy+4][lix+1]; + float b3 = lsmem[liy+4][lix+2]; + float b4 = lsmem[liy+4][lix+3]; + float b5 = lsmem[liy+4][lix+4]; + + //calc and store dx and dy;// + DSTX(x,y) = scale * + mad(12.0f, m4 - m2, + mad(6.0f, m5 - m1, + mad(8.0f, u4 - u2 + l4 - l2, + mad(4.0f, u5 - u1 + l5 - l1, + mad(2.0f, t4 - t2 + b4 - b2, t5 - t1 + b5 - b1 ) + ) + ) + ) + ); + + DSTY(x,y) = scale * + mad(12.0f, l3 - u3, + mad(6.0f, b3 - t3, + mad(8.0f, l2 - u2 + l4 - u4, + mad(4.0f, b2 - t2 + b4 - t4, + mad(2.0f, l1 - u1 + l5 - u5, b1 - t1 + b5 - t5 ) + ) + ) + ) + ); +} + +__kernel void sobel7( + __global uchar* Src, + const uint SrcPitch, + const int srcOffsetX, + const int srcOffsetY, + __global uchar* DstX, + const int DstXOffset, + const uint DstXPitch, + __global uchar* DstY, + const int DstYOffset, + const uint DstYPitch, + int width, + int height, + int dstWidth, + int dstHeight, + float scale + ) +{ + __local float lsmem[BLK_Y+6][BLK_X+6]; + + int lix = get_local_id(0); + int liy = get_local_id(1); + + int x = (int)get_global_id(0); + int y = (int)get_global_id(1); + + INIT_AND_READ_LOCAL_SOURCE(width, height, 0, 3) + barrier(CLK_LOCAL_MEM_FENCE); + + if( x >= dstWidth || y >=dstHeight ) return; + + float tt1 = lsmem[liy][lix]; + float tt2 = lsmem[liy][lix+1]; + float tt3 = lsmem[liy][lix+2]; + float tt4 = lsmem[liy][lix+3]; + float tt5 = lsmem[liy][lix+4]; + float tt6 = lsmem[liy][lix+5]; + float tt7 = lsmem[liy][lix+6]; + + float t1 = lsmem[liy+1][lix]; + float t2 = lsmem[liy+1][lix+1]; + float t3 = lsmem[liy+1][lix+2]; + float t4 = lsmem[liy+1][lix+3]; + float t5 = lsmem[liy+1][lix+4]; + float t6 = lsmem[liy+1][lix+5]; + float t7 = lsmem[liy+1][lix+6]; + + float u1 = lsmem[liy+2][lix]; + float u2 = lsmem[liy+2][lix+1]; + float u3 = lsmem[liy+2][lix+2]; + float u4 = lsmem[liy+2][lix+3]; + float u5 = lsmem[liy+2][lix+4]; + float u6 = lsmem[liy+2][lix+5]; + float u7 = lsmem[liy+2][lix+6]; + + float m1 = lsmem[liy+3][lix]; + float m2 = lsmem[liy+3][lix+1]; + float m3 = lsmem[liy+3][lix+2]; + float m5 = lsmem[liy+3][lix+4]; + float m6 = lsmem[liy+3][lix+5]; + float m7 = lsmem[liy+3][lix+6]; + + float l1 = lsmem[liy+4][lix]; + float l2 = lsmem[liy+4][lix+1]; + float l3 = lsmem[liy+4][lix+2]; + float l4 = lsmem[liy+4][lix+3]; + float l5 = lsmem[liy+4][lix+4]; + float l6 = lsmem[liy+4][lix+5]; + float l7 = lsmem[liy+4][lix+6]; + + float b1 = lsmem[liy+5][lix]; + float b2 = lsmem[liy+5][lix+1]; + float b3 = lsmem[liy+5][lix+2]; + float b4 = lsmem[liy+5][lix+3]; + float b5 = lsmem[liy+5][lix+4]; + float b6 = lsmem[liy+5][lix+5]; + float b7 = lsmem[liy+5][lix+6]; + + float bb1 = lsmem[liy+6][lix]; + float bb2 = lsmem[liy+6][lix+1]; + float bb3 = lsmem[liy+6][lix+2]; + float bb4 = lsmem[liy+6][lix+3]; + float bb5 = lsmem[liy+6][lix+4]; + float bb6 = lsmem[liy+6][lix+5]; + float bb7 = lsmem[liy+6][lix+6]; + + //calc and store dx and dy + DSTX(x,y) = scale * + mad(100.0f, m5 - m3, + mad(80.0f, m6 - m2, + mad(20.0f, m7 - m1, + mad(75.0f, u5 - u3 + l5 - l3, + mad(60.0f, u6 - u2 + l6 - l2, + mad(15.0f, u7 - u1 + l7 - l1, + mad(30.0f, t5 - t3 + b5 - b3, + mad(24.0f, t6 - t2 + b6 - b2, + mad(6.0f, t7 - t1 + b7 - b1, + mad(5.0f, tt5 - tt3 + bb5 - bb3, + mad(4.0f, tt6 - tt2 + bb6 - bb2, tt7 - tt1 + bb7 - bb1 ) + ) + ) + ) + ) + ) + ) + ) + ) + ) + ); - float dy = mad(2.0f, b2 - u2, b1 - u1 + b3 - u3); - DstY[ id_y * dstStride + id_x ] = dy * scale; -} \ No newline at end of file + DSTY(x,y) = scale * + mad(100.0f, l4 - u4, + mad(80.0f, b4 - t4, + mad(20.0f, bb4 - tt4, + mad(75.0f, l5 - u5 + l3 - u3, + mad(60.0f, b5 - t5 + b3 - t3, + mad(15.0f, bb5 - tt5 + bb3 - tt3, + mad(30.0f, l6 - u6 + l2 - u2, + mad(24.0f, b6 - t6 + b2 - t2, + mad(6.0f, bb6 - tt6 + bb2 - tt2, + mad(5.0f, l7 - u7 + l1 - u1, + mad(4.0f, b7 - t7 + b1 - t1, bb7 - tt7 + bb1 - tt1 ) + ) + ) + ) + ) + ) + ) + ) + ) + ) + ); +} From a63576e76d43a57524307a817079f5a87b7460b8 Mon Sep 17 00:00:00 2001 From: krodyush Date: Tue, 17 Dec 2013 14:02:57 +0400 Subject: [PATCH 07/28] HOST side optimization for GFFT --- modules/ocl/include/opencv2/ocl/ocl.hpp | 2 + modules/ocl/src/gftt.cpp | 366 +++++++++++++----------- modules/ocl/src/opencl/imgproc_gftt.cl | 200 +++---------- 3 files changed, 243 insertions(+), 325 deletions(-) diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp index af42136303..d771aea875 100644 --- a/modules/ocl/include/opencv2/ocl/ocl.hpp +++ b/modules/ocl/include/opencv2/ocl/ocl.hpp @@ -1381,8 +1381,10 @@ namespace cv oclMat Dx_; oclMat Dy_; oclMat eig_; + oclMat eig_minmax_; oclMat minMaxbuf_; oclMat tmpCorners_; + oclMat counter_; }; inline GoodFeaturesToTrackDetector_OCL::GoodFeaturesToTrackDetector_OCL(int maxCorners_, double qualityLevel_, double minDistance_, diff --git a/modules/ocl/src/gftt.cpp b/modules/ocl/src/gftt.cpp index 541b1d6ef9..658e1a912a 100644 --- a/modules/ocl/src/gftt.cpp +++ b/modules/ocl/src/gftt.cpp @@ -48,154 +48,142 @@ using namespace cv; using namespace cv::ocl; +// currently sort procedure on the host is more efficient static bool use_cpu_sorter = true; -namespace +// compact structure for corners +struct DefCorner { -enum SortMethod + float eig; //eigenvalue of corner + short x; //x coordinate of corner point + short y; //y coordinate of corner point +} ; + +// compare procedure for corner +//it is used for sort on the host side +struct DefCornerCompare { - CPU_STL, - BITONIC, - SELECTION -}; - -const int GROUP_SIZE = 256; - -template -struct Sorter -{ - //typedef EigType; -}; - -//TODO(pengx): optimize GPU sorter's performance thus CPU sorter is removed. -template<> -struct Sorter -{ - typedef oclMat EigType; - static cv::Mutex cs; - static Mat mat_eig; - - //prototype - static int clfloat2Gt(cl_float2 pt1, cl_float2 pt2) - { - float v1 = mat_eig.at(cvRound(pt1.s[1]), cvRound(pt1.s[0])); - float v2 = mat_eig.at(cvRound(pt2.s[1]), cvRound(pt2.s[0])); - return v1 > v2; - } - static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count) + bool operator()(const DefCorner a, const DefCorner b) const { - cv::AutoLock lock(cs); - //temporarily use STL's sort function - Mat mat_corners = corners; - mat_eig = eig_tex; - std::sort(mat_corners.begin(), mat_corners.begin() + count, clfloat2Gt); - corners = mat_corners; + return a.eig > b.eig; } }; -cv::Mutex Sorter::cs; -cv::Mat Sorter::mat_eig; -template<> -struct Sorter +// sort corner point using opencl bitonicosrt implementation +static void sortCorners_caller(oclMat& corners, const int count) { - typedef TextureCL EigType; - - static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count) + Context * cxt = Context::getContext(); + int GS = count/2; + int LS = min(255,GS); + size_t globalThreads[3] = {GS, 1, 1}; + size_t localThreads[3] = {LS, 1, 1}; + + // 2^numStages should be equal to count or the output is invalid + int numStages = 0; + for(int i = count; i > 1; i >>= 1) { - Context * cxt = Context::getContext(); - size_t globalThreads[3] = {count / 2, 1, 1}; - size_t localThreads[3] = {GROUP_SIZE, 1, 1}; - - // 2^numStages should be equal to count or the output is invalid - int numStages = 0; - for(int i = count; i > 1; i >>= 1) - { - ++numStages; - } - const int argc = 5; - std::vector< std::pair > args(argc); - std::string kernelname = "sortCorners_bitonicSort"; - args[0] = std::make_pair(sizeof(cl_mem), (void *)&eig_tex); - args[1] = std::make_pair(sizeof(cl_mem), (void *)&corners.data); - args[2] = std::make_pair(sizeof(cl_int), (void *)&count); - for(int stage = 0; stage < numStages; ++stage) - { - args[3] = std::make_pair(sizeof(cl_int), (void *)&stage); - for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage) - { - args[4] = std::make_pair(sizeof(cl_int), (void *)&passOfStage); - openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1); - } - } + ++numStages; } -}; - -template<> -struct Sorter -{ - typedef TextureCL EigType; - - static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count) + const int argc = 4; + std::vector< std::pair > args(argc); + std::string kernelname = "sortCorners_bitonicSort"; + args[0] = std::make_pair(sizeof(cl_mem), (void *)&corners.data); + args[1] = std::make_pair(sizeof(cl_int), (void *)&count); + for(int stage = 0; stage < numStages; ++stage) { - Context * cxt = Context::getContext(); - - size_t globalThreads[3] = {count, 1, 1}; - size_t localThreads[3] = {GROUP_SIZE, 1, 1}; - - std::vector< std::pair > args; - //local - std::string kernelname = "sortCorners_selectionSortLocal"; - int lds_size = GROUP_SIZE * sizeof(cl_float2); - args.push_back( std::make_pair( sizeof(cl_mem), (void*)&eig_tex) ); - args.push_back( std::make_pair( sizeof(cl_mem), (void*)&corners.data) ); - args.push_back( std::make_pair( sizeof(cl_int), (void*)&count) ); - args.push_back( std::make_pair( lds_size, (void*)NULL) ); - - openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1); - - //final - kernelname = "sortCorners_selectionSortFinal"; - args.pop_back(); - openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1); + args[2] = std::make_pair(sizeof(cl_int), (void *)&stage); + for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage) + { + args[3] = std::make_pair(sizeof(cl_int), (void *)&passOfStage); + openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1); + } } -}; +} -int findCorners_caller( - const TextureCL& eig, - const float threshold, - const oclMat& mask, - oclMat& corners, - const int max_count) +// find corners on matrix and put it into array +void findCorners_caller( + const oclMat& eig_mat, //input matrix worth eigenvalues + oclMat& eigMinMax, //input with min and max values of eigenvalues + const float qualityLevel, + const oclMat& mask, + oclMat& corners, //output array with detected corners + oclMat& counter) //output value with number of detected corners, have to be 0 before call { + string opt; std::vector k; Context * cxt = Context::getContext(); std::vector< std::pair > args; - std::string kernelname = "findCorners"; const int mask_strip = mask.step / mask.elemSize1(); - oclMat g_counter(1, 1, CV_32SC1); - g_counter.setTo(0); + args.push_back(make_pair( sizeof(cl_mem), (void*)&(eig_mat.data))); - args.push_back(make_pair( sizeof(cl_mem), (void*)&eig )); + int src_pitch = (int)eig_mat.step; + args.push_back(make_pair( sizeof(cl_int), (void*)&src_pitch )); args.push_back(make_pair( sizeof(cl_mem), (void*)&mask.data )); args.push_back(make_pair( sizeof(cl_mem), (void*)&corners.data )); args.push_back(make_pair( sizeof(cl_int), (void*)&mask_strip)); - args.push_back(make_pair( sizeof(cl_float), (void*)&threshold )); - args.push_back(make_pair( sizeof(cl_int), (void*)&eig.rows )); - args.push_back(make_pair( sizeof(cl_int), (void*)&eig.cols )); - args.push_back(make_pair( sizeof(cl_int), (void*)&max_count )); - args.push_back(make_pair( sizeof(cl_mem), (void*)&g_counter.data )); - - size_t globalThreads[3] = {eig.cols, eig.rows, 1}; + args.push_back(make_pair( sizeof(cl_mem), (void*)&eigMinMax.data )); + args.push_back(make_pair( sizeof(cl_float), (void*)&qualityLevel )); + args.push_back(make_pair( sizeof(cl_int), (void*)&eig_mat.rows )); + args.push_back(make_pair( sizeof(cl_int), (void*)&eig_mat.cols )); + args.push_back(make_pair( sizeof(cl_int), (void*)&corners.cols )); + args.push_back(make_pair( sizeof(cl_mem), (void*)&counter.data )); + + size_t globalThreads[3] = {eig_mat.cols, eig_mat.rows, 1}; size_t localThreads[3] = {16, 16, 1}; + if(!mask.empty()) + opt += " -D WITH_MASK=1"; - const char * opt = mask.empty() ? "" : "-D WITH_MASK"; - openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1, opt); - return std::min(Mat(g_counter).at(0), max_count); + openCLExecuteKernel(cxt, &imgproc_gftt, "findCorners", globalThreads, localThreads, args, -1, -1, opt.c_str()); +} + + +static void minMaxEig_caller(const oclMat &src, oclMat &dst, oclMat & tozero) +{ + size_t groupnum = src.clCxt->getDeviceInfo().maxComputeUnits; + CV_Assert(groupnum != 0); + + int dbsize = groupnum * 2 * src.elemSize(); + + ensureSizeIsEnough(1, dbsize, CV_8UC1, dst); + + cl_mem dst_data = reinterpret_cast(dst.data); + + int all_cols = src.step / src.elemSize(); + int pre_cols = (src.offset % src.step) / src.elemSize(); + int sec_cols = all_cols - (src.offset % src.step + src.cols * src.elemSize() - 1) / src.elemSize() - 1; + int invalid_cols = pre_cols + sec_cols; + int cols = all_cols - invalid_cols , elemnum = cols * src.rows; + int offset = src.offset / src.elemSize(); + + {// first parallel pass + vector > args; + args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data)); + args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_data )); + args.push_back( make_pair( sizeof(cl_int) , (void *)&cols )); + args.push_back( make_pair( sizeof(cl_int) , (void *)&invalid_cols )); + args.push_back( make_pair( sizeof(cl_int) , (void *)&offset)); + args.push_back( make_pair( sizeof(cl_int) , (void *)&elemnum)); + args.push_back( make_pair( sizeof(cl_int) , (void *)&groupnum)); + size_t globalThreads[3] = {groupnum * 256, 1, 1}; + size_t localThreads[3] = {256, 1, 1}; + openCLExecuteKernel(src.clCxt, &arithm_minMax, "arithm_op_minMax", globalThreads, localThreads, + args, -1, -1, "-D T=float -D DEPTH_5"); + } + + {// run final "serial" kernel to find accumulate results from threads and reset corner counter + vector > args; + args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_data )); + args.push_back( make_pair( sizeof(cl_int) , (void *)&groupnum )); + args.push_back( make_pair( sizeof(cl_mem) , (void *)&tozero.data )); + size_t globalThreads[3] = {1, 1, 1}; + size_t localThreads[3] = {1, 1, 1}; + openCLExecuteKernel(src.clCxt, &imgproc_gftt, "arithm_op_minMax_final", globalThreads, localThreads, + args, -1, -1); + } } -}//unnamed namespace void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image, oclMat& corners, const oclMat& mask) { @@ -205,67 +193,99 @@ void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image, ensureSizeIsEnough(image.size(), CV_32F, eig_); if (useHarrisDetector) - cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3, harrisK); + cornerHarris_dxdy(image, eig_, Dx_, Dy_, blockSize, 3, harrisK); else cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3); - double maxVal = 0; - minMax(eig_, NULL, &maxVal); + ensureSizeIsEnough(1,1, CV_32SC1, counter_); + + // find max eigenvalue and reset detected counters + minMaxEig_caller(eig_,eig_minmax_,counter_); - ensureSizeIsEnough(1, std::max(1000, static_cast(image.size().area() * 0.05)), CV_32FC2, tmpCorners_); + // allocate buffer for kernels + int corner_array_size = std::max(1024, static_cast(image.size().area() * 0.05)); - Ptr eig_tex = bindTexturePtr(eig_); - int total = findCorners_caller( - *eig_tex, - static_cast(maxVal * qualityLevel), + if(!use_cpu_sorter) + { // round to 2^n + unsigned int n=1; + for(n=1;n<(unsigned int)corner_array_size;n<<=1); + corner_array_size = (int)n; + + ensureSizeIsEnough(1, corner_array_size , CV_32FC2, tmpCorners_); + + // set to 0 to be able use bitonic sort on whole 2^n array + tmpCorners_.setTo(0); + } + else + { + ensureSizeIsEnough(1, corner_array_size , CV_32FC2, tmpCorners_); + } + + int total = tmpCorners_.cols; // by default the number of corner is full array + vector tmp(tmpCorners_.cols); // input buffer with corner for HOST part of algorithm + + //find points with high eigenvalue and put it into the output array + findCorners_caller( + eig_, + eig_minmax_, + static_cast(qualityLevel), mask, tmpCorners_, - tmpCorners_.cols); + counter_); + + if(!use_cpu_sorter) + {// sort detected corners on deivce side + sortCorners_caller(tmpCorners_, corner_array_size); + } + else + {// send non-blocking request to read real non-zero number of corners to sort it on the HOST side + openCLVerifyCall(clEnqueueReadBuffer(getClCommandQueue(counter_.clCxt), (cl_mem)counter_.data, CL_FALSE, 0,sizeof(int), &total, 0, NULL, NULL)); + } + + //blocking read whole corners array (sorted or not sorted) + openCLReadBuffer(tmpCorners_.clCxt,(cl_mem)tmpCorners_.data,&tmp[0],tmpCorners_.cols*sizeof(DefCorner)); if (total == 0) - { + {// check for trivial case corners.release(); return; } + if(use_cpu_sorter) - { - Sorter::sortCorners_caller(eig_, tmpCorners_, total); - } - else - { - //if total is power of 2 - if(((total - 1) & (total)) == 0) - { - Sorter::sortCorners_caller(*eig_tex, tmpCorners_, total); - } - else - { - Sorter::sortCorners_caller(*eig_tex, tmpCorners_, total); - } + {// sort detected corners on cpu side. + tmp.resize(total); + cv::sort(tmp,DefCornerCompare()); } + //estimate maximal size of final output array + int total_max = maxCorners > 0 ? std::min(maxCorners, total) : total; + int D2 = (int)ceil(minDistance * minDistance); + // allocate output buffer + vector tmp2; + tmp2.reserve(total_max); + + if (minDistance < 1) - { - Rect roi_range(0, 0, maxCorners > 0 ? std::min(maxCorners, total) : total, 1); - tmpCorners_(roi_range).copyTo(corners); + {// we have not distance restriction. then just copy with conversion maximal allowed points into output array + for(int i=0;i0.0f;++i) + { + tmp2.push_back(Point2f(tmp[i].x,tmp[i].y)); + } } else - { - vector tmp(total); - downloadPoints(tmpCorners_, tmp); - - vector tmp2; - tmp2.reserve(total); - + {// we have distance restriction. then start coping to output array from the first element and check distance for each next one const int cell_size = cvRound(minDistance); const int grid_width = (image.cols + cell_size - 1) / cell_size; const int grid_height = (image.rows + cell_size - 1) / cell_size; - std::vector< std::vector > grid(grid_width * grid_height); + std::vector< std::vector > grid(grid_width * grid_height); - for (int i = 0; i < total; ++i) + for (int i = 0; i < total ; ++i) { - Point2f p = tmp[i]; + DefCorner p = tmp[i]; + + if(p.eig<=0.0f) + break; // condition to stop that is needed for GPU bitonic sort usage. bool good = true; @@ -287,40 +307,42 @@ void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image, { for (int xx = x1; xx <= x2; xx++) { - vector& m = grid[yy * grid_width + xx]; - - if (!m.empty()) + vector& m = grid[yy * grid_width + xx]; + if (m.empty()) + continue; + for(size_t j = 0; j < m.size(); j++) { - for(size_t j = 0; j < m.size(); j++) + int dx = p.x - m[j].x; + int dy = p.y - m[j].y; + + if (dx * dx + dy * dy < D2) { - float dx = p.x - m[j].x; - float dy = p.y - m[j].y; - - if (dx * dx + dy * dy < minDistance * minDistance) - { - good = false; - goto break_out; - } + good = false; + goto break_out_; } } } } - break_out: + break_out_: if(good) { - grid[y_cell * grid_width + x_cell].push_back(p); + grid[y_cell * grid_width + x_cell].push_back(Point2i(p.x,p.y)); - tmp2.push_back(p); + tmp2.push_back(Point2f(p.x,p.y)); if (maxCorners > 0 && tmp2.size() == static_cast(maxCorners)) break; } } - corners.upload(Mat(1, static_cast(tmp2.size()), CV_32FC2, &tmp2[0])); } + int final_size = static_cast(tmp2.size()); + if(final_size>0) + corners.upload(Mat(1, final_size, CV_32FC2, &tmp2[0])); + else + corners.release(); } void cv::ocl::GoodFeaturesToTrackDetector_OCL::downloadPoints(const oclMat &points, vector &points_v) { diff --git a/modules/ocl/src/opencl/imgproc_gftt.cl b/modules/ocl/src/opencl/imgproc_gftt.cl index 80bdec08ff..4d5356cfbd 100644 --- a/modules/ocl/src/opencl/imgproc_gftt.cl +++ b/modules/ocl/src/opencl/imgproc_gftt.cl @@ -46,33 +46,26 @@ #ifndef WITH_MASK #define WITH_MASK 0 #endif - -__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; - -inline float ELEM_INT2(image2d_t _eig, int _x, int _y) -{ - return read_imagef(_eig, sampler, (int2)(_x, _y)).x; -} - -inline float ELEM_FLT2(image2d_t _eig, float2 pt) -{ - return read_imagef(_eig, sampler, pt).x; -} +//macro to read eigenvalue matrix +#define GET_SRC_32F(_x, _y) ((__global const float*)(eig + (_y)*eig_pitch))[_x] __kernel void findCorners ( - image2d_t eig, - __global const char * mask, - __global float2 * corners, - const int mask_strip,// in pixels - const float threshold, - const int rows, - const int cols, - const int max_count, - __global int * g_counter + __global const char* eig, + const int eig_pitch, + __global const char* mask, + __global float2* corners, + const int mask_strip,// in pixels + __global const float* pMinMax, + const float qualityLevel, + const int rows, + const int cols, + const int max_count, + __global int* g_counter ) { + float threshold = qualityLevel*pMinMax[1]; const int j = get_global_id(0); const int i = get_global_id(1); @@ -82,39 +75,42 @@ __kernel #endif ) { - const float val = ELEM_INT2(eig, j, i); + const float val = GET_SRC_32F(j, i); if (val > threshold) { float maxVal = val; + maxVal = fmax(GET_SRC_32F(j - 1, i - 1), maxVal); + maxVal = fmax(GET_SRC_32F(j , i - 1), maxVal); + maxVal = fmax(GET_SRC_32F(j + 1, i - 1), maxVal); - maxVal = fmax(ELEM_INT2(eig, j - 1, i - 1), maxVal); - maxVal = fmax(ELEM_INT2(eig, j , i - 1), maxVal); - maxVal = fmax(ELEM_INT2(eig, j + 1, i - 1), maxVal); - - maxVal = fmax(ELEM_INT2(eig, j - 1, i), maxVal); - maxVal = fmax(ELEM_INT2(eig, j + 1, i), maxVal); + maxVal = fmax(GET_SRC_32F(j - 1, i), maxVal); + maxVal = fmax(GET_SRC_32F(j + 1, i), maxVal); - maxVal = fmax(ELEM_INT2(eig, j - 1, i + 1), maxVal); - maxVal = fmax(ELEM_INT2(eig, j , i + 1), maxVal); - maxVal = fmax(ELEM_INT2(eig, j + 1, i + 1), maxVal); + maxVal = fmax(GET_SRC_32F(j - 1, i + 1), maxVal); + maxVal = fmax(GET_SRC_32F(j , i + 1), maxVal); + maxVal = fmax(GET_SRC_32F(j + 1, i + 1), maxVal); if (val == maxVal) { const int ind = atomic_inc(g_counter); if (ind < max_count) - corners[ind] = (float2)(j, i); + {// pack and store eigenvalue and its coordinates + corners[ind].x = val; + corners[ind].y = as_float(j|(i<<16)); + } } } } } +#undef GET_SRC_32F + //bitonic sort __kernel void sortCorners_bitonicSort ( - image2d_t eig, __global float2 * corners, const int count, const int stage, @@ -140,8 +136,8 @@ __kernel const float2 leftPt = corners[leftId]; const float2 rightPt = corners[rightId]; - const float leftVal = ELEM_FLT2(eig, leftPt); - const float rightVal = ELEM_FLT2(eig, rightPt); + const float leftVal = leftPt.x; + const float rightVal = rightPt.x; const bool compareResult = leftVal > rightVal; @@ -152,124 +148,22 @@ __kernel corners[rightId] = sortOrder ? greater : lesser; } -//selection sort for gfft -//kernel is ported from Bolt library: -//https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_kernels.cl -// Local sort will firstly sort elements of each workgroup using selection sort -// its performance is O(n) -__kernel - void sortCorners_selectionSortLocal - ( - image2d_t eig, - __global float2 * corners, - const int count, - __local float2 * scratch - ) +// this is simple short serial kernel that makes some short reduction and initialization work +// it makes HOST like work to avoid additional sync with HOST to do this short work +// data - input/output float2. +// input data are sevral (min,max) pairs +// output data is one reduced (min,max) pair +// g_counter - counter that have to be initialized by 0 for next findCorner call. +__kernel void arithm_op_minMax_final(__global float * data, int groupnum,__global int * g_counter) { - int i = get_local_id(0); // index in workgroup - int numOfGroups = get_num_groups(0); // index in workgroup - int groupID = get_group_id(0); - int wg = get_local_size(0); // workgroup size = block size - int n; // number of elements to be processed for this work group - - int offset = groupID * wg; - int same = 0; - corners += offset; - n = (groupID == (numOfGroups-1))? (count - wg*(numOfGroups-1)) : wg; - float2 pt1, pt2; - - pt1 = corners[min(i, n)]; - scratch[i] = pt1; - barrier(CLK_LOCAL_MEM_FENCE); - - if(i >= n) + g_counter[0] = 0; + float minVal = data[0]; + float maxVal = data[groupnum]; + for(int i=1;i val1) - pos++;//calculate the rank of this element in this work group - else - { - if(val1 > val2) - continue; - else - { - // val1 and val2 are same - same++; - } - } - } - for (int j=0; j< same; j++) - corners[pos + j] = pt1; -} -__kernel - void sortCorners_selectionSortFinal - ( - image2d_t eig, - __global float2 * corners, - const int count - ) -{ - const int i = get_local_id(0); // index in workgroup - const int numOfGroups = get_num_groups(0); // index in workgroup - const int groupID = get_group_id(0); - const int wg = get_local_size(0); // workgroup size = block size - int pos = 0, same = 0; - const int offset = get_group_id(0) * wg; - const int remainder = count - wg*(numOfGroups-1); - - if((offset + i ) >= count) - return; - float2 pt1, pt2; - pt1 = corners[groupID*wg + i]; - - float val1 = ELEM_FLT2(eig, pt1); - float val2; - - for(int j=0; j val2) - break; - else - { - //Increment only if the value is not the same. - if( val2 > val1 ) - pos++; - else - same++; - } - } - } - - for(int k=0; k val2) - break; - else - { - //Don't increment if the value is the same. - //Two elements are same if (*userComp)(jData, iData) and (*userComp)(iData, jData) are both false - if(val2 > val1) - pos++; - else - same++; - } - } - for (int j=0; j< same; j++) - corners[pos + j] = pt1; -} + data[0] = minVal; + data[1] = maxVal; +} \ No newline at end of file From 917b883cf0d703c8e5ce3bb17df7755cf4a291f3 Mon Sep 17 00:00:00 2001 From: krodyush Date: Tue, 17 Dec 2013 14:04:10 +0400 Subject: [PATCH 08/28] remove extra calculations from haar to be consistent with native implementation --- modules/ocl/src/haar.cpp | 69 +++++++++++++++------- modules/ocl/src/opencl/haarobjectdetect.cl | 26 ++++---- 2 files changed, 62 insertions(+), 33 deletions(-) diff --git a/modules/ocl/src/haar.cpp b/modules/ocl/src/haar.cpp index 25d376a4e1..e334ad913b 100644 --- a/modules/ocl/src/haar.cpp +++ b/modules/ocl/src/haar.cpp @@ -866,16 +866,17 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS if(gcascade->is_stump_based && gsum.clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE)) { - //setup local group size - localThreads[0] = 8; - localThreads[1] = 16; + //setup local group size for "pixel step" = 1 + localThreads[0] = 16; + localThreads[1] = 32; localThreads[2] = 1; - //init maximal number of workgroups + //calc maximal number of workgroups int WGNumX = 1+(sizev[0].width /(localThreads[0])); int WGNumY = 1+(sizev[0].height/(localThreads[1])); int WGNumZ = loopcount; - int WGNum = 0; //accurate number of non -empty workgroups + int WGNumTotal = 0; //accurate number of non-empty workgroups + int WGNumSampled = 0; //accurate number of workgroups processed only 1/4 part of all pixels. it is made for large images with scale <= 2 oclMat oclWGInfo(1,sizeof(cl_int4) * WGNumX*WGNumY*WGNumZ,CV_8U); { cl_int4* pWGInfo = (cl_int4*)clEnqueueMapBuffer(getClCommandQueue(oclWGInfo.clCxt),(cl_mem)oclWGInfo.datastart,true,CL_MAP_WRITE, 0, oclWGInfo.step, 0,0,0,&status); @@ -895,12 +896,16 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS if(gx>=(Width-cascade->orig_window_size.width)) continue; // no data to process + if(scaleinfo[z].factor<=2) + { + WGNumSampled++; + } // save no-empty workgroup info into array - pWGInfo[WGNum].s[0] = scaleinfo[z].width_height; - pWGInfo[WGNum].s[1] = (gx << 16) | gy; - pWGInfo[WGNum].s[2] = scaleinfo[z].imgoff; - memcpy(&(pWGInfo[WGNum].s[3]),&(scaleinfo[z].factor),sizeof(float)); - WGNum++; + pWGInfo[WGNumTotal].s[0] = scaleinfo[z].width_height; + pWGInfo[WGNumTotal].s[1] = (gx << 16) | gy; + pWGInfo[WGNumTotal].s[2] = scaleinfo[z].imgoff; + memcpy(&(pWGInfo[WGNumTotal].s[3]),&(scaleinfo[z].factor),sizeof(float)); + WGNumTotal++; } } } @@ -908,13 +913,8 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS pWGInfo = NULL; } - // setup global sizes to have linear array of workgroups with WGNum size - globalThreads[0] = localThreads[0]*WGNum; - globalThreads[1] = localThreads[1]; - globalThreads[2] = 1; - #define NODE_SIZE 12 - // pack node info to have less memory loads + // pack node info to have less memory loads on the device side oclMat oclNodesPK(1,sizeof(cl_int) * NODE_SIZE * nodenum,CV_8U); { cl_int status; @@ -963,8 +963,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS options += format(" -D WND_SIZE_X=%d",cascade->orig_window_size.width); options += format(" -D WND_SIZE_Y=%d",cascade->orig_window_size.height); options += format(" -D STUMP_BASED=%d",gcascade->is_stump_based); - options += format(" -D LSx=%d",localThreads[0]); - options += format(" -D LSy=%d",localThreads[1]); options += format(" -D SPLITNODE=%d",splitnode); options += format(" -D SPLITSTAGE=%d",splitstage); options += format(" -D OUTPUTSZ=%d",outputsz); @@ -972,8 +970,39 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS // init candiate global count by 0 int pattern = 0; openCLSafeCall(clEnqueueWriteBuffer(qu, candidatebuffer, 1, 0, 1 * sizeof(pattern),&pattern, 0, NULL, NULL)); - // execute face detector - openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascadePacked", globalThreads, localThreads, args, -1, -1, options.c_str()); + + if(WGNumTotal>WGNumSampled) + {// small images and each pixel is processed + // setup global sizes to have linear array of workgroups with WGNum size + int pixelstep = 1; + size_t LS[3]={localThreads[0]/pixelstep,localThreads[1]/pixelstep,1}; + globalThreads[0] = LS[0]*(WGNumTotal-WGNumSampled); + globalThreads[1] = LS[1]; + globalThreads[2] = 1; + string options1 = options; + options1 += format(" -D PIXEL_STEP=%d",pixelstep); + options1 += format(" -D WGSTART=%d",WGNumSampled); + options1 += format(" -D LSx=%d",LS[0]); + options1 += format(" -D LSy=%d",LS[1]); + // execute face detector + openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascadePacked", globalThreads, LS, args, -1, -1, options1.c_str()); + } + if(WGNumSampled>0) + {// large images each 4th pixel is processed + // setup global sizes to have linear array of workgroups with WGNum size + int pixelstep = 2; + size_t LS[3]={localThreads[0]/pixelstep,localThreads[1]/pixelstep,1}; + globalThreads[0] = LS[0]*WGNumSampled; + globalThreads[1] = LS[1]; + globalThreads[2] = 1; + string options2 = options; + options2 += format(" -D PIXEL_STEP=%d",pixelstep); + options2 += format(" -D WGSTART=%d",0); + options2 += format(" -D LSx=%d",LS[0]); + options2 += format(" -D LSy=%d",LS[1]); + // execute face detector + openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascadePacked", globalThreads, LS, args, -1, -1, options2.c_str()); + } //read candidate buffer back and put it into host list openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz ); assert(candidate[0]> 16)&0xFFFF; int GroupY = (WGInfo.y >> 0 )& 0xFFFF; int Width = (WGInfo.x >> 16)&0xFFFF; @@ -140,8 +138,8 @@ __kernel void gpuRunHaarClassifierCascadePacked( int ImgOffset = WGInfo.z; float ScaleFactor = as_float(WGInfo.w); -#define DATA_SIZE_X (LSx+WND_SIZE_X) -#define DATA_SIZE_Y (LSy+WND_SIZE_Y) +#define DATA_SIZE_X (PIXEL_STEP*LSx+WND_SIZE_X) +#define DATA_SIZE_Y (PIXEL_STEP*LSy+WND_SIZE_Y) #define DATA_SIZE (DATA_SIZE_X*DATA_SIZE_Y) local int SumL[DATA_SIZE]; @@ -165,9 +163,11 @@ __kernel void gpuRunHaarClassifierCascadePacked( int4 info1 = p; int4 info2 = pq; - { - int xl = lid_x; - int yl = lid_y; + // calc processed ROI coordinate in local mem + int xl = lid_x*PIXEL_STEP; + int yl = lid_y*PIXEL_STEP; + + {// calc variance_norm_factor for all stages int OffsetLocal = yl * DATA_SIZE_X + xl; int OffsetGlobal = (GroupY+yl)* pixelstep + (GroupX+xl); @@ -194,13 +194,13 @@ __kernel void gpuRunHaarClassifierCascadePacked( int result = (1.0f>0.0f); for(int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++ ) - {// iterate until candidate is exist + {// iterate until candidate is valid float stage_sum = 0.0f; __global GpuHidHaarStageClassifier* stageinfo = (__global GpuHidHaarStageClassifier*) ((__global uchar*)stagecascadeptr+stageloop*sizeof(GpuHidHaarStageClassifier)); + int lcl_off = (yl*DATA_SIZE_X)+(xl); int stagecount = stageinfo->count; float stagethreshold = stageinfo->threshold; - int lcl_off = (lid_y*DATA_SIZE_X)+(lid_x); for(int nodeloop = 0; nodeloop < stagecount; nodecounter++,nodeloop++ ) { // simple macro to extract shorts from int @@ -212,7 +212,7 @@ __kernel void gpuRunHaarClassifierCascadePacked( int4 n1 = pN[1]; int4 n2 = pN[2]; float nodethreshold = as_float(n2.y) * variance_norm_factor; - // calc sum of intensity pixels according to node information + // calc sum of intensity pixels according to classifier node information float classsum = (SumL[M0(n0.x)+lcl_off] - SumL[M1(n0.x)+lcl_off] - SumL[M0(n0.y)+lcl_off] + SumL[M1(n0.y)+lcl_off]) * as_float(n1.z) + (SumL[M0(n0.z)+lcl_off] - SumL[M1(n0.z)+lcl_off] - SumL[M0(n0.w)+lcl_off] + SumL[M1(n0.w)+lcl_off]) * as_float(n1.w) + @@ -228,8 +228,8 @@ __kernel void gpuRunHaarClassifierCascadePacked( int index = 1+atomic_inc((volatile global int*)candidate); //get index to write global data with face info if(index Date: Tue, 17 Dec 2013 14:06:14 +0400 Subject: [PATCH 09/28] Changes the datatype of the angle of the gradient for Intel platforms. --- modules/ocl/src/hog.cpp | 24 ++++++++++++++++++++---- modules/ocl/src/opencl/objdetect_hog.cl | 18 +++++++++++++----- 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/modules/ocl/src/hog.cpp b/modules/ocl/src/hog.cpp index 68f3949a84..1f8afe5590 100644 --- a/modules/ocl/src/hog.cpp +++ b/modules/ocl/src/hog.cpp @@ -76,6 +76,11 @@ namespace cv int cdescr_width; int cdescr_height; + // A shift value and type that allows qangle to be different + // sizes on different hardware + int qangle_step_shift; + int qangle_type; + void set_up_constants(int nbins, int block_stride_x, int block_stride_y, int nblocks_win_x, int nblocks_win_y); @@ -153,6 +158,7 @@ cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size blo hog_device_cpu = true; else hog_device_cpu = false; + } size_t cv::ocl::HOGDescriptor::getDescriptorSize() const @@ -213,7 +219,7 @@ void cv::ocl::HOGDescriptor::init_buffer(const oclMat &img, Size win_stride) effect_size = img.size(); grad.create(img.size(), CV_32FC2); - qangle.create(img.size(), CV_8UC2); + qangle.create(img.size(), hog::qangle_type); const size_t block_hist_size = getBlockHistogramSize(); const Size blocks_per_img = numPartsWithin(img.size(), block_size, block_stride); @@ -1607,6 +1613,16 @@ void cv::ocl::device::hog::set_up_constants(int nbins, int descr_size = descr_width * nblocks_win_y; cdescr_size = descr_size; + + qangle_type = CV_8UC2; + qangle_step_shift = 0; + // Some Intel devices have low single-byte access performance, + // so we change the datatype here. + if (Context::getContext()->supportsFeature(FEATURE_CL_INTEL_DEVICE)) + { + qangle_type = CV_32SC2; + qangle_step_shift = 2; + } } void cv::ocl::device::hog::compute_hists(int nbins, @@ -1628,7 +1644,7 @@ void cv::ocl::device::hog::compute_hists(int nbins, int blocks_total = img_block_width * img_block_height; int grad_quadstep = grad.step >> 2; - int qangle_step = qangle.step; + int qangle_step = qangle.step >> qangle_step_shift; int blocks_in_group = 4; size_t localThreads[3] = { blocks_in_group * 24, 2, 1 }; @@ -1892,7 +1908,7 @@ void cv::ocl::device::hog::compute_gradients_8UC1(int height, int width, char correctGamma = (correct_gamma) ? 1 : 0; int img_step = img.step; int grad_quadstep = grad.step >> 3; - int qangle_step = qangle.step >> 1; + int qangle_step = qangle.step >> (1 + qangle_step_shift); args.push_back( make_pair( sizeof(cl_int), (void *)&height)); args.push_back( make_pair( sizeof(cl_int), (void *)&width)); @@ -1927,7 +1943,7 @@ void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width, char correctGamma = (correct_gamma) ? 1 : 0; int img_step = img.step >> 2; int grad_quadstep = grad.step >> 3; - int qangle_step = qangle.step >> 1; + int qangle_step = qangle.step >> (1 + qangle_step_shift); args.push_back( make_pair( sizeof(cl_int), (void *)&height)); args.push_back( make_pair( sizeof(cl_int), (void *)&width)); diff --git a/modules/ocl/src/opencl/objdetect_hog.cl b/modules/ocl/src/opencl/objdetect_hog.cl index 0d2f26f966..60d7346e5a 100644 --- a/modules/ocl/src/opencl/objdetect_hog.cl +++ b/modules/ocl/src/opencl/objdetect_hog.cl @@ -50,6 +50,14 @@ #define NTHREADS 256 #define CV_PI_F 3.1415926535897932384626433832795f +#ifdef INTEL_DEVICE +#define QANGLE_TYPE int +#define QANGLE_TYPE2 int2 +#else +#define QANGLE_TYPE uchar +#define QANGLE_TYPE2 uchar2 +#endif + //---------------------------------------------------------------------------- // Histogram computation // 12 threads for a cell, 12x4 threads per block @@ -59,7 +67,7 @@ __kernel void compute_hists_lut_kernel( const int cnbins, const int cblock_hist_size, const int img_block_width, const int blocks_in_group, const int blocks_total, const int grad_quadstep, const int qangle_step, - __global const float* grad, __global const uchar* qangle, + __global const float* grad, __global const QANGLE_TYPE* qangle, __global const float* gauss_w_lut, __global float* block_hists, __local float* smem) { @@ -86,7 +94,7 @@ __kernel void compute_hists_lut_kernel( __global const float* grad_ptr = (gid < blocks_total) ? grad + offset_y * grad_quadstep + (offset_x << 1) : grad; - __global const uchar* qangle_ptr = (gid < blocks_total) ? + __global const QANGLE_TYPE* qangle_ptr = (gid < blocks_total) ? qangle + offset_y * qangle_step + (offset_x << 1) : qangle; __local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) + @@ -101,7 +109,7 @@ __kernel void compute_hists_lut_kernel( for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y) { float2 vote = (float2) (grad_ptr[0], grad_ptr[1]); - uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]); + QANGLE_TYPE2 bin = (QANGLE_TYPE2) (qangle_ptr[0], qangle_ptr[1]); grad_ptr += grad_quadstep; qangle_ptr += qangle_step; @@ -558,7 +566,7 @@ __kernel void extract_descrs_by_cols_kernel( __kernel void compute_gradients_8UC4_kernel( const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step, - const __global uchar4 * img, __global float * grad, __global uchar * qangle, + const __global uchar4 * img, __global float * grad, __global QANGLE_TYPE * qangle, const float angle_scale, const char correct_gamma, const int cnbins) { const int x = get_global_id(0); @@ -660,7 +668,7 @@ __kernel void compute_gradients_8UC4_kernel( __kernel void compute_gradients_8UC1_kernel( const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step, - __global const uchar * img, __global float * grad, __global uchar * qangle, + __global const uchar * img, __global float * grad, __global QANGLE_TYPE * qangle, const float angle_scale, const char correct_gamma, const int cnbins) { const int x = get_global_id(0); From dfe7c98090402018318d86b9059cbe63a831df53 Mon Sep 17 00:00:00 2001 From: krodyush Date: Tue, 17 Dec 2013 14:09:06 +0400 Subject: [PATCH 10/28] optimize separable filter by Added "sep_filter_singlepass" kernel that performs separable filtering in one kernel call Added appropriate host part - sepFilter2D_SinglePass function and SingleStepSeparableFilterEngine_GPU class Changed function declarations to enable their usage --- modules/ocl/include/opencv2/ocl/ocl.hpp | 7 +- modules/ocl/src/filtering.cpp | 191 +++++++++++++++++- .../opencl/filtering_sep_filter_singlepass.cl | 185 +++++++++++++++++ 3 files changed, 369 insertions(+), 14 deletions(-) create mode 100644 modules/ocl/src/opencl/filtering_sep_filter_singlepass.cl diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp index af42136303..d144a042e8 100644 --- a/modules/ocl/include/opencv2/ocl/ocl.hpp +++ b/modules/ocl/include/opencv2/ocl/ocl.hpp @@ -706,17 +706,17 @@ namespace cv //! returns the separable linear filter engine CV_EXPORTS Ptr createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat &rowKernel, - const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT); + const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT, Size imgSize = Size(-1,-1)); //! returns the separable filter engine with the specified filters CV_EXPORTS Ptr createSeparableFilter_GPU(const Ptr &rowFilter, const Ptr &columnFilter); //! returns the Gaussian filter engine - CV_EXPORTS Ptr createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT); + CV_EXPORTS Ptr createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT, Size imgSize = Size(-1,-1)); //! returns filter engine for the generalized Sobel operator - CV_EXPORTS Ptr createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT ); + CV_EXPORTS Ptr createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT, Size imgSize = Size(-1,-1) ); //! applies Laplacian operator to the image // supports only ksize = 1 and ksize = 3 @@ -869,7 +869,6 @@ namespace cv CV_EXPORTS void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int bordertype = cv::BORDER_DEFAULT); CV_EXPORTS void cornerMinEigenVal_dxdy(const oclMat &src, oclMat &dst, oclMat &Dx, oclMat &Dy, int blockSize, int ksize, int bordertype = cv::BORDER_DEFAULT); - /////////////////////////////////// ML /////////////////////////////////////////// //! Compute closest centers for each lines in source and lable it after center's index diff --git a/modules/ocl/src/filtering.cpp b/modules/ocl/src/filtering.cpp index 4f9802cb71..20895abee3 100644 --- a/modules/ocl/src/filtering.cpp +++ b/modules/ocl/src/filtering.cpp @@ -739,6 +739,135 @@ void cv::ocl::filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &ke f->apply(src, dst); } +const int optimizedSepFilterLocalSize = 16; +static void sepFilter2D_SinglePass(const oclMat &src, oclMat &dst, + const Mat &row_kernel, const Mat &col_kernel, int bordertype = BORDER_DEFAULT) +{ + size_t lt2[3] = {optimizedSepFilterLocalSize, optimizedSepFilterLocalSize, 1}; + size_t gt2[3] = {lt2[0]*(1 + (src.cols-1) / lt2[0]), lt2[1]*(1 + (src.rows-1) / lt2[1]), 1}; + + unsigned int src_pitch = src.step; + unsigned int dst_pitch = dst.step; + + int src_offset_x = (src.offset % src.step) / src.elemSize(); + int src_offset_y = src.offset / src.step; + + std::vector > args; + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data )); + args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&src_pitch )); + + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset_x )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset_y )); + + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.offset )); + args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&dst_pitch )); + + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.wholecols )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.wholerows )); + + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows )); + + string option = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d",(int)lt2[0], (int)lt2[1], + row_kernel.rows / 2, col_kernel.rows / 2 ); + + option += " -D KERNEL_MATRIX_X="; + for(int i=0; i( &row_kernel.at(i) ) ); + option += "0x0"; + + option += " -D KERNEL_MATRIX_Y="; + for(int i=0; i( &col_kernel.at(i) ) ); + option += "0x0"; + + switch(src.type()) + { + case CV_8UC1: + option += " -D SRCTYPE=uchar -D CONVERT_SRCTYPE=convert_float -D WORKTYPE=float"; + break; + case CV_32FC1: + option += " -D SRCTYPE=float -D CONVERT_SRCTYPE= -D WORKTYPE=float"; + break; + case CV_8UC2: + option += " -D SRCTYPE=uchar2 -D CONVERT_SRCTYPE=convert_float2 -D WORKTYPE=float2"; + break; + case CV_32FC2: + option += " -D SRCTYPE=float2 -D CONVERT_SRCTYPE= -D WORKTYPE=float2"; + break; + case CV_8UC3: + option += " -D SRCTYPE=uchar3 -D CONVERT_SRCTYPE=convert_float3 -D WORKTYPE=float3"; + break; + case CV_32FC3: + option += " -D SRCTYPE=float3 -D CONVERT_SRCTYPE= -D WORKTYPE=float3"; + break; + case CV_8UC4: + option += " -D SRCTYPE=uchar4 -D CONVERT_SRCTYPE=convert_float4 -D WORKTYPE=float4"; + break; + case CV_32FC4: + option += " -D SRCTYPE=float4 -D CONVERT_SRCTYPE= -D WORKTYPE=float4"; + break; + default: + CV_Error(CV_StsUnsupportedFormat, "Image type is not supported!"); + break; + } + switch(dst.type()) + { + case CV_8UC1: + option += " -D DSTTYPE=uchar -D CONVERT_DSTTYPE=convert_uchar_sat"; + break; + case CV_8UC2: + option += " -D DSTTYPE=uchar2 -D CONVERT_DSTTYPE=convert_uchar2_sat"; + break; + case CV_8UC3: + option += " -D DSTTYPE=uchar3 -D CONVERT_DSTTYPE=convert_uchar3_sat"; + break; + case CV_8UC4: + option += " -D DSTTYPE=uchar4 -D CONVERT_DSTTYPE=convert_uchar4_sat"; + break; + case CV_32FC1: + option += " -D DSTTYPE=float -D CONVERT_DSTTYPE="; + break; + case CV_32FC2: + option += " -D DSTTYPE=float2 -D CONVERT_DSTTYPE="; + break; + case CV_32FC3: + option += " -D DSTTYPE=float3 -D CONVERT_DSTTYPE="; + break; + case CV_32FC4: + option += " -D DSTTYPE=float4 -D CONVERT_DSTTYPE="; + break; + default: + CV_Error(CV_StsUnsupportedFormat, "Image type is not supported!"); + break; + } + switch(bordertype) + { + case cv::BORDER_CONSTANT: + option += " -D BORDER_CONSTANT"; + break; + case cv::BORDER_REPLICATE: + option += " -D BORDER_REPLICATE"; + break; + case cv::BORDER_REFLECT: + option += " -D BORDER_REFLECT"; + break; + case cv::BORDER_REFLECT101: + option += " -D BORDER_REFLECT_101"; + break; + case cv::BORDER_WRAP: + option += " -D BORDER_WRAP"; + break; + default: + CV_Error(CV_StsBadFlag, "BORDER type is not supported!"); + break; + } + + openCLExecuteKernel(src.clCxt, &filtering_sep_filter_singlepass, "sep_filter_singlepass", gt2, lt2, args, + -1, -1, option.c_str() ); +} + //////////////////////////////////////////////////////////////////////////////////////////////////// // SeparableFilter @@ -788,6 +917,35 @@ Ptr cv::ocl::createSeparableFilter_GPU(const Ptr(new SeparableFilterEngine_GPU(rowFilter, columnFilter)); } +namespace +{ +class SingleStepSeparableFilterEngine_GPU : public FilterEngine_GPU +{ +public: + SingleStepSeparableFilterEngine_GPU( const Mat &rowKernel_, const Mat &columnKernel_, const int btype ) + { + bordertype = btype; + rowKernel = rowKernel_; + columnKernel = columnKernel_; + } + + virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1)) + { + normalizeROI(roi, Size(rowKernel.rows, columnKernel.rows), Point(-1,-1), src.size()); + + oclMat srcROI = src(roi); + oclMat dstROI = dst(roi); + + sepFilter2D_SinglePass(src, dst, rowKernel, columnKernel, bordertype); + } + + Mat rowKernel; + Mat columnKernel; + int bordertype; +}; +} + + static void GPUFilterBox(const oclMat &src, oclMat &dst, Size &ksize, const Point anchor, const int borderType) { @@ -1241,17 +1399,30 @@ Ptr cv::ocl::getLinearColumnFilter_GPU(int /*bufType*/, in } Ptr cv::ocl::createSeparableLinearFilter_GPU(int srcType, int dstType, - const Mat &rowKernel, const Mat &columnKernel, const Point &anchor, double delta, int bordertype) + const Mat &rowKernel, const Mat &columnKernel, const Point &anchor, double delta, int bordertype, Size imgSize ) { int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(dstType); int cn = CV_MAT_CN(srcType); int bdepth = std::max(std::max(sdepth, ddepth), CV_32F); int bufType = CV_MAKETYPE(bdepth, cn); - Ptr rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x, bordertype); - Ptr columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y, bordertype, delta); + //if image size is non-degenerate and large enough + //and if filter support is reasonable to satisfy larger local memory requirements, + //then we can use single pass routine to avoid extra runtime calls overhead + if( rowKernel.rows <= 21 && columnKernel.rows <= 21 && + (rowKernel.rows & 1) == 1 && (columnKernel.rows & 1) == 1 && + imgSize.width > optimizedSepFilterLocalSize + (rowKernel.rows>>1) && + imgSize.height > optimizedSepFilterLocalSize + (columnKernel.rows>>1) ) + { + return Ptr(new SingleStepSeparableFilterEngine_GPU(rowKernel, columnKernel, bordertype)); + } + else + { + Ptr rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x, bordertype); + Ptr columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y, bordertype, delta); - return createSeparableFilter_GPU(rowFilter, columnFilter); + return createSeparableFilter_GPU(rowFilter, columnFilter); + } } void cv::ocl::sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernelX, const Mat &kernelY, Point anchor, double delta, int bordertype) @@ -1275,16 +1446,16 @@ void cv::ocl::sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels())); - Ptr f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, delta, bordertype); + Ptr f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, delta, bordertype, src.size()); f->apply(src, dst); } -Ptr cv::ocl::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int borderType) +Ptr cv::ocl::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int borderType, Size imgSize ) { Mat kx, ky; getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F); return createSeparableLinearFilter_GPU(srcType, dstType, - kx, ky, Point(-1, -1), 0, borderType); + kx, ky, Point(-1, -1), 0, borderType, imgSize); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -1354,7 +1525,7 @@ void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, d //////////////////////////////////////////////////////////////////////////////////////////////////// // Gaussian Filter -Ptr cv::ocl::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2, int bordertype) +Ptr cv::ocl::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2, int bordertype, Size imgSize) { int depth = CV_MAT_DEPTH(type); @@ -1381,7 +1552,7 @@ Ptr cv::ocl::createGaussianFilter_GPU(int type, Size ksize, do else ky = getGaussianKernel(ksize.height, sigma2, std::max(depth, CV_32F)); - return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1, -1), 0.0, bordertype); + return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1, -1), 0.0, bordertype, imgSize); } void cv::ocl::GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double sigma1, double sigma2, int bordertype) @@ -1417,7 +1588,7 @@ void cv::ocl::GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double si dst.create(src.size(), src.type()); - Ptr f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, bordertype); + Ptr f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, bordertype, src.size()); f->apply(src, dst); } diff --git a/modules/ocl/src/opencl/filtering_sep_filter_singlepass.cl b/modules/ocl/src/opencl/filtering_sep_filter_singlepass.cl new file mode 100644 index 0000000000..c6555bff0f --- /dev/null +++ b/modules/ocl/src/opencl/filtering_sep_filter_singlepass.cl @@ -0,0 +1,185 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2013, Intel Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ +/////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////Macro for border type//////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#ifdef BORDER_CONSTANT +//CCCCCC|abcdefgh|CCCCCCC +#define EXTRAPOLATE(x, maxV) +#elif defined BORDER_REPLICATE +//aaaaaa|abcdefgh|hhhhhhh +#define EXTRAPOLATE(x, maxV) \ + { \ + (x) = max(min((x), (maxV) - 1), 0); \ + } +#elif defined BORDER_WRAP +//cdefgh|abcdefgh|abcdefg +#define EXTRAPOLATE(x, maxV) \ + { \ + (x) = ( (x) + (maxV) ) % (maxV); \ + } +#elif defined BORDER_REFLECT +//fedcba|abcdefgh|hgfedcb +#define EXTRAPOLATE(x, maxV) \ + { \ + (x) = min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ); \ + } +#elif defined BORDER_REFLECT_101 +//gfedcb|abcdefgh|gfedcba +#define EXTRAPOLATE(x, maxV) \ + { \ + (x) = min(((maxV)-1)*2-(x), max((x),-(x)) ); \ + } +#else +#error No extrapolation method +#endif + +#define SRC(_x,_y) CONVERT_SRCTYPE(((global SRCTYPE*)(Src+(_y)*SrcPitch))[_x]) + +#ifdef BORDER_CONSTANT +//CCCCCC|abcdefgh|CCCCCCC +#define ELEM(_x,_y,r_edge,t_edge,const_v) (_x)<0 | (_x) >= (r_edge) | (_y)<0 | (_y) >= (t_edge) ? (const_v) : SRC((_x),(_y)) +#else +#define ELEM(_x,_y,r_edge,t_edge,const_v) SRC((_x),(_y)) +#endif + +#define DST(_x,_y) (((global DSTTYPE*)(Dst+DstOffset+(_y)*DstPitch))[_x]) + +//horizontal and vertical filter kernels +//should be defined on host during compile time to avoid overhead +__constant uint mat_kernelX[] = {KERNEL_MATRIX_X}; +__constant uint mat_kernelY[] = {KERNEL_MATRIX_Y}; + +__kernel __attribute__((reqd_work_group_size(BLK_X,BLK_Y,1))) void sep_filter_singlepass + ( + __global uchar* Src, + const uint SrcPitch, + const int srcOffsetX, + const int srcOffsetY, + __global uchar* Dst, + const int DstOffset, + const uint DstPitch, + int width, + int height, + int dstWidth, + int dstHeight + ) +{ + //RADIUSX, RADIUSY are filter dimensions + //BLK_X, BLK_Y are local wrogroup sizes + //all these should be defined on host during compile time + //first lsmem array for source pixels used in first pass, + //second lsmemDy for storing first pass results + __local WORKTYPE lsmem[BLK_Y+2*RADIUSY][BLK_X+2*RADIUSX]; + __local WORKTYPE lsmemDy[BLK_Y][BLK_X+2*RADIUSX]; + + //get local and global ids - used as image and local memory array indexes + int lix = get_local_id(0); + int liy = get_local_id(1); + + int x = (int)get_global_id(0); + int y = (int)get_global_id(1); + + //calculate pixel position in source image taking image offset into account + int srcX = x + srcOffsetX - RADIUSX; + int srcY = y + srcOffsetY - RADIUSY; + int xb = srcX; + int yb = srcY; + + //extrapolate coordinates, if needed + //and read my own source pixel into local memory + //with account for extra border pixels, which will be read by starting workitems + int clocY = liy; + int cSrcY = srcY; + do + { + int yb = cSrcY; + EXTRAPOLATE(yb, (height)); + + int clocX = lix; + int cSrcX = srcX; + do + { + int xb = cSrcX; + EXTRAPOLATE(xb,(width)); + lsmem[clocY][clocX] = ELEM(xb, yb, (width), (height), 0 ); + + clocX += BLK_X; + cSrcX += BLK_X; + } + while(clocX < BLK_X+(RADIUSX*2)); + + clocY += BLK_Y; + cSrcY += BLK_Y; + } + while(clocY < BLK_Y+(RADIUSY*2)); + barrier(CLK_LOCAL_MEM_FENCE); + + //do vertical filter pass + //and store intermediate results to second local memory array + int i; + WORKTYPE sum = 0.0f; + int clocX = lix; + do + { + sum = 0.0f; + for(i=0; i<=2*RADIUSY; i++) + sum = mad(lsmem[liy+i][clocX], as_float(mat_kernelY[i]), sum); + lsmemDy[liy][clocX] = sum; + clocX += BLK_X; + } + while(clocX < BLK_X+(RADIUSX*2)); + barrier(CLK_LOCAL_MEM_FENCE); + + //if this pixel happened to be out of image borders because of global size rounding, + //then just return + if( x >= dstWidth || y >=dstHeight ) return; + + //do second horizontal filter pass + //and calculate final result + sum = 0.0f; + for(i=0; i<=2*RADIUSX; i++) + sum = mad(lsmemDy[liy][lix+i], as_float(mat_kernelX[i]), sum); + + //store result into destination image + DST(x,y) = CONVERT_DSTTYPE(sum); +} From fffac2f0859dcd526c5fa2f8999b3477d5463a75 Mon Sep 17 00:00:00 2001 From: krodyush Date: Tue, 17 Dec 2013 14:12:33 +0400 Subject: [PATCH 11/28] optimize SURF by Inlining and customizing sampling functions to reduce memory traffic and compute Improve calcOrientation implementation. Using more efficient rounding routines. Removing unnecessary use of local memory --- modules/nonfree/src/opencl/surf.cl | 408 ++++++++++++++++------------- modules/nonfree/src/surf.ocl.cpp | 22 +- 2 files changed, 235 insertions(+), 195 deletions(-) diff --git a/modules/nonfree/src/opencl/surf.cl b/modules/nonfree/src/opencl/surf.cl index 02f77c224d..405e48f02c 100644 --- a/modules/nonfree/src/opencl/surf.cl +++ b/modules/nonfree/src/opencl/surf.cl @@ -12,6 +12,7 @@ // // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Copyright (C) 2013, Intel Corporation, all rights reserved. // Third party copyrights are property of their respective owners. // // @Authors @@ -66,8 +67,8 @@ uint read_sumTex(IMAGE_INT32 img, sampler_t sam, int2 coord, int rows, int cols, uchar read_imgTex(IMAGE_INT8 img, sampler_t sam, float2 coord, int rows, int cols, int elemPerRow) { #ifdef DISABLE_IMAGE2D - int x = clamp(convert_int_rte(coord.x), 0, cols - 1); - int y = clamp(convert_int_rte(coord.y), 0, rows - 1); + int x = clamp(round(coord.x), 0, cols - 1); + int y = clamp(round(coord.y), 0, rows - 1); return img[elemPerRow * y + x]; #else return (uchar)read_imageui(img, sam, coord).x; @@ -98,6 +99,7 @@ __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAM #define CV_PI_F 3.14159265f #endif + // Use integral image to calculate haar wavelets. // N = 2 // for simple haar paatern @@ -114,89 +116,10 @@ float icvCalcHaarPatternSum_2( F d = 0; - int2 dx1 = convert_int2_rte(ratio * src[0]); - int2 dy1 = convert_int2_rte(ratio * src[1]); - int2 dx2 = convert_int2_rte(ratio * src[2]); - int2 dy2 = convert_int2_rte(ratio * src[3]); - - F t = 0; - t += read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy1.x), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy2.x), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy1.x), rows, cols, elemPerRow ); - t += read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy2.x), rows, cols, elemPerRow ); - d += t * src[4].x / ((dx2.x - dx1.x) * (dy2.x - dy1.x)); - - t = 0; - t += read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy1.y), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy2.y), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy1.y), rows, cols, elemPerRow ); - t += read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy2.y), rows, cols, elemPerRow ); - d += t * src[4].y / ((dx2.y - dx1.y) * (dy2.y - dy1.y)); - - return (float)d; -} - -// N = 3 -float icvCalcHaarPatternSum_3( - IMAGE_INT32 sumTex, - __constant float4 *src, - int oldSize, - int newSize, - int y, int x, - int rows, int cols, int elemPerRow) -{ - - float ratio = (float)newSize / oldSize; - - F d = 0; - - int4 dx1 = convert_int4_rte(ratio * src[0]); - int4 dy1 = convert_int4_rte(ratio * src[1]); - int4 dx2 = convert_int4_rte(ratio * src[2]); - int4 dy2 = convert_int4_rte(ratio * src[3]); - - F t = 0; - t += read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy1.x), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy2.x), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy1.x), rows, cols, elemPerRow ); - t += read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy2.x), rows, cols, elemPerRow ); - d += t * src[4].x / ((dx2.x - dx1.x) * (dy2.x - dy1.x)); - - t = 0; - t += read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy1.y), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy2.y), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy1.y), rows, cols, elemPerRow ); - t += read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy2.y), rows, cols, elemPerRow ); - d += t * src[4].y / ((dx2.y - dx1.y) * (dy2.y - dy1.y)); - - t = 0; - t += read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy1.z), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy2.z), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy1.z), rows, cols, elemPerRow ); - t += read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy2.z), rows, cols, elemPerRow ); - d += t * src[4].z / ((dx2.z - dx1.z) * (dy2.z - dy1.z)); - - return (float)d; -} - -// N = 4 -float icvCalcHaarPatternSum_4( - IMAGE_INT32 sumTex, - __constant float4 *src, - int oldSize, - int newSize, - int y, int x, - int rows, int cols, int elemPerRow) -{ - - float ratio = (float)newSize / oldSize; - - F d = 0; - - int4 dx1 = convert_int4_rte(ratio * src[0]); - int4 dy1 = convert_int4_rte(ratio * src[1]); - int4 dx2 = convert_int4_rte(ratio * src[2]); - int4 dy2 = convert_int4_rte(ratio * src[3]); + int2 dx1 = convert_int2(round(ratio * src[0])); + int2 dy1 = convert_int2(round(ratio * src[1])); + int2 dx2 = convert_int2(round(ratio * src[2])); + int2 dy2 = convert_int2(round(ratio * src[3])); F t = 0; t += read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy1.x), rows, cols, elemPerRow ); @@ -212,30 +135,12 @@ float icvCalcHaarPatternSum_4( t += read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy2.y), rows, cols, elemPerRow ); d += t * src[4].y / ((dx2.y - dx1.y) * (dy2.y - dy1.y)); - t = 0; - t += read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy1.z), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy2.z), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy1.z), rows, cols, elemPerRow ); - t += read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy2.z), rows, cols, elemPerRow ); - d += t * src[4].z / ((dx2.z - dx1.z) * (dy2.z - dy1.z)); - - t = 0; - t += read_sumTex( sumTex, sampler, (int2)(x + dx1.w, y + dy1.w), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.w, y + dy2.w), rows, cols, elemPerRow ); - t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.w, y + dy1.w), rows, cols, elemPerRow ); - t += read_sumTex( sumTex, sampler, (int2)(x + dx2.w, y + dy2.w), rows, cols, elemPerRow ); - d += t * src[4].w / ((dx2.w - dx1.w) * (dy2.w - dy1.w)); - return (float)d; } //////////////////////////////////////////////////////////////////////// // Hessian -__constant float4 c_DX[5] = { (float4)(0, 3, 6, 0), (float4)(2, 2, 2, 0), (float4)(3, 6, 9, 0), (float4)(7, 7, 7, 0), (float4)(1, -2, 1, 0) }; -__constant float4 c_DY[5] = { (float4)(2, 2, 2, 0), (float4)(0, 3, 6, 0), (float4)(7, 7, 7, 0), (float4)(3, 6, 9, 0), (float4)(1, -2, 1, 0) }; -__constant float4 c_DXY[5] = { (float4)(1, 5, 1, 5), (float4)(1, 1, 5, 5), (float4)(4, 8, 4, 8), (float4)(4, 4, 8, 8), (float4)(1, -1, -1, 1) };// Use integral image to calculate haar wavelets. - __inline int calcSize(int octave, int layer) { /* Wavelet size at first layer of first octave. */ @@ -250,6 +155,24 @@ __inline int calcSize(int octave, int layer) return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave; } +// Calculate a derivative in an axis-aligned direction (x or y). The "plus1" +// boxes contribute 1 * (area), and the "minus2" box contributes -2 * (area). +// So the final computation is plus1a + plus1b - 2 * minus2. The corners are +// labeled A, B, C, and D, with A being the top left, B being top right, C +// being bottom left, and D being bottom right. +F calcAxisAlignedDerivative( + int plus1a_A, int plus1a_B, int plus1a_C, int plus1a_D, F plus1a_scale, + int plus1b_A, int plus1b_B, int plus1b_C, int plus1b_D, F plus1b_scale, + int minus2_A, int minus2_B, int minus2_C, int minus2_D, F minus2_scale) +{ + F plus1a = plus1a_A - plus1a_B - plus1a_C + plus1a_D; + F plus1b = plus1b_A - plus1b_B - plus1b_C + plus1b_D; + F minus2 = minus2_A - minus2_B - minus2_C + minus2_D; + + return (plus1a / plus1a_scale - + 2.0f * minus2 / minus2_scale + + plus1b / plus1b_scale); +} //calculate targeted layer per-pixel determinant and trace with an integral image __kernel void icvCalcLayerDetAndTrace( @@ -264,7 +187,7 @@ __kernel void icvCalcLayerDetAndTrace( int c_octave, int c_layer_rows, int sumTex_step -) + ) { det_step /= sizeof(*det); trace_step /= sizeof(*trace); @@ -288,16 +211,103 @@ __kernel void icvCalcLayerDetAndTrace( if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j) { - const float dx = icvCalcHaarPatternSum_3(sumTex, c_DX , 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step); - const float dy = icvCalcHaarPatternSum_3(sumTex, c_DY , 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step); - const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step); + int x = j << c_octave; + int y = i << c_octave; + + float ratio = (float)size / 9; + + // Precompute some commonly used values, which are used to offset + // texture coordinates in the integral image. + int r1 = round(ratio); + int r2 = round(ratio * 2.0f); + int r3 = round(ratio * 3.0f); + int r4 = round(ratio * 4.0f); + int r5 = round(ratio * 5.0f); + int r6 = round(ratio * 6.0f); + int r7 = round(ratio * 7.0f); + int r8 = round(ratio * 8.0f); + int r9 = round(ratio * 9.0f); + + // Calculate the approximated derivative in the x-direction + F d = 0; + { + // Some of the pixels needed to compute the derivative are + // repeated, so we only don't duplicate the fetch here. + int t02 = read_sumTex( sumTex, sampler, (int2)(x, y + r2), c_img_rows, c_img_cols, sumTex_step ); + int t07 = read_sumTex( sumTex, sampler, (int2)(x, y + r7), c_img_rows, c_img_cols, sumTex_step ); + int t32 = read_sumTex( sumTex, sampler, (int2)(x + r3, y + r2), c_img_rows, c_img_cols, sumTex_step ); + int t37 = read_sumTex( sumTex, sampler, (int2)(x + r3, y + r7), c_img_rows, c_img_cols, sumTex_step ); + int t62 = read_sumTex( sumTex, sampler, (int2)(x + r6, y + r2), c_img_rows, c_img_cols, sumTex_step ); + int t67 = read_sumTex( sumTex, sampler, (int2)(x + r6, y + r7), c_img_rows, c_img_cols, sumTex_step ); + int t92 = read_sumTex( sumTex, sampler, (int2)(x + r9, y + r2), c_img_rows, c_img_cols, sumTex_step ); + int t97 = read_sumTex( sumTex, sampler, (int2)(x + r9, y + r7), c_img_rows, c_img_cols, sumTex_step ); + + d = calcAxisAlignedDerivative(t02, t07, t32, t37, (r3) * (r7 - r2), + t62, t67, t92, t97, (r9 - r6) * (r7 - r2), + t32, t37, t62, t67, (r6 - r3) * (r7 - r2)); + } + const float dx = (float)d; + + // Calculate the approximated derivative in the y-direction + d = 0; + { + // Some of the pixels needed to compute the derivative are + // repeated, so we only don't duplicate the fetch here. + int t20 = read_sumTex( sumTex, sampler, (int2)(x + r2, y), c_img_rows, c_img_cols, sumTex_step ); + int t23 = read_sumTex( sumTex, sampler, (int2)(x + r2, y + r3), c_img_rows, c_img_cols, sumTex_step ); + int t70 = read_sumTex( sumTex, sampler, (int2)(x + r7, y), c_img_rows, c_img_cols, sumTex_step ); + int t73 = read_sumTex( sumTex, sampler, (int2)(x + r7, y + r3), c_img_rows, c_img_cols, sumTex_step ); + int t26 = read_sumTex( sumTex, sampler, (int2)(x + r2, y + r6), c_img_rows, c_img_cols, sumTex_step ); + int t76 = read_sumTex( sumTex, sampler, (int2)(x + r7, y + r6), c_img_rows, c_img_cols, sumTex_step ); + int t29 = read_sumTex( sumTex, sampler, (int2)(x + r2, y + r9), c_img_rows, c_img_cols, sumTex_step ); + int t79 = read_sumTex( sumTex, sampler, (int2)(x + r7, y + r9), c_img_rows, c_img_cols, sumTex_step ); + + d = calcAxisAlignedDerivative(t20, t23, t70, t73, (r7 - r2) * (r3), + t26, t29, t76, t79, (r7 - r2) * (r9 - r6), + t23, t26, t73, t76, (r7 - r2) * (r6 - r3)); + } + const float dy = (float)d; + + // Calculate the approximated derivative in the xy-direction + d = 0; + { + // There's no saving us here, we just have to get all of the pixels in + // separate fetches + F t = 0; + t += read_sumTex( sumTex, sampler, (int2)(x + r1, y + r1), c_img_rows, c_img_cols, sumTex_step ); + t -= read_sumTex( sumTex, sampler, (int2)(x + r1, y + r4), c_img_rows, c_img_cols, sumTex_step ); + t -= read_sumTex( sumTex, sampler, (int2)(x + r4, y + r1), c_img_rows, c_img_cols, sumTex_step ); + t += read_sumTex( sumTex, sampler, (int2)(x + r4, y + r4), c_img_rows, c_img_cols, sumTex_step ); + d += t / ((r4 - r1) * (r4 - r1)); + + t = 0; + t += read_sumTex( sumTex, sampler, (int2)(x + r5, y + r1), c_img_rows, c_img_cols, sumTex_step ); + t -= read_sumTex( sumTex, sampler, (int2)(x + r5, y + r4), c_img_rows, c_img_cols, sumTex_step ); + t -= read_sumTex( sumTex, sampler, (int2)(x + r8, y + r1), c_img_rows, c_img_cols, sumTex_step ); + t += read_sumTex( sumTex, sampler, (int2)(x + r8, y + r4), c_img_rows, c_img_cols, sumTex_step ); + d -= t / ((r8 - r5) * (r4 - r1)); + + t = 0; + t += read_sumTex( sumTex, sampler, (int2)(x + r1, y + r5), c_img_rows, c_img_cols, sumTex_step ); + t -= read_sumTex( sumTex, sampler, (int2)(x + r1, y + r8), c_img_rows, c_img_cols, sumTex_step ); + t -= read_sumTex( sumTex, sampler, (int2)(x + r4, y + r5), c_img_rows, c_img_cols, sumTex_step ); + t += read_sumTex( sumTex, sampler, (int2)(x + r4, y + r8), c_img_rows, c_img_cols, sumTex_step ); + d -= t / ((r4 - r1) * (r8 - r5)); + + t = 0; + t += read_sumTex( sumTex, sampler, (int2)(x + r5, y + r5), c_img_rows, c_img_cols, sumTex_step ); + t -= read_sumTex( sumTex, sampler, (int2)(x + r5, y + r8), c_img_rows, c_img_cols, sumTex_step ); + t -= read_sumTex( sumTex, sampler, (int2)(x + r8, y + r5), c_img_rows, c_img_cols, sumTex_step ); + t += read_sumTex( sumTex, sampler, (int2)(x + r8, y + r8), c_img_rows, c_img_cols, sumTex_step ); + d += t / ((r8 - r5) * (r8 - r5)); + } + const float dxy = (float)d; det [j + margin + det_step * (layer * c_layer_rows + i + margin)] = dx * dy - 0.81f * dxy * dxy; trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy; } } - //////////////////////////////////////////////////////////////////////// // NONMAX @@ -309,10 +319,10 @@ bool within_check(IMAGE_INT32 maskSumTex, int sum_i, int sum_j, int size, int ro float d = 0; - int dx1 = convert_int_rte(ratio * c_DM[0]); - int dy1 = convert_int_rte(ratio * c_DM[1]); - int dx2 = convert_int_rte(ratio * c_DM[2]); - int dy2 = convert_int_rte(ratio * c_DM[3]); + int dx1 = round(ratio * c_DM[0]); + int dy1 = round(ratio * c_DM[1]); + int dx2 = round(ratio * c_DM[2]); + int dy2 = round(ratio * c_DM[3]); float t = 0; @@ -572,7 +582,7 @@ void icvFindMaximaInLayer( } // solve 3x3 linear system Ax=b for floating point input -inline bool solve3x3_float(volatile __local const float4 *A, volatile __local const float *b, volatile __local float *x) +inline bool solve3x3_float(const float4 *A, const float *b, float *x) { float det = A[0].x * (A[1].y * A[2].z - A[1].z * A[2].y) - A[0].y * (A[1].x * A[2].z - A[1].z * A[2].x) @@ -651,7 +661,7 @@ void icvInterpolateKeypoint( if (get_local_id(0) == 0 && get_local_id(1) == 0 && get_local_id(2) == 0) { - volatile __local float dD[3]; + float dD[3]; //dx dD[0] = -0.5f * (N9[1][1][2] - N9[1][1][0]); @@ -660,7 +670,7 @@ void icvInterpolateKeypoint( //ds dD[2] = -0.5f * (N9[2][1][1] - N9[0][1][1]); - volatile __local float4 H[3]; + float4 H[3]; //dxx H[0].x = N9[1][1][0] - 2.0f * N9[1][1][1] + N9[1][1][2]; @@ -681,7 +691,7 @@ void icvInterpolateKeypoint( //dss H[2].z = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1]; - volatile __local float x[3]; + float x[3]; if (solve3x3_float(H, dD, x)) { @@ -711,7 +721,7 @@ void icvInterpolateKeypoint( sampled in a circle of radius 6s using wavelets of size 4s. We ensure the gradient wavelet size is even to ensure the wavelet pattern is balanced and symmetric around its center */ - const int grad_wav_size = 2 * convert_int_rte(2.0f * s); + const int grad_wav_size = 2 * round(2.0f * s); // check when grad_wav_size is too big if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size) @@ -737,9 +747,12 @@ void icvInterpolateKeypoint( //////////////////////////////////////////////////////////////////////// // Orientation -#define ORI_SEARCH_INC 5 -#define ORI_WIN 60 -#define ORI_SAMPLES 113 +#define ORI_WIN 60 +#define ORI_SAMPLES 113 + +// The distance between samples in the beginning of the the reduction +#define ORI_RESPONSE_REDUCTION_WIDTH 48 +#define ORI_RESPONSE_ARRAY_SIZE (ORI_RESPONSE_REDUCTION_WIDTH * 2) __constant float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6}; __constant float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0}; @@ -833,12 +846,15 @@ void icvCalcOrientation( __global float* featureDir = keypoints + ANGLE_ROW * keypoints_step; - volatile __local float s_X[128]; - volatile __local float s_Y[128]; - volatile __local float s_angle[128]; + __local float s_X[ORI_SAMPLES]; + __local float s_Y[ORI_SAMPLES]; + __local float s_angle[ORI_SAMPLES]; - volatile __local float s_sumx[32 * 4]; - volatile __local float s_sumy[32 * 4]; + // Need to allocate enough to make the reduction work without accessing + // past the end of the array. + __local float s_sumx[ORI_RESPONSE_ARRAY_SIZE]; + __local float s_sumy[ORI_RESPONSE_ARRAY_SIZE]; + __local float s_mod[ORI_RESPONSE_ARRAY_SIZE]; /* The sampling intervals and wavelet sized for selecting an orientation and building the keypoint descriptor are defined relative to 's' */ @@ -849,28 +865,60 @@ void icvCalcOrientation( sampled in a circle of radius 6s using wavelets of size 4s. We ensure the gradient wavelet size is even to ensure the wavelet pattern is balanced and symmetric around its center */ - const int grad_wav_size = 2 * convert_int_rte(2.0f * s); + const int grad_wav_size = 2 * round(2.0f * s); // check when grad_wav_size is too big if ((c_img_rows + 1) < grad_wav_size || (c_img_cols + 1) < grad_wav_size) return; // Calc X, Y, angle and store it to shared memory - const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0); + const int tid = get_local_id(0); + // Initialize values that are only used as part of the reduction later. + if (tid < ORI_RESPONSE_ARRAY_SIZE - ORI_LOCAL_SIZE) { + s_mod[tid + ORI_LOCAL_SIZE] = 0.0f; + } - float X = 0.0f, Y = 0.0f, angle = 0.0f; + float ratio = (float)grad_wav_size / 4; - if (tid < ORI_SAMPLES) + int r2 = round(ratio * 2.0); + int r4 = round(ratio * 4.0); + for (int i = tid; i < ORI_SAMPLES; i += ORI_LOCAL_SIZE ) { + float X = 0.0f, Y = 0.0f, angle = 0.0f; const float margin = (float)(grad_wav_size - 1) / 2.0f; - const int x = convert_int_rte(featureX[get_group_id(0)] + c_aptX[tid] * s - margin); - const int y = convert_int_rte(featureY[get_group_id(0)] + c_aptY[tid] * s - margin); + const int x = round(featureX[get_group_id(0)] + c_aptX[i] * s - margin); + const int y = round(featureY[get_group_id(0)] + c_aptY[i] * s - margin); if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size && - x >= 0 && x < (c_img_cols + 1) - grad_wav_size) + x >= 0 && x < (c_img_cols + 1) - grad_wav_size) { - X = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NX, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step); - Y = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NY, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step); + + float apt = c_aptW[i]; + + // Compute the haar sum without fetching duplicate pixels. + float t00 = read_sumTex( sumTex, sampler, (int2)(x, y), c_img_rows, c_img_cols, sum_step); + float t02 = read_sumTex( sumTex, sampler, (int2)(x, y + r2), c_img_rows, c_img_cols, sum_step); + float t04 = read_sumTex( sumTex, sampler, (int2)(x, y + r4), c_img_rows, c_img_cols, sum_step); + float t20 = read_sumTex( sumTex, sampler, (int2)(x + r2, y), c_img_rows, c_img_cols, sum_step); + float t24 = read_sumTex( sumTex, sampler, (int2)(x + r2, y + r4), c_img_rows, c_img_cols, sum_step); + float t40 = read_sumTex( sumTex, sampler, (int2)(x + r4, y), c_img_rows, c_img_cols, sum_step); + float t42 = read_sumTex( sumTex, sampler, (int2)(x + r4, y + r2), c_img_rows, c_img_cols, sum_step); + float t44 = read_sumTex( sumTex, sampler, (int2)(x + r4, y + r4), c_img_rows, c_img_cols, sum_step); + + F t = t00 - t04 - t20 + t24; + X -= t / ((r2) * (r4)); + + t = t20 - t24 - t40 + t44; + X += t / ((r4 - r2) * (r4)); + + t = t00 - t02 - t40 + t42; + Y += t / ((r2) * (r4)); + + t = t02 - t04 - t42 + t44; + Y -= t / ((r4) * (r4 - r2)); + + X = apt*X; + Y = apt*Y; angle = atan2(Y, X); @@ -879,76 +927,61 @@ void icvCalcOrientation( angle *= 180.0f / CV_PI_F; } + + s_X[i] = X; + s_Y[i] = Y; + s_angle[i] = angle; } - s_X[tid] = X; - s_Y[tid] = Y; - s_angle[tid] = angle; barrier(CLK_LOCAL_MEM_FENCE); float bestx = 0, besty = 0, best_mod = 0; + float sumx = 0.0f, sumy = 0.0f; + const int dir = tid * ORI_SEARCH_INC; + #pragma unroll + for (int i = 0; i < ORI_SAMPLES; ++i) { + int angle = round(s_angle[i]); -#pragma unroll - for (int i = 0; i < 18; ++i) - { - const int dir = (i * 4 + get_local_id(1)) * ORI_SEARCH_INC; - - volatile float sumx = 0.0f, sumy = 0.0f; - int d = abs(convert_int_rte(s_angle[get_local_id(0)]) - dir); + int d = abs(angle - dir); if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2) { - sumx = s_X[get_local_id(0)]; - sumy = s_Y[get_local_id(0)]; + sumx += s_X[i]; + sumy += s_Y[i]; } - d = abs(convert_int_rte(s_angle[get_local_id(0) + 32]) - dir); - if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2) - { - sumx += s_X[get_local_id(0) + 32]; - sumy += s_Y[get_local_id(0) + 32]; - } - d = abs(convert_int_rte(s_angle[get_local_id(0) + 64]) - dir); - if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2) - { - sumx += s_X[get_local_id(0) + 64]; - sumy += s_Y[get_local_id(0) + 64]; - } - d = abs(convert_int_rte(s_angle[get_local_id(0) + 96]) - dir); - if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2) - { - sumx += s_X[get_local_id(0) + 96]; - sumy += s_Y[get_local_id(0) + 96]; - } - reduce_32_sum(s_sumx + get_local_id(1) * 32, &sumx, get_local_id(0)); - reduce_32_sum(s_sumy + get_local_id(1) * 32, &sumy, get_local_id(0)); + } + s_sumx[tid] = sumx; + s_sumy[tid] = sumy; + s_mod[tid] = sumx*sumx + sumy*sumy; + barrier(CLK_LOCAL_MEM_FENCE); - const float temp_mod = sumx * sumx + sumy * sumy; - if (temp_mod > best_mod) - { - best_mod = temp_mod; - bestx = sumx; - besty = sumy; + // This reduction searches for the longest wavelet response vector. The first + // step uses all of the work items in the workgroup to narrow the search + // down to the three candidates. It requires s_mod to have a few more + // elements alocated past the work-group size, which are pre-initialized to + // 0.0f above. + for(int t = ORI_RESPONSE_REDUCTION_WIDTH; t >= 3; t /= 2) { + if (tid < t) { + if (s_mod[tid] < s_mod[tid + t]) { + s_mod[tid] = s_mod[tid + t]; + s_sumx[tid] = s_sumx[tid + t]; + s_sumy[tid] = s_sumy[tid + t]; + } } barrier(CLK_LOCAL_MEM_FENCE); } - if (get_local_id(0) == 0) - { - s_X[get_local_id(1)] = bestx; - s_Y[get_local_id(1)] = besty; - s_angle[get_local_id(1)] = best_mod; - } - barrier(CLK_LOCAL_MEM_FENCE); - if (get_local_id(1) == 0 && get_local_id(0) == 0) + // Do the final reduction and write out the result. + if (tid == 0) { int bestIdx = 0; - if (s_angle[1] > s_angle[bestIdx]) + // The loop above narrowed the search of the longest vector to three + // possibilities. Pick the best here. + if (s_mod[1] > s_mod[bestIdx]) bestIdx = 1; - if (s_angle[2] > s_angle[bestIdx]) + if (s_mod[2] > s_mod[bestIdx]) bestIdx = 2; - if (s_angle[3] > s_angle[bestIdx]) - bestIdx = 3; - float kp_dir = atan2(s_Y[bestIdx], s_X[bestIdx]); + float kp_dir = atan2(s_sumy[bestIdx], s_sumx[bestIdx]); if (kp_dir < 0) kp_dir += 2.0f * CV_PI_F; kp_dir *= 180.0f / CV_PI_F; @@ -961,7 +994,6 @@ void icvCalcOrientation( } } - __kernel void icvSetUpright( __global float * keypoints, @@ -1035,8 +1067,8 @@ inline float linearFilter( float out = 0.0f; - const int x1 = convert_int_rtn(x); - const int y1 = convert_int_rtn(y); + const int x1 = round(x); + const int y1 = round(y); const int x2 = x1 + 1; const int y2 = y1 + 1; diff --git a/modules/nonfree/src/surf.ocl.cpp b/modules/nonfree/src/surf.ocl.cpp index c79c4b2e67..293fd84b56 100644 --- a/modules/nonfree/src/surf.ocl.cpp +++ b/modules/nonfree/src/surf.ocl.cpp @@ -46,6 +46,7 @@ #ifdef HAVE_OPENCV_OCL #include +#include #include "opencl_kernels.hpp" using namespace cv; @@ -55,18 +56,25 @@ namespace cv { namespace ocl { + // The number of degrees between orientation samples in calcOrientation + const static int ORI_SEARCH_INC = 5; + // The local size of the calcOrientation kernel + const static int ORI_LOCAL_SIZE = (360 / ORI_SEARCH_INC); + static void openCLExecuteKernelSURF(Context *clCxt, const cv::ocl::ProgramEntry* source, string kernelName, size_t globalThreads[3], size_t localThreads[3], std::vector< std::pair > &args, int channels, int depth) { - char optBuf [100] = {0}; - char * optBufPtr = optBuf; + std::stringstream optsStr; + optsStr << "-D ORI_LOCAL_SIZE=" << ORI_LOCAL_SIZE << " "; + optsStr << "-D ORI_SEARCH_INC=" << ORI_SEARCH_INC << " "; cl_kernel kernel; - kernel = openCLGetKernelFromSource(clCxt, source, kernelName, optBufPtr); + kernel = openCLGetKernelFromSource(clCxt, source, kernelName, optsStr.str().c_str()); size_t wave_size = queryWaveFrontSize(kernel); CV_Assert(clReleaseKernel(kernel) == CL_SUCCESS); - sprintf(optBufPtr, "-D WAVE_SIZE=%d", static_cast(wave_size)); - openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, optBufPtr); + optsStr << "-D WAVE_SIZE=" << wave_size; + openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, optsStr.str().c_str()); } + } } @@ -594,8 +602,8 @@ void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeat args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_cols)); args.push_back( std::make_pair( sizeof(cl_int), (void *)&surf_.sum.step)); - size_t localThreads[3] = {32, 4, 1}; - size_t globalThreads[3] = {nFeatures *localThreads[0], localThreads[1], 1}; + size_t localThreads[3] = {ORI_LOCAL_SIZE, 1, 1}; + size_t globalThreads[3] = {nFeatures * localThreads[0], 1, 1}; openCLExecuteKernelSURF(clCxt, &surf, kernelName, globalThreads, localThreads, args, -1, -1); } From cc08e00876b91ad626b60d88bee5aa3441ed546b Mon Sep 17 00:00:00 2001 From: Vladimir Bystricky Date: Tue, 17 Dec 2013 16:13:55 +0400 Subject: [PATCH 12/28] Fix notes about cmake files. Fix build warning. --- cmake/OpenCVFindIntelPerCSDK.cmake | 55 ++++--------------- cmake/OpenCVFindLibsVideo.cmake | 1 - cmake/templates/cvconfig.h.in | 6 +- doc/user_guide/ug_intelperc.rst | 7 +-- modules/highgui/CMakeLists.txt | 2 +- .../include/opencv2/highgui/highgui_c.h | 10 ++-- modules/highgui/src/cap_intelperc.cpp | 44 +++++++-------- samples/cpp/intelperc_capture.cpp | 18 +++--- 8 files changed, 54 insertions(+), 89 deletions(-) diff --git a/cmake/OpenCVFindIntelPerCSDK.cmake b/cmake/OpenCVFindIntelPerCSDK.cmake index 2d45c6e227..7243105601 100644 --- a/cmake/OpenCVFindIntelPerCSDK.cmake +++ b/cmake/OpenCVFindIntelPerCSDK.cmake @@ -1,51 +1,20 @@ # Main variables: -# INTELPERC_LIBRARY and INTELPERC_INCLUDES to link Intel Perceptial Computing SDK modules +# INTELPERC_LIBRARIES and INTELPERC_INCLUDE to link Intel Perceptial Computing SDK modules # HAVE_INTELPERC for conditional compilation OpenCV with/without Intel Perceptial Computing SDK -if(NOT "${INTELPERC_LIB_DIR}" STREQUAL "${INTELPERC_LIB_DIR_INTERNAL}") - unset(INTELPERC_LIBRARY CACHE) - unset(INTELPERC_LIB_DIR CACHE) -endif() - -if(NOT "${INTELPERC_INCLUDE_DIR}" STREQUAL "${INTELPERC_INCLUDE_DIR_INTERNAL}") - unset(INTELPERC_INCLUDES CACHE) - unset(INTELPERC_INCLUDE_DIR CACHE) -endif() - -if(WIN32) - if(NOT (MSVC64 OR MINGW64)) - find_file(INTELPERC_INCLUDES "pxcsession.h" PATHS "$ENV{PCSDK_DIR}include" DOC "Intel Perceptual Computing SDK interface header") - find_library(INTELPERC_LIBRARY "libpxc.lib" PATHS "$ENV{PCSDK_DIR}lib/Win32" DOC "Intel Perceptual Computing SDK library") - else() - find_file(INTELPERC_INCLUDES "pxcsession.h" PATHS "$ENV{PCSDK_DIR}include" DOC "Intel Perceptual Computing SDK interface header") - find_library(INTELPERC_LIBRARY "libpxc.lib" PATHS "$ENV{PCSDK_DIR}/lib/x64" DOC "Intel Perceptual Computing SDK library") - endif() -endif() - -if(INTELPERC_LIBRARY AND INTELPERC_INCLUDES) - set(HAVE_INTELPERC TRUE) -endif() #if(INTELPERC_LIBRARY AND INTELPERC_INCLUDES) - -get_filename_component(INTELPERC_LIB_DIR "${INTELPERC_LIBRARY}" PATH) -get_filename_component(INTELPERC_INCLUDE_DIR "${INTELPERC_INCLUDES}" PATH) - -if(HAVE_INTELPERC) - set(INTELPERC_LIB_DIR "${INTELPERC_LIB_DIR}" CACHE PATH "Path to Intel Perceptual Computing SDK interface libraries" FORCE) - set(INTELPERC_INCLUDE_DIR "${INTELPERC_INCLUDE_DIR}" CACHE PATH "Path to Intel Perceptual Computing SDK interface headers" FORCE) -endif() - -if(INTELPERC_LIBRARY) - set(INTELPERC_LIB_DIR_INTERNAL "${INTELPERC_LIB_DIR}" CACHE INTERNAL "This is the value of the last time INTELPERC_LIB_DIR was set successfully." FORCE) +if(X86_64) + find_path(INTELPERC_INCLUDE_DIR "pxcsession.h" PATHS "$ENV{PCSDK_DIR}include" DOC "Path to Intel Perceptual Computing SDK interface headers") + find_file(INTELPERC_LIBRARIES "libpxc.lib" PATHS "$ENV{PCSDK_DIR}lib/x64" DOC "Path to Intel Perceptual Computing SDK interface libraries") else() - message( WARNING, " Intel Perceptual Computing SDK library directory (set by INTELPERC_LIB_DIR variable) is not found or does not have Intel Perceptual Computing SDK libraries." ) + find_path(INTELPERC_INCLUDE_DIR "pxcsession.h" PATHS "$ENV{PCSDK_DIR}include" DOC "Path to Intel Perceptual Computing SDK interface headers") + find_file(INTELPERC_LIBRARIES "libpxc.lib" PATHS "$ENV{PCSDK_DIR}lib/Win32" DOC "Path to Intel Perceptual Computing SDK interface libraries") endif() -if(INTELPERC_INCLUDES) - set(INTELPERC_INCLUDE_DIR_INTERNAL "${INTELPERC_INCLUDE_DIR}" CACHE INTERNAL "This is the value of the last time INTELPERC_INCLUDE_DIR was set successfully." FORCE) +if(INTELPERC_INCLUDE_DIR AND INTELPERC_LIBRARIES) + set(HAVE_INTELPERC TRUE) else() - message( WARNING, " Intel Perceptual Computing SDK include directory (set by INTELPERC_INCLUDE_DIR variable) is not found or does not have Intel Perceptual Computing SDK include files." ) -endif() - -mark_as_advanced(FORCE INTELPERC_LIBRARY) -mark_as_advanced(FORCE INTELPERC_INCLUDES) + set(HAVE_INTELPERC FALSE) + message(WARNING "Intel Perceptual Computing SDK library directory (set by INTELPERC_LIB_DIR variable) is not found or does not have Intel Perceptual Computing SDK libraries.") +endif() #if(INTELPERC_INCLUDE_DIR AND INTELPERC_LIBRARIES) +mark_as_advanced(FORCE INTELPERC_LIBRARIES INTELPERC_INCLUDE_DIR) \ No newline at end of file diff --git a/cmake/OpenCVFindLibsVideo.cmake b/cmake/OpenCVFindLibsVideo.cmake index 22b58f5ef1..a5075b57f7 100644 --- a/cmake/OpenCVFindLibsVideo.cmake +++ b/cmake/OpenCVFindLibsVideo.cmake @@ -252,7 +252,6 @@ if (NOT IOS) endif() # --- Intel Perceptual Computing SSDK --- -ocv_clear_vars(HAVE_INTELPERC) if(WITH_INTELPERC) include("${OpenCV_SOURCE_DIR}/cmake/OpenCVFindIntelPerCSDK.cmake") endif(WITH_INTELPERC) diff --git a/cmake/templates/cvconfig.h.in b/cmake/templates/cvconfig.h.in index f52c5e457c..a6cee63684 100644 --- a/cmake/templates/cvconfig.h.in +++ b/cmake/templates/cvconfig.h.in @@ -85,6 +85,9 @@ /* Apple ImageIO Framework */ #cmakedefine HAVE_IMAGEIO +/* Intel Perceptual Computing SDK library */ +#cmakedefine HAVE_INTELPERC + /* Intel Integrated Performance Primitives */ #cmakedefine HAVE_IPP @@ -158,9 +161,6 @@ /* Xine video library */ #cmakedefine HAVE_XINE -/* Intel Perceptual Computing SDK library */ -#cmakedefine HAVE_INTELPERC - /* Define to 1 if your processor stores words with the most significant byte first (like Motorola and SPARC, unlike Intel and VAX). */ #cmakedefine WORDS_BIGENDIAN diff --git a/doc/user_guide/ug_intelperc.rst b/doc/user_guide/ug_intelperc.rst index d00a2f9009..71a7c5d90e 100644 --- a/doc/user_guide/ug_intelperc.rst +++ b/doc/user_guide/ug_intelperc.rst @@ -12,7 +12,7 @@ Depth sensors compatible with Intel Perceptual Computing SDK are supported throu In order to use depth sensor with OpenCV you should do the following preliminary steps: #. - Install Intel Perceptual Computing SDK (from here http://www.intel.com/software/perceptual). + Install Intel Perceptual Computing SDK (from here http://www.intel.com/software/perceptual). #. Configure OpenCV with Intel Perceptual Computing SDK support by setting ``WITH_INTELPERC`` flag in CMake. If Intel Perceptual Computing SDK is found in install folders OpenCV will be built with Intel Perceptual Computing SDK library (see a status ``INTELPERC`` in CMake log). If CMake process doesn't find Intel Perceptual Computing SDK installation folder automatically, the user should change corresponding CMake variables ``INTELPERC_LIB_DIR`` and ``INTELPERC_INCLUDE_DIR`` to the proper value. @@ -56,7 +56,7 @@ For getting several data maps use ``VideoCapture::grab`` and ``VideoCapture::ret capture.retrieve( depthMap, CV_CAP_INTELPERC_DEPTH_MAP ); capture.retrieve( image, CV_CAP_INTELPERC_IMAGE ); - capture.retrieve( irImage, CV_CAP_INTELPERC_IR_MAP); + capture.retrieve( irImage, CV_CAP_INTELPERC_IR_MAP); if( waitKey( 30 ) >= 0 ) break; @@ -76,5 +76,4 @@ Since two types of sensor's data generators are supported (image generator and d For more information please refer to the example of usage intelperc_capture.cpp_ in ``opencv/samples/cpp`` folder. -.. _intelperc_capture.cpp: https://github.com/Itseez/opencv/tree/master/samples/cpp/intelperc_capture.cpp - +.. _intelperc_capture.cpp: https://github.com/Itseez/opencv/tree/master/samples/cpp/intelperc_capture.cpp \ No newline at end of file diff --git a/modules/highgui/CMakeLists.txt b/modules/highgui/CMakeLists.txt index 5c86a2fcd1..fd2eec6a1e 100644 --- a/modules/highgui/CMakeLists.txt +++ b/modules/highgui/CMakeLists.txt @@ -221,7 +221,7 @@ endif() if(HAVE_INTELPERC) list(APPEND highgui_srcs src/cap_intelperc.cpp) ocv_include_directories(${INTELPERC_INCLUDE_DIR}) - list(APPEND HIGHGUI_LIBRARIES ${INTELPERC_LIBRARY}) + list(APPEND HIGHGUI_LIBRARIES ${INTELPERC_LIBRARIES}) endif(HAVE_INTELPERC) if(IOS) diff --git a/modules/highgui/include/opencv2/highgui/highgui_c.h b/modules/highgui/include/opencv2/highgui/highgui_c.h index 862fa053a6..8a59197594 100644 --- a/modules/highgui/include/opencv2/highgui/highgui_c.h +++ b/modules/highgui/include/opencv2/highgui/highgui_c.h @@ -469,10 +469,10 @@ enum CV_CAP_PROP_GIGA_FRAME_WIDTH_MAX = 10003, CV_CAP_PROP_GIGA_FRAME_HEIGH_MAX = 10004, CV_CAP_PROP_GIGA_FRAME_SENS_WIDTH = 10005, - CV_CAP_PROP_GIGA_FRAME_SENS_HEIGH = 10006 + CV_CAP_PROP_GIGA_FRAME_SENS_HEIGH = 10006, - ,CV_CAP_PROP_INTELPERC_PROFILE_COUNT = 11001, - CV_CAP_PROP_INTELPERC_PROFILE_IDX = 11002, + CV_CAP_PROP_INTELPERC_PROFILE_COUNT = 11001, + CV_CAP_PROP_INTELPERC_PROFILE_IDX = 11002, CV_CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE = 11003, CV_CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE = 11004, CV_CAP_PROP_INTELPERC_DEPTH_CONFIDENCE_THRESHOLD = 11005, @@ -480,8 +480,8 @@ enum CV_CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_VERT = 11007, // Intel PerC streams - CV_CAP_INTELPERC_DEPTH_GENERATOR = 1 << 31, - CV_CAP_INTELPERC_IMAGE_GENERATOR = 1 << 30, + CV_CAP_INTELPERC_DEPTH_GENERATOR = 1 << 29, + CV_CAP_INTELPERC_IMAGE_GENERATOR = 1 << 28, CV_CAP_INTELPERC_GENERATORS_MASK = CV_CAP_INTELPERC_DEPTH_GENERATOR + CV_CAP_INTELPERC_IMAGE_GENERATOR, }; diff --git a/modules/highgui/src/cap_intelperc.cpp b/modules/highgui/src/cap_intelperc.cpp index 910a6f748a..18b3b9d0c0 100644 --- a/modules/highgui/src/cap_intelperc.cpp +++ b/modules/highgui/src/cap_intelperc.cpp @@ -63,7 +63,7 @@ public: } int getProfileIDX() const { - return m_profileIdx; + return m_profileIdx; } public: virtual bool initStream(PXCSession *session) = 0; @@ -132,29 +132,29 @@ protected: return false; pxcStatus sts = PXC_STATUS_NO_ERROR; - PXCSession::ImplDesc templat; - memset(&templat,0,sizeof(templat)); - templat.group = PXCSession::IMPL_GROUP_SENSOR; - templat.subgroup= PXCSession::IMPL_SUBGROUP_VIDEO_CAPTURE; + PXCSession::ImplDesc templat; + memset(&templat,0,sizeof(templat)); + templat.group = PXCSession::IMPL_GROUP_SENSOR; + templat.subgroup= PXCSession::IMPL_SUBGROUP_VIDEO_CAPTURE; - for (int modidx = 0; PXC_STATUS_NO_ERROR <= sts; modidx++) + for (int modidx = 0; PXC_STATUS_NO_ERROR <= sts; modidx++) { PXCSession::ImplDesc desc; sts = session->QueryImpl(&templat, modidx, &desc); - if (PXC_STATUS_NO_ERROR > sts) + if (PXC_STATUS_NO_ERROR > sts) break; - + PXCSmartPtr capture; sts = session->CreateImpl(&desc, &capture); - if (!capture.IsValid()) + if (!capture.IsValid()) continue; - + /* enumerate devices */ - for (int devidx = 0; PXC_STATUS_NO_ERROR <= sts; devidx++) + for (int devidx = 0; PXC_STATUS_NO_ERROR <= sts; devidx++) { PXCSmartPtr device; sts = capture->CreateDevice(devidx, &device); - if (PXC_STATUS_NO_ERROR <= sts) + if (PXC_STATUS_NO_ERROR <= sts) { m_device = device.ReleasePtr(); return true; @@ -172,19 +172,19 @@ protected: pxcStatus sts = PXC_STATUS_NO_ERROR; /* enumerate streams */ - for (int streamidx = 0; PXC_STATUS_NO_ERROR <= sts; streamidx++) + for (int streamidx = 0; PXC_STATUS_NO_ERROR <= sts; streamidx++) { PXCCapture::Device::StreamInfo sinfo; sts = m_device->QueryStream(streamidx, &sinfo); - if (PXC_STATUS_NO_ERROR > sts) + if (PXC_STATUS_NO_ERROR > sts) break; - if (PXCCapture::VideoStream::CUID != sinfo.cuid) + if (PXCCapture::VideoStream::CUID != sinfo.cuid) continue; - if (type != sinfo.imageType) + if (type != sinfo.imageType) continue; - + sts = m_device->CreateStream(streamidx, &m_stream); - if (PXC_STATUS_NO_ERROR == sts) + if (PXC_STATUS_NO_ERROR == sts) break; m_stream.ReleaseRef(); } @@ -206,7 +206,7 @@ protected: if (!m_stream.IsValid()) return; pxcStatus sts = PXC_STATUS_NO_ERROR; - for (int profidx = 0; PXC_STATUS_NO_ERROR <= sts; profidx++) + for (int profidx = 0; PXC_STATUS_NO_ERROR <= sts; profidx++) { PXCCapture::VideoStream::ProfileInfo pinfo; sts = m_stream->QueryProfile(profidx, &pinfo); @@ -422,7 +422,7 @@ protected: return false; PXCImage::ImageInfo info; pxcImage->QueryInfo(&info); - + PXCImage::ImageData data; pxcImage->AcquireAccess(PXCImage::ACCESS_READ, PXCImage::COLOR_FORMAT_RGB24, &data); @@ -574,7 +574,7 @@ protected: return false; PXCImage::ImageInfo info; pxcImage->QueryInfo(&info); - + PXCImage::ImageData data; pxcImage->AcquireAccess(PXCImage::ACCESS_READ, &data); @@ -610,7 +610,7 @@ public: : m_contextOpened(false) { pxcStatus sts = PXCSession_Create(&m_session); - if (PXC_STATUS_NO_ERROR > sts) + if (PXC_STATUS_NO_ERROR > sts) return; m_contextOpened = m_imageStream.initStream(m_session); m_contextOpened &= m_depthStream.initStream(m_session); diff --git a/samples/cpp/intelperc_capture.cpp b/samples/cpp/intelperc_capture.cpp index 30471c3471..be032dead9 100644 --- a/samples/cpp/intelperc_capture.cpp +++ b/samples/cpp/intelperc_capture.cpp @@ -3,7 +3,6 @@ #include #include "opencv2/highgui/highgui.hpp" -//#include "opencv2/imgproc/imgproc.hpp" #include @@ -122,11 +121,11 @@ static void printStreamProperties(VideoCapture &capture) { capture.set(CV_CAP_INTELPERC_IMAGE_GENERATOR | CV_CAP_PROP_INTELPERC_PROFILE_IDX, (double)i); cout << " Profile[" << i << "]: "; - cout << "width = " << + cout << "width = " << (int)capture.get(CV_CAP_INTELPERC_IMAGE_GENERATOR | CV_CAP_PROP_FRAME_WIDTH); - cout << ", height = " << + cout << ", height = " << (int)capture.get(CV_CAP_INTELPERC_IMAGE_GENERATOR | CV_CAP_PROP_FRAME_HEIGHT); - cout << ", fps = " << + cout << ", fps = " << capture.get(CV_CAP_INTELPERC_IMAGE_GENERATOR | CV_CAP_PROP_FPS); cout << endl; } @@ -143,11 +142,11 @@ static void printStreamProperties(VideoCapture &capture) { capture.set(CV_CAP_INTELPERC_DEPTH_GENERATOR | CV_CAP_PROP_INTELPERC_PROFILE_IDX, (double)i); cout << " Profile[" << i << "]: "; - cout << "width = " << + cout << "width = " << (int)capture.get(CV_CAP_INTELPERC_DEPTH_GENERATOR | CV_CAP_PROP_FRAME_WIDTH); - cout << ", height = " << + cout << ", height = " << (int)capture.get(CV_CAP_INTELPERC_DEPTH_GENERATOR | CV_CAP_PROP_FRAME_HEIGHT); - cout << ", fps = " << + cout << ", fps = " << capture.get(CV_CAP_INTELPERC_DEPTH_GENERATOR | CV_CAP_PROP_FPS); cout << endl; } @@ -353,7 +352,7 @@ int _tmain(int argc, char* argv[]) { if (g_closedDepthPoint) { - double minVal = 0.0; double maxVal = 0.0; + double minVal = 0.0; double maxVal = 0.0; minMaxIdx(depthImage, &minVal, &maxVal, g_closedDepthPoint); } imshowDepth("depth image", depthImage, capture); @@ -375,5 +374,4 @@ int _tmain(int argc, char* argv[]) } return 0; -} - +} \ No newline at end of file From ea0c9b7f5c6fec72b46cf82f92cf303c0f3a20d8 Mon Sep 17 00:00:00 2001 From: krodyush Date: Tue, 17 Dec 2013 17:12:57 +0400 Subject: [PATCH 13/28] GFFT fix for linux build --- modules/ocl/src/gftt.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ocl/src/gftt.cpp b/modules/ocl/src/gftt.cpp index 658e1a912a..a82196d78f 100644 --- a/modules/ocl/src/gftt.cpp +++ b/modules/ocl/src/gftt.cpp @@ -101,7 +101,7 @@ static void sortCorners_caller(oclMat& corners, const int count) } // find corners on matrix and put it into array -void findCorners_caller( +static void findCorners_caller( const oclMat& eig_mat, //input matrix worth eigenvalues oclMat& eigMinMax, //input with min and max values of eigenvalues const float qualityLevel, From 3a6d248bee93df66fc92c593d8a6ba6cc0214c95 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Tue, 17 Dec 2013 17:41:28 +0400 Subject: [PATCH 14/28] typo --- modules/core/doc/operations_on_arrays.rst | 2 +- modules/core/src/dxt.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/core/doc/operations_on_arrays.rst b/modules/core/doc/operations_on_arrays.rst index a312818daf..8c01a1010b 100644 --- a/modules/core/doc/operations_on_arrays.rst +++ b/modules/core/doc/operations_on_arrays.rst @@ -929,7 +929,7 @@ So, the function chooses an operation mode depending on the flags and size of th * When ``DFT_COMPLEX_OUTPUT`` is set, the output is a complex matrix of the same size as input. - * When ``DFT_COMPLEX_OUTPUT`` is not set, the output is a real matrix of the same size as input. In case of 2D transform, it uses the packed format as shown above. In case of a single 1D transform, it looks like the first row of the matrix above. In case of multiple 1D transforms (when using the ``DCT_ROWS`` flag), each row of the output matrix looks like the first row of the matrix above. + * When ``DFT_COMPLEX_OUTPUT`` is not set, the output is a real matrix of the same size as input. In case of 2D transform, it uses the packed format as shown above. In case of a single 1D transform, it looks like the first row of the matrix above. In case of multiple 1D transforms (when using the ``DFT_ROWS`` flag), each row of the output matrix looks like the first row of the matrix above. * If the input array is complex and either ``DFT_INVERSE`` or ``DFT_REAL_OUTPUT`` are not set, the output is a complex array of the same size as input. The function performs a forward or inverse 1D or 2D transform of the whole input array or each row of the input array independently, depending on the flags ``DFT_INVERSE`` and ``DFT_ROWS``. diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp index e6fed4eae7..033bf45120 100644 --- a/modules/core/src/dxt.cpp +++ b/modules/core/src/dxt.cpp @@ -2284,7 +2284,7 @@ void cv::dct( InputArray _src0, OutputArray _dst, int flags ) DCTFunc dct_func = dct_tbl[(int)inv + (depth == CV_64F)*2]; - if( (flags & DFT_ROWS) || src.rows == 1 || + if( (flags & DCT_ROWS) || src.rows == 1 || (src.cols == 1 && (src.isContinuous() && dst.isContinuous()))) { stage = end_stage = 0; @@ -2304,7 +2304,7 @@ void cv::dct( InputArray _src0, OutputArray _dst, int flags ) { len = src.cols; count = src.rows; - if( len == 1 && !(flags & DFT_ROWS) ) + if( len == 1 && !(flags & DCT_ROWS) ) { len = src.rows; count = 1; From 34c630faf4b88f7dafd23a8c1675867bd7bb8d78 Mon Sep 17 00:00:00 2001 From: krodyush Date: Tue, 17 Dec 2013 17:46:09 +0400 Subject: [PATCH 15/28] update doc to be consisted with headers --- modules/ocl/doc/image_filtering.rst | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/modules/ocl/doc/image_filtering.rst b/modules/ocl/doc/image_filtering.rst index 92a6c575f4..147ebc3da4 100644 --- a/modules/ocl/doc/image_filtering.rst +++ b/modules/ocl/doc/image_filtering.rst @@ -287,7 +287,7 @@ ocl::createSeparableLinearFilter_GPU ---------------------------------------- Creates a separable linear filter engine. -.. ocv:function:: Ptr ocl::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat &rowKernel, const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT) +.. ocv:function:: Ptr ocl::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat &rowKernel, const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT, Size imgSize = Size(-1,-1) ) :param srcType: Source array type. ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1`` source types are supported. @@ -303,6 +303,8 @@ Creates a separable linear filter engine. :param bordertype: Pixel extrapolation method. + :param imgSize: Source image size to choose optimal method for processing. + .. seealso:: :ocv:func:`ocl::getLinearRowFilter_GPU`, :ocv:func:`ocl::getLinearColumnFilter_GPU`, :ocv:func:`createSeparableLinearFilter` @@ -334,7 +336,7 @@ ocl::createDerivFilter_GPU ------------------------------ Creates a filter engine for the generalized Sobel operator. -.. ocv:function:: Ptr ocl::createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT ) +.. ocv:function:: Ptr ocl::createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT, Size imgSize = Size(-1,-1) ) :param srcType: Source image type. ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1`` source types are supported. @@ -348,6 +350,8 @@ Creates a filter engine for the generalized Sobel operator. :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate`. + :param imgSize: Source image size to choose optimal method for processing. + .. seealso:: :ocv:func:`ocl::createSeparableLinearFilter_GPU`, :ocv:func:`createDerivFilter` @@ -405,7 +409,7 @@ ocl::createGaussianFilter_GPU --------------------------------- Creates a Gaussian filter engine. -.. ocv:function:: Ptr ocl::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT) +.. ocv:function:: Ptr ocl::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT, Size imgSize = Size(-1,-1) ) :param type: Source and destination image type. ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1`` are supported. @@ -417,6 +421,8 @@ Creates a Gaussian filter engine. :param bordertype: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate`. + :param imgSize: Source image size to choose optimal method for processing. + .. seealso:: :ocv:func:`ocl::createSeparableLinearFilter_GPU`, :ocv:func:`createGaussianFilter` ocl::GaussianBlur From 63ae0eeba592e8855dd602ae9b1b406e48374645 Mon Sep 17 00:00:00 2001 From: Vladimir Bystricky Date: Tue, 17 Dec 2013 18:39:52 +0400 Subject: [PATCH 16/28] Fix build errors --- doc/user_guide/ug_intelperc.rst | 2 +- samples/cpp/intelperc_capture.cpp | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/user_guide/ug_intelperc.rst b/doc/user_guide/ug_intelperc.rst index 71a7c5d90e..bae5f70146 100644 --- a/doc/user_guide/ug_intelperc.rst +++ b/doc/user_guide/ug_intelperc.rst @@ -5,7 +5,7 @@ HighGUI .. highlight:: cpp Using Creative Senz3D and other Intel Perceptual Computing SDK compatible depth sensors -====================================================== +======================================================================================= Depth sensors compatible with Intel Perceptual Computing SDK are supported through ``VideoCapture`` class. Depth map, RGB image and some other formats of output can be retrieved by using familiar interface of ``VideoCapture``. diff --git a/samples/cpp/intelperc_capture.cpp b/samples/cpp/intelperc_capture.cpp index be032dead9..24ab0170eb 100644 --- a/samples/cpp/intelperc_capture.cpp +++ b/samples/cpp/intelperc_capture.cpp @@ -1,7 +1,6 @@ // testOpenCVCam.cpp : Defines the entry point for the console application. // -#include #include "opencv2/highgui/highgui.hpp" #include From e719bee2b80639d09acafac9551fc20e9f082d2c Mon Sep 17 00:00:00 2001 From: Andrey Pavlenko Date: Wed, 18 Dec 2013 00:15:02 +0400 Subject: [PATCH 17/28] minor refactoring, no functional changes --- samples/ocl/facedetect.cpp | 50 +++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/samples/ocl/facedetect.cpp b/samples/ocl/facedetect.cpp index fbb08cb1e5..9fafbf3ce1 100644 --- a/samples/ocl/facedetect.cpp +++ b/samples/ocl/facedetect.cpp @@ -11,7 +11,10 @@ using namespace std; using namespace cv; + #define LOOP_NUM 10 +#define MAX_THREADS 10 + ///////////////////////////single-threading faces detecting/////////////////////////////// @@ -26,23 +29,23 @@ const static Scalar colors[] = { CV_RGB(0,0,255), } ; -int64 work_begin = 0; -int64 work_end = 0; +int64 work_begin[MAX_THREADS] = {0}; +int64 work_end[MAX_THREADS] = {0}; string inputName, outputName, cascadeName; -static void workBegin() +static void workBegin(int i = 0) { - work_begin = getTickCount(); + work_begin[i] = getTickCount(); } -static void workEnd() +static void workEnd(int i = 0) { - work_end += (getTickCount() - work_begin); + work_end[i] += (getTickCount() - work_begin[i]); } -static double getTime() +static double getTime(int i = 0) { - return work_end /((double)cvGetTickFrequency() * 1000.); + return work_end[i] /getTickFrequency() * 1000.; } @@ -96,7 +99,6 @@ static int facedetect_one_thread(bool useCPU, double scale ) } } - cvNamedWindow( "result", 1 ); if( capture ) { cout << "In capture ..." << endl; @@ -125,34 +127,34 @@ static int facedetect_one_thread(bool useCPU, double scale ) } else { - cout << "In image read" << endl; + cout << "In image read " << image.size() << endl; vector faces; vector ref_rst; double accuracy = 0.; - for(int i = 0; i <= LOOP_NUM; i ++) + cout << "loops: "; + for(int i = 0; i <= LOOP_NUM; i++) { - cout << "loop" << i << endl; + cout << i << ", "; if(useCPU) - detectCPU(image, faces, cpu_cascade, scale, i==0?false:true); + detectCPU(image, faces, cpu_cascade, scale, i!=0); else { - detect(image, faces, cascade, scale, i==0?false:true); + detect(image, faces, cascade, scale, i!=0); if(i == 0) { detectCPU(image, ref_rst, cpu_cascade, scale, false); accuracy = checkRectSimilarity(image.size(), ref_rst, faces); } } - if (i == LOOP_NUM) - { - if (useCPU) - cout << "average CPU time (noCamera) : "; - else - cout << "average GPU time (noCamera) : "; - cout << getTime() / LOOP_NUM << " ms" << endl; - cout << "accuracy value: " << accuracy <= 1700) -#define MAX_THREADS 10 - static void detectFaces(std::string fileName) { ocl::OclCascadeClassifier cascade; From dd71bef6f599b1a6130eb9bdfb9ba4a707ca65d4 Mon Sep 17 00:00:00 2001 From: Vladimir Bystricky Date: Wed, 18 Dec 2013 09:59:24 +0400 Subject: [PATCH 18/28] Fix errors in example (samples/cpp/intelperc_capture.cpp) --- samples/cpp/intelperc_capture.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/samples/cpp/intelperc_capture.cpp b/samples/cpp/intelperc_capture.cpp index 24ab0170eb..b81a278cfe 100644 --- a/samples/cpp/intelperc_capture.cpp +++ b/samples/cpp/intelperc_capture.cpp @@ -20,12 +20,12 @@ static bool g_showClosedPoint = false; static int g_closedDepthPoint[2]; -static void printUsage(char *arg0) +static void printUsage(const char *arg0) { - char *filename = arg0; + const char *filename = arg0; while (*filename) filename++; - while ((arg0 <= filename) && ('\\' != *filename) && ('//' != *filename)) + while ((arg0 <= filename) && ('\\' != *filename) && ('/' != *filename)) filename--; filename++; @@ -95,7 +95,7 @@ static void parseCMDLine(int argc, char* argv[]) exit(-1); } } - if (g_closedDepthPoint && (-1 == g_depthStreamProfileIdx)) + if (g_showClosedPoint && (-1 == g_depthStreamProfileIdx)) { cerr << "For --show-closed depth profile has be selected" << endl; exit(-1); @@ -153,7 +153,7 @@ static void printStreamProperties(VideoCapture &capture) static void imshowImage(const char *winname, Mat &image, VideoCapture &capture) { - if (g_closedDepthPoint) + if (g_showClosedPoint) { Mat uvMap; if (capture.retrieve(uvMap, CV_CAP_INTELPERC_UVDEPTH_MAP)) @@ -283,7 +283,7 @@ static void imshowDepth(const char *winname, Mat &depth, VideoCapture &capture) imshow(winname, image); } -int _tmain(int argc, char* argv[]) +int main(int argc, char* argv[]) { parseCMDLine(argc, argv); @@ -349,7 +349,7 @@ int _tmain(int argc, char* argv[]) if ((-1 != g_depthStreamProfileIdx) && (capture.retrieve(depthImage, CV_CAP_INTELPERC_DEPTH_MAP))) { - if (g_closedDepthPoint) + if (g_showClosedPoint) { double minVal = 0.0; double maxVal = 0.0; minMaxIdx(depthImage, &minVal, &maxVal, g_closedDepthPoint); From 66145ea06c68e427b19d3d0c2ae0103c96c333fe Mon Sep 17 00:00:00 2001 From: Vladimir Bystricky Date: Wed, 18 Dec 2013 10:55:09 +0400 Subject: [PATCH 19/28] Add CV_CAP_INTELPERC and CV_CAP_PROP_INTELPERC_ prefixes to const_ignore_list --- modules/java/generator/gen_java.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/java/generator/gen_java.py b/modules/java/generator/gen_java.py index 123daf70b8..c0aaed1918 100755 --- a/modules/java/generator/gen_java.py +++ b/modules/java/generator/gen_java.py @@ -18,6 +18,8 @@ class_ignore_list = ( const_ignore_list = ( "CV_CAP_OPENNI", "CV_CAP_PROP_OPENNI_", + "CV_CAP_INTELPERC", + "CV_CAP_PROP_INTELPERC_" "WINDOW_AUTOSIZE", "CV_WND_PROP_", "CV_WINDOW_", From 80d0593dbd62f9a2349a15f488f2b17547521534 Mon Sep 17 00:00:00 2001 From: Vladimir Bystricky Date: Wed, 18 Dec 2013 11:46:52 +0400 Subject: [PATCH 20/28] Delete end comma in enumerations --- modules/highgui/include/opencv2/highgui/highgui_c.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/highgui/include/opencv2/highgui/highgui_c.h b/modules/highgui/include/opencv2/highgui/highgui_c.h index 8a59197594..4f743ffec8 100644 --- a/modules/highgui/include/opencv2/highgui/highgui_c.h +++ b/modules/highgui/include/opencv2/highgui/highgui_c.h @@ -460,11 +460,11 @@ enum CV_CAP_PROP_IOS_DEVICE_EXPOSURE = 9002, CV_CAP_PROP_IOS_DEVICE_FLASH = 9003, CV_CAP_PROP_IOS_DEVICE_WHITEBALANCE = 9004, - CV_CAP_PROP_IOS_DEVICE_TORCH = 9005 + CV_CAP_PROP_IOS_DEVICE_TORCH = 9005, // Properties of cameras available through Smartek Giganetix Ethernet Vision interface /* --- Vladimir Litvinenko (litvinenko.vladimir@gmail.com) --- */ - ,CV_CAP_PROP_GIGA_FRAME_OFFSET_X = 10001, + CV_CAP_PROP_GIGA_FRAME_OFFSET_X = 10001, CV_CAP_PROP_GIGA_FRAME_OFFSET_Y = 10002, CV_CAP_PROP_GIGA_FRAME_WIDTH_MAX = 10003, CV_CAP_PROP_GIGA_FRAME_HEIGH_MAX = 10004, @@ -482,7 +482,7 @@ enum // Intel PerC streams CV_CAP_INTELPERC_DEPTH_GENERATOR = 1 << 29, CV_CAP_INTELPERC_IMAGE_GENERATOR = 1 << 28, - CV_CAP_INTELPERC_GENERATORS_MASK = CV_CAP_INTELPERC_DEPTH_GENERATOR + CV_CAP_INTELPERC_IMAGE_GENERATOR, + CV_CAP_INTELPERC_GENERATORS_MASK = CV_CAP_INTELPERC_DEPTH_GENERATOR + CV_CAP_INTELPERC_IMAGE_GENERATOR }; enum @@ -568,7 +568,7 @@ enum CV_CAP_INTELPERC_DEPTH_MAP = 0, // Each pixel is a 16-bit integer. The value indicates the distance from an object to the camera's XY plane or the Cartesian depth. CV_CAP_INTELPERC_UVDEPTH_MAP = 1, // Each pixel contains two 32-bit floating point values in the range of 0-1, representing the mapping of depth coordinates to the color coordinates. CV_CAP_INTELPERC_IR_MAP = 2, // Each pixel is a 16-bit integer. The value indicates the intensity of the reflected laser beam. - CV_CAP_INTELPERC_IMAGE = 3, + CV_CAP_INTELPERC_IMAGE = 3 }; /* retrieve or set capture properties */ From 1ae71fe205856d47c22c6e5b5f3aadebcee3504f Mon Sep 17 00:00:00 2001 From: krodyush Date: Wed, 18 Dec 2013 14:27:51 +0400 Subject: [PATCH 21/28] intel device guard was added because of perf degradation on some non intel platform. --- modules/ocl/src/filtering.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/modules/ocl/src/filtering.cpp b/modules/ocl/src/filtering.cpp index 20895abee3..35aa226de6 100644 --- a/modules/ocl/src/filtering.cpp +++ b/modules/ocl/src/filtering.cpp @@ -1405,11 +1405,13 @@ Ptr cv::ocl::createSeparableLinearFilter_GPU(int srcType, int int cn = CV_MAT_CN(srcType); int bdepth = std::max(std::max(sdepth, ddepth), CV_32F); int bufType = CV_MAKETYPE(bdepth, cn); + Context* clCxt = Context::getContext(); //if image size is non-degenerate and large enough //and if filter support is reasonable to satisfy larger local memory requirements, //then we can use single pass routine to avoid extra runtime calls overhead - if( rowKernel.rows <= 21 && columnKernel.rows <= 21 && + if( clCxt && clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE) && + rowKernel.rows <= 21 && columnKernel.rows <= 21 && (rowKernel.rows & 1) == 1 && (columnKernel.rows & 1) == 1 && imgSize.width > optimizedSepFilterLocalSize + (rowKernel.rows>>1) && imgSize.height > optimizedSepFilterLocalSize + (columnKernel.rows>>1) ) From 8c6049867394da89e0b5ed3dd5dc98187a87a2b6 Mon Sep 17 00:00:00 2001 From: Andrey Pavlenko Date: Wed, 18 Dec 2013 17:02:39 +0400 Subject: [PATCH 22/28] adding timing for multi-threaded case --- samples/ocl/facedetect.cpp | 57 +++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 22 deletions(-) diff --git a/samples/ocl/facedetect.cpp b/samples/ocl/facedetect.cpp index 9fafbf3ce1..378105906e 100644 --- a/samples/ocl/facedetect.cpp +++ b/samples/ocl/facedetect.cpp @@ -30,7 +30,7 @@ const static Scalar colors[] = { CV_RGB(0,0,255), int64 work_begin[MAX_THREADS] = {0}; -int64 work_end[MAX_THREADS] = {0}; +int64 work_total[MAX_THREADS] = {0}; string inputName, outputName, cascadeName; static void workBegin(int i = 0) @@ -40,12 +40,12 @@ static void workBegin(int i = 0) static void workEnd(int i = 0) { - work_end[i] += (getTickCount() - work_begin[i]); + work_total[i] += (getTickCount() - work_begin[i]); } -static double getTime(int i = 0) +static double getTotalTime(int i = 0) { - return work_end[i] /getTickFrequency() * 1000.; + return work_total[i] /getTickFrequency() * 1000.; } @@ -152,7 +152,7 @@ static int facedetect_one_thread(bool useCPU, double scale ) cout << "average CPU time (noCamera) : "; else cout << "average GPU time (noCamera) : "; - cout << getTime() / LOOP_NUM << " ms" << endl; + cout << getTotalTime() / LOOP_NUM << " ms" << endl; cout << "accuracy value: " << accuracy <= 1700) -static void detectFaces(std::string fileName) +static void detectFaces(std::string fileName, int threadNum) { ocl::OclCascadeClassifier cascade; if(!cascade.load(cascadeName)) @@ -179,7 +179,7 @@ static void detectFaces(std::string fileName) Mat img = imread(fileName, CV_LOAD_IMAGE_COLOR); if (img.empty()) { - std::cout << "cann't open file " + fileName < oclfaces; - cascade.detectMultiScale(d_img, oclfaces, 1.1, 3, 0|CV_HAAR_SCALE_IMAGE, Size(30, 30), Size(0, 0)); + std::thread::id tid = std::this_thread::get_id(); + std::cout << '[' << threadNum << "] " + << "ThreadID = " << tid + << ", CommandQueue = " << *(void**)ocl::getClCommandQueuePtr() + << endl; + for(int i = 0; i <= LOOP_NUM; i++) + { + if(i>0) workBegin(threadNum); + cascade.detectMultiScale(d_img, oclfaces, 1.1, 3, 0|CV_HAAR_SCALE_IMAGE, Size(30, 30), Size(0, 0)); + if(i>0) workEnd(threadNum); + } + std::cout << '[' << threadNum << "] " << "Average time = " << getTotalTime(threadNum) / LOOP_NUM << " ms" << endl; for(unsigned int i = 0; i threads; for(int i = 0; i= 1 }"; CommandLineParser cmd(argc, argv, keys); @@ -314,8 +327,8 @@ void Draw(Mat& img, vector& faces, double scale) radius = cvRound((r->width + r->height)*0.25*scale); circle( img, center, radius, color, 3, 8, 0 ); } - imwrite( outputName, img ); - if(abs(scale-1.0)>.001) + if( !outputName.empty() ) imwrite( outputName, img ); + if( abs(scale-1.0)>.001 ) { resize(img, img, Size((int)(img.cols/scale), (int)(img.rows/scale))); } From a9687a341e63f969c01ee0ce74139c1a9dab2178 Mon Sep 17 00:00:00 2001 From: Andrey Pavlenko Date: Wed, 18 Dec 2013 17:27:39 +0400 Subject: [PATCH 23/28] adding more than 4 channels random Mats support if `Scalar:all` is used, Mats with 5+ channels cause errors --- modules/ts/src/ts_func.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/ts/src/ts_func.cpp b/modules/ts/src/ts_func.cpp index 5900637c33..44f3e483fd 100644 --- a/modules/ts/src/ts_func.cpp +++ b/modules/ts/src/ts_func.cpp @@ -116,7 +116,7 @@ Mat randomMat(RNG& rng, Size size, int type, double minVal, double maxVal, bool Mat m(size0, type); - rng.fill(m, RNG::UNIFORM, Scalar::all(minVal), Scalar::all(maxVal)); + rng.fill(m, RNG::UNIFORM, minVal, maxVal); if( size0 == size ) return m; return m(Rect((size0.width-size.width)/2, (size0.height-size.height)/2, size.width, size.height)); @@ -142,7 +142,7 @@ Mat randomMat(RNG& rng, const vector& size, int type, double minVal, double Mat m(dims, &size0[0], type); - rng.fill(m, RNG::UNIFORM, Scalar::all(minVal), Scalar::all(maxVal)); + rng.fill(m, RNG::UNIFORM, minVal, maxVal); if( eqsize ) return m; return m(&r[0]); From d449ba104a068c92a931a68d782245bbcb92af6c Mon Sep 17 00:00:00 2001 From: Vladimir Bystricky Date: Thu, 19 Dec 2013 10:29:19 +0400 Subject: [PATCH 24/28] Fix comment in the cmake file from SSDK to SDK --- cmake/OpenCVFindLibsVideo.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/OpenCVFindLibsVideo.cmake b/cmake/OpenCVFindLibsVideo.cmake index a5075b57f7..a797f04169 100644 --- a/cmake/OpenCVFindLibsVideo.cmake +++ b/cmake/OpenCVFindLibsVideo.cmake @@ -251,7 +251,7 @@ if (NOT IOS) endif() endif() -# --- Intel Perceptual Computing SSDK --- +# --- Intel Perceptual Computing SDK --- if(WITH_INTELPERC) include("${OpenCV_SOURCE_DIR}/cmake/OpenCVFindIntelPerCSDK.cmake") endif(WITH_INTELPERC) From 27c1bd27624f0a9c248cd05d6779cc6859d86892 Mon Sep 17 00:00:00 2001 From: krodyush Date: Thu, 19 Dec 2013 22:56:46 +0400 Subject: [PATCH 25/28] Improve ocl cvt_color performance for the following conversions: RGB<->BGR, RGB->Gray, RGB<->XYZ, RGB<->YCrCb, RGB<->YUV, and mRGBA<->RGBA. The improvement was done basically by processing more than 1 pixel by each work-item and using vector's operations. new performance tests were added --- modules/ocl/perf/perf_color.cpp | 93 +++- modules/ocl/src/color.cpp | 184 ++++++- modules/ocl/src/opencl/cvt_color.cl | 817 ++++++++++++++++++++++++---- 3 files changed, 936 insertions(+), 158 deletions(-) diff --git a/modules/ocl/perf/perf_color.cpp b/modules/ocl/perf/perf_color.cpp index 8433315189..75e6820fcb 100644 --- a/modules/ocl/perf/perf_color.cpp +++ b/modules/ocl/perf/perf_color.cpp @@ -57,9 +57,39 @@ CV_ENUM(ConversionTypes, CV_RGB2GRAY, CV_RGB2BGR, CV_RGB2YUV, CV_YUV2RGB, CV_RGB CV_HLS2RGB, CV_BGR5652BGR, CV_BGR2BGR565, CV_RGBA2mRGBA, CV_mRGBA2RGBA, CV_YUV2RGB_NV12) typedef tuple > cvtColorParams; -typedef TestBaseWithParam cvtColorFixture; +typedef TestBaseWithParam cvtColorU8Fixture; +typedef TestBaseWithParam cvtColorF32Fixture; +typedef TestBaseWithParam cvtColorU16Fixture; -PERF_TEST_P(cvtColorFixture, cvtColor, testing::Combine( +#define RUN_CVT_PERF_TEST \ + cvtColorParams params = GetParam();\ + const Size srcSize = get<0>(params);\ + const tuple conversionParams = get<1>(params);\ + const int code = get<0>(conversionParams), scn = get<1>(conversionParams),\ + dcn = get<2>(conversionParams);\ +\ + Mat src(srcSize, CV_8UC(scn)), dst(srcSize, CV_8UC(scn));\ + declare.in(src, WARMUP_RNG).out(dst);\ +\ + if (RUN_OCL_IMPL)\ + {\ + ocl::oclMat oclSrc(src), oclDst(src.size(), dst.type());\ +\ + OCL_TEST_CYCLE() ocl::cvtColor(oclSrc, oclDst, code, dcn);\ + oclDst.download(dst);\ +\ + SANITY_CHECK(dst, 1);\ + }\ + else if (RUN_PLAIN_IMPL)\ + {\ + TEST_CYCLE() cv::cvtColor(src, dst, code, dcn);\ +\ + SANITY_CHECK(dst);\ + }\ + else\ + OCL_PERF_ELSE\ + +PERF_TEST_P(cvtColorU8Fixture, cvtColor, testing::Combine( testing::Values(Size(1000, 1002), Size(2000, 2004), Size(4000, 4008)), testing::Values( make_tuple(ConversionTypes(CV_RGB2GRAY), 3, 1), @@ -81,30 +111,41 @@ PERF_TEST_P(cvtColorFixture, cvtColor, testing::Combine( make_tuple(ConversionTypes(CV_YUV2RGB_NV12), 1, 3) ))) { - cvtColorParams params = GetParam(); - const Size srcSize = get<0>(params); - const tuple conversionParams = get<1>(params); - const int code = get<0>(conversionParams), scn = get<1>(conversionParams), - dcn = get<2>(conversionParams); - - Mat src(srcSize, CV_8UC(scn)), dst(srcSize, CV_8UC(scn)); - declare.in(src, WARMUP_RNG).out(dst); - - if (RUN_OCL_IMPL) - { - ocl::oclMat oclSrc(src), oclDst(src.size(), dst.type()); - - OCL_TEST_CYCLE() ocl::cvtColor(oclSrc, oclDst, code, dcn); - oclDst.download(dst); + RUN_CVT_PERF_TEST +} - SANITY_CHECK(dst, 1); - } - else if (RUN_PLAIN_IMPL) - { - TEST_CYCLE() cv::cvtColor(src, dst, code, dcn); +PERF_TEST_P(cvtColorF32Fixture, cvtColor, testing::Combine( + testing::Values(Size(1000, 1002), Size(2000, 2004), Size(4000, 4008)), + testing::Values( + make_tuple(ConversionTypes(CV_RGB2GRAY), 3, 1), + make_tuple(ConversionTypes(CV_RGB2BGR), 3, 3), + make_tuple(ConversionTypes(CV_RGB2YUV), 3, 3), + make_tuple(ConversionTypes(CV_YUV2RGB), 3, 3), + make_tuple(ConversionTypes(CV_RGB2YCrCb), 3, 3), + make_tuple(ConversionTypes(CV_YCrCb2RGB), 3, 3), + make_tuple(ConversionTypes(CV_RGB2XYZ), 3, 3), + make_tuple(ConversionTypes(CV_XYZ2RGB), 3, 3), + make_tuple(ConversionTypes(CV_RGB2HSV), 3, 3), + make_tuple(ConversionTypes(CV_HSV2RGB), 3, 3), + make_tuple(ConversionTypes(CV_RGB2HLS), 3, 3), + make_tuple(ConversionTypes(CV_HLS2RGB), 3, 3) + ))) +{ + RUN_CVT_PERF_TEST +} - SANITY_CHECK(dst); - } - else - OCL_PERF_ELSE +PERF_TEST_P(cvtColorU16Fixture, cvtColor, testing::Combine( + testing::Values(Size(1000, 1002), Size(2000, 2004), Size(4000, 4008)), + testing::Values( + make_tuple(ConversionTypes(CV_RGB2GRAY), 3, 1), + make_tuple(ConversionTypes(CV_RGB2BGR), 3, 3), + make_tuple(ConversionTypes(CV_RGB2YUV), 3, 3), + make_tuple(ConversionTypes(CV_YUV2RGB), 3, 3), + make_tuple(ConversionTypes(CV_RGB2YCrCb), 3, 3), + make_tuple(ConversionTypes(CV_YCrCb2RGB), 3, 3), + make_tuple(ConversionTypes(CV_RGB2XYZ), 3, 3), + make_tuple(ConversionTypes(CV_XYZ2RGB), 3, 3) + ))) +{ + RUN_CVT_PERF_TEST } diff --git a/modules/ocl/src/color.cpp b/modules/ocl/src/color.cpp index 0af58643c9..e323934b4c 100644 --- a/modules/ocl/src/color.cpp +++ b/modules/ocl/src/color.cpp @@ -56,8 +56,19 @@ static void fromRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std:: { int src_offset = src.offset / src.elemSize1(), src_step = src.step1(); int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1(); + int pixels_per_work_item = 1; - std::string build_options = format("-D DEPTH_%d", src.depth()); + if (Context::getContext()->supportsFeature(FEATURE_CL_INTEL_DEVICE)) + { + if ((src.cols % 4 == 0) && (src.depth() == CV_8U)) + pixels_per_work_item = 4; + else if (src.cols % 2 == 0) + pixels_per_work_item = 2; + else + pixels_per_work_item = 1; + } + + std::string build_options = format("-D DEPTH_%d -D scn=%d -D bidx=%d -D pixels_per_work_item=%d", src.depth(), src.oclchannels(), bidx, pixels_per_work_item); if (!additionalOptions.empty()) build_options += additionalOptions; @@ -66,7 +77,6 @@ static void fromRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std:: args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows)); args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step)); args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step)); - args.push_back( make_pair( sizeof(cl_int) , (void *)&bidx)); args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data)); args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data)); args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset )); @@ -77,6 +87,73 @@ static void fromRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std:: if (!data2.empty()) args.push_back( make_pair( sizeof(cl_mem) , (void *)&data2.data )); + size_t gt[3] = { dst.cols/pixels_per_work_item, dst.rows, 1 }; +#ifdef ANDROID + size_t lt[3] = { 16, 10, 1 }; +#else + size_t lt[3] = { 16, 16, 1 }; +#endif + openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str()); +} + +static void toHSV_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName, + const std::string & additionalOptions = std::string(), + const oclMat & data1 = oclMat(), const oclMat & data2 = oclMat()) +{ + int src_offset = src.offset / src.elemSize1(), src_step = src.step1(); + int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1(); + + std::string build_options = format("-D DEPTH_%d -D scn=%d -D bidx=%d", src.depth(), src.oclchannels(), bidx); + if (!additionalOptions.empty()) + build_options += additionalOptions; + + vector > args; + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols)); + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows)); + args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step)); + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step)); + args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data)); + args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data)); + args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset )); + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_offset )); + + if (!data1.empty()) + args.push_back( make_pair( sizeof(cl_mem) , (void *)&data1.data )); + if (!data2.empty()) + args.push_back( make_pair( sizeof(cl_mem) , (void *)&data2.data )); + + size_t gt[3] = { dst.cols, dst.rows, 1 }; +#ifdef ANDROID + size_t lt[3] = { 16, 10, 1 }; +#else + size_t lt[3] = { 16, 16, 1 }; +#endif + openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str()); +} + +static void fromGray_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName, + const std::string & additionalOptions = std::string(), const oclMat & data = oclMat()) +{ + std::string build_options = format("-D DEPTH_%d -D dcn=%d -D bidx=%d", src.depth(), dst.channels(), bidx); + if (!additionalOptions.empty()) + build_options += additionalOptions; + + int src_offset = src.offset / src.elemSize1(), src_step = src.step1(); + int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1(); + + vector > args; + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols)); + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows)); + args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step)); + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step)); + args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data)); + args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data)); + args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset )); + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_offset )); + + if (!data.empty()) + args.push_back( make_pair( sizeof(cl_mem) , (void *)&data.data )); + size_t gt[3] = { dst.cols, dst.rows, 1 }; #ifdef ANDROID size_t lt[3] = { 16, 10, 1 }; @@ -89,7 +166,50 @@ static void fromRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std:: static void toRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName, const std::string & additionalOptions = std::string(), const oclMat & data = oclMat()) { - std::string build_options = format("-D DEPTH_%d -D dcn=%d", src.depth(), dst.channels()); + int src_offset = src.offset / src.elemSize1(), src_step = src.step1(); + int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1(); + int pixels_per_work_item = 1; + + if (Context::getContext()->supportsFeature(FEATURE_CL_INTEL_DEVICE)) + { + if ((src.cols % 4 == 0) && (src.depth() == CV_8U)) + pixels_per_work_item = 4; + else if (src.cols % 2 == 0) + pixels_per_work_item = 2; + else + pixels_per_work_item = 1; + } + + std::string build_options = format("-D DEPTH_%d -D dcn=%d -D bidx=%d -D pixels_per_work_item=%d", src.depth(), dst.channels(), bidx, pixels_per_work_item); + if (!additionalOptions.empty()) + build_options += additionalOptions; + + vector > args; + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols)); + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows)); + args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step)); + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step)); + args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data)); + args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data)); + args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset )); + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_offset )); + + if (!data.empty()) + args.push_back( make_pair( sizeof(cl_mem) , (void *)&data.data )); + + size_t gt[3] = { dst.cols/pixels_per_work_item, dst.rows, 1 }; +#ifdef ANDROID + size_t lt[3] = { 16, 10, 1 }; +#else + size_t lt[3] = { 16, 16, 1 }; +#endif + openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str()); +} + +static void toRGB_NV12_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName, + const std::string & additionalOptions = std::string(), const oclMat & data = oclMat()) +{ + std::string build_options = format("-D DEPTH_%d -D dcn=%d -D bidx=%d", src.depth(), dst.channels(), bidx); if (!additionalOptions.empty()) build_options += additionalOptions; @@ -101,7 +221,6 @@ static void toRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::st args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows)); args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step)); args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step)); - args.push_back( make_pair( sizeof(cl_int) , (void *)&bidx)); args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data)); args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data)); args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset )); @@ -119,13 +238,46 @@ static void toRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::st openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str()); } +static void fromHSV_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName, + const std::string & additionalOptions = std::string(), const oclMat & data = oclMat()) +{ + std::string build_options = format("-D DEPTH_%d -D dcn=%d -D bidx=%d", src.depth(), dst.channels(), bidx); + if (!additionalOptions.empty()) + build_options += additionalOptions; + + int src_offset = src.offset / src.elemSize1(), src_step = src.step1(); + int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1(); + + vector > args; + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols)); + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows)); + args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step)); + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step)); + args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data)); + args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data)); + args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset )); + args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_offset )); + + if (!data.empty()) + args.push_back( make_pair( sizeof(cl_mem) , (void *)&data.data )); + + size_t gt[3] = { dst.cols, dst.rows, 1 }; +#ifdef ANDROID + size_t lt[3] = { 16, 10, 1 }; +#else + size_t lt[3] = { 16, 16, 1 }; +#endif + openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str()); +} + static void RGB_caller(const oclMat &src, oclMat &dst, bool reverse) { - std::string build_options = format("-D DEPTH_%d -D dcn=%d -D scn=%d -D %s", src.depth(), - dst.channels(), src.channels(), reverse ? "REVERSE" : "ORDER"); int src_offset = src.offset / src.elemSize1(), src_step = src.step1(); int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1(); + std::string build_options = format("-D DEPTH_%d -D dcn=%d -D scn=%d -D %s", + src.depth(), dst.channels(), src.channels(), reverse ? "REVERSE" : "ORDER"); + vector > args; args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols)); args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows)); @@ -147,8 +299,8 @@ static void RGB_caller(const oclMat &src, oclMat &dst, bool reverse) static void fromRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int greenbits, const std::string & kernelName) { - std::string build_options = format("-D DEPTH_%d -D greenbits=%d -D dcn=%d", - src.depth(), greenbits, dst.channels()); + std::string build_options = format("-D DEPTH_%d -D greenbits=%d -D dcn=%d -D bidx=%d", + src.depth(), greenbits, dst.channels(), bidx); int src_offset = src.offset >> 1, src_step = src.step >> 1; int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step / dst.elemSize1(); @@ -157,7 +309,6 @@ static void fromRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int gree args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows)); args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step)); args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step)); - args.push_back( make_pair( sizeof(cl_int) , (void *)&bidx)); args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data)); args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data)); args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset )); @@ -174,8 +325,8 @@ static void fromRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int gree static void toRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int greenbits, const std::string & kernelName) { - std::string build_options = format("-D DEPTH_%d -D greenbits=%d -D scn=%d", - src.depth(), greenbits, src.channels()); + std::string build_options = format("-D DEPTH_%d -D greenbits=%d -D scn=%d -D bidx=%d", + src.depth(), greenbits, src.channels(), bidx); int src_offset = (int)src.offset, src_step = (int)src.step; int dst_offset = dst.offset >> 1, dst_step = dst.step >> 1; @@ -184,7 +335,6 @@ static void toRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int greenb args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows)); args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step)); args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step)); - args.push_back( make_pair( sizeof(cl_int) , (void *)&bidx)); args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data)); args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data)); args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset )); @@ -272,7 +422,7 @@ static void cvtColor_caller(const oclMat &src, oclMat &dst, int code, int dcn) CV_Assert(scn == 1); dcn = code == CV_GRAY2BGRA ? 4 : 3; dst.create(sz, CV_MAKETYPE(depth, dcn)); - toRGB_caller(src, dst, 0, "Gray2RGB"); + fromGray_caller(src, dst, 0, "Gray2RGB"); break; } case CV_BGR2YUV: case CV_RGB2YUV: @@ -303,7 +453,7 @@ static void cvtColor_caller(const oclMat &src, oclMat &dst, int code, int dcn) Size dstSz(sz.width, sz.height * 2 / 3); dst.create(dstSz, CV_MAKETYPE(depth, dcn)); - toRGB_caller(src, dst, bidx, "YUV2RGBA_NV12"); + toRGB_NV12_caller(src, dst, bidx, "YUV2RGBA_NV12"); break; } case CV_BGR2YCrCb: case CV_RGB2YCrCb: @@ -460,11 +610,11 @@ static void cvtColor_caller(const oclMat &src, oclMat &dst, int code, int dcn) initialized = true; } - fromRGB_caller(src, dst, bidx, kernelName, format(" -D hrange=%d", hrange), sdiv_data, hrange == 256 ? hdiv_data256 : hdiv_data180); + toHSV_caller(src, dst, bidx, kernelName, format(" -D hrange=%d", hrange), sdiv_data, hrange == 256 ? hdiv_data256 : hdiv_data180); return; } - fromRGB_caller(src, dst, bidx, kernelName, format(" -D hscale=%f", hrange*(1.f/360.f))); + toHSV_caller(src, dst, bidx, kernelName, format(" -D hscale=%f", hrange*(1.f/360.f))); break; } case CV_HSV2BGR: case CV_HSV2RGB: case CV_HSV2BGR_FULL: case CV_HSV2RGB_FULL: @@ -483,7 +633,7 @@ static void cvtColor_caller(const oclMat &src, oclMat &dst, int code, int dcn) dst.create(sz, CV_MAKETYPE(depth, dcn)); std::string kernelName = std::string(is_hsv ? "HSV" : "HLS") + "2RGB"; - toRGB_caller(src, dst, bidx, kernelName, format(" -D hrange=%d -D hscale=%f", hrange, 6.f/hrange)); + fromHSV_caller(src, dst, bidx, kernelName, format(" -D hrange=%d -D hscale=%f", hrange, 6.f/hrange)); break; } case CV_RGBA2mRGBA: case CV_mRGBA2RGBA: diff --git a/modules/ocl/src/opencl/cvt_color.cl b/modules/ocl/src/opencl/cvt_color.cl index bf3b6cfa76..2313af1527 100644 --- a/modules/ocl/src/opencl/cvt_color.cl +++ b/modules/ocl/src/opencl/cvt_color.cl @@ -56,35 +56,59 @@ #ifdef DEPTH_0 #define DATA_TYPE uchar +#define VECTOR2 uchar2 +#define VECTOR4 uchar4 +#define VECTOR8 uchar8 +#define VECTOR16 uchar16 #define COEFF_TYPE int #define MAX_NUM 255 #define HALF_MAX 128 #define SAT_CAST(num) convert_uchar_sat_rte(num) +#define SAT_CAST2(num) convert_uchar2_sat(num) +#define SAT_CAST4(num) convert_uchar4_sat(num) #endif #ifdef DEPTH_2 #define DATA_TYPE ushort +#define VECTOR2 ushort2 +#define VECTOR4 ushort4 +#define VECTOR8 ushort8 +#define VECTOR16 ushort16 #define COEFF_TYPE int #define MAX_NUM 65535 #define HALF_MAX 32768 #define SAT_CAST(num) convert_ushort_sat_rte(num) +#define SAT_CAST2(num) convert_ushort2_sat(num) +#define SAT_CAST4(num) convert_ushort4_sat(num) #endif #ifdef DEPTH_5 #define DATA_TYPE float +#define VECTOR2 float2 +#define VECTOR4 float4 +#define VECTOR8 float8 +#define VECTOR16 float16 #define COEFF_TYPE float #define MAX_NUM 1.0f #define HALF_MAX 0.5f #define SAT_CAST(num) (num) #endif +#ifndef bidx + #define bidx 0 +#endif + +#ifndef pixels_per_work_item + #define pixels_per_work_item 1 +#endif + #define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n)) enum { yuv_shift = 14, xyz_shift = 12, - hsv_shift = 12, + hsv_shift = 12, R2Y = 4899, G2Y = 9617, B2Y = 1868, @@ -93,26 +117,84 @@ enum ///////////////////////////////////// RGB <-> GRAY ////////////////////////////////////// +__constant float c_RGB2GrayCoeffs_f[3] = { 0.114f, 0.587f, 0.299f }; +__constant int c_RGB2GrayCoeffs_i[3] = { B2Y, G2Y, R2Y }; + __kernel void RGB2Gray(int cols, int rows, int src_step, int dst_step, - int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst, + __global const DATA_TYPE* src, __global DATA_TYPE* dst, int src_offset, int dst_offset) { - int x = get_global_id(0); + int x = get_global_id(0) * pixels_per_work_item; int y = get_global_id(1); if (y < rows && x < cols) { int src_idx = mad24(y, src_step, src_offset + (x << 2)); int dst_idx = mad24(y, dst_step, dst_offset + x); + +#ifndef INTEL_DEVICE #ifdef DEPTH_5 dst[dst_idx] = src[src_idx + bidx] * 0.114f + src[src_idx + 1] * 0.587f + src[src_idx + (bidx^2)] * 0.299f; #else dst[dst_idx] = (DATA_TYPE)CV_DESCALE((src[src_idx + bidx] * B2Y + src[src_idx + 1] * G2Y + src[src_idx + (bidx^2)] * R2Y), yuv_shift); #endif +#else + global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx); + global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx); + +#ifdef DEPTH_5 + __constant float * coeffs = c_RGB2GrayCoeffs_f; +#else + __constant int * coeffs = c_RGB2GrayCoeffs_i; +#endif + + if (1 == pixels_per_work_item) + { +#ifdef DEPTH_5 + *dst_ptr = src_ptr[bidx] * coeffs[0] + src_ptr[1] * coeffs[1] + src_ptr[(bidx^2)] *coeffs[2]; +#else + *dst_ptr = (DATA_TYPE)CV_DESCALE((src_ptr[bidx] * coeffs[0] + src_ptr[1] * coeffs[1] + src_ptr[(bidx^2)] * coeffs[2]), yuv_shift); +#endif + } + else if (2 == pixels_per_work_item) + { + const VECTOR8 r0 = vload8(0, src_ptr); + +#ifdef DEPTH_5 + const float2 c0 = r0.s04; + const float2 c1 = r0.s15; + const float2 c2 = r0.s26; + + const float2 Y = c0 * coeffs[bidx] + c1 * coeffs[1] + c2 * coeffs[bidx^2]; +#else + const int2 c0 = convert_int2(r0.s04); + const int2 c1 = convert_int2(r0.s15); + const int2 c2 = convert_int2(r0.s26); + + const int2 yi = CV_DESCALE(c0 * coeffs[bidx] + c1 * coeffs[1] + c2 * coeffs[bidx^2], yuv_shift); + const VECTOR2 Y = SAT_CAST2(yi); +#endif + + vstore2(Y, 0, dst_ptr); + } + else if (4 == pixels_per_work_item) + { +#ifndef DEPTH_5 + const VECTOR16 r0 = vload16(0, src_ptr); + + const int4 c0 = convert_int4(r0.s048c); + const int4 c1 = convert_int4(r0.s159d); + const int4 c2 = convert_int4(r0.s26ae); + const int4 Y = CV_DESCALE(c0 * coeffs[bidx] + c1 * coeffs[1] + c2 * coeffs[bidx^2], yuv_shift); + + vstore4(SAT_CAST4(Y), 0, dst_ptr); +#endif + } +#endif //INTEL_DEVICE } } -__kernel void Gray2RGB(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void Gray2RGB(int cols, int rows, int src_step, int dst_step, __global const DATA_TYPE* src, __global DATA_TYPE* dst, int src_offset, int dst_offset) { @@ -140,10 +222,10 @@ __constant float c_RGB2YUVCoeffs_f[5] = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877 __constant int c_RGB2YUVCoeffs_i[5] = { B2Y, G2Y, R2Y, 8061, 14369 }; __kernel void RGB2YUV(int cols, int rows, int src_step, int dst_step, - int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst, + __global const DATA_TYPE* src, __global DATA_TYPE* dst, int src_offset, int dst_offset) { - int x = get_global_id(0); + int x = get_global_id(0) * pixels_per_work_item; int y = get_global_id(1); if (y < rows && x < cols) @@ -151,24 +233,85 @@ __kernel void RGB2YUV(int cols, int rows, int src_step, int dst_step, x <<= 2; int src_idx = mad24(y, src_step, src_offset + x); int dst_idx = mad24(y, dst_step, dst_offset + x); - DATA_TYPE rgb[] = { src[src_idx], src[src_idx + 1], src[src_idx + 2] }; + + global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx); + global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx); #ifdef DEPTH_5 __constant float * coeffs = c_RGB2YUVCoeffs_f; - DATA_TYPE Y = rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx]; - DATA_TYPE Cr = (rgb[bidx^2] - Y) * coeffs[3] + HALF_MAX; - DATA_TYPE Cb = (rgb[bidx] - Y) * coeffs[4] + HALF_MAX; #else __constant int * coeffs = c_RGB2YUVCoeffs_i; - int delta = HALF_MAX * (1 << yuv_shift); - int Y = CV_DESCALE(rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx], yuv_shift); - int Cr = CV_DESCALE((rgb[bidx^2] - Y) * coeffs[3] + delta, yuv_shift); - int Cb = CV_DESCALE((rgb[bidx] - Y) * coeffs[4] + delta, yuv_shift); + const int delta = HALF_MAX * (1 << yuv_shift); +#endif + + if (1 == pixels_per_work_item) + { + const DATA_TYPE rgb[] = {src_ptr[0], src_ptr[1], src_ptr[2]}; + +#ifdef DEPTH_5 + float Y = rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx]; + float U = (rgb[bidx^2] - Y) * coeffs[3] + HALF_MAX; + float V = (rgb[bidx] - Y) * coeffs[4] + HALF_MAX; +#else + int Y = CV_DESCALE(rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx], yuv_shift); + int U = CV_DESCALE((rgb[bidx^2] - Y) * coeffs[3] + delta, yuv_shift); + int V = CV_DESCALE((rgb[bidx] - Y) * coeffs[4] + delta, yuv_shift); #endif - dst[dst_idx] = SAT_CAST( Y ); - dst[dst_idx + 1] = SAT_CAST( Cr ); - dst[dst_idx + 2] = SAT_CAST( Cb ); + dst_ptr[0] = SAT_CAST( Y ); + dst_ptr[1] = SAT_CAST( U ); + dst_ptr[2] = SAT_CAST( V ); + } +#ifdef INTEL_DEVICE + else if (2 == pixels_per_work_item) + { + const VECTOR8 r0 = vload8(0, src_ptr); + +#ifdef DEPTH_5 + const float2 c0 = r0.s04; + const float2 c1 = r0.s15; + const float2 c2 = r0.s26; + + const float2 Y = (bidx == 0) ? (c0 * coeffs[2] + c1 * coeffs[1] + c2 * coeffs[0]) : (c0 * coeffs[0] + c1 * coeffs[1] + c2 * coeffs[2]); + const float2 U = (bidx == 0) ? ((c2 - Y) * coeffs[3] + HALF_MAX) : ((c0 - Y) * coeffs[3] + HALF_MAX); + const float2 V = (bidx == 0) ? ((c0 - Y) * coeffs[4] + HALF_MAX) : ((c2 - Y) * coeffs[4] + HALF_MAX); +#else + const int2 c0 = convert_int2(r0.s04); + const int2 c1 = convert_int2(r0.s15); + const int2 c2 = convert_int2(r0.s26); + + const int2 yi = (bidx == 0) ? CV_DESCALE(c0 * coeffs[2] + c1 * coeffs[1] + c2 * coeffs[0], yuv_shift) : CV_DESCALE(c0 * coeffs[0] + c1 * coeffs[1] + c2 * coeffs[2], yuv_shift); + const int2 ui = (bidx == 0) ? CV_DESCALE((c2 - yi) * coeffs[3] + delta, yuv_shift) : CV_DESCALE((c0 - yi) * coeffs[3] + delta, yuv_shift); + const int2 vi = (bidx == 0) ? CV_DESCALE((c0 - yi) * coeffs[4] + delta, yuv_shift) : CV_DESCALE((c2 - yi) * coeffs[4] + delta, yuv_shift); + + const VECTOR2 Y = SAT_CAST2(yi); + const VECTOR2 U = SAT_CAST2(ui); + const VECTOR2 V = SAT_CAST2(vi); +#endif + + vstore8((VECTOR8)(Y.s0, U.s0, V.s0, 0, Y.s1, U.s1, V.s1, 0), 0, dst_ptr); + } + else if (4 == pixels_per_work_item) + { +#ifndef DEPTH_5 + const VECTOR16 r0 = vload16(0, src_ptr); + + const int4 c0 = convert_int4(r0.s048c); + const int4 c1 = convert_int4(r0.s159d); + const int4 c2 = convert_int4(r0.s26ae); + + const int4 yi = (bidx == 0) ? CV_DESCALE(c0 * coeffs[2] + c1 * coeffs[1] + c2 * coeffs[0], yuv_shift) : CV_DESCALE(c0 * coeffs[0] + c1 * coeffs[1] + c2 * coeffs[2], yuv_shift); + const int4 ui = (bidx == 0) ? CV_DESCALE((c2 - yi) * coeffs[3] + delta, yuv_shift) : CV_DESCALE((c0 - yi) * coeffs[3] + delta, yuv_shift); + const int4 vi = (bidx == 0) ? CV_DESCALE((c0 - yi) * coeffs[4] + delta, yuv_shift) : CV_DESCALE((c2 - yi) * coeffs[4] + delta, yuv_shift); + + const VECTOR4 Y = SAT_CAST4(yi); + const VECTOR4 U = SAT_CAST4(ui); + const VECTOR4 V = SAT_CAST4(vi); + + vstore16((VECTOR16)(Y.s0, U.s0, V.s0, 0, Y.s1, U.s1, V.s1, 0, Y.s2, U.s2, V.s2, 0, Y.s3, U.s3, V.s3, 0), 0, dst_ptr); +#endif + } +#endif //INTEL_DEVICE } } @@ -176,10 +319,10 @@ __constant float c_YUV2RGBCoeffs_f[5] = { 2.032f, -0.395f, -0.581f, 1.140f }; __constant int c_YUV2RGBCoeffs_i[5] = { 33292, -6472, -9519, 18678 }; __kernel void YUV2RGB(int cols, int rows, int src_step, int dst_step, - int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst, + __global const DATA_TYPE* src, __global DATA_TYPE* dst, int src_offset, int dst_offset) { - int x = get_global_id(0); + int x = get_global_id(0) * pixels_per_work_item; int y = get_global_id(1); if (y < rows && x < cols) @@ -187,26 +330,95 @@ __kernel void YUV2RGB(int cols, int rows, int src_step, int dst_step, x <<= 2; int src_idx = mad24(y, src_step, src_offset + x); int dst_idx = mad24(y, dst_step, dst_offset + x); - DATA_TYPE yuv[] = { src[src_idx], src[src_idx + 1], src[src_idx + 2] }; + + global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx); + global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx); #ifdef DEPTH_5 __constant float * coeffs = c_YUV2RGBCoeffs_f; - float b = yuv[0] + (yuv[2] - HALF_MAX) * coeffs[3]; - float g = yuv[0] + (yuv[2] - HALF_MAX) * coeffs[2] + (yuv[1] - HALF_MAX) * coeffs[1]; - float r = yuv[0] + (yuv[1] - HALF_MAX) * coeffs[0]; #else __constant int * coeffs = c_YUV2RGBCoeffs_i; - int b = yuv[0] + CV_DESCALE((yuv[2] - HALF_MAX) * coeffs[3], yuv_shift); - int g = yuv[0] + CV_DESCALE((yuv[2] - HALF_MAX) * coeffs[2] + (yuv[1] - HALF_MAX) * coeffs[1], yuv_shift); - int r = yuv[0] + CV_DESCALE((yuv[1] - HALF_MAX) * coeffs[0], yuv_shift); #endif - dst[dst_idx + bidx] = SAT_CAST( b ); - dst[dst_idx + 1] = SAT_CAST( g ); - dst[dst_idx + (bidx^2)] = SAT_CAST( r ); + if (1 == pixels_per_work_item) + { + const DATA_TYPE yuv[] = {src_ptr[0], src_ptr[1], src_ptr[2]}; + +#ifdef DEPTH_5 + float B = yuv[0] + (yuv[2] - HALF_MAX) * coeffs[3]; + float G = yuv[0] + (yuv[2] - HALF_MAX) * coeffs[2] + (yuv[1] - HALF_MAX) * coeffs[1]; + float R = yuv[0] + (yuv[1] - HALF_MAX) * coeffs[0]; +#else + int B = yuv[0] + CV_DESCALE((yuv[2] - HALF_MAX) * coeffs[3], yuv_shift); + int G = yuv[0] + CV_DESCALE((yuv[2] - HALF_MAX) * coeffs[2] + (yuv[1] - HALF_MAX) * coeffs[1], yuv_shift); + int R = yuv[0] + CV_DESCALE((yuv[1] - HALF_MAX) * coeffs[0], yuv_shift); +#endif + + dst_ptr[bidx] = SAT_CAST( B ); + dst_ptr[1] = SAT_CAST( G ); + dst_ptr[(bidx^2)] = SAT_CAST( R ); #if dcn == 4 - dst[dst_idx + 3] = MAX_NUM; + dst_ptr[3] = MAX_NUM; +#endif + } +#ifdef INTEL_DEVICE + else if (2 == pixels_per_work_item) + { + const VECTOR8 r0 = vload8(0, src_ptr); + +#ifdef DEPTH_5 + const float2 Y = r0.s04; + const float2 U = r0.s15; + const float2 V = r0.s26; + + const float2 c0 = (bidx == 0) ? (Y + (V - HALF_MAX) * coeffs[3]) : (Y + (U - HALF_MAX) * coeffs[0]); + const float2 c1 = Y + (V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1]; + const float2 c2 = (bidx == 0) ? (Y + (U - HALF_MAX) * coeffs[0]) : (Y + (V - HALF_MAX) * coeffs[3]); +#else + const int2 Y = convert_int2(r0.s04); + const int2 U = convert_int2(r0.s15); + const int2 V = convert_int2(r0.s26); + + const int2 c0i = (bidx == 0) ? (Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift)) : (Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift)); + const int2 c1i = Y + CV_DESCALE((V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1], yuv_shift); + const int2 c2i = (bidx == 0) ? (Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift)) : (Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift)); + + const VECTOR2 c0 = SAT_CAST2(c0i); + const VECTOR2 c1 = SAT_CAST2(c1i); + const VECTOR2 c2 = SAT_CAST2(c2i); +#endif + +#if dcn == 4 + vstore8((VECTOR8)(c0.s0, c1.s0, c2.s0, MAX_NUM, c0.s1, c1.s1, c2.s1, MAX_NUM), 0, dst_ptr); +#else + vstore8((VECTOR8)(c0.s0, c1.s0, c2.s0, 0, c0.s1, c1.s1, c2.s1, 0), 0, dst_ptr); +#endif + } + else if (4 == pixels_per_work_item) + { +#ifndef DEPTH_5 + const VECTOR16 r0 = vload16(0, src_ptr); + + const int4 Y = convert_int4(r0.s048c); + const int4 U = convert_int4(r0.s159d); + const int4 V = convert_int4(r0.s26ae); + + const int4 c0i = (bidx == 0) ? (Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift)) : (Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift)); + const int4 c1i = Y + CV_DESCALE((V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1], yuv_shift); + const int4 c2i = (bidx == 0) ? (Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift)) : (Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift)); + + const VECTOR4 c0 = SAT_CAST4(c0i); + const VECTOR4 c1 = SAT_CAST4(c1i); + const VECTOR4 c2 = SAT_CAST4(c2i); + +#if dcn == 4 + vstore16((VECTOR16)(c0.s0, c1.s0, c2.s0, MAX_NUM, c0.s1, c1.s1, c2.s1, MAX_NUM, c0.s2, c1.s2, c2.s2, MAX_NUM, c0.s3, c1.s3, c2.s3, MAX_NUM), 0, dst_ptr); +#else + vstore16((VECTOR16)(c0.s0, c1.s0, c2.s0, 0, c0.s1, c1.s1, c2.s1, 0, c0.s2, c1.s2, c2.s2, 0, c0.s3, c1.s3, c2.s3, 0), 0, dst_ptr); +#endif #endif + } +#endif //INTEL_DEVICE } } @@ -218,7 +430,7 @@ __constant int ITUR_BT_601_CVR = 1673527; __constant int ITUR_BT_601_SHIFT = 20; __kernel void YUV2RGBA_NV12(int cols, int rows, int src_step, int dst_step, - int bidx, __global const uchar* src, __global uchar* dst, + __global const uchar* src, __global uchar* dst, int src_offset, int dst_offset) { const int x = get_global_id(0); @@ -275,10 +487,10 @@ __constant float c_RGB2YCrCbCoeffs_f[5] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564 __constant int c_RGB2YCrCbCoeffs_i[5] = {R2Y, G2Y, B2Y, 11682, 9241}; __kernel void RGB2YCrCb(int cols, int rows, int src_step, int dst_step, - int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst, - int src_offset, int dst_offset) + __global const DATA_TYPE* src, __global DATA_TYPE* dst, + int src_offset, int dst_offset) { - int x = get_global_id(0); + int x = get_global_id(0) * pixels_per_work_item; int y = get_global_id(1); if (y < rows && x < cols) @@ -287,24 +499,83 @@ __kernel void RGB2YCrCb(int cols, int rows, int src_step, int dst_step, int src_idx = mad24(y, src_step, src_offset + x); int dst_idx = mad24(y, dst_step, dst_offset + x); - DATA_TYPE rgb[] = { src[src_idx], src[src_idx + 1], src[src_idx + 2] }; + global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx); + global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx); #ifdef DEPTH_5 __constant float * coeffs = c_RGB2YCrCbCoeffs_f; - DATA_TYPE Y = rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx]; - DATA_TYPE Cr = (rgb[bidx^2] - Y) * coeffs[3] + HALF_MAX; - DATA_TYPE Cb = (rgb[bidx] - Y) * coeffs[4] + HALF_MAX; #else __constant int * coeffs = c_RGB2YCrCbCoeffs_i; - int delta = HALF_MAX * (1 << yuv_shift); - int Y = CV_DESCALE(rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx], yuv_shift); - int Cr = CV_DESCALE((rgb[bidx^2] - Y) * coeffs[3] + delta, yuv_shift); - int Cb = CV_DESCALE((rgb[bidx] - Y) * coeffs[4] + delta, yuv_shift); + const int delta = HALF_MAX * (1 << yuv_shift); #endif - dst[dst_idx] = SAT_CAST( Y ); - dst[dst_idx + 1] = SAT_CAST( Cr ); - dst[dst_idx + 2] = SAT_CAST( Cb ); + if (1 == pixels_per_work_item) + { + const DATA_TYPE rgb[] = {src_ptr[0], src_ptr[1], src_ptr[2]}; + +#ifdef DEPTH_5 + float Y = rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx]; + float Cr = (rgb[bidx^2] - Y) * coeffs[3] + HALF_MAX; + float Cb = (rgb[bidx] - Y) * coeffs[4] + HALF_MAX; +#else + int Y = CV_DESCALE(rgb[0] * coeffs[bidx^2] + rgb[1] * coeffs[1] + rgb[2] * coeffs[bidx], yuv_shift); + int Cr = CV_DESCALE((rgb[bidx^2] - Y) * coeffs[3] + delta, yuv_shift); + int Cb = CV_DESCALE((rgb[bidx] - Y) * coeffs[4] + delta, yuv_shift); +#endif + + dst_ptr[0] = SAT_CAST( Y ); + dst_ptr[1] = SAT_CAST( Cr ); + dst_ptr[2] = SAT_CAST( Cb ); + } +#ifdef INTEL_DEVICE + else if (2 == pixels_per_work_item) + { + const VECTOR8 r0 = vload8(0, src_ptr); + +#ifdef DEPTH_5 + const float2 c0 = r0.s04; + const float2 c1 = r0.s15; + const float2 c2 = r0.s26; + + const float2 Y = (bidx == 0) ? (c0 * coeffs[2] + c1 * coeffs[1] + c2 * coeffs[0]) : (c0 * coeffs[0] + c1 * coeffs[1] + c2 * coeffs[2]); + const float2 Cr = (bidx == 0) ? ((c2 - Y) * coeffs[3] + HALF_MAX) : ((c0 - Y) * coeffs[3] + HALF_MAX); + const float2 Cb = (bidx == 0) ? ((c0 - Y) * coeffs[4] + HALF_MAX) : ((c2 - Y) * coeffs[4] + HALF_MAX); +#else + const int2 c0 = convert_int2(r0.s04); + const int2 c1 = convert_int2(r0.s15); + const int2 c2 = convert_int2(r0.s26); + + const int2 yi = (bidx == 0) ? CV_DESCALE(c0 * coeffs[2] + c1 * coeffs[1] + c2 * coeffs[0], yuv_shift) : CV_DESCALE(c0 * coeffs[0] + c1 * coeffs[1] + c2 * coeffs[2], yuv_shift); + const int2 ui = (bidx == 0) ? CV_DESCALE((c2 - yi) * coeffs[3] + delta, yuv_shift) : CV_DESCALE((c0 - yi) * coeffs[3] + delta, yuv_shift); + const int2 vi = (bidx == 0) ? CV_DESCALE((c0 - yi) * coeffs[4] + delta, yuv_shift) : CV_DESCALE((c2 - yi) * coeffs[4] + delta, yuv_shift); + + const VECTOR2 Y = SAT_CAST2(yi); + const VECTOR2 Cr = SAT_CAST2(ui); + const VECTOR2 Cb = SAT_CAST2(vi); +#endif + + vstore8((VECTOR8)(Y.s0, Cr.s0, Cb.s0, 0, Y.s1, Cr.s1, Cb.s1, 0), 0, dst_ptr); + } + else if (4 == pixels_per_work_item) + { +#ifndef DEPTH_5 + const VECTOR16 r0 = vload16(0, src_ptr); + const int4 c0 = convert_int4(r0.s048c); + const int4 c1 = convert_int4(r0.s159d); + const int4 c2 = convert_int4(r0.s26ae); + + const int4 yi = (bidx == 0) ? CV_DESCALE(c0 * coeffs[2] + c1 * coeffs[1] + c2 * coeffs[0], yuv_shift) : CV_DESCALE(c0 * coeffs[0] + c1 * coeffs[1] + c2 * coeffs[2], yuv_shift); + const int4 ui = (bidx == 0) ? CV_DESCALE((c2 - yi) * coeffs[3] + delta, yuv_shift) : CV_DESCALE((c0 - yi) * coeffs[3] + delta, yuv_shift); + const int4 vi = (bidx == 0) ? CV_DESCALE((c0 - yi) * coeffs[4] + delta, yuv_shift) : CV_DESCALE((c2 - yi) * coeffs[4] + delta, yuv_shift); + + const VECTOR4 Y = SAT_CAST4(yi); + const VECTOR4 Cr = SAT_CAST4(ui); + const VECTOR4 Cb = SAT_CAST4(vi); + + vstore16((VECTOR16)(Y.s0, Cr.s0, Cb.s0, 0, Y.s1, Cr.s1, Cb.s1, 0, Y.s2, Cr.s2, Cb.s2, 0, Y.s3, Cr.s3, Cb.s3, 0), 0, dst_ptr); +#endif + } +#endif //INTEL_DEVICE } } @@ -312,10 +583,10 @@ __constant float c_YCrCb2RGBCoeffs_f[4] = { 1.403f, -0.714f, -0.344f, 1.773f }; __constant int c_YCrCb2RGBCoeffs_i[4] = { 22987, -11698, -5636, 29049 }; __kernel void YCrCb2RGB(int cols, int rows, int src_step, int dst_step, - int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst, - int src_offset, int dst_offset) + __global const DATA_TYPE* src, __global DATA_TYPE* dst, + int src_offset, int dst_offset) { - int x = get_global_id(0); + int x = get_global_id(0) * pixels_per_work_item; int y = get_global_id(1); if (y < rows && x < cols) @@ -324,36 +595,104 @@ __kernel void YCrCb2RGB(int cols, int rows, int src_step, int dst_step, int src_idx = mad24(y, src_step, src_offset + x); int dst_idx = mad24(y, dst_step, dst_offset + x); - DATA_TYPE ycrcb[] = { src[src_idx], src[src_idx + 1], src[src_idx + 2] }; + global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx); + global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx); + +#ifdef DEPTH_5 + __constant float * coeffs = c_YCrCb2RGBCoeffs_f; +#else + __constant int * coeffs = c_YCrCb2RGBCoeffs_i; +#endif + + if (1 == pixels_per_work_item) + { + const DATA_TYPE ycrcb[] = {src_ptr[0], src_ptr[1], src_ptr[2]}; #ifdef DEPTH_5 - __constant float * coeff = c_YCrCb2RGBCoeffs_f; - float r = ycrcb[0] + coeff[0] * (ycrcb[1] - HALF_MAX); - float g = ycrcb[0] + coeff[1] * (ycrcb[1] - HALF_MAX) + coeff[2] * (ycrcb[2] - HALF_MAX); - float b = ycrcb[0] + coeff[3] * (ycrcb[2] - HALF_MAX); + float B = ycrcb[0] + (ycrcb[2] - HALF_MAX) * coeffs[3]; + float G = ycrcb[0] + (ycrcb[2] - HALF_MAX) * coeffs[2] + (ycrcb[1] - HALF_MAX) * coeffs[1]; + float R = ycrcb[0] + (ycrcb[1] - HALF_MAX) * coeffs[0]; #else - __constant int * coeff = c_YCrCb2RGBCoeffs_i; - int r = ycrcb[0] + CV_DESCALE(coeff[0] * (ycrcb[1] - HALF_MAX), yuv_shift); - int g = ycrcb[0] + CV_DESCALE(coeff[1] * (ycrcb[1] - HALF_MAX) + coeff[2] * (ycrcb[2] - HALF_MAX), yuv_shift); - int b = ycrcb[0] + CV_DESCALE(coeff[3] * (ycrcb[2] - HALF_MAX), yuv_shift); + int B = ycrcb[0] + CV_DESCALE((ycrcb[2] - HALF_MAX) * coeffs[3], yuv_shift); + int G = ycrcb[0] + CV_DESCALE((ycrcb[2] - HALF_MAX) * coeffs[2] + (ycrcb[1] - HALF_MAX) * coeffs[1], yuv_shift); + int R = ycrcb[0] + CV_DESCALE((ycrcb[1] - HALF_MAX) * coeffs[0], yuv_shift); #endif - dst[dst_idx + (bidx^2)] = SAT_CAST(r); - dst[dst_idx + 1] = SAT_CAST(g); - dst[dst_idx + bidx] = SAT_CAST(b); + dst_ptr[bidx] = SAT_CAST( B ); + dst_ptr[1] = SAT_CAST( G ); + dst_ptr[(bidx^2)] = SAT_CAST( R ); #if dcn == 4 - dst[dst_idx + 3] = MAX_NUM; + dst_ptr[3] = MAX_NUM; +#endif + } +#ifdef INTEL_DEVICE + else if (2 == pixels_per_work_item) + { + const VECTOR8 r0 = vload8(0, src_ptr); + +#ifdef DEPTH_5 + const float2 Y = r0.s04; + const float2 Cr = r0.s15; + const float2 Cb = r0.s26; + + const float2 c0 = (bidx == 0) ? (Y + (Cb - HALF_MAX) * coeffs[3]) : (Y + (Cr - HALF_MAX) * coeffs[0]); + const float2 c1 = Y + (Cb - HALF_MAX) * coeffs[2] + (Cr - HALF_MAX) * coeffs[1]; + const float2 c2 = (bidx == 0) ? (Y + (Cr - HALF_MAX) * coeffs[0]) : (Y + (Cb - HALF_MAX) * coeffs[3]); +#else + const int2 Y = convert_int2(r0.s04); + const int2 Cr = convert_int2(r0.s15); + const int2 Cb = convert_int2(r0.s26); + + const int2 c0i = (bidx == 0) ? (Y + CV_DESCALE((Cb - HALF_MAX) * coeffs[3], yuv_shift)) : (Y + CV_DESCALE((Cr - HALF_MAX) * coeffs[0], yuv_shift)); + const int2 c1i = Y + CV_DESCALE((Cb - HALF_MAX) * coeffs[2] + (Cr - HALF_MAX) * coeffs[1], yuv_shift); + const int2 c2i = (bidx == 0) ? (Y + CV_DESCALE((Cr - HALF_MAX) * coeffs[0], yuv_shift)) : (Y + CV_DESCALE((Cb - HALF_MAX) * coeffs[3], yuv_shift)); + + const VECTOR2 c0 = SAT_CAST2(c0i); + const VECTOR2 c1 = SAT_CAST2(c1i); + const VECTOR2 c2 = SAT_CAST2(c2i); +#endif + +#if dcn == 4 + vstore8((VECTOR8)(c0.s0, c1.s0, c2.s0, MAX_NUM, c0.s1, c1.s1, c2.s1, MAX_NUM), 0, dst_ptr); +#else + vstore8((VECTOR8)(c0.s0, c1.s0, c2.s0, 0, c0.s1, c1.s1, c2.s1, 0), 0, dst_ptr); +#endif + } + else if (4 == pixels_per_work_item) + { +#ifndef DEPTH_5 + const VECTOR16 r0 = vload16(0, src_ptr); + + const int4 Y = convert_int4(r0.s048c); + const int4 Cr = convert_int4(r0.s159d); + const int4 Cb = convert_int4(r0.s26ae); + + const int4 c0i = (bidx == 0) ? (Y + CV_DESCALE((Cb - HALF_MAX) * coeffs[3], yuv_shift)) : (Y + CV_DESCALE((Cr - HALF_MAX) * coeffs[0], yuv_shift)); + const int4 c1i = Y + CV_DESCALE((Cb - HALF_MAX) * coeffs[2] + (Cr - HALF_MAX) * coeffs[1], yuv_shift); + const int4 c2i = (bidx == 0) ? (Y + CV_DESCALE((Cr - HALF_MAX) * coeffs[0], yuv_shift)) : (Y + CV_DESCALE((Cb - HALF_MAX) * coeffs[3], yuv_shift)); + + const VECTOR4 c0 = SAT_CAST4(c0i); + const VECTOR4 c1 = SAT_CAST4(c1i); + const VECTOR4 c2 = SAT_CAST4(c2i); + +#if dcn == 4 + vstore16((VECTOR16)(c0.s0, c1.s0, c2.s0, MAX_NUM, c0.s1, c1.s1, c2.s1, MAX_NUM, c0.s2, c1.s2, c2.s2, MAX_NUM, c0.s3, c1.s3, c2.s3, MAX_NUM), 0, dst_ptr); +#else + vstore16((VECTOR16)(c0.s0, c1.s0, c2.s0, 0, c0.s1, c1.s1, c2.s1, 0, c0.s2, c1.s2, c2.s2, 0, c0.s3, c1.s3, c2.s3, 0), 0, dst_ptr); #endif +#endif + } +#endif //INTEL_DEVICE } } ///////////////////////////////////// RGB <-> XYZ ////////////////////////////////////// __kernel void RGB2XYZ(int cols, int rows, int src_step, int dst_step, - int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst, + __global const DATA_TYPE* src, __global DATA_TYPE* dst, int src_offset, int dst_offset, __constant COEFF_TYPE * coeffs) { - int dx = get_global_id(0); + int dx = get_global_id(0) * pixels_per_work_item; int dy = get_global_id(1); if (dy < rows && dx < cols) @@ -362,28 +701,85 @@ __kernel void RGB2XYZ(int cols, int rows, int src_step, int dst_step, int src_idx = mad24(dy, src_step, src_offset + dx); int dst_idx = mad24(dy, dst_step, dst_offset + dx); - DATA_TYPE r = src[src_idx], g = src[src_idx + 1], b = src[src_idx + 2]; + global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx); + global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx); + + if (1 == pixels_per_work_item) + { + DATA_TYPE R = src_ptr[0], G = src_ptr[1], B = src_ptr[2]; #ifdef DEPTH_5 - float x = r * coeffs[0] + g * coeffs[1] + b * coeffs[2]; - float y = r * coeffs[3] + g * coeffs[4] + b * coeffs[5]; - float z = r * coeffs[6] + g * coeffs[7] + b * coeffs[8]; + float X = R * coeffs[0] + G * coeffs[1] + B * coeffs[2]; + float Y = R * coeffs[3] + G * coeffs[4] + B * coeffs[5]; + float Z = R * coeffs[6] + G * coeffs[7] + B * coeffs[8]; #else - int x = CV_DESCALE(r * coeffs[0] + g * coeffs[1] + b * coeffs[2], xyz_shift); - int y = CV_DESCALE(r * coeffs[3] + g * coeffs[4] + b * coeffs[5], xyz_shift); - int z = CV_DESCALE(r * coeffs[6] + g * coeffs[7] + b * coeffs[8], xyz_shift); + int X = CV_DESCALE(R * coeffs[0] + G * coeffs[1] + B * coeffs[2], xyz_shift); + int Y = CV_DESCALE(R * coeffs[3] + G * coeffs[4] + B * coeffs[5], xyz_shift); + int Z = CV_DESCALE(R * coeffs[6] + G * coeffs[7] + B * coeffs[8], xyz_shift); #endif - dst[dst_idx] = SAT_CAST(x); - dst[dst_idx + 1] = SAT_CAST(y); - dst[dst_idx + 2] = SAT_CAST(z); + + dst_ptr[0] = SAT_CAST( X ); + dst_ptr[1] = SAT_CAST( Y ); + dst_ptr[2] = SAT_CAST( Z ); + } +#ifdef INTEL_DEVICE + else if (2 == pixels_per_work_item) + { + const VECTOR8 r0 = vload8(0, src_ptr); + +#ifdef DEPTH_5 + const float2 R = r0.s04; + const float2 G = r0.s15; + const float2 B = r0.s26; + + const float2 X = R * coeffs[0] + G * coeffs[1] + B * coeffs[2]; + const float2 Y = R * coeffs[3] + G * coeffs[4] + B * coeffs[5]; + const float2 Z = R * coeffs[6] + G * coeffs[7] + B * coeffs[8]; +#else + const int2 R = convert_int2(r0.s04); + const int2 G = convert_int2(r0.s15); + const int2 B = convert_int2(r0.s26); + + const int2 xi = CV_DESCALE(R * coeffs[0] + G * coeffs[1] + B * coeffs[2], xyz_shift); + const int2 yi = CV_DESCALE(R * coeffs[3] + G * coeffs[4] + B * coeffs[5], xyz_shift); + const int2 zi = CV_DESCALE(R * coeffs[6] + G * coeffs[7] + B * coeffs[8], xyz_shift); + + const VECTOR2 X = SAT_CAST2(xi); + const VECTOR2 Y = SAT_CAST2(yi); + const VECTOR2 Z = SAT_CAST2(zi); +#endif + + vstore8((VECTOR8)(X.s0, Y.s0, Z.s0, 0, X.s1, Y.s1, Z.s1, 0), 0, dst_ptr); + } + else if (4 == pixels_per_work_item) + { +#ifndef DEPTH_5 + const VECTOR16 r0 = vload16(0, src_ptr); + + const int4 R = convert_int4(r0.s048c); + const int4 G = convert_int4(r0.s159d); + const int4 B = convert_int4(r0.s26ae); + + const int4 xi = CV_DESCALE(R * coeffs[0] + G * coeffs[1] + B * coeffs[2], xyz_shift); + const int4 yi = CV_DESCALE(R * coeffs[3] + G * coeffs[4] + B * coeffs[5], xyz_shift); + const int4 zi = CV_DESCALE(R * coeffs[6] + G * coeffs[7] + B * coeffs[8], xyz_shift); + + const VECTOR4 X = SAT_CAST4(xi); + const VECTOR4 Y = SAT_CAST4(yi); + const VECTOR4 Z = SAT_CAST4(zi); + + vstore16((VECTOR16)(X.s0, Y.s0, Z.s0, 0, X.s1, Y.s1, Z.s1, 0, X.s2, Y.s2, Z.s2, 0, X.s3, Y.s3, Z.s3, 0), 0, dst_ptr); +#endif + } +#endif //INTEL_DEVICE } } __kernel void XYZ2RGB(int cols, int rows, int src_step, int dst_step, - int bidx, __global const DATA_TYPE* src, __global DATA_TYPE* dst, + __global const DATA_TYPE* src, __global DATA_TYPE* dst, int src_offset, int dst_offset, __constant COEFF_TYPE * coeffs) { - int dx = get_global_id(0); + int dx = get_global_id(0) * pixels_per_work_item; int dy = get_global_id(1); if (dy < rows && dx < cols) @@ -392,23 +788,88 @@ __kernel void XYZ2RGB(int cols, int rows, int src_step, int dst_step, int src_idx = mad24(dy, src_step, src_offset + dx); int dst_idx = mad24(dy, dst_step, dst_offset + dx); - DATA_TYPE x = src[src_idx], y = src[src_idx + 1], z = src[src_idx + 2]; + global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx); + global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx); + + if (1 == pixels_per_work_item) + { + const DATA_TYPE X = src_ptr[0], Y = src_ptr[1], Z = src_ptr[2]; #ifdef DEPTH_5 - float b = x * coeffs[0] + y * coeffs[1] + z * coeffs[2]; - float g = x * coeffs[3] + y * coeffs[4] + z * coeffs[5]; - float r = x * coeffs[6] + y * coeffs[7] + z * coeffs[8]; + float B = X * coeffs[0] + Y * coeffs[1] + Z * coeffs[2]; + float G = X * coeffs[3] + Y * coeffs[4] + Z * coeffs[5]; + float R = X * coeffs[6] + Y * coeffs[7] + Z * coeffs[8]; #else - int b = CV_DESCALE(x * coeffs[0] + y * coeffs[1] + z * coeffs[2], xyz_shift); - int g = CV_DESCALE(x * coeffs[3] + y * coeffs[4] + z * coeffs[5], xyz_shift); - int r = CV_DESCALE(x * coeffs[6] + y * coeffs[7] + z * coeffs[8], xyz_shift); + int B = CV_DESCALE(X * coeffs[0] + Y * coeffs[1] + Z * coeffs[2], xyz_shift); + int G = CV_DESCALE(X * coeffs[3] + Y * coeffs[4] + Z * coeffs[5], xyz_shift); + int R = CV_DESCALE(X * coeffs[6] + Y * coeffs[7] + Z * coeffs[8], xyz_shift); #endif - dst[dst_idx] = SAT_CAST(b); - dst[dst_idx + 1] = SAT_CAST(g); - dst[dst_idx + 2] = SAT_CAST(r); + + dst_ptr[0] = SAT_CAST( B ); + dst_ptr[1] = SAT_CAST( G ); + dst_ptr[2] = SAT_CAST( R ); #if dcn == 4 - dst[dst_idx + 3] = MAX_NUM; + dst_ptr[3] = MAX_NUM; +#endif + } +#ifdef INTEL_DEVICE + else if (2 == pixels_per_work_item) + { + const VECTOR8 r0 = vload8(0, src_ptr); + +#ifdef DEPTH_5 + const float2 X = r0.s04; + const float2 Y = r0.s15; + const float2 Z = r0.s26; + + float2 B = X * coeffs[0] + Y * coeffs[1] + Z * coeffs[2]; + float2 G = X * coeffs[3] + Y * coeffs[4] + Z * coeffs[5]; + float2 R = X * coeffs[6] + Y * coeffs[7] + Z * coeffs[8]; +#else + const int2 xi = convert_int2(r0.s04); + const int2 yi = convert_int2(r0.s15); + const int2 zi = convert_int2(r0.s26); + + const int2 bi = CV_DESCALE(xi * coeffs[0] + yi * coeffs[1] + zi * coeffs[2], xyz_shift); + const int2 gi = CV_DESCALE(xi * coeffs[3] + yi * coeffs[4] + zi * coeffs[5], xyz_shift); + const int2 ri = CV_DESCALE(xi * coeffs[6] + yi * coeffs[7] + zi * coeffs[8], xyz_shift); + + const VECTOR2 R = SAT_CAST2(ri); + const VECTOR2 G = SAT_CAST2(gi); + const VECTOR2 B = SAT_CAST2(bi); +#endif + +#if dcn == 4 + vstore8((VECTOR8)(B.s0, G.s0, R.s0, MAX_NUM, B.s1, G.s1, R.s1, MAX_NUM), 0, dst_ptr); +#else + vstore8((VECTOR8)(B.s0, G.s0, R.s0, 0, B.s1, G.s1, R.s1, 0), 0, dst_ptr); +#endif + } + else if (4 == pixels_per_work_item) + { +#ifndef DEPTH_5 + const VECTOR16 r0 = vload16(0, src_ptr); + + const int4 xi = convert_int4(r0.s048c); + const int4 yi = convert_int4(r0.s159d); + const int4 zi = convert_int4(r0.s26ae); + + const int4 bi = CV_DESCALE(xi * coeffs[0] + yi * coeffs[1] + zi * coeffs[2], xyz_shift); + const int4 gi = CV_DESCALE(xi * coeffs[3] + yi * coeffs[4] + zi * coeffs[5], xyz_shift); + const int4 ri = CV_DESCALE(xi * coeffs[6] + yi * coeffs[7] + zi * coeffs[8], xyz_shift); + + const VECTOR4 R = SAT_CAST4(ri); + const VECTOR4 G = SAT_CAST4(gi); + const VECTOR4 B = SAT_CAST4(bi); + +#if dcn == 4 + vstore16((VECTOR16)(B.s0, G.s0, R.s0, MAX_NUM, B.s1, G.s1, R.s1, MAX_NUM, B.s2, G.s2, R.s2, MAX_NUM, B.s3, G.s3, R.s3, MAX_NUM), 0, dst_ptr); +#else + vstore16((VECTOR16)(B.s0, G.s0, R.s0, 0, B.s1, G.s1, R.s1, 0, B.s2, G.s2, R.s2, 0, B.s3, G.s3, R.s3, 0), 0, dst_ptr); #endif +#endif + } +#endif //INTEL_DEVICE } } @@ -427,6 +888,7 @@ __kernel void RGB(int cols, int rows, int src_step, int dst_step, int src_idx = mad24(y, src_step, src_offset + x); int dst_idx = mad24(y, dst_step, dst_offset + x); +#ifndef INTEL_DEVICE #ifdef REVERSE dst[dst_idx] = src[src_idx + 2]; dst[dst_idx + 1] = src[src_idx + 1]; @@ -443,13 +905,44 @@ __kernel void RGB(int cols, int rows, int src_step, int dst_step, #else dst[dst_idx + 3] = src[src_idx + 3]; #endif +#endif +#else + global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx); + global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx); + + const VECTOR4 r0 = vload4(0, src_ptr); +#ifdef REVERSE + if (3 == dcn) + { + vstore4((VECTOR4)(r0.s210, 0), 0, dst_ptr); + } + else if (3 == scn) + { + vstore4((VECTOR4)(r0.s210, MAX_NUM), 0, dst_ptr); + } + else { + vstore4((VECTOR4)(r0.s2103), 0, dst_ptr); + } +#elif defined ORDER + if (3 == dcn) + { + vstore4((VECTOR4)(r0.s012, 0), 0, dst_ptr); + } + else if (3 == scn) + { + vstore4((VECTOR4)(r0.s012, MAX_NUM), 0, dst_ptr); + } + else { + vstore4(r0, 0, dst_ptr); + } +#endif #endif } } ///////////////////////////////////// RGB5x5 <-> RGB ////////////////////////////////////// -__kernel void RGB5x52RGB(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void RGB5x52RGB(int cols, int rows, int src_step, int dst_step, __global const ushort * src, __global uchar * dst, int src_offset, int dst_offset) { @@ -482,7 +975,7 @@ __kernel void RGB5x52RGB(int cols, int rows, int src_step, int dst_step, int bid } } -__kernel void RGB2RGB5x5(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void RGB2RGB5x5(int cols, int rows, int src_step, int dst_step, __global const uchar * src, __global ushort * dst, int src_offset, int dst_offset) { @@ -507,7 +1000,7 @@ __kernel void RGB2RGB5x5(int cols, int rows, int src_step, int dst_step, int bid ///////////////////////////////////// RGB5x5 <-> RGB ////////////////////////////////////// -__kernel void BGR5x52Gray(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void BGR5x52Gray(int cols, int rows, int src_step, int dst_step, __global const ushort * src, __global uchar * dst, int src_offset, int dst_offset) { @@ -532,7 +1025,7 @@ __kernel void BGR5x52Gray(int cols, int rows, int src_step, int dst_step, int bi } } -__kernel void Gray2BGR5x5(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void Gray2BGR5x5(int cols, int rows, int src_step, int dst_step, __global const uchar * src, __global ushort * dst, int src_offset, int dst_offset) { @@ -560,7 +1053,7 @@ __constant int sector_data[][3] = { {1, 3, 0}, { 1, 0, 2 }, { 3, 0, 1 }, { 0, 2, #ifdef DEPTH_0 -__kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step, __global const uchar * src, __global uchar * dst, int src_offset, int dst_offset, __constant int * sdiv_table, __constant int * hdiv_table) @@ -600,7 +1093,7 @@ __kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step, int bidx, } } -__kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step, __global const uchar * src, __global uchar * dst, int src_offset, int dst_offset) { @@ -656,7 +1149,7 @@ __kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step, int bidx, #elif defined DEPTH_5 -__kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step, __global const float * src, __global float * dst, int src_offset, int dst_offset) { @@ -698,7 +1191,7 @@ __kernel void RGB2HSV(int cols, int rows, int src_step, int dst_step, int bidx, } } -__kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step, __global const float * src, __global float * dst, int src_offset, int dst_offset) { @@ -758,7 +1251,7 @@ __kernel void HSV2RGB(int cols, int rows, int src_step, int dst_step, int bidx, #ifdef DEPTH_0 -__kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step, __global const uchar * src, __global uchar * dst, int src_offset, int dst_offset) { @@ -805,7 +1298,7 @@ __kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step, int bidx, } } -__kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step, __global const uchar * src, __global uchar * dst, int src_offset, int dst_offset) { @@ -860,7 +1353,7 @@ __kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step, int bidx, #elif defined DEPTH_5 -__kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step, __global const float * src, __global float * dst, int src_offset, int dst_offset) { @@ -907,7 +1400,7 @@ __kernel void RGB2HLS(int cols, int rows, int src_step, int dst_step, int bidx, } } -__kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step, __global const float * src, __global float * dst, int src_offset, int dst_offset) { @@ -968,10 +1461,10 @@ __kernel void HLS2RGB(int cols, int rows, int src_step, int dst_step, int bidx, #ifdef DEPTH_0 __kernel void RGBA2mRGBA(int cols, int rows, int src_step, int dst_step, - int bidx, __global const uchar * src, __global uchar * dst, + __global const uchar * src, __global uchar * dst, int src_offset, int dst_offset) { - int x = get_global_id(0); + int x = get_global_id(0) * pixels_per_work_item; int y = get_global_id(1); if (y < rows && x < cols) @@ -980,21 +1473,66 @@ __kernel void RGBA2mRGBA(int cols, int rows, int src_step, int dst_step, int src_idx = mad24(y, src_step, src_offset + x); int dst_idx = mad24(y, dst_step, dst_offset + x); - uchar v0 = src[src_idx], v1 = src[src_idx + 1]; - uchar v2 = src[src_idx + 2], v3 = src[src_idx + 3]; + global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx); + global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx); + + if (1 == pixels_per_work_item) + { + const uchar4 r0 = vload4(0, src_ptr); - dst[dst_idx] = (v0 * v3 + HALF_MAX) / MAX_NUM; - dst[dst_idx + 1] = (v1 * v3 + HALF_MAX) / MAX_NUM; - dst[dst_idx + 2] = (v2 * v3 + HALF_MAX) / MAX_NUM; - dst[dst_idx + 3] = v3; + dst_ptr[0] = (r0.s0 * r0.s3 + HALF_MAX) / MAX_NUM; + dst_ptr[1] = (r0.s1 * r0.s3 + HALF_MAX) / MAX_NUM; + dst_ptr[2] = (r0.s2 * r0.s3 + HALF_MAX) / MAX_NUM; + dst_ptr[3] = r0.s3; + } +#ifdef INTEL_DEVICE + else if (2 == pixels_per_work_item) + { + const uchar8 r0 = vload8(0, src_ptr); + + const int2 v0 = convert_int2(r0.s04); + const int2 v1 = convert_int2(r0.s15); + const int2 v2 = convert_int2(r0.s26); + const int2 v3 = convert_int2(r0.s37); + + const int2 ri = (v0 * v3 + HALF_MAX) / MAX_NUM; + const int2 gi = (v1 * v3 + HALF_MAX) / MAX_NUM; + const int2 bi = (v2 * v3 + HALF_MAX) / MAX_NUM; + + const uchar2 r = convert_uchar2(ri); + const uchar2 g = convert_uchar2(gi); + const uchar2 b = convert_uchar2(bi); + + vstore8((uchar8)(r.s0, g.s0, b.s0, v3.s0, r.s1, g.s1, b.s1, v3.s1), 0, dst_ptr); + } + else if (4 == pixels_per_work_item) + { + const uchar16 r0 = vload16(0, src_ptr); + + const int4 v0 = convert_int4(r0.s048c); + const int4 v1 = convert_int4(r0.s159d); + const int4 v2 = convert_int4(r0.s26ae); + const int4 v3 = convert_int4(r0.s37bf); + + const int4 ri = (v0 * v3 + HALF_MAX) / MAX_NUM; + const int4 gi = (v1 * v3 + HALF_MAX) / MAX_NUM; + const int4 bi = (v2 * v3 + HALF_MAX) / MAX_NUM; + + const uchar4 r = convert_uchar4(ri); + const uchar4 g = convert_uchar4(gi); + const uchar4 b = convert_uchar4(bi); + + vstore16((uchar16)(r.s0, g.s0, b.s0, v3.s0, r.s1, g.s1, b.s1, v3.s1, r.s2, g.s2, b.s2, v3.s2, r.s3, g.s3, b.s3, v3.s3), 0, dst_ptr); + } +#endif //INTEL_DEVICE } } -__kernel void mRGBA2RGBA(int cols, int rows, int src_step, int dst_step, int bidx, +__kernel void mRGBA2RGBA(int cols, int rows, int src_step, int dst_step, __global const uchar * src, __global uchar * dst, int src_offset, int dst_offset) { - int x = get_global_id(0); + int x = get_global_id(0) * pixels_per_work_item; int y = get_global_id(1); if (y < rows && x < cols) @@ -1003,14 +1541,63 @@ __kernel void mRGBA2RGBA(int cols, int rows, int src_step, int dst_step, int bid int src_idx = mad24(y, src_step, src_offset + x); int dst_idx = mad24(y, dst_step, dst_offset + x); - uchar v0 = src[src_idx], v1 = src[src_idx + 1]; - uchar v2 = src[src_idx + 2], v3 = src[src_idx + 3]; - uchar v3_half = v3 / 2; + global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx); + global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx); + + if (1 == pixels_per_work_item) + { + const uchar4 r0 = vload4(0, src_ptr); + const uchar v3_half = r0.s3 / 2; - dst[dst_idx] = v3 == 0 ? 0 : (v0 * MAX_NUM + v3_half) / v3; - dst[dst_idx + 1] = v3 == 0 ? 0 : (v1 * MAX_NUM + v3_half) / v3; - dst[dst_idx + 2] = v3 == 0 ? 0 : (v2 * MAX_NUM + v3_half) / v3; - dst[dst_idx + 3] = v3; + const uchar r = (r0.s3 == 0) ? 0 : (r0.s0 * MAX_NUM + v3_half) / r0.s3; + const uchar g = (r0.s3 == 0) ? 0 : (r0.s1 * MAX_NUM + v3_half) / r0.s3; + const uchar b = (r0.s3 == 0) ? 0 : (r0.s2 * MAX_NUM + v3_half) / r0.s3; + + vstore4((uchar4)(r, g, b, r0.s3), 0, dst_ptr); + } +#ifdef INTEL_DEVICE + else if (2 == pixels_per_work_item) + { + const uchar8 r0 = vload8(0, src_ptr); + + const int2 v0 = convert_int2(r0.s04); + const int2 v1 = convert_int2(r0.s15); + const int2 v2 = convert_int2(r0.s26); + const int2 v3 = convert_int2(r0.s37); + const int2 v3_half = v3 / 2; + + const int2 ri = (v3 == 0) ? 0 : (v0 * MAX_NUM + v3_half) / v3; + const int2 gi = (v3 == 0) ? 0 : (v1 * MAX_NUM + v3_half) / v3; + const int2 bi = (v3 == 0) ? 0 : (v2 * MAX_NUM + v3_half) / v3; + + const uchar2 r = convert_uchar2(ri); + const uchar2 g = convert_uchar2(gi); + const uchar2 b = convert_uchar2(bi); + + vstore8((uchar8)(r.s0, g.s0, b.s0, v3.s0, r.s1, g.s1, b.s1, v3.s1), 0, dst_ptr); + } + else if (4 == pixels_per_work_item) + { + const uchar16 r0 = vload16(0, src_ptr); + + const int4 v0 = convert_int4(r0.s048c); + const int4 v1 = convert_int4(r0.s159d); + const int4 v2 = convert_int4(r0.s26ae); + const int4 v3 = convert_int4(r0.s37bf); + const int4 v3_half = v3 / 2; + + + const int4 ri = (v3 == 0) ? 0 : (v0 * MAX_NUM + v3_half) / v3; + const int4 gi = (v3 == 0) ? 0 : (v1 * MAX_NUM + v3_half) / v3; + const int4 bi = (v3 == 0) ? 0 : (v2 * MAX_NUM + v3_half) / v3; + + const uchar4 r = convert_uchar4(ri); + const uchar4 g = convert_uchar4(gi); + const uchar4 b = convert_uchar4(bi); + + vstore16((uchar16)(r.s0, g.s0, b.s0, v3.s0, r.s1, g.s1, b.s1, v3.s1, r.s2, g.s2, b.s2, v3.s2, r.s3, g.s3, b.s3, v3.s3), 0, dst_ptr); + } +#endif //INTEL_DEVICE } } From d6a7e8f84fd5ac745af2589a573b011b82a69345 Mon Sep 17 00:00:00 2001 From: Vladimir Bystricky Date: Fri, 20 Dec 2013 12:33:39 +0400 Subject: [PATCH 26/28] Remove TBB ifdef form code --- modules/highgui/src/cap_intelperc.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/modules/highgui/src/cap_intelperc.cpp b/modules/highgui/src/cap_intelperc.cpp index 18b3b9d0c0..368f4fd2c5 100644 --- a/modules/highgui/src/cap_intelperc.cpp +++ b/modules/highgui/src/cap_intelperc.cpp @@ -2,10 +2,6 @@ #ifdef HAVE_INTELPERC -#if defined TBB_INTERFACE_VERSION && TBB_INTERFACE_VERSION < 5000 -# undef HAVE_TBB -#endif - #include "pxcsession.h" #include "pxcsmartptr.h" #include "pxccapture.h" From e8dd31aacd08c9d1754871068aa5f708246c7c96 Mon Sep 17 00:00:00 2001 From: krodyush Date: Fri, 20 Dec 2013 13:51:51 +0400 Subject: [PATCH 27/28] change code according reviewer suggesions --- modules/ocl/perf/perf_color.cpp | 93 ++++++++--------------------- modules/ocl/src/opencl/cvt_color.cl | 87 +++++++++++++-------------- 2 files changed, 67 insertions(+), 113 deletions(-) diff --git a/modules/ocl/perf/perf_color.cpp b/modules/ocl/perf/perf_color.cpp index 75e6820fcb..8433315189 100644 --- a/modules/ocl/perf/perf_color.cpp +++ b/modules/ocl/perf/perf_color.cpp @@ -57,39 +57,9 @@ CV_ENUM(ConversionTypes, CV_RGB2GRAY, CV_RGB2BGR, CV_RGB2YUV, CV_YUV2RGB, CV_RGB CV_HLS2RGB, CV_BGR5652BGR, CV_BGR2BGR565, CV_RGBA2mRGBA, CV_mRGBA2RGBA, CV_YUV2RGB_NV12) typedef tuple > cvtColorParams; -typedef TestBaseWithParam cvtColorU8Fixture; -typedef TestBaseWithParam cvtColorF32Fixture; -typedef TestBaseWithParam cvtColorU16Fixture; +typedef TestBaseWithParam cvtColorFixture; -#define RUN_CVT_PERF_TEST \ - cvtColorParams params = GetParam();\ - const Size srcSize = get<0>(params);\ - const tuple conversionParams = get<1>(params);\ - const int code = get<0>(conversionParams), scn = get<1>(conversionParams),\ - dcn = get<2>(conversionParams);\ -\ - Mat src(srcSize, CV_8UC(scn)), dst(srcSize, CV_8UC(scn));\ - declare.in(src, WARMUP_RNG).out(dst);\ -\ - if (RUN_OCL_IMPL)\ - {\ - ocl::oclMat oclSrc(src), oclDst(src.size(), dst.type());\ -\ - OCL_TEST_CYCLE() ocl::cvtColor(oclSrc, oclDst, code, dcn);\ - oclDst.download(dst);\ -\ - SANITY_CHECK(dst, 1);\ - }\ - else if (RUN_PLAIN_IMPL)\ - {\ - TEST_CYCLE() cv::cvtColor(src, dst, code, dcn);\ -\ - SANITY_CHECK(dst);\ - }\ - else\ - OCL_PERF_ELSE\ - -PERF_TEST_P(cvtColorU8Fixture, cvtColor, testing::Combine( +PERF_TEST_P(cvtColorFixture, cvtColor, testing::Combine( testing::Values(Size(1000, 1002), Size(2000, 2004), Size(4000, 4008)), testing::Values( make_tuple(ConversionTypes(CV_RGB2GRAY), 3, 1), @@ -111,41 +81,30 @@ PERF_TEST_P(cvtColorU8Fixture, cvtColor, testing::Combine( make_tuple(ConversionTypes(CV_YUV2RGB_NV12), 1, 3) ))) { - RUN_CVT_PERF_TEST -} + cvtColorParams params = GetParam(); + const Size srcSize = get<0>(params); + const tuple conversionParams = get<1>(params); + const int code = get<0>(conversionParams), scn = get<1>(conversionParams), + dcn = get<2>(conversionParams); -PERF_TEST_P(cvtColorF32Fixture, cvtColor, testing::Combine( - testing::Values(Size(1000, 1002), Size(2000, 2004), Size(4000, 4008)), - testing::Values( - make_tuple(ConversionTypes(CV_RGB2GRAY), 3, 1), - make_tuple(ConversionTypes(CV_RGB2BGR), 3, 3), - make_tuple(ConversionTypes(CV_RGB2YUV), 3, 3), - make_tuple(ConversionTypes(CV_YUV2RGB), 3, 3), - make_tuple(ConversionTypes(CV_RGB2YCrCb), 3, 3), - make_tuple(ConversionTypes(CV_YCrCb2RGB), 3, 3), - make_tuple(ConversionTypes(CV_RGB2XYZ), 3, 3), - make_tuple(ConversionTypes(CV_XYZ2RGB), 3, 3), - make_tuple(ConversionTypes(CV_RGB2HSV), 3, 3), - make_tuple(ConversionTypes(CV_HSV2RGB), 3, 3), - make_tuple(ConversionTypes(CV_RGB2HLS), 3, 3), - make_tuple(ConversionTypes(CV_HLS2RGB), 3, 3) - ))) -{ - RUN_CVT_PERF_TEST -} + Mat src(srcSize, CV_8UC(scn)), dst(srcSize, CV_8UC(scn)); + declare.in(src, WARMUP_RNG).out(dst); -PERF_TEST_P(cvtColorU16Fixture, cvtColor, testing::Combine( - testing::Values(Size(1000, 1002), Size(2000, 2004), Size(4000, 4008)), - testing::Values( - make_tuple(ConversionTypes(CV_RGB2GRAY), 3, 1), - make_tuple(ConversionTypes(CV_RGB2BGR), 3, 3), - make_tuple(ConversionTypes(CV_RGB2YUV), 3, 3), - make_tuple(ConversionTypes(CV_YUV2RGB), 3, 3), - make_tuple(ConversionTypes(CV_RGB2YCrCb), 3, 3), - make_tuple(ConversionTypes(CV_YCrCb2RGB), 3, 3), - make_tuple(ConversionTypes(CV_RGB2XYZ), 3, 3), - make_tuple(ConversionTypes(CV_XYZ2RGB), 3, 3) - ))) -{ - RUN_CVT_PERF_TEST + if (RUN_OCL_IMPL) + { + ocl::oclMat oclSrc(src), oclDst(src.size(), dst.type()); + + OCL_TEST_CYCLE() ocl::cvtColor(oclSrc, oclDst, code, dcn); + oclDst.download(dst); + + SANITY_CHECK(dst, 1); + } + else if (RUN_PLAIN_IMPL) + { + TEST_CYCLE() cv::cvtColor(src, dst, code, dcn); + + SANITY_CHECK(dst); + } + else + OCL_PERF_ELSE } diff --git a/modules/ocl/src/opencl/cvt_color.cl b/modules/ocl/src/opencl/cvt_color.cl index 2313af1527..5c236f0e05 100644 --- a/modules/ocl/src/opencl/cvt_color.cl +++ b/modules/ocl/src/opencl/cvt_color.cl @@ -133,12 +133,14 @@ __kernel void RGB2Gray(int cols, int rows, int src_step, int dst_step, int dst_idx = mad24(y, dst_step, dst_offset + x); #ifndef INTEL_DEVICE + #ifdef DEPTH_5 dst[dst_idx] = src[src_idx + bidx] * 0.114f + src[src_idx + 1] * 0.587f + src[src_idx + (bidx^2)] * 0.299f; #else dst[dst_idx] = (DATA_TYPE)CV_DESCALE((src[src_idx + bidx] * B2Y + src[src_idx + 1] * G2Y + src[src_idx + (bidx^2)] * R2Y), yuv_shift); #endif -#else + +#else //INTEL_DEVICE global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx); global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx); @@ -148,7 +150,7 @@ __kernel void RGB2Gray(int cols, int rows, int src_step, int dst_step, __constant int * coeffs = c_RGB2GrayCoeffs_i; #endif - if (1 == pixels_per_work_item) +#if (1 == pixels_per_work_item) { #ifdef DEPTH_5 *dst_ptr = src_ptr[bidx] * coeffs[0] + src_ptr[1] * coeffs[1] + src_ptr[(bidx^2)] *coeffs[2]; @@ -156,7 +158,7 @@ __kernel void RGB2Gray(int cols, int rows, int src_step, int dst_step, *dst_ptr = (DATA_TYPE)CV_DESCALE((src_ptr[bidx] * coeffs[0] + src_ptr[1] * coeffs[1] + src_ptr[(bidx^2)] * coeffs[2]), yuv_shift); #endif } - else if (2 == pixels_per_work_item) +#elif (2 == pixels_per_work_item) { const VECTOR8 r0 = vload8(0, src_ptr); @@ -177,7 +179,7 @@ __kernel void RGB2Gray(int cols, int rows, int src_step, int dst_step, vstore2(Y, 0, dst_ptr); } - else if (4 == pixels_per_work_item) +#elif (4 == pixels_per_work_item) { #ifndef DEPTH_5 const VECTOR16 r0 = vload16(0, src_ptr); @@ -190,6 +192,7 @@ __kernel void RGB2Gray(int cols, int rows, int src_step, int dst_step, vstore4(SAT_CAST4(Y), 0, dst_ptr); #endif } +#endif //pixels_per_work_item #endif //INTEL_DEVICE } } @@ -244,7 +247,7 @@ __kernel void RGB2YUV(int cols, int rows, int src_step, int dst_step, const int delta = HALF_MAX * (1 << yuv_shift); #endif - if (1 == pixels_per_work_item) +#if (1 == pixels_per_work_item) { const DATA_TYPE rgb[] = {src_ptr[0], src_ptr[1], src_ptr[2]}; @@ -262,8 +265,7 @@ __kernel void RGB2YUV(int cols, int rows, int src_step, int dst_step, dst_ptr[1] = SAT_CAST( U ); dst_ptr[2] = SAT_CAST( V ); } -#ifdef INTEL_DEVICE - else if (2 == pixels_per_work_item) +#elif (2 == pixels_per_work_item) { const VECTOR8 r0 = vload8(0, src_ptr); @@ -291,7 +293,7 @@ __kernel void RGB2YUV(int cols, int rows, int src_step, int dst_step, vstore8((VECTOR8)(Y.s0, U.s0, V.s0, 0, Y.s1, U.s1, V.s1, 0), 0, dst_ptr); } - else if (4 == pixels_per_work_item) +#elif (4 == pixels_per_work_item) { #ifndef DEPTH_5 const VECTOR16 r0 = vload16(0, src_ptr); @@ -311,7 +313,7 @@ __kernel void RGB2YUV(int cols, int rows, int src_step, int dst_step, vstore16((VECTOR16)(Y.s0, U.s0, V.s0, 0, Y.s1, U.s1, V.s1, 0, Y.s2, U.s2, V.s2, 0, Y.s3, U.s3, V.s3, 0), 0, dst_ptr); #endif } -#endif //INTEL_DEVICE +#endif //pixels_per_work_item } } @@ -340,7 +342,7 @@ __kernel void YUV2RGB(int cols, int rows, int src_step, int dst_step, __constant int * coeffs = c_YUV2RGBCoeffs_i; #endif - if (1 == pixels_per_work_item) +#if (1 == pixels_per_work_item) { const DATA_TYPE yuv[] = {src_ptr[0], src_ptr[1], src_ptr[2]}; @@ -361,8 +363,7 @@ __kernel void YUV2RGB(int cols, int rows, int src_step, int dst_step, dst_ptr[3] = MAX_NUM; #endif } -#ifdef INTEL_DEVICE - else if (2 == pixels_per_work_item) +#elif (2 == pixels_per_work_item) { const VECTOR8 r0 = vload8(0, src_ptr); @@ -394,7 +395,7 @@ __kernel void YUV2RGB(int cols, int rows, int src_step, int dst_step, vstore8((VECTOR8)(c0.s0, c1.s0, c2.s0, 0, c0.s1, c1.s1, c2.s1, 0), 0, dst_ptr); #endif } - else if (4 == pixels_per_work_item) +#elif (4 == pixels_per_work_item) { #ifndef DEPTH_5 const VECTOR16 r0 = vload16(0, src_ptr); @@ -418,7 +419,7 @@ __kernel void YUV2RGB(int cols, int rows, int src_step, int dst_step, #endif #endif } -#endif //INTEL_DEVICE +#endif //pixels_per_work_item } } @@ -509,7 +510,7 @@ __kernel void RGB2YCrCb(int cols, int rows, int src_step, int dst_step, const int delta = HALF_MAX * (1 << yuv_shift); #endif - if (1 == pixels_per_work_item) +#if (1 == pixels_per_work_item) { const DATA_TYPE rgb[] = {src_ptr[0], src_ptr[1], src_ptr[2]}; @@ -527,8 +528,7 @@ __kernel void RGB2YCrCb(int cols, int rows, int src_step, int dst_step, dst_ptr[1] = SAT_CAST( Cr ); dst_ptr[2] = SAT_CAST( Cb ); } -#ifdef INTEL_DEVICE - else if (2 == pixels_per_work_item) +#elif (2 == pixels_per_work_item) { const VECTOR8 r0 = vload8(0, src_ptr); @@ -556,7 +556,7 @@ __kernel void RGB2YCrCb(int cols, int rows, int src_step, int dst_step, vstore8((VECTOR8)(Y.s0, Cr.s0, Cb.s0, 0, Y.s1, Cr.s1, Cb.s1, 0), 0, dst_ptr); } - else if (4 == pixels_per_work_item) +#elif (4 == pixels_per_work_item) { #ifndef DEPTH_5 const VECTOR16 r0 = vload16(0, src_ptr); @@ -575,7 +575,7 @@ __kernel void RGB2YCrCb(int cols, int rows, int src_step, int dst_step, vstore16((VECTOR16)(Y.s0, Cr.s0, Cb.s0, 0, Y.s1, Cr.s1, Cb.s1, 0, Y.s2, Cr.s2, Cb.s2, 0, Y.s3, Cr.s3, Cb.s3, 0), 0, dst_ptr); #endif } -#endif //INTEL_DEVICE +#endif //pixels_per_work_item } } @@ -604,7 +604,7 @@ __kernel void YCrCb2RGB(int cols, int rows, int src_step, int dst_step, __constant int * coeffs = c_YCrCb2RGBCoeffs_i; #endif - if (1 == pixels_per_work_item) +#if (1 == pixels_per_work_item) { const DATA_TYPE ycrcb[] = {src_ptr[0], src_ptr[1], src_ptr[2]}; @@ -625,8 +625,7 @@ __kernel void YCrCb2RGB(int cols, int rows, int src_step, int dst_step, dst_ptr[3] = MAX_NUM; #endif } -#ifdef INTEL_DEVICE - else if (2 == pixels_per_work_item) +#elif (2 == pixels_per_work_item) { const VECTOR8 r0 = vload8(0, src_ptr); @@ -658,7 +657,7 @@ __kernel void YCrCb2RGB(int cols, int rows, int src_step, int dst_step, vstore8((VECTOR8)(c0.s0, c1.s0, c2.s0, 0, c0.s1, c1.s1, c2.s1, 0), 0, dst_ptr); #endif } - else if (4 == pixels_per_work_item) +#elif (4 == pixels_per_work_item) { #ifndef DEPTH_5 const VECTOR16 r0 = vload16(0, src_ptr); @@ -682,7 +681,7 @@ __kernel void YCrCb2RGB(int cols, int rows, int src_step, int dst_step, #endif #endif } -#endif //INTEL_DEVICE +#endif //pixels_per_work_item } } @@ -704,7 +703,7 @@ __kernel void RGB2XYZ(int cols, int rows, int src_step, int dst_step, global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx); global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx); - if (1 == pixels_per_work_item) +#if (1 == pixels_per_work_item) { DATA_TYPE R = src_ptr[0], G = src_ptr[1], B = src_ptr[2]; @@ -722,8 +721,7 @@ __kernel void RGB2XYZ(int cols, int rows, int src_step, int dst_step, dst_ptr[1] = SAT_CAST( Y ); dst_ptr[2] = SAT_CAST( Z ); } -#ifdef INTEL_DEVICE - else if (2 == pixels_per_work_item) +#elif (2 == pixels_per_work_item) { const VECTOR8 r0 = vload8(0, src_ptr); @@ -751,7 +749,7 @@ __kernel void RGB2XYZ(int cols, int rows, int src_step, int dst_step, vstore8((VECTOR8)(X.s0, Y.s0, Z.s0, 0, X.s1, Y.s1, Z.s1, 0), 0, dst_ptr); } - else if (4 == pixels_per_work_item) +#elif (4 == pixels_per_work_item) { #ifndef DEPTH_5 const VECTOR16 r0 = vload16(0, src_ptr); @@ -771,7 +769,7 @@ __kernel void RGB2XYZ(int cols, int rows, int src_step, int dst_step, vstore16((VECTOR16)(X.s0, Y.s0, Z.s0, 0, X.s1, Y.s1, Z.s1, 0, X.s2, Y.s2, Z.s2, 0, X.s3, Y.s3, Z.s3, 0), 0, dst_ptr); #endif } -#endif //INTEL_DEVICE +#endif //pixels_per_work_item } } @@ -791,7 +789,7 @@ __kernel void XYZ2RGB(int cols, int rows, int src_step, int dst_step, global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx); global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx); - if (1 == pixels_per_work_item) +#if (1 == pixels_per_work_item) { const DATA_TYPE X = src_ptr[0], Y = src_ptr[1], Z = src_ptr[2]; @@ -812,8 +810,7 @@ __kernel void XYZ2RGB(int cols, int rows, int src_step, int dst_step, dst_ptr[3] = MAX_NUM; #endif } -#ifdef INTEL_DEVICE - else if (2 == pixels_per_work_item) +#elif (2 == pixels_per_work_item) { const VECTOR8 r0 = vload8(0, src_ptr); @@ -845,7 +842,7 @@ __kernel void XYZ2RGB(int cols, int rows, int src_step, int dst_step, vstore8((VECTOR8)(B.s0, G.s0, R.s0, 0, B.s1, G.s1, R.s1, 0), 0, dst_ptr); #endif } - else if (4 == pixels_per_work_item) +#elif (4 == pixels_per_work_item) { #ifndef DEPTH_5 const VECTOR16 r0 = vload16(0, src_ptr); @@ -869,7 +866,7 @@ __kernel void XYZ2RGB(int cols, int rows, int src_step, int dst_step, #endif #endif } -#endif //INTEL_DEVICE +#endif // pixels_per_work_item } } @@ -906,7 +903,7 @@ __kernel void RGB(int cols, int rows, int src_step, int dst_step, dst[dst_idx + 3] = src[src_idx + 3]; #endif #endif -#else +#else //INTEL_DEVICE global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx); global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx); @@ -936,7 +933,7 @@ __kernel void RGB(int cols, int rows, int src_step, int dst_step, vstore4(r0, 0, dst_ptr); } #endif -#endif +#endif //INTEL_DEVICE } } @@ -1476,7 +1473,7 @@ __kernel void RGBA2mRGBA(int cols, int rows, int src_step, int dst_step, global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx); global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx); - if (1 == pixels_per_work_item) +#if (1 == pixels_per_work_item) { const uchar4 r0 = vload4(0, src_ptr); @@ -1485,8 +1482,7 @@ __kernel void RGBA2mRGBA(int cols, int rows, int src_step, int dst_step, dst_ptr[2] = (r0.s2 * r0.s3 + HALF_MAX) / MAX_NUM; dst_ptr[3] = r0.s3; } -#ifdef INTEL_DEVICE - else if (2 == pixels_per_work_item) +#elif (2 == pixels_per_work_item) { const uchar8 r0 = vload8(0, src_ptr); @@ -1505,7 +1501,7 @@ __kernel void RGBA2mRGBA(int cols, int rows, int src_step, int dst_step, vstore8((uchar8)(r.s0, g.s0, b.s0, v3.s0, r.s1, g.s1, b.s1, v3.s1), 0, dst_ptr); } - else if (4 == pixels_per_work_item) +#elif (4 == pixels_per_work_item) { const uchar16 r0 = vload16(0, src_ptr); @@ -1524,7 +1520,7 @@ __kernel void RGBA2mRGBA(int cols, int rows, int src_step, int dst_step, vstore16((uchar16)(r.s0, g.s0, b.s0, v3.s0, r.s1, g.s1, b.s1, v3.s1, r.s2, g.s2, b.s2, v3.s2, r.s3, g.s3, b.s3, v3.s3), 0, dst_ptr); } -#endif //INTEL_DEVICE +#endif // pixels_per_work_item } } @@ -1544,7 +1540,7 @@ __kernel void mRGBA2RGBA(int cols, int rows, int src_step, int dst_step, global DATA_TYPE *src_ptr = (global DATA_TYPE *)(src + src_idx); global DATA_TYPE *dst_ptr = (global DATA_TYPE *)(dst + dst_idx); - if (1 == pixels_per_work_item) +#if (1 == pixels_per_work_item) { const uchar4 r0 = vload4(0, src_ptr); const uchar v3_half = r0.s3 / 2; @@ -1555,8 +1551,7 @@ __kernel void mRGBA2RGBA(int cols, int rows, int src_step, int dst_step, vstore4((uchar4)(r, g, b, r0.s3), 0, dst_ptr); } -#ifdef INTEL_DEVICE - else if (2 == pixels_per_work_item) +#elif (2 == pixels_per_work_item) { const uchar8 r0 = vload8(0, src_ptr); @@ -1576,7 +1571,7 @@ __kernel void mRGBA2RGBA(int cols, int rows, int src_step, int dst_step, vstore8((uchar8)(r.s0, g.s0, b.s0, v3.s0, r.s1, g.s1, b.s1, v3.s1), 0, dst_ptr); } - else if (4 == pixels_per_work_item) +#elif (4 == pixels_per_work_item) { const uchar16 r0 = vload16(0, src_ptr); @@ -1597,7 +1592,7 @@ __kernel void mRGBA2RGBA(int cols, int rows, int src_step, int dst_step, vstore16((uchar16)(r.s0, g.s0, b.s0, v3.s0, r.s1, g.s1, b.s1, v3.s1, r.s2, g.s2, b.s2, v3.s2, r.s3, g.s3, b.s3, v3.s3), 0, dst_ptr); } -#endif //INTEL_DEVICE +#endif // pixels_per_work_item } } From 9941c6710da481029f5dc7add24dfe319e014e02 Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Tue, 10 Dec 2013 11:22:29 +0400 Subject: [PATCH 28/28] NEON instruction set control unified for regular and cross-compiler builds. --- CMakeLists.txt | 11 +++++++++++ cmake/OpenCVCompilerOptions.cmake | 6 ++++++ .../crosscompilation/arm_crosscompile_with_cmake.rst | 4 ++-- platforms/linux/arm-gnueabi.toolchain.cmake | 11 ++++------- 4 files changed, 23 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2a7c730bc0..85ea4d5c89 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -214,6 +214,8 @@ OCV_OPTION(ENABLE_SSSE3 "Enable SSSE3 instructions" OCV_OPTION(ENABLE_SSE41 "Enable SSE4.1 instructions" OFF IF ((CV_ICC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_SSE42 "Enable SSE4.2 instructions" OFF IF (CMAKE_COMPILER_IS_GNUCXX AND (X86 OR X86_64)) ) OCV_OPTION(ENABLE_AVX "Enable AVX instructions" OFF IF ((MSVC OR CMAKE_COMPILER_IS_GNUCXX) AND (X86 OR X86_64)) ) +OCV_OPTION(ENABLE_NEON "Enable NEON instructions" OFF IF CMAKE_COMPILER_IS_GNUCXX AND ARM ) +OCV_OPTION(ENABLE_VFPV3 "Enable VFPv3-D32 instructions" OFF IF CMAKE_COMPILER_IS_GNUCXX AND ARM ) OCV_OPTION(ENABLE_NOISY_WARNINGS "Show all warnings even if they are too noisy" OFF ) OCV_OPTION(OPENCV_WARNINGS_ARE_ERRORS "Treat warnings as errors" OFF ) OCV_OPTION(ENABLE_WINRT_MODE "Build with Windows Runtime support" OFF IF WIN32 ) @@ -240,6 +242,15 @@ include(cmake/OpenCVVersion.cmake) # Save libs and executables in the same place set(EXECUTABLE_OUTPUT_PATH "${CMAKE_BINARY_DIR}/bin" CACHE PATH "Output directory for applications" ) +if (ANDROID) + if (ANDROID_ABI MATCHES "NEON") + set(ENABLE_NEON ON) + endif() + if (ANDROID_ABI MATCHES "VFPV3") + set(ENABLE_VFPV3 ON) + endif() +endif() + if(ANDROID OR WIN32) set(OPENCV_DOC_INSTALL_PATH doc) elseif(INSTALL_TO_MANGLED_PATHS) diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake index 5033b36edb..a4b039280f 100644 --- a/cmake/OpenCVCompilerOptions.cmake +++ b/cmake/OpenCVCompilerOptions.cmake @@ -130,6 +130,12 @@ if(CMAKE_COMPILER_IS_GNUCXX) if(ENABLE_SSE2) add_extra_compiler_option(-msse2) endif() + if (ENABLE_NEON) + add_extra_compiler_option("-mfpu=neon") + endif() + if (ENABLE_VFPV3 AND NOT ENABLE_NEON) + add_extra_compiler_option("-mfpu=vfpv3") + endif() # SSE3 and further should be disabled under MingW because it generates compiler errors if(NOT MINGW) diff --git a/doc/tutorials/introduction/crosscompilation/arm_crosscompile_with_cmake.rst b/doc/tutorials/introduction/crosscompilation/arm_crosscompile_with_cmake.rst index 0b2253acea..87f6d9d4d6 100644 --- a/doc/tutorials/introduction/crosscompilation/arm_crosscompile_with_cmake.rst +++ b/doc/tutorials/introduction/crosscompilation/arm_crosscompile_with_cmake.rst @@ -106,8 +106,8 @@ Enable hardware optimizations ----------------------------- Depending on target platform architecture different instruction sets can be used. By default -compiler generates code for armv5l without VFPv3 and NEON extensions. Add ``-DUSE_VFPV3=ON`` -to cmake command line to enable code generation for VFPv3 and ``-DUSE_NEON=ON`` for using +compiler generates code for armv5l without VFPv3 and NEON extensions. Add ``-DENABLE_VFPV3=ON`` +to cmake command line to enable code generation for VFPv3 and ``-DENABLE_NEON=ON`` for using NEON SIMD extensions. TBB is supported on multi core ARM SoCs also. diff --git a/platforms/linux/arm-gnueabi.toolchain.cmake b/platforms/linux/arm-gnueabi.toolchain.cmake index c6b0469ad8..2c5b7406d8 100644 --- a/platforms/linux/arm-gnueabi.toolchain.cmake +++ b/platforms/linux/arm-gnueabi.toolchain.cmake @@ -28,14 +28,11 @@ set(CMAKE_MODULE_LINKER_FLAGS "-Wl,--fix-cortex-a8 -Wl,--no-undefined -Wl,--gc-s set(CMAKE_EXE_LINKER_FLAGS "-Wl,--fix-cortex-a8 -Wl,--no-undefined -Wl,--gc-sections -Wl,-z,noexecstack -Wl,-z,relro -Wl,-z,now ${CMAKE_EXE_LINKER_FLAGS}") if(USE_NEON) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon") + message(WARNING "You use obsolete variable USE_NEON to enable NEON instruction set. Use -DENABLE_NEON=ON instead." ) + set(ENABLE_NEON TRUE) elseif(USE_VFPV3) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=vfpv3") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=vfpv3") -else() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=vfpv3-d16") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=vfpv3-d16") + message(WARNING "You use obsolete variable USE_VFPV3 to enable VFPV3 instruction set. Use -DENABLE_VFPV3=ON instead." ) + set(ENABLE_VFPV3 TRUE) endif() set(CMAKE_FIND_ROOT_PATH ${CMAKE_FIND_ROOT_PATH} ${ARM_LINUX_SYSROOT})