diff --git a/cmake/OpenCVFindLibsGUI.cmake b/cmake/OpenCVFindLibsGUI.cmake index 5bb6d57f52..70752c3fba 100644 --- a/cmake/OpenCVFindLibsGUI.cmake +++ b/cmake/OpenCVFindLibsGUI.cmake @@ -82,7 +82,7 @@ endif(WITH_OPENGL) if(APPLE) if(WITH_CARBON) set(HAVE_CARBON YES) - elseif(NOT IOS) + elseif(NOT IOS AND CMAKE_COMPILER_IS_CLANGCXX) set(HAVE_COCOA YES) endif() endif() diff --git a/cmake/OpenCVFindLibsVideo.cmake b/cmake/OpenCVFindLibsVideo.cmake index 93cce2b7ad..5520d05521 100644 --- a/cmake/OpenCVFindLibsVideo.cmake +++ b/cmake/OpenCVFindLibsVideo.cmake @@ -273,7 +273,7 @@ endif() if (NOT IOS) if(WITH_QUICKTIME) set(HAVE_QUICKTIME YES) - elseif(APPLE) + elseif(APPLE AND CMAKE_COMPILER_IS_CLANGCXX) set(HAVE_QTKIT YES) endif() endif() diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp index aa9469c04c..b74678ca67 100644 --- a/modules/core/src/arithm.cpp +++ b/modules/core/src/arithm.cpp @@ -448,11 +448,13 @@ template struct OpNot T operator()( T a, T ) const { return ~a; } }; +#if (ARITHM_USE_IPP == 1) static inline void fixSteps(Size sz, size_t elemSize, size_t& step1, size_t& step2, size_t& step) { if( sz.height == 1 ) step1 = step2 = step = sz.width*elemSize; } +#endif static void add8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, diff --git a/modules/core/src/mathfuncs.cpp b/modules/core/src/mathfuncs.cpp index 65f78de085..bb33e952dd 100644 --- a/modules/core/src/mathfuncs.cpp +++ b/modules/core/src/mathfuncs.cpp @@ -46,7 +46,6 @@ namespace cv { -static const int MAX_BLOCK_SIZE = 1024; typedef void (*MathFunc)(const void* src, void* dst, int len); static const float atan2_p1 = 0.9997878412794807f*(float)(180/CV_PI); diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index ecc0f76cb8..6d590f9075 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -680,7 +680,8 @@ static bool ocl_countNonZero( InputArray _src, int & res ) int cv::countNonZero( InputArray _src ) { - CV_Assert( _src.channels() == 1 ); + int type = _src.type(), cn = CV_MAT_CN(type); + CV_Assert( cn == 1 ); #ifdef HAVE_OPENCL int res = -1; @@ -690,8 +691,33 @@ int cv::countNonZero( InputArray _src ) #endif Mat src = _src.getMat(); - CountNonZeroFunc func = getCountNonZeroTab(src.depth()); +#if defined HAVE_IPP && !defined HAVE_IPP_ICV_ONLY + if (src.dims <= 2 || src.isContinuous()) + { + IppiSize roiSize = { src.cols, src.rows }; + Ipp32s count, srcstep = (Ipp32s)src.step; + IppStatus status = (IppStatus)-1; + + if (src.isContinuous()) + { + roiSize.width = (Ipp32s)src.total(); + roiSize.height = 1; + srcstep = (Ipp32s)src.total() * CV_ELEM_SIZE(type); + } + + int depth = CV_MAT_DEPTH(type); + if (depth == CV_8U) + status = ippiCountInRange_8u_C1R((const Ipp8u *)src.data, srcstep, roiSize, &count, 0, 0); + else if (depth == CV_32F) + status = ippiCountInRange_32f_C1R((const Ipp32f *)src.data, srcstep, roiSize, &count, 0, 0); + + if (status >= 0) + return (Ipp32s)src.total() - count; + } +#endif + + CountNonZeroFunc func = getCountNonZeroTab(src.depth()); CV_Assert( func != 0 ); const Mat* arrays[] = {&src, 0}; diff --git a/modules/core/test/test_countnonzero.cpp b/modules/core/test/test_countnonzero.cpp index 176d324c9e..0b8210322d 100644 --- a/modules/core/test/test_countnonzero.cpp +++ b/modules/core/test/test_countnonzero.cpp @@ -52,9 +52,6 @@ using namespace std; #define sign(a) a > 0 ? 1 : a == 0 ? 0 : -1 -const int FLOAT_TYPE [2] = {CV_32F, CV_64F}; -const int INT_TYPE [5] = {CV_8U, CV_8S, CV_16U, CV_16S, CV_32S}; - #define MAX_WIDTH 100 #define MAX_HEIGHT 100 diff --git a/modules/features2d/include/opencv2/features2d.hpp b/modules/features2d/include/opencv2/features2d.hpp index 190e8ac665..4129705033 100644 --- a/modules/features2d/include/opencv2/features2d.hpp +++ b/modules/features2d/include/opencv2/features2d.hpp @@ -405,8 +405,16 @@ public: protected: virtual void computeImpl( InputArray image, std::vector& keypoints, OutputArray descriptors ) const; void buildPattern(); - uchar meanIntensity( InputArray image, InputArray integral, const float kp_x, const float kp_y, - const unsigned int scale, const unsigned int rot, const unsigned int point ) const; + + template + imgType meanIntensity( InputArray image, InputArray integral, const float kp_x, const float kp_y, + const unsigned int scale, const unsigned int rot, const unsigned int point ) const; + + template + void computeDescriptors( InputArray image, std::vector& keypoints, OutputArray descriptors ) const; + + template + void extractDescriptor(srcMatType *pointsValue, void ** ptr) const; bool orientationNormalized; //true if the orientation is normalized, false otherwise bool scaleNormalized; //true if the scale is normalized, false otherwise diff --git a/modules/features2d/src/freak.cpp b/modules/features2d/src/freak.cpp index 8759efa2e5..00c0e35ae8 100644 --- a/modules/features2d/src/freak.cpp +++ b/modules/features2d/src/freak.cpp @@ -239,13 +239,129 @@ void FREAK::computeImpl( InputArray _image, std::vector& keypoints, Ou ((FREAK*)this)->buildPattern(); + // Convert to gray if not already + Mat grayImage = image; +// if( image.channels() > 1 ) +// cvtColor( image, grayImage, COLOR_BGR2GRAY ); + + // Use 32-bit integers if we won't overflow in the integral image + if ((image.depth() == CV_8U || image.depth() == CV_8S) && + (image.rows * image.cols) < 8388608 ) // 8388608 = 2 ^ (32 - 8(bit depth) - 1(sign bit)) + { + // Create the integral image appropriate for our type & usage + if (image.depth() == CV_8U) + computeDescriptors(grayImage, keypoints, _descriptors); + else if (image.depth() == CV_8S) + computeDescriptors(grayImage, keypoints, _descriptors); + else + CV_Error( Error::StsUnsupportedFormat, "" ); + } else { + // Create the integral image appropriate for our type & usage + if ( image.depth() == CV_8U ) + computeDescriptors(grayImage, keypoints, _descriptors); + else if ( image.depth() == CV_8S ) + computeDescriptors(grayImage, keypoints, _descriptors); + else if ( image.depth() == CV_16U ) + computeDescriptors(grayImage, keypoints, _descriptors); + else if ( image.depth() == CV_16S ) + computeDescriptors(grayImage, keypoints, _descriptors); + else + CV_Error( Error::StsUnsupportedFormat, "" ); + } +} + +template +void FREAK::extractDescriptor(srcMatType *pointsValue, void ** ptr) const +{ + std::bitset** ptrScalar = (std::bitset**) ptr; + + // extracting descriptor preserving the order of SSE version + int cnt = 0; + for( int n = 7; n < FREAK_NB_PAIRS; n += 128) + { + for( int m = 8; m--; ) + { + int nm = n-m; + for(int kk = nm+15*8; kk >= nm; kk-=8, ++cnt) + { + (*ptrScalar)->set(kk, pointsValue[descriptionPairs[cnt].i] >= pointsValue[descriptionPairs[cnt].j]); + } + } + } + --(*ptrScalar); +} + +#if CV_SSE2 +template <> +void FREAK::extractDescriptor(uchar *pointsValue, void ** ptr) const +{ + __m128i** ptrSSE = (__m128i**) ptr; + + // note that comparisons order is modified in each block (but first 128 comparisons remain globally the same-->does not affect the 128,384 bits segmanted matching strategy) + int cnt = 0; + for( int n = FREAK_NB_PAIRS/128; n-- ; ) + { + __m128i result128 = _mm_setzero_si128(); + for( int m = 128/16; m--; cnt += 16 ) + { + __m128i operand1 = _mm_set_epi8(pointsValue[descriptionPairs[cnt+0].i], + pointsValue[descriptionPairs[cnt+1].i], + pointsValue[descriptionPairs[cnt+2].i], + pointsValue[descriptionPairs[cnt+3].i], + pointsValue[descriptionPairs[cnt+4].i], + pointsValue[descriptionPairs[cnt+5].i], + pointsValue[descriptionPairs[cnt+6].i], + pointsValue[descriptionPairs[cnt+7].i], + pointsValue[descriptionPairs[cnt+8].i], + pointsValue[descriptionPairs[cnt+9].i], + pointsValue[descriptionPairs[cnt+10].i], + pointsValue[descriptionPairs[cnt+11].i], + pointsValue[descriptionPairs[cnt+12].i], + pointsValue[descriptionPairs[cnt+13].i], + pointsValue[descriptionPairs[cnt+14].i], + pointsValue[descriptionPairs[cnt+15].i]); + + __m128i operand2 = _mm_set_epi8(pointsValue[descriptionPairs[cnt+0].j], + pointsValue[descriptionPairs[cnt+1].j], + pointsValue[descriptionPairs[cnt+2].j], + pointsValue[descriptionPairs[cnt+3].j], + pointsValue[descriptionPairs[cnt+4].j], + pointsValue[descriptionPairs[cnt+5].j], + pointsValue[descriptionPairs[cnt+6].j], + pointsValue[descriptionPairs[cnt+7].j], + pointsValue[descriptionPairs[cnt+8].j], + pointsValue[descriptionPairs[cnt+9].j], + pointsValue[descriptionPairs[cnt+10].j], + pointsValue[descriptionPairs[cnt+11].j], + pointsValue[descriptionPairs[cnt+12].j], + pointsValue[descriptionPairs[cnt+13].j], + pointsValue[descriptionPairs[cnt+14].j], + pointsValue[descriptionPairs[cnt+15].j]); + + __m128i workReg = _mm_min_epu8(operand1, operand2); // emulated "not less than" for 8-bit UNSIGNED integers + workReg = _mm_cmpeq_epi8(workReg, operand2); // emulated "not less than" for 8-bit UNSIGNED integers + + workReg = _mm_and_si128(_mm_set1_epi16(short(0x8080 >> m)), workReg); // merge the last 16 bits with the 128bits std::vector until full + result128 = _mm_or_si128(result128, workReg); + } + (**ptrSSE) = result128; + ++(*ptrSSE); + } + (*ptrSSE) -= 8; +} +#endif + +template +void FREAK::computeDescriptors( InputArray _image, std::vector& keypoints, OutputArray _descriptors ) const { + + Mat image = _image.getMat(); Mat imgIntegral; - integral(image, imgIntegral); + integral(image, imgIntegral, DataType::type); std::vector kpScaleIdx(keypoints.size()); // used to save pattern scale index corresponding to each keypoints const std::vector::iterator ScaleIdxBegin = kpScaleIdx.begin(); // used in std::vector erase function const std::vector::iterator kpBegin = keypoints.begin(); // used in std::vector erase function const float sizeCst = static_cast(FREAK_NB_SCALES/(FREAK_LOG2* nOctaves)); - uchar pointsValue[FREAK_NB_POINTS]; + srcMatType pointsValue[FREAK_NB_POINTS]; int thetaIdx = 0; int direction0; int direction1; @@ -300,13 +416,10 @@ void FREAK::computeImpl( InputArray _image, std::vector& keypoints, Ou _descriptors.create((int)keypoints.size(), FREAK_NB_PAIRS/8, CV_8U); _descriptors.setTo(Scalar::all(0)); Mat descriptors = _descriptors.getMat(); -#if CV_SSE2 - __m128i* ptr= (__m128i*) (descriptors.data+(keypoints.size()-1)*descriptors.step[0]); -#else - std::bitset* ptr = (std::bitset*) (descriptors.data+(keypoints.size()-1)*descriptors.step[0]); -#endif - for( size_t k = keypoints.size(); k--; ) - { + + void *ptr = descriptors.data+(keypoints.size()-1)*descriptors.step[0]; + + for( size_t k = keypoints.size(); k--; ) { // estimate orientation (gradient) if( !orientationNormalized ) { @@ -316,9 +429,10 @@ void FREAK::computeImpl( InputArray _image, std::vector& keypoints, Ou else { // get the points intensity value in the un-rotated pattern - for( int i = FREAK_NB_POINTS; i--; ) - { - pointsValue[i] = meanIntensity(image, imgIntegral, keypoints[k].pt.x,keypoints[k].pt.y, kpScaleIdx[k], 0, i); + for( int i = FREAK_NB_POINTS; i--; ) { + pointsValue[i] = meanIntensity(image, imgIntegral, + keypoints[k].pt.x, keypoints[k].pt.y, + kpScaleIdx[k], 0, i); } direction0 = 0; direction1 = 0; @@ -339,80 +453,14 @@ void FREAK::computeImpl( InputArray _image, std::vector& keypoints, Ou thetaIdx -= FREAK_NB_ORIENTATION; } // extract descriptor at the computed orientation - for( int i = FREAK_NB_POINTS; i--; ) - { - pointsValue[i] = meanIntensity(image, imgIntegral, keypoints[k].pt.x,keypoints[k].pt.y, kpScaleIdx[k], thetaIdx, i); + for( int i = FREAK_NB_POINTS; i--; ) { + pointsValue[i] = meanIntensity(image, imgIntegral, + keypoints[k].pt.x, keypoints[k].pt.y, + kpScaleIdx[k], thetaIdx, i); } -#if CV_SSE2 - // note that comparisons order is modified in each block (but first 128 comparisons remain globally the same-->does not affect the 128,384 bits segmanted matching strategy) - int cnt = 0; - for( int n = FREAK_NB_PAIRS/128; n-- ; ) - { - __m128i result128 = _mm_setzero_si128(); - for( int m = 128/16; m--; cnt += 16 ) - { - __m128i operand1 = _mm_set_epi8( - pointsValue[descriptionPairs[cnt+0].i], - pointsValue[descriptionPairs[cnt+1].i], - pointsValue[descriptionPairs[cnt+2].i], - pointsValue[descriptionPairs[cnt+3].i], - pointsValue[descriptionPairs[cnt+4].i], - pointsValue[descriptionPairs[cnt+5].i], - pointsValue[descriptionPairs[cnt+6].i], - pointsValue[descriptionPairs[cnt+7].i], - pointsValue[descriptionPairs[cnt+8].i], - pointsValue[descriptionPairs[cnt+9].i], - pointsValue[descriptionPairs[cnt+10].i], - pointsValue[descriptionPairs[cnt+11].i], - pointsValue[descriptionPairs[cnt+12].i], - pointsValue[descriptionPairs[cnt+13].i], - pointsValue[descriptionPairs[cnt+14].i], - pointsValue[descriptionPairs[cnt+15].i]); - - __m128i operand2 = _mm_set_epi8( - pointsValue[descriptionPairs[cnt+0].j], - pointsValue[descriptionPairs[cnt+1].j], - pointsValue[descriptionPairs[cnt+2].j], - pointsValue[descriptionPairs[cnt+3].j], - pointsValue[descriptionPairs[cnt+4].j], - pointsValue[descriptionPairs[cnt+5].j], - pointsValue[descriptionPairs[cnt+6].j], - pointsValue[descriptionPairs[cnt+7].j], - pointsValue[descriptionPairs[cnt+8].j], - pointsValue[descriptionPairs[cnt+9].j], - pointsValue[descriptionPairs[cnt+10].j], - pointsValue[descriptionPairs[cnt+11].j], - pointsValue[descriptionPairs[cnt+12].j], - pointsValue[descriptionPairs[cnt+13].j], - pointsValue[descriptionPairs[cnt+14].j], - pointsValue[descriptionPairs[cnt+15].j]); - - __m128i workReg = _mm_min_epu8(operand1, operand2); // emulated "not less than" for 8-bit UNSIGNED integers - workReg = _mm_cmpeq_epi8(workReg, operand2); // emulated "not less than" for 8-bit UNSIGNED integers - - workReg = _mm_and_si128(_mm_set1_epi16(short(0x8080 >> m)), workReg); // merge the last 16 bits with the 128bits std::vector until full - result128 = _mm_or_si128(result128, workReg); - } - (*ptr) = result128; - ++ptr; - } - ptr -= 8; -#else - // extracting descriptor preserving the order of SSE version - int cnt = 0; - for( int n = 7; n < FREAK_NB_PAIRS; n += 128) - { - for( int m = 8; m--; ) - { - int nm = n-m; - for(int kk = nm+15*8; kk >= nm; kk-=8, ++cnt) - { - ptr->set(kk, pointsValue[descriptionPairs[cnt].i] >= pointsValue[descriptionPairs[cnt].j]); - } - } - } - --ptr; -#endif + + // Extract descriptor + extractDescriptor(pointsValue, &ptr); } } else // extract all possible comparisons for selection @@ -434,7 +482,9 @@ void FREAK::computeImpl( InputArray _image, std::vector& keypoints, Ou { //get the points intensity value in the un-rotated pattern for( int i = FREAK_NB_POINTS;i--; ) - pointsValue[i] = meanIntensity(image, imgIntegral, keypoints[k].pt.x,keypoints[k].pt.y, kpScaleIdx[k], 0, i); + pointsValue[i] = meanIntensity(image, imgIntegral, + keypoints[k].pt.x,keypoints[k].pt.y, + kpScaleIdx[k], 0, i); direction0 = 0; direction1 = 0; @@ -456,10 +506,10 @@ void FREAK::computeImpl( InputArray _image, std::vector& keypoints, Ou thetaIdx -= FREAK_NB_ORIENTATION; } // get the points intensity value in the rotated pattern - for( int i = FREAK_NB_POINTS; i--; ) - { - pointsValue[i] = meanIntensity(image, imgIntegral, keypoints[k].pt.x, - keypoints[k].pt.y, kpScaleIdx[k], thetaIdx, i); + for( int i = FREAK_NB_POINTS; i--; ) { + pointsValue[i] = meanIntensity(image, imgIntegral, + keypoints[k].pt.x, keypoints[k].pt.y, + kpScaleIdx[k], thetaIdx, i); } int cnt(0); @@ -478,13 +528,13 @@ void FREAK::computeImpl( InputArray _image, std::vector& keypoints, Ou } // simply take average on a square patch, not even gaussian approx -uchar FREAK::meanIntensity( InputArray _image, InputArray _integral, - const float kp_x, - const float kp_y, - const unsigned int scale, - const unsigned int rot, - const unsigned int point) const -{ +template +imgType FREAK::meanIntensity( InputArray _image, InputArray _integral, + const float kp_x, + const float kp_y, + const unsigned int scale, + const unsigned int rot, + const unsigned int point) const { Mat image = _image.getMat(), integral = _integral.getMat(); // get point position in image const PatternPoint& FreakPoint = patternLookup[scale*FREAK_NB_ORIENTATION*FREAK_NB_POINTS + rot*FREAK_NB_POINTS + point]; @@ -492,7 +542,6 @@ uchar FREAK::meanIntensity( InputArray _image, InputArray _integral, const float yf = FreakPoint.y+kp_y; const int x = int(xf); const int y = int(yf); - const int& imagecols = image.cols; // get the sigma: const float radius = FreakPoint.sigma; @@ -505,19 +554,15 @@ uchar FREAK::meanIntensity( InputArray _image, InputArray _integral, const int r_y = static_cast((yf-y)*1024); const int r_x_1 = (1024-r_x); const int r_y_1 = (1024-r_y); - uchar* ptr = image.data+x+y*imagecols; unsigned int ret_val; // linear interpolation: - ret_val = (r_x_1*r_y_1*int(*ptr)); - ptr++; - ret_val += (r_x*r_y_1*int(*ptr)); - ptr += imagecols; - ret_val += (r_x*r_y*int(*ptr)); - ptr--; - ret_val += (r_x_1*r_y*int(*ptr)); + ret_val = r_x_1*r_y_1*int(image.at(y , x )) + + r_x *r_y_1*int(image.at(y , x+1)) + + r_x_1*r_y *int(image.at(y+1, x )) + + r_x *r_y *int(image.at(y+1, x+1)); //return the rounded mean ret_val += 2 * 1024 * 1024; - return static_cast(ret_val / (4 * 1024 * 1024)); + return static_cast(ret_val / (4 * 1024 * 1024)); } // expected case: @@ -527,15 +572,15 @@ uchar FREAK::meanIntensity( InputArray _image, InputArray _integral, const int y_top = int(yf-radius+0.5); const int x_right = int(xf+radius+1.5);//integral image is 1px wider const int y_bottom = int(yf+radius+1.5);//integral image is 1px higher - int ret_val; + iiType ret_val; - ret_val = integral.at(y_bottom,x_right);//bottom right corner - ret_val -= integral.at(y_bottom,x_left); - ret_val += integral.at(y_top,x_left); - ret_val -= integral.at(y_top,x_right); + ret_val = integral.at(y_bottom,x_right);//bottom right corner + ret_val -= integral.at(y_bottom,x_left); + ret_val += integral.at(y_top,x_left); + ret_val -= integral.at(y_top,x_right); ret_val = ret_val/( (x_right-x_left)* (y_bottom-y_top) ); //~ std::cout<(ret_val); + return static_cast(ret_val); } // pair selection algorithm from a set of training images and corresponding keypoints diff --git a/modules/features2d/src/stardetector.cpp b/modules/features2d/src/stardetector.cpp index 1e00ee6047..0b09b8678b 100644 --- a/modules/features2d/src/stardetector.cpp +++ b/modules/features2d/src/stardetector.cpp @@ -44,20 +44,24 @@ namespace cv { -static void -computeIntegralImages( const Mat& matI, Mat& matS, Mat& matT, Mat& _FT ) +template static void +computeIntegralImages( const Mat& matI, Mat& matS, Mat& matT, Mat& _FT, + int iiType ) { - CV_Assert( matI.type() == CV_8U ); - int x, y, rows = matI.rows, cols = matI.cols; - matS.create(rows + 1, cols + 1, CV_32S); - matT.create(rows + 1, cols + 1, CV_32S); - _FT.create(rows + 1, cols + 1, CV_32S); + matS.create(rows + 1, cols + 1, iiType ); + matT.create(rows + 1, cols + 1, iiType ); + _FT.create(rows + 1, cols + 1, iiType ); + + const inMatType* I = matI.ptr(); + + outMatType *S = matS.ptr(); + outMatType *T = matT.ptr(); + outMatType *FT = _FT.ptr(); - const uchar* I = matI.ptr(); - int *S = matS.ptr(), *T = matT.ptr(), *FT = _FT.ptr(); - int istep = (int)matI.step, step = (int)(matS.step/sizeof(S[0])); + int istep = (int)(matI.step/matI.elemSize()); + int step = (int)(matS.step/matS.elemSize()); for( x = 0; x <= cols; x++ ) S[x] = T[x] = FT[x] = 0; @@ -95,14 +99,9 @@ computeIntegralImages( const Mat& matI, Mat& matS, Mat& matT, Mat& _FT ) } } -struct StarFeature -{ - int area; - int* p[8]; -}; - -static int -StarDetectorComputeResponses( const Mat& img, Mat& responses, Mat& sizes, int maxSize ) +template static int +StarDetectorComputeResponses( const Mat& img, Mat& responses, Mat& sizes, + int maxSize, int iiType ) { const int MAX_PATTERN = 17; static const int sizes0[] = {1, 2, 3, 4, 6, 8, 11, 12, 16, 22, 23, 32, 45, 46, 64, 90, 128, -1}; @@ -116,16 +115,21 @@ StarDetectorComputeResponses( const Mat& img, Mat& responses, Mat& sizes, int ma __m128 sizes1_4[MAX_PATTERN]; union { int i; float f; } absmask; absmask.i = 0x7fffffff; - volatile bool useSIMD = cv::checkHardwareSupport(CV_CPU_SSE2); + volatile bool useSIMD = cv::checkHardwareSupport(CV_CPU_SSE2) && iiType == CV_32S; #endif + + struct StarFeature + { + int area; + iiMatType* p[8]; + }; + StarFeature f[MAX_PATTERN]; Mat sum, tilted, flatTilted; int y, rows = img.rows, cols = img.cols; int border, npatterns=0, maxIdx=0; - CV_Assert( img.type() == CV_8UC1 ); - responses.create( img.size(), CV_32F ); sizes.create( img.size(), CV_16S ); @@ -139,7 +143,18 @@ StarDetectorComputeResponses( const Mat& img, Mat& responses, Mat& sizes, int ma npatterns += (pairs[npatterns-1][0] >= 0); maxIdx = pairs[npatterns-1][0]; - computeIntegralImages( img, sum, tilted, flatTilted ); + // Create the integral image appropriate for our type & usage + if ( img.type() == CV_8U ) + computeIntegralImages( img, sum, tilted, flatTilted, iiType ); + else if ( img.type() == CV_8S ) + computeIntegralImages( img, sum, tilted, flatTilted, iiType ); + else if ( img.type() == CV_16U ) + computeIntegralImages( img, sum, tilted, flatTilted, iiType ); + else if ( img.type() == CV_16S ) + computeIntegralImages( img, sum, tilted, flatTilted, iiType ); + else + CV_Error( Error::StsUnsupportedFormat, "" ); + int step = (int)(sum.step/sum.elemSize()); for(int i = 0; i <= maxIdx; i++ ) @@ -148,15 +163,15 @@ StarDetectorComputeResponses( const Mat& img, Mat& responses, Mat& sizes, int ma int ur_area = (2*ur_size + 1)*(2*ur_size + 1); int t_area = t_size*t_size + (t_size + 1)*(t_size + 1); - f[i].p[0] = sum.ptr() + (ur_size + 1)*step + ur_size + 1; - f[i].p[1] = sum.ptr() - ur_size*step + ur_size + 1; - f[i].p[2] = sum.ptr() + (ur_size + 1)*step - ur_size; - f[i].p[3] = sum.ptr() - ur_size*step - ur_size; + f[i].p[0] = sum.ptr() + (ur_size + 1)*step + ur_size + 1; + f[i].p[1] = sum.ptr() - ur_size*step + ur_size + 1; + f[i].p[2] = sum.ptr() + (ur_size + 1)*step - ur_size; + f[i].p[3] = sum.ptr() - ur_size*step - ur_size; - f[i].p[4] = tilted.ptr() + (t_size + 1)*step + 1; - f[i].p[5] = flatTilted.ptr() - t_size; - f[i].p[6] = flatTilted.ptr() + t_size + 1; - f[i].p[7] = tilted.ptr() - t_size*step + 1; + f[i].p[4] = tilted.ptr() + (t_size + 1)*step + 1; + f[i].p[5] = flatTilted.ptr() - t_size; + f[i].p[6] = flatTilted.ptr() + t_size + 1; + f[i].p[7] = tilted.ptr() - t_size*step + 1; f[i].area = ur_area + t_area; sizes1[i] = sizes0[i]; @@ -227,7 +242,7 @@ StarDetectorComputeResponses( const Mat& img, Mat& responses, Mat& sizes, int ma for(int i = 0; i <= maxIdx; i++ ) { - const int** p = (const int**)&f[i].p[0]; + const iiMatType** p = (const iiMatType**)&f[i].p[0]; __m128i r0 = _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(p[0]+ofs)), _mm_loadu_si128((const __m128i*)(p[1]+ofs))); __m128i r1 = _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(p[3]+ofs)), @@ -269,9 +284,9 @@ StarDetectorComputeResponses( const Mat& img, Mat& responses, Mat& sizes, int ma for(int i = 0; i <= maxIdx; i++ ) { - const int** p = (const int**)&f[i].p[0]; - vals[i] = p[0][ofs] - p[1][ofs] - p[2][ofs] + p[3][ofs] + - p[4][ofs] - p[5][ofs] - p[6][ofs] + p[7][ofs]; + const iiMatType** p = (const iiMatType**)&f[i].p[0]; + vals[i] = (int)(p[0][ofs] - p[1][ofs] - p[2][ofs] + p[3][ofs] + + p[4][ofs] - p[5][ofs] - p[6][ofs] + p[7][ofs]); } for(int i = 0; i < npatterns; i++ ) { @@ -429,7 +444,7 @@ StarDetector::StarDetector(int _maxSize, int _responseThreshold, void StarDetector::detectImpl( InputArray _image, std::vector& keypoints, InputArray _mask ) const { Mat image = _image.getMat(), mask = _mask.getMat(), grayImage = image; - if( image.type() != CV_8U ) cvtColor( image, grayImage, COLOR_BGR2GRAY ); + if( image.channels() > 1 ) cvtColor( image, grayImage, COLOR_BGR2GRAY ); (*this)(grayImage, keypoints); KeyPointsFilter::runByPixelsMask( keypoints, mask ); @@ -438,7 +453,15 @@ void StarDetector::detectImpl( InputArray _image, std::vector& keypoin void StarDetector::operator()(const Mat& img, std::vector& keypoints) const { Mat responses, sizes; - int border = StarDetectorComputeResponses( img, responses, sizes, maxSize ); + int border; + + // Use 32-bit integers if we won't overflow in the integral image + if ((img.depth() == CV_8U || img.depth() == CV_8S) && + (img.rows * img.cols) < 8388608 ) // 8388608 = 2 ^ (32 - 8(bit depth) - 1(sign bit)) + border = StarDetectorComputeResponses( img, responses, sizes, maxSize, CV_32S ); + else + border = StarDetectorComputeResponses( img, responses, sizes, maxSize, CV_64F ); + keypoints.clear(); if( border >= 0 ) StarDetectorSuppressNonmax( responses, sizes, keypoints, border, diff --git a/modules/flann/include/opencv2/flann/defines.h b/modules/flann/include/opencv2/flann/defines.h index 13833b3c0b..f0264f74e3 100644 --- a/modules/flann/include/opencv2/flann/defines.h +++ b/modules/flann/include/opencv2/flann/defines.h @@ -107,6 +107,7 @@ enum flann_centers_init_t FLANN_CENTERS_RANDOM = 0, FLANN_CENTERS_GONZALES = 1, FLANN_CENTERS_KMEANSPP = 2, + FLANN_CENTERS_GROUPWISE = 3, // deprecated constants, should use the FLANN_CENTERS_* ones instead CENTERS_RANDOM = 0, diff --git a/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h b/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h index 710382053c..88af4e706e 100644 --- a/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h +++ b/modules/flann/include/opencv2/flann/hierarchical_clustering_index.h @@ -257,6 +257,84 @@ private: } + /** + * Chooses the initial centers in a way inspired by Gonzales (by Pierre-Emmanuel Viel): + * select the first point of the list as a candidate, then parse the points list. If another + * point is further than current candidate from the other centers, test if it is a good center + * of a local aggregation. If it is, replace current candidate by this point. And so on... + * + * Used with KMeansIndex that computes centers coordinates by averaging positions of clusters points, + * this doesn't make a real difference with previous methods. But used with HierarchicalClusteringIndex + * class that pick centers among existing points instead of computing the barycenters, there is a real + * improvement. + * + * Params: + * k = number of centers + * vecs = the dataset of points + * indices = indices in the dataset + * Returns: + */ + void GroupWiseCenterChooser(int k, int* dsindices, int indices_length, int* centers, int& centers_length) + { + const float kSpeedUpFactor = 1.3f; + + int n = indices_length; + + DistanceType* closestDistSq = new DistanceType[n]; + + // Choose one random center and set the closestDistSq values + int index = rand_int(n); + assert(index >=0 && index < n); + centers[0] = dsindices[index]; + + for (int i = 0; i < n; i++) { + closestDistSq[i] = distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols); + } + + + // Choose each center + int centerCount; + for (centerCount = 1; centerCount < k; centerCount++) { + + // Repeat several trials + double bestNewPot = -1; + int bestNewIndex = 0; + DistanceType furthest = 0; + for (index = 0; index < n; index++) { + + // We will test only the potential of the points further than current candidate + if( closestDistSq[index] > kSpeedUpFactor * (float)furthest ) { + + // Compute the new potential + double newPot = 0; + for (int i = 0; i < n; i++) { + newPot += std::min( distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols) + , closestDistSq[i] ); + } + + // Store the best result + if ((bestNewPot < 0)||(newPot <= bestNewPot)) { + bestNewPot = newPot; + bestNewIndex = index; + furthest = closestDistSq[index]; + } + } + } + + // Add the appropriate center + centers[centerCount] = dsindices[bestNewIndex]; + for (int i = 0; i < n; i++) { + closestDistSq[i] = std::min( distance(dataset[dsindices[i]], dataset[dsindices[bestNewIndex]], dataset.cols) + , closestDistSq[i] ); + } + } + + centers_length = centerCount; + + delete[] closestDistSq; + } + + public: @@ -290,6 +368,9 @@ public: else if (centers_init_==FLANN_CENTERS_KMEANSPP) { chooseCenters = &HierarchicalClusteringIndex::chooseCentersKMeanspp; } + else if (centers_init_==FLANN_CENTERS_GROUPWISE) { + chooseCenters = &HierarchicalClusteringIndex::GroupWiseCenterChooser; + } else { throw FLANNException("Unknown algorithm for choosing initial centers."); } diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index 287a188807..ce70ea5c6c 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -2716,6 +2716,8 @@ struct mRGBA2RGBA #ifdef HAVE_OPENCL +#define DIVUP(total, grain) (((total) + (grain) - 1) / (grain)) + static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) { bool ok = false; @@ -2729,6 +2731,17 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) if (depth != CV_8U && depth != CV_16U && depth != CV_32F) return false; + cv::String opts = format("-D depth=%d -D scn=%d ", depth, scn); + + ocl::Device dev = ocl::Device::getDefault(); + int pxPerWIy = 1; + if (dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU)) + { + pxPerWIy = 4; + } + globalsize[1] = DIVUP(globalsize[1], pxPerWIy); + opts += format("-D PIX_PER_WI_Y=%d ", pxPerWIy); + switch (code) { case COLOR_BGR2BGRA: case COLOR_RGB2BGRA: case COLOR_BGRA2BGR: @@ -2738,7 +2751,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) dcn = code == COLOR_BGR2BGRA || code == COLOR_RGB2BGRA || code == COLOR_BGRA2RGBA ? 4 : 3; bool reverse = !(code == COLOR_BGR2BGRA || code == COLOR_BGRA2BGR); k.create("RGB", ocl::imgproc::cvtcolor_oclsrc, - format("-D depth=%d -D scn=%d -D dcn=%d -D bidx=0 -D %s", depth, scn, dcn, + opts + format("-D dcn=%d -D bidx=0 -D %s", dcn, reverse ? "REVERSE" : "ORDER")); break; } @@ -2752,7 +2765,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) int greenbits = code == COLOR_BGR5652BGR || code == COLOR_BGR5652RGB || code == COLOR_BGR5652BGRA || code == COLOR_BGR5652RGBA ? 6 : 5; k.create("RGB5x52RGB", ocl::imgproc::cvtcolor_oclsrc, - format("-D depth=%d -D scn=2 -D dcn=%d -D bidx=%d -D greenbits=%d", depth, dcn, bidx, greenbits)); + opts + format("-D dcn=%d -D bidx=%d -D greenbits=%d", dcn, bidx, greenbits)); break; } case COLOR_BGR2BGR565: case COLOR_BGR2BGR555: case COLOR_RGB2BGR565: case COLOR_RGB2BGR555: @@ -2765,7 +2778,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) code == COLOR_BGRA2BGR565 || code == COLOR_RGBA2BGR565 ? 6 : 5; dcn = 2; k.create("RGB2RGB5x5", ocl::imgproc::cvtcolor_oclsrc, - format("-D depth=%d -D scn=%d -D dcn=2 -D bidx=%d -D greenbits=%d", depth, scn, bidx, greenbits)); + opts + format("-D dcn=2 -D bidx=%d -D greenbits=%d", bidx, greenbits)); break; } case COLOR_BGR5652GRAY: case COLOR_BGR5552GRAY: @@ -2774,7 +2787,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) dcn = 1; int greenbits = code == COLOR_BGR5652GRAY ? 6 : 5; k.create("BGR5x52Gray", ocl::imgproc::cvtcolor_oclsrc, - format("-D depth=%d -D scn=2 -D dcn=1 -D bidx=0 -D greenbits=%d", depth, greenbits)); + opts + format("-D dcn=1 -D bidx=0 -D greenbits=%d", greenbits)); break; } case COLOR_GRAY2BGR565: case COLOR_GRAY2BGR555: @@ -2783,7 +2796,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) dcn = 2; int greenbits = code == COLOR_GRAY2BGR565 ? 6 : 5; k.create("Gray2BGR5x5", ocl::imgproc::cvtcolor_oclsrc, - format("-D depth=%d -D scn=1 -D dcn=2 -D bidx=0 -D greenbits=%d", depth, greenbits)); + opts + format("-D dcn=2 -D bidx=0 -D greenbits=%d", greenbits)); break; } case COLOR_BGR2GRAY: case COLOR_BGRA2GRAY: @@ -2793,8 +2806,8 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) bidx = code == COLOR_BGR2GRAY || code == COLOR_BGRA2GRAY ? 0 : 2; dcn = 1; k.create("RGB2Gray", ocl::imgproc::cvtcolor_oclsrc, - format("-D depth=%d -D scn=%d -D dcn=1 -D bidx=%d -D STRIPE_SIZE=%d", - depth, scn, bidx, stripeSize)); + opts + format("-D dcn=1 -D bidx=%d -D STRIPE_SIZE=%d", + bidx, stripeSize)); globalsize[0] = (src.cols + stripeSize-1)/stripeSize; break; } @@ -2804,7 +2817,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) CV_Assert(scn == 1); dcn = code == COLOR_GRAY2BGRA ? 4 : 3; k.create("Gray2RGB", ocl::imgproc::cvtcolor_oclsrc, - format("-D depth=%d -D bidx=0 -D scn=1 -D dcn=%d", depth, dcn)); + opts + format("-D bidx=0 -D dcn=%d", dcn)); break; } case COLOR_BGR2YUV: @@ -2814,7 +2827,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) bidx = code == COLOR_RGB2YUV ? 0 : 2; dcn = 3; k.create("RGB2YUV", ocl::imgproc::cvtcolor_oclsrc, - format("-D depth=%d -D scn=%d -D dcn=3 -D bidx=%d", depth, scn, bidx)); + opts + format("-D dcn=3 -D bidx=%d", bidx)); break; } case COLOR_YUV2BGR: @@ -2824,7 +2837,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) CV_Assert(dcn == 3 || dcn == 4); bidx = code == COLOR_YUV2RGB ? 0 : 2; k.create("YUV2RGB", ocl::imgproc::cvtcolor_oclsrc, - format("-D depth=%d -D scn=3 -D dcn=%d -D bidx=%d", depth, dcn, bidx)); + opts + format("-D dcn=%d -D bidx=%d", dcn, bidx)); break; } case COLOR_YUV2RGB_NV12: case COLOR_YUV2BGR_NV12: @@ -2837,7 +2850,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) dstSz = Size(sz.width, sz.height * 2 / 3); k.create("YUV2RGB_NV12", ocl::imgproc::cvtcolor_oclsrc, - format("-D depth=0 -D scn=1 -D dcn=%d -D bidx=%d", dcn, bidx)); + opts + format("-D dcn=%d -D bidx=%d", dcn, bidx)); break; } case COLOR_BGR2YCrCb: @@ -2847,7 +2860,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) bidx = code == COLOR_BGR2YCrCb ? 0 : 2; dcn = 3; k.create("RGB2YCrCb", ocl::imgproc::cvtcolor_oclsrc, - format("-D depth=%d -D scn=%d -D dcn=3 -D bidx=%d", depth, scn, bidx)); + opts + format("-D dcn=3 -D bidx=%d", bidx)); break; } case COLOR_YCrCb2BGR: @@ -2858,7 +2871,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) CV_Assert(scn == 3 && (dcn == 3 || dcn == 4)); bidx = code == COLOR_YCrCb2BGR ? 0 : 2; k.create("YCrCb2RGB", ocl::imgproc::cvtcolor_oclsrc, - format("-D depth=%d -D scn=%d -D dcn=%d -D bidx=%d", depth, scn, dcn, bidx)); + opts + format("-D dcn=%d -D bidx=%d", dcn, bidx)); break; } case COLOR_BGR2XYZ: case COLOR_RGB2XYZ: @@ -2904,7 +2917,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) dst = _dst.getUMat(); k.create("RGB2XYZ", ocl::imgproc::cvtcolor_oclsrc, - format("-D depth=%d -D scn=%d -D dcn=3 -D bidx=%d", depth, scn, bidx)); + opts + format("-D dcn=3 -D bidx=%d", bidx)); if (k.empty()) return false; k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(c)); @@ -2955,7 +2968,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) dst = _dst.getUMat(); k.create("XYZ2RGB", ocl::imgproc::cvtcolor_oclsrc, - format("-D depth=%d -D scn=3 -D dcn=%d -D bidx=%d", depth, dcn, bidx)); + opts + format("-D dcn=%d -D bidx=%d", dcn, bidx)); if (k.empty()) return false; k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(c)); @@ -3010,8 +3023,9 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) _dst.create(dstSz, CV_8UC3); dst = _dst.getUMat(); - k.create("RGB2HSV", ocl::imgproc::cvtcolor_oclsrc, format("-D depth=%d -D hrange=%d -D bidx=%d -D dcn=3 -D scn=%d", - depth, hrange, bidx, scn)); + k.create("RGB2HSV", ocl::imgproc::cvtcolor_oclsrc, + opts + format("-D hrange=%d -D bidx=%d -D dcn=3", + hrange, bidx)); if (k.empty()) return false; @@ -3023,7 +3037,8 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) } else k.create(kernelName.c_str(), ocl::imgproc::cvtcolor_oclsrc, - format("-D depth=%d -D hscale=%ff -D bidx=%d -D scn=%d -D dcn=3", depth, hrange*(1.f/360.f), bidx, scn)); + opts + format("-D hscale=%ff -D bidx=%d -D dcn=3", + hrange*(1.f/360.f), bidx)); break; } case COLOR_HSV2BGR: case COLOR_HSV2RGB: case COLOR_HSV2BGR_FULL: case COLOR_HSV2RGB_FULL: @@ -3041,8 +3056,8 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) String kernelName = String(is_hsv ? "HSV" : "HLS") + "2RGB"; k.create(kernelName.c_str(), ocl::imgproc::cvtcolor_oclsrc, - format("-D depth=%d -D dcn=%d -D scn=3 -D bidx=%d -D hrange=%d -D hscale=%ff", - depth, dcn, bidx, hrange, 6.f/hrange)); + opts + format("-D dcn=%d -D bidx=%d -D hrange=%d -D hscale=%ff", + dcn, bidx, hrange, 6.f/hrange)); break; } case COLOR_RGBA2mRGBA: case COLOR_mRGBA2RGBA: @@ -3051,7 +3066,7 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) dcn = 4; k.create(code == COLOR_RGBA2mRGBA ? "RGBA2mRGBA" : "mRGBA2RGBA", ocl::imgproc::cvtcolor_oclsrc, - format("-D depth=%d -D dcn=4 -D scn=4 -D bidx=3", depth)); + opts + "-D dcn=4 -D bidx=3"); break; } case CV_BGR2Lab: case CV_RGB2Lab: case CV_LBGR2Lab: case CV_LRGB2Lab: @@ -3063,8 +3078,8 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) dcn = 3; k.create("BGR2Lab", ocl::imgproc::cvtcolor_oclsrc, - format("-D depth=%d -D dcn=3 -D scn=%d -D bidx=%d%s", - depth, scn, bidx, srgb ? " -D SRGB" : "")); + opts + format("-D dcn=3 -D bidx=%d%s", + bidx, srgb ? " -D SRGB" : "")); if (k.empty()) return false; @@ -3165,8 +3180,8 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) bool srgb = code == CV_Lab2BGR || code == CV_Lab2RGB; k.create("Lab2BGR", ocl::imgproc::cvtcolor_oclsrc, - format("-D depth=%d -D dcn=%d -D scn=3 -D bidx=%d%s", - depth, dcn, bidx, srgb ? " -D SRGB" : "")); + opts + format("-D dcn=%d -D bidx=%d%s", + dcn, bidx, srgb ? " -D SRGB" : "")); if (k.empty()) return false; diff --git a/modules/imgproc/src/histogram.cpp b/modules/imgproc/src/histogram.cpp index 04dc7e2824..8e8484d6b7 100644 --- a/modules/imgproc/src/histogram.cpp +++ b/modules/imgproc/src/histogram.cpp @@ -1175,6 +1175,48 @@ calcHist_8u( std::vector& _ptrs, const std::vector& _deltas, } } +#if defined HAVE_IPP && !defined HAVE_IPP_ICV_ONLY + +class IPPCalcHistInvoker : + public ParallelLoopBody +{ +public: + IPPCalcHistInvoker(const Mat & _src, Mat & _hist, AutoBuffer & _levels, Ipp32s _histSize, Ipp32s _low, Ipp32s _high, bool * _ok) : + ParallelLoopBody(), src(&_src), hist(&_hist), levels(&_levels), histSize(_histSize), low(_low), high(_high), ok(_ok) + { + *ok = true; + } + + virtual void operator() (const Range & range) const + { + Mat phist(hist->size(), hist->type(), Scalar::all(0)); + + IppStatus status = ippiHistogramEven_8u_C1R( + src->data + src->step * range.start, (int)src->step, ippiSize(src->cols, range.end - range.start), + (Ipp32s *)phist.data, (Ipp32s *)*levels, histSize, low, high); + + if (status < 0) + { + *ok = false; + return; + } + + for (int i = 0; i < histSize; ++i) + CV_XADD((int *)(hist->data + i * hist->step), *(int *)(phist.data + i * phist.step)); + } + +private: + const Mat * src; + Mat * hist; + AutoBuffer * levels; + Ipp32s histSize, low, high; + bool * ok; + + const IPPCalcHistInvoker & operator = (const IPPCalcHistInvoker & ); +}; + +#endif + } void cv::calcHist( const Mat* images, int nimages, const int* channels, @@ -1190,6 +1232,32 @@ void cv::calcHist( const Mat* images, int nimages, const int* channels, Mat hist = _hist.getMat(), ihist = hist; ihist.flags = (ihist.flags & ~CV_MAT_TYPE_MASK)|CV_32S; +#if defined HAVE_IPP && !defined HAVE_IPP_ICV_ONLY + if (nimages == 1 && images[0].type() == CV_8UC1 && dims == 1 && channels && + channels[0] == 0 && mask.empty() && images[0].dims <= 2 && + !accumulate && uniform) + { + ihist.setTo(Scalar::all(0)); + AutoBuffer levels(histSize[0] + 1); + + bool ok = true; + const Mat & src = images[0]; + int nstripes = std::min(8, src.total() / (1 << 16)); +#ifdef HAVE_CONCURRENCY + nstripes = 1; +#endif + IPPCalcHistInvoker invoker(src, ihist, levels, histSize[0] + 1, (Ipp32s)ranges[0][0], (Ipp32s)ranges[0][1], &ok); + Range range(0, src.rows); + parallel_for_(range, invoker, nstripes); + + if (ok) + { + ihist.convertTo(hist, CV_32F); + return; + } + } +#endif + if( !accumulate || histdata != hist.data ) hist = Scalar(0.); else @@ -1477,7 +1545,7 @@ void cv::calcHist( InputArrayOfArrays images, const std::vector& channels, CV_OCL_RUN(images.total() == 1 && channels.size() == 1 && images.channels(0) == 1 && channels[0] == 0 && images.isUMatVector() && mask.empty() && !accumulate && histSize.size() == 1 && histSize[0] == BINS && ranges.size() == 2 && - ranges[0] == 0 && ranges[1] == 256, + ranges[0] == 0 && ranges[1] == BINS, ocl_calcHist(images, hist)) int i, dims = (int)histSize.size(), rsz = (int)ranges.size(), csz = (int)channels.size(); diff --git a/modules/imgproc/src/moments.cpp b/modules/imgproc/src/moments.cpp index a8cfc96132..f3fb6962da 100644 --- a/modules/imgproc/src/moments.cpp +++ b/modules/imgproc/src/moments.cpp @@ -466,6 +466,61 @@ cv::Moments cv::moments( InputArray _src, bool binary ) if( cn > 1 ) CV_Error( CV_StsBadArg, "Invalid image type (must be single-channel)" ); +#if (IPP_VERSION_X100 >= 801) + if (!binary) + { + IppiSize roi = {mat.cols, mat.rows}; + IppiMomentState_64f *moment; + // ippiMomentInitAlloc_64f, ippiMomentFree_64f are deprecated in 8.1, but there are not another way + // to initialize IppiMomentState_64f. When GetStateSize and Init functions will appear we have to + // change our code. + if (0 <= ippiMomentInitAlloc_64f(&moment, ippAlgHintAccurate)) + { + IppStatus sts = (IppStatus)(-1); + if (depth == CV_8U) + sts = ippiMoments64f_8u_C1R((const Ipp8u *)mat.data, (int)mat.step, roi, moment); + else if( depth == CV_16U ) + sts = ippiMoments64f_16u_C1R((const Ipp16u *)mat.data, (int)mat.step, roi, moment); + else if( depth == CV_32F ) + sts = ippiMoments64f_32f_C1R((const Ipp32f *)mat.data, (int)mat.step, roi, moment); + if (0 <= sts) + { + IppiPoint point = {0, 0}; + ippiGetSpatialMoment_64f(moment, 0, 0, 0, point, &m.m00); + ippiGetSpatialMoment_64f(moment, 1, 0, 0, point, &m.m10); + ippiGetSpatialMoment_64f(moment, 0, 1, 0, point, &m.m01); + + ippiGetSpatialMoment_64f(moment, 2, 0, 0, point, &m.m20); + ippiGetSpatialMoment_64f(moment, 1, 1, 0, point, &m.m11); + ippiGetSpatialMoment_64f(moment, 0, 2, 0, point, &m.m02); + + ippiGetSpatialMoment_64f(moment, 3, 0, 0, point, &m.m30); + ippiGetSpatialMoment_64f(moment, 2, 1, 0, point, &m.m21); + ippiGetSpatialMoment_64f(moment, 1, 2, 0, point, &m.m12); + ippiGetSpatialMoment_64f(moment, 0, 3, 0, point, &m.m03); + ippiGetCentralMoment_64f(moment, 2, 0, 0, &m.mu20); + ippiGetCentralMoment_64f(moment, 1, 1, 0, &m.mu11); + ippiGetCentralMoment_64f(moment, 0, 2, 0, &m.mu02); + ippiGetCentralMoment_64f(moment, 3, 0, 0, &m.mu30); + ippiGetCentralMoment_64f(moment, 2, 1, 0, &m.mu21); + ippiGetCentralMoment_64f(moment, 1, 2, 0, &m.mu12); + ippiGetCentralMoment_64f(moment, 0, 3, 0, &m.mu03); + ippiGetNormalizedCentralMoment_64f(moment, 2, 0, 0, &m.nu20); + ippiGetNormalizedCentralMoment_64f(moment, 1, 1, 0, &m.nu11); + ippiGetNormalizedCentralMoment_64f(moment, 0, 2, 0, &m.nu02); + ippiGetNormalizedCentralMoment_64f(moment, 3, 0, 0, &m.nu30); + ippiGetNormalizedCentralMoment_64f(moment, 2, 1, 0, &m.nu21); + ippiGetNormalizedCentralMoment_64f(moment, 1, 2, 0, &m.nu12); + ippiGetNormalizedCentralMoment_64f(moment, 0, 3, 0, &m.nu03); + + ippiMomentFree_64f(moment); + return m; + } + ippiMomentFree_64f(moment); + } + } +#endif + if( binary || depth == CV_8U ) func = momentsInTile; else if( depth == CV_16U ) diff --git a/modules/imgproc/src/opencl/cvtcolor.cl b/modules/imgproc/src/opencl/cvtcolor.cl index 115bfbd7ae..0034395458 100644 --- a/modules/imgproc/src/opencl/cvtcolor.cl +++ b/modules/imgproc/src/opencl/cvtcolor.cl @@ -99,64 +99,81 @@ enum #define hrange 0 #endif +#if bidx == 0 +#define R_COMP z +#define G_COMP y +#define B_COMP x +#elif bidx == 2 +#define R_COMP x +#define G_COMP y +#define B_COMP z +#elif bidx == 3 +// The only kernel that uses bidx == 3 doesn't use these macros. +// But we still need to make the compiler happy. +#define R_COMP w +#define G_COMP w +#define B_COMP w +#endif + +#define __CAT(x, y) x##y +#define CAT(x, y) __CAT(x, y) + +#define DATA_TYPE_4 CAT(DATA_TYPE, 4) + ///////////////////////////////////// RGB <-> GRAY ////////////////////////////////////// __kernel void RGB2Gray(__global const uchar* srcptr, int srcstep, int srcoffset, __global uchar* dstptr, int dststep, int dstoffset, int rows, int cols) { -#if 1 - const int x = get_global_id(0); - const int y = get_global_id(1); - - if (y < rows && x < cols) - { - __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes)); - __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes)); -#ifdef DEPTH_5 - dst[0] = src[bidx] * 0.114f + src[1] * 0.587f + src[(bidx^2)] * 0.299f; -#else - dst[0] = (DATA_TYPE)CV_DESCALE((src[bidx] * B2Y + src[1] * G2Y + src[(bidx^2)] * R2Y), yuv_shift); -#endif - } -#else - const int x_min = get_global_id(0)*STRIPE_SIZE; - const int x_max = min(x_min + STRIPE_SIZE, cols); - const int y = get_global_id(1); + int x = get_global_id(0); + int y = get_global_id(1) * PIX_PER_WI_Y; - if( y < rows ) + if (x < cols) { - __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + - mad24(y, srcstep, srcoffset)) + x_min*scn; - __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset)); - int x; - for( x = x_min; x < x_max; x++, src += scn ) + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) + { + if (y < rows) + { + __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes)); + __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes)); + DATA_TYPE_4 src_pix = vload4(0, src); #ifdef DEPTH_5 - dst[x] = src[bidx] * 0.114f + src[1] * 0.587f + src[(bidx^2)] * 0.299f; + dst[0] = src_pix.B_COMP * 0.114f + src_pix.G_COMP * 0.587f + src_pix.R_COMP * 0.299f; #else - dst[x] = (DATA_TYPE)(mad24(src[bidx], B2Y, mad24(src[1], G2Y, - mad24(src[(bidx^2)], R2Y, 1 << (yuv_shift-1)))) >> yuv_shift); + dst[0] = (DATA_TYPE)CV_DESCALE((src_pix.B_COMP * B2Y + src_pix.G_COMP * G2Y + src_pix.R_COMP * R2Y), yuv_shift); #endif + } + ++y; + } } -#endif } __kernel void Gray2RGB(__global const uchar* srcptr, int srcstep, int srcoffset, __global uchar* dstptr, int dststep, int dstoffset, int rows, int cols) { - const int x = get_global_id(0); - const int y = get_global_id(1); + int x = get_global_id(0); + int y = get_global_id(1) * PIX_PER_WI_Y; - if (y < rows && x < cols) + if (x < cols) { - __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes)); - __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes)); - DATA_TYPE val = src[0]; - dst[0] = dst[1] = dst[2] = val; + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) + { + if (y < rows) + { + __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes)); + __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes)); + DATA_TYPE val = src[0]; + dst[0] = dst[1] = dst[2] = val; #if dcn == 4 - dst[3] = MAX_NUM; + dst[3] = MAX_NUM; #endif + } + ++y; + } } } @@ -170,30 +187,39 @@ __kernel void RGB2YUV(__global const uchar* srcptr, int srcstep, int srcoffset, int rows, int cols) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1) * PIX_PER_WI_Y; - if (y < rows && x < cols) + if (x < cols) { - __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes)); - __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes)); - DATA_TYPE b=src[bidx], g=src[1], r=src[bidx^2]; + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) + { + if (y < rows) + { + __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes)); + __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes)); + DATA_TYPE_4 src_pix = vload4(0, src); + DATA_TYPE b=src_pix.B_COMP, g=src_pix.G_COMP, r=src_pix.R_COMP; #ifdef DEPTH_5 - __constant float * coeffs = c_RGB2YUVCoeffs_f; - const DATA_TYPE Y = b * coeffs[0] + g * coeffs[1] + r * coeffs[2]; - const DATA_TYPE U = (b - Y) * coeffs[3] + HALF_MAX; - const DATA_TYPE V = (r - Y) * coeffs[4] + HALF_MAX; + __constant float * coeffs = c_RGB2YUVCoeffs_f; + const DATA_TYPE Y = b * coeffs[0] + g * coeffs[1] + r * coeffs[2]; + const DATA_TYPE U = (b - Y) * coeffs[3] + HALF_MAX; + const DATA_TYPE V = (r - Y) * coeffs[4] + HALF_MAX; #else - __constant int * coeffs = c_RGB2YUVCoeffs_i; - const int delta = HALF_MAX * (1 << yuv_shift); - const int Y = CV_DESCALE(b * coeffs[0] + g * coeffs[1] + r * coeffs[2], yuv_shift); - const int U = CV_DESCALE((b - Y) * coeffs[3] + delta, yuv_shift); - const int V = CV_DESCALE((r - Y) * coeffs[4] + delta, yuv_shift); + __constant int * coeffs = c_RGB2YUVCoeffs_i; + const int delta = HALF_MAX * (1 << yuv_shift); + const int Y = CV_DESCALE(b * coeffs[0] + g * coeffs[1] + r * coeffs[2], yuv_shift); + const int U = CV_DESCALE((b - Y) * coeffs[3] + delta, yuv_shift); + const int V = CV_DESCALE((r - Y) * coeffs[4] + delta, yuv_shift); #endif - dst[0] = SAT_CAST( Y ); - dst[1] = SAT_CAST( U ); - dst[2] = SAT_CAST( V ); + dst[0] = SAT_CAST( Y ); + dst[1] = SAT_CAST( U ); + dst[2] = SAT_CAST( V ); + } + ++y; + } } } @@ -205,32 +231,41 @@ __kernel void YUV2RGB(__global const uchar* srcptr, int srcstep, int srcoffset, int rows, int cols) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1) * PIX_PER_WI_Y; - if (y < rows && x < cols) + if (x < cols) { - __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes)); - __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes)); - DATA_TYPE Y = src[0], U = src[1], V = src[2]; + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) + { + if (y < rows) + { + __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes)); + __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes)); + DATA_TYPE_4 src_pix = vload4(0, src); + DATA_TYPE Y = src_pix.x, U = src_pix.y, V = src_pix.z; #ifdef DEPTH_5 - __constant float * coeffs = c_YUV2RGBCoeffs_f; - const float r = Y + (V - HALF_MAX) * coeffs[3]; - const float g = Y + (V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1]; - const float b = Y + (U - HALF_MAX) * coeffs[0]; + __constant float * coeffs = c_YUV2RGBCoeffs_f; + const float r = Y + (V - HALF_MAX) * coeffs[3]; + const float g = Y + (V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1]; + const float b = Y + (U - HALF_MAX) * coeffs[0]; #else - __constant int * coeffs = c_YUV2RGBCoeffs_i; - const int r = Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift); - const int g = Y + CV_DESCALE((V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1], yuv_shift); - const int b = Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift); + __constant int * coeffs = c_YUV2RGBCoeffs_i; + const int r = Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift); + const int g = Y + CV_DESCALE((V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1], yuv_shift); + const int b = Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift); #endif - dst[bidx] = SAT_CAST( b ); - dst[1] = SAT_CAST( g ); - dst[bidx^2] = SAT_CAST( r ); + dst[bidx] = SAT_CAST( b ); + dst[1] = SAT_CAST( g ); + dst[bidx^2] = SAT_CAST( r ); #if dcn == 4 - dst[3] = MAX_NUM; + dst[3] = MAX_NUM; #endif + } + ++y; + } } } @@ -246,58 +281,66 @@ __kernel void YUV2RGB_NV12(__global const uchar* srcptr, int srcstep, int srcoff int rows, int cols) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1) * PIX_PER_WI_Y; - if (y < rows / 2 && x < cols / 2 ) + if (x < cols / 2) { - __global const uchar* ysrc = srcptr + mad24(y << 1, srcstep, (x << 1) + srcoffset); - __global const uchar* usrc = srcptr + mad24(rows + y, srcstep, (x << 1) + srcoffset); - __global uchar* dst1 = dstptr + mad24(y << 1, dststep, x * (dcn<<1) + dstoffset); - __global uchar* dst2 = dstptr + mad24((y << 1) + 1, dststep, x * (dcn<<1) + dstoffset); - - int Y1 = ysrc[0]; - int Y2 = ysrc[1]; - int Y3 = ysrc[srcstep]; - int Y4 = ysrc[srcstep + 1]; - - int U = usrc[0] - 128; - int V = usrc[1] - 128; - - int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * V; - int guv = (1 << (ITUR_BT_601_SHIFT - 1)) - ITUR_BT_601_CVG * V - ITUR_BT_601_CUG * U; - int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * U; - - Y1 = max(0, Y1 - 16) * ITUR_BT_601_CY; - dst1[2 - bidx] = convert_uchar_sat((Y1 + ruv) >> ITUR_BT_601_SHIFT); - dst1[1] = convert_uchar_sat((Y1 + guv) >> ITUR_BT_601_SHIFT); - dst1[bidx] = convert_uchar_sat((Y1 + buv) >> ITUR_BT_601_SHIFT); + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) + { + if (y < rows / 2 ) + { + __global const uchar* ysrc = srcptr + mad24(y << 1, srcstep, (x << 1) + srcoffset); + __global const uchar* usrc = srcptr + mad24(rows + y, srcstep, (x << 1) + srcoffset); + __global uchar* dst1 = dstptr + mad24(y << 1, dststep, x * (dcn<<1) + dstoffset); + __global uchar* dst2 = dstptr + mad24((y << 1) + 1, dststep, x * (dcn<<1) + dstoffset); + + int Y1 = ysrc[0]; + int Y2 = ysrc[1]; + int Y3 = ysrc[srcstep]; + int Y4 = ysrc[srcstep + 1]; + + int U = usrc[0] - 128; + int V = usrc[1] - 128; + + int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * V; + int guv = (1 << (ITUR_BT_601_SHIFT - 1)) - ITUR_BT_601_CVG * V - ITUR_BT_601_CUG * U; + int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * U; + + Y1 = max(0, Y1 - 16) * ITUR_BT_601_CY; + dst1[2 - bidx] = convert_uchar_sat((Y1 + ruv) >> ITUR_BT_601_SHIFT); + dst1[1] = convert_uchar_sat((Y1 + guv) >> ITUR_BT_601_SHIFT); + dst1[bidx] = convert_uchar_sat((Y1 + buv) >> ITUR_BT_601_SHIFT); #if dcn == 4 - dst1[3] = 255; + dst1[3] = 255; #endif - Y2 = max(0, Y2 - 16) * ITUR_BT_601_CY; - dst1[dcn + 2 - bidx] = convert_uchar_sat((Y2 + ruv) >> ITUR_BT_601_SHIFT); - dst1[dcn + 1] = convert_uchar_sat((Y2 + guv) >> ITUR_BT_601_SHIFT); - dst1[dcn + bidx] = convert_uchar_sat((Y2 + buv) >> ITUR_BT_601_SHIFT); + Y2 = max(0, Y2 - 16) * ITUR_BT_601_CY; + dst1[dcn + 2 - bidx] = convert_uchar_sat((Y2 + ruv) >> ITUR_BT_601_SHIFT); + dst1[dcn + 1] = convert_uchar_sat((Y2 + guv) >> ITUR_BT_601_SHIFT); + dst1[dcn + bidx] = convert_uchar_sat((Y2 + buv) >> ITUR_BT_601_SHIFT); #if dcn == 4 - dst1[7] = 255; + dst1[7] = 255; #endif - Y3 = max(0, Y3 - 16) * ITUR_BT_601_CY; - dst2[2 - bidx] = convert_uchar_sat((Y3 + ruv) >> ITUR_BT_601_SHIFT); - dst2[1] = convert_uchar_sat((Y3 + guv) >> ITUR_BT_601_SHIFT); - dst2[bidx] = convert_uchar_sat((Y3 + buv) >> ITUR_BT_601_SHIFT); + Y3 = max(0, Y3 - 16) * ITUR_BT_601_CY; + dst2[2 - bidx] = convert_uchar_sat((Y3 + ruv) >> ITUR_BT_601_SHIFT); + dst2[1] = convert_uchar_sat((Y3 + guv) >> ITUR_BT_601_SHIFT); + dst2[bidx] = convert_uchar_sat((Y3 + buv) >> ITUR_BT_601_SHIFT); #if dcn == 4 - dst2[3] = 255; + dst2[3] = 255; #endif - Y4 = max(0, Y4 - 16) * ITUR_BT_601_CY; - dst2[dcn + 2 - bidx] = convert_uchar_sat((Y4 + ruv) >> ITUR_BT_601_SHIFT); - dst2[dcn + 1] = convert_uchar_sat((Y4 + guv) >> ITUR_BT_601_SHIFT); - dst2[dcn + bidx] = convert_uchar_sat((Y4 + buv) >> ITUR_BT_601_SHIFT); + Y4 = max(0, Y4 - 16) * ITUR_BT_601_CY; + dst2[dcn + 2 - bidx] = convert_uchar_sat((Y4 + ruv) >> ITUR_BT_601_SHIFT); + dst2[dcn + 1] = convert_uchar_sat((Y4 + guv) >> ITUR_BT_601_SHIFT); + dst2[dcn + bidx] = convert_uchar_sat((Y4 + buv) >> ITUR_BT_601_SHIFT); #if dcn == 4 - dst2[7] = 255; + dst2[7] = 255; #endif + } + ++y; + } } } @@ -311,30 +354,39 @@ __kernel void RGB2YCrCb(__global const uchar* srcptr, int srcstep, int srcoffset int rows, int cols) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1) * PIX_PER_WI_Y; - if (y < rows && x < cols) + if (x < cols) { - __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes)); - __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes)); - DATA_TYPE b=src[bidx], g=src[1], r=src[bidx^2]; + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) + { + if (y < rows) + { + __global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes)); + __global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes)); + DATA_TYPE_4 src_pix = vload4(0, src); + DATA_TYPE b=src_pix.B_COMP, g=src_pix.G_COMP, r=src_pix.R_COMP; #ifdef DEPTH_5 - __constant float * coeffs = c_RGB2YCrCbCoeffs_f; - DATA_TYPE Y = b * coeffs[2] + g * coeffs[1] + r * coeffs[0]; - DATA_TYPE Cr = (r - Y) * coeffs[3] + HALF_MAX; - DATA_TYPE Cb = (b - Y) * coeffs[4] + HALF_MAX; + __constant float * coeffs = c_RGB2YCrCbCoeffs_f; + DATA_TYPE Y = b * coeffs[2] + g * coeffs[1] + r * coeffs[0]; + DATA_TYPE Cr = (r - Y) * coeffs[3] + HALF_MAX; + DATA_TYPE Cb = (b - Y) * coeffs[4] + HALF_MAX; #else - __constant int * coeffs = c_RGB2YCrCbCoeffs_i; - int delta = HALF_MAX * (1 << yuv_shift); - int Y = CV_DESCALE(b * coeffs[2] + g * coeffs[1] + r * coeffs[0], yuv_shift); - int Cr = CV_DESCALE((r - Y) * coeffs[3] + delta, yuv_shift); - int Cb = CV_DESCALE((b - Y) * coeffs[4] + delta, yuv_shift); + __constant int * coeffs = c_RGB2YCrCbCoeffs_i; + int delta = HALF_MAX * (1 << yuv_shift); + int Y = CV_DESCALE(b * coeffs[2] + g * coeffs[1] + r * coeffs[0], yuv_shift); + int Cr = CV_DESCALE((r - Y) * coeffs[3] + delta, yuv_shift); + int Cb = CV_DESCALE((b - Y) * coeffs[4] + delta, yuv_shift); #endif - dst[0] = SAT_CAST( Y ); - dst[1] = SAT_CAST( Cr ); - dst[2] = SAT_CAST( Cb ); + dst[0] = SAT_CAST( Y ); + dst[1] = SAT_CAST( Cr ); + dst[2] = SAT_CAST( Cb ); + } + ++y; + } } } @@ -346,35 +398,44 @@ __kernel void YCrCb2RGB(__global const uchar* src, int src_step, int src_offset, int rows, int cols) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1) * PIX_PER_WI_Y; - if (y < rows && x < cols) + if (x < cols) { - int src_idx = mad24(y, src_step, src_offset + x * scnbytes); - int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); - __global const DATA_TYPE * srcptr = (__global const DATA_TYPE*)(src + src_idx); - __global DATA_TYPE * dstptr = (__global DATA_TYPE*)(dst + dst_idx); + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) + { + if (y < rows) + { + int src_idx = mad24(y, src_step, src_offset + x * scnbytes); + int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); + __global const DATA_TYPE * srcptr = (__global const DATA_TYPE*)(src + src_idx); + __global DATA_TYPE * dstptr = (__global DATA_TYPE*)(dst + dst_idx); - DATA_TYPE y = srcptr[0], cr = srcptr[1], cb = srcptr[2]; + DATA_TYPE_4 src_pix = vload4(0, srcptr); + DATA_TYPE y = src_pix.x, cr = src_pix.y, cb = src_pix.z; #ifdef DEPTH_5 - __constant float * coeff = c_YCrCb2RGBCoeffs_f; - float r = y + coeff[0] * (cr - HALF_MAX); - float g = y + coeff[1] * (cr - HALF_MAX) + coeff[2] * (cb - HALF_MAX); - float b = y + coeff[3] * (cb - HALF_MAX); + __constant float * coeff = c_YCrCb2RGBCoeffs_f; + float r = y + coeff[0] * (cr - HALF_MAX); + float g = y + coeff[1] * (cr - HALF_MAX) + coeff[2] * (cb - HALF_MAX); + float b = y + coeff[3] * (cb - HALF_MAX); #else - __constant int * coeff = c_YCrCb2RGBCoeffs_i; - int r = y + CV_DESCALE(coeff[0] * (cr - HALF_MAX), yuv_shift); - int g = y + CV_DESCALE(coeff[1] * (cr - HALF_MAX) + coeff[2] * (cb - HALF_MAX), yuv_shift); - int b = y + CV_DESCALE(coeff[3] * (cb - HALF_MAX), yuv_shift); + __constant int * coeff = c_YCrCb2RGBCoeffs_i; + int r = y + CV_DESCALE(coeff[0] * (cr - HALF_MAX), yuv_shift); + int g = y + CV_DESCALE(coeff[1] * (cr - HALF_MAX) + coeff[2] * (cb - HALF_MAX), yuv_shift); + int b = y + CV_DESCALE(coeff[3] * (cb - HALF_MAX), yuv_shift); #endif - dstptr[(bidx^2)] = SAT_CAST(r); - dstptr[1] = SAT_CAST(g); - dstptr[bidx] = SAT_CAST(b); + dstptr[(bidx^2)] = SAT_CAST(r); + dstptr[1] = SAT_CAST(g); + dstptr[bidx] = SAT_CAST(b); #if dcn == 4 - dstptr[3] = MAX_NUM; + dstptr[3] = MAX_NUM; #endif + } + ++y; + } } } @@ -385,30 +446,39 @@ __kernel void RGB2XYZ(__global const uchar * srcptr, int src_step, int src_offse int rows, int cols, __constant COEFF_TYPE * coeffs) { int dx = get_global_id(0); - int dy = get_global_id(1); + int dy = get_global_id(1) * PIX_PER_WI_Y; - if (dy < rows && dx < cols) + if (dx < cols) { - int src_idx = mad24(dy, src_step, src_offset + dx * scnbytes); - int dst_idx = mad24(dy, dst_step, dst_offset + dx * dcnbytes); + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) + { + if (dy < rows) + { + int src_idx = mad24(dy, src_step, src_offset + dx * scnbytes); + int dst_idx = mad24(dy, dst_step, dst_offset + dx * dcnbytes); - __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_idx); - __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_idx); + __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_idx); + __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_idx); - DATA_TYPE r = src[0], g = src[1], b = src[2]; + DATA_TYPE_4 src_pix = vload4(0, src); + DATA_TYPE r = src_pix.x, g = src_pix.y, b = src_pix.z; #ifdef DEPTH_5 - float x = r * coeffs[0] + g * coeffs[1] + b * coeffs[2]; - float y = r * coeffs[3] + g * coeffs[4] + b * coeffs[5]; - float z = r * coeffs[6] + g * coeffs[7] + b * coeffs[8]; + float x = r * coeffs[0] + g * coeffs[1] + b * coeffs[2]; + float y = r * coeffs[3] + g * coeffs[4] + b * coeffs[5]; + float z = r * coeffs[6] + g * coeffs[7] + b * coeffs[8]; #else - int x = CV_DESCALE(r * coeffs[0] + g * coeffs[1] + b * coeffs[2], xyz_shift); - int y = CV_DESCALE(r * coeffs[3] + g * coeffs[4] + b * coeffs[5], xyz_shift); - int z = CV_DESCALE(r * coeffs[6] + g * coeffs[7] + b * coeffs[8], xyz_shift); + int x = CV_DESCALE(r * coeffs[0] + g * coeffs[1] + b * coeffs[2], xyz_shift); + int y = CV_DESCALE(r * coeffs[3] + g * coeffs[4] + b * coeffs[5], xyz_shift); + int z = CV_DESCALE(r * coeffs[6] + g * coeffs[7] + b * coeffs[8], xyz_shift); #endif - dst[0] = SAT_CAST(x); - dst[1] = SAT_CAST(y); - dst[2] = SAT_CAST(z); + dst[0] = SAT_CAST(x); + dst[1] = SAT_CAST(y); + dst[2] = SAT_CAST(z); + } + ++dy; + } } } @@ -417,33 +487,42 @@ __kernel void XYZ2RGB(__global const uchar * srcptr, int src_step, int src_offse int rows, int cols, __constant COEFF_TYPE * coeffs) { int dx = get_global_id(0); - int dy = get_global_id(1); + int dy = get_global_id(1) * PIX_PER_WI_Y; - if (dy < rows && dx < cols) + if (dx < cols) { - int src_idx = mad24(dy, src_step, src_offset + dx * scnbytes); - int dst_idx = mad24(dy, dst_step, dst_offset + dx * dcnbytes); + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) + { + if (dy < rows) + { + int src_idx = mad24(dy, src_step, src_offset + dx * scnbytes); + int dst_idx = mad24(dy, dst_step, dst_offset + dx * dcnbytes); - __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_idx); - __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_idx); + __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_idx); + __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_idx); - DATA_TYPE x = src[0], y = src[1], z = src[2]; + DATA_TYPE_4 src_pix = vload4(0, src); + DATA_TYPE x = src_pix.x, y = src_pix.y, z = src_pix.z; #ifdef DEPTH_5 - float b = x * coeffs[0] + y * coeffs[1] + z * coeffs[2]; - float g = x * coeffs[3] + y * coeffs[4] + z * coeffs[5]; - float r = x * coeffs[6] + y * coeffs[7] + z * coeffs[8]; + float b = x * coeffs[0] + y * coeffs[1] + z * coeffs[2]; + float g = x * coeffs[3] + y * coeffs[4] + z * coeffs[5]; + float r = x * coeffs[6] + y * coeffs[7] + z * coeffs[8]; #else - int b = CV_DESCALE(x * coeffs[0] + y * coeffs[1] + z * coeffs[2], xyz_shift); - int g = CV_DESCALE(x * coeffs[3] + y * coeffs[4] + z * coeffs[5], xyz_shift); - int r = CV_DESCALE(x * coeffs[6] + y * coeffs[7] + z * coeffs[8], xyz_shift); + int b = CV_DESCALE(x * coeffs[0] + y * coeffs[1] + z * coeffs[2], xyz_shift); + int g = CV_DESCALE(x * coeffs[3] + y * coeffs[4] + z * coeffs[5], xyz_shift); + int r = CV_DESCALE(x * coeffs[6] + y * coeffs[7] + z * coeffs[8], xyz_shift); #endif - dst[0] = SAT_CAST(b); - dst[1] = SAT_CAST(g); - dst[2] = SAT_CAST(r); + dst[0] = SAT_CAST(b); + dst[1] = SAT_CAST(g); + dst[2] = SAT_CAST(r); #if dcn == 4 - dst[3] = MAX_NUM; + dst[3] = MAX_NUM; #endif + } + ++dy; + } } } @@ -454,33 +533,42 @@ __kernel void RGB(__global const uchar* srcptr, int src_step, int src_offset, int rows, int cols) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1) * PIX_PER_WI_Y; - if (y < rows && x < cols) + if (x < cols) { - int src_idx = mad24(y, src_step, src_offset + x * scnbytes); - int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) + { + if (y < rows) + { + int src_idx = mad24(y, src_step, src_offset + x * scnbytes); + int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); - __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_idx); - __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_idx); + __global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_idx); + __global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_idx); + DATA_TYPE_4 src_pix = vload4(0, src); #ifdef REVERSE - dst[0] = src[2]; - dst[1] = src[1]; - dst[2] = src[0]; + dst[0] = src_pix.z; + dst[1] = src_pix.y; + dst[2] = src_pix.x; #else - dst[0] = src[0]; - dst[1] = src[1]; - dst[2] = src[2]; + dst[0] = src_pix.x; + dst[1] = src_pix.y; + dst[2] = src_pix.z; #endif #if dcn == 4 #if scn == 3 - dst[3] = MAX_NUM; + dst[3] = MAX_NUM; #else - dst[3] = src[3]; + dst[3] = src[3]; #endif #endif + } + ++y; + } } } @@ -491,31 +579,39 @@ __kernel void RGB5x52RGB(__global const uchar* src, int src_step, int src_offset int rows, int cols) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1) * PIX_PER_WI_Y; - if (y < rows && x < cols) + if (x < cols) { - int src_idx = mad24(y, src_step, src_offset + x * scnbytes); - int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); - ushort t = *((__global const ushort*)(src + src_idx)); + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) + { + if (y < rows) + { + int src_idx = mad24(y, src_step, src_offset + x * scnbytes); + int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); + ushort t = *((__global const ushort*)(src + src_idx)); #if greenbits == 6 - dst[dst_idx + bidx] = (uchar)(t << 3); - dst[dst_idx + 1] = (uchar)((t >> 3) & ~3); - dst[dst_idx + (bidx^2)] = (uchar)((t >> 8) & ~7); + dst[dst_idx + bidx] = (uchar)(t << 3); + dst[dst_idx + 1] = (uchar)((t >> 3) & ~3); + dst[dst_idx + (bidx^2)] = (uchar)((t >> 8) & ~7); #else - dst[dst_idx + bidx] = (uchar)(t << 3); - dst[dst_idx + 1] = (uchar)((t >> 2) & ~7); - dst[dst_idx + (bidx^2)] = (uchar)((t >> 7) & ~7); + dst[dst_idx + bidx] = (uchar)(t << 3); + dst[dst_idx + 1] = (uchar)((t >> 2) & ~7); + dst[dst_idx + (bidx^2)] = (uchar)((t >> 7) & ~7); #endif #if dcn == 4 #if greenbits == 6 - dst[dst_idx + 3] = 255; + dst[dst_idx + 3] = 255; #else - dst[dst_idx + 3] = t & 0x8000 ? 255 : 0; + dst[dst_idx + 3] = t & 0x8000 ? 255 : 0; #endif #endif + } + ++y; + } } } @@ -524,21 +620,30 @@ __kernel void RGB2RGB5x5(__global const uchar* src, int src_step, int src_offset int rows, int cols) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1) * PIX_PER_WI_Y; - if (y < rows && x < cols) + if (x < cols) { - int src_idx = mad24(y, src_step, src_offset + x * scnbytes); - int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) + { + if (y < rows) + { + int src_idx = mad24(y, src_step, src_offset + x * scnbytes); + int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); + uchar4 src_pix = vload4(0, src + src_idx); #if greenbits == 6 - *((__global ushort*)(dst + dst_idx)) = (ushort)((src[src_idx + bidx] >> 3)|((src[src_idx + 1]&~3) << 3)|((src[src_idx + (bidx^2)]&~7) << 8)); + *((__global ushort*)(dst + dst_idx)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~3) << 3)|((src_pix.R_COMP&~7) << 8)); #elif scn == 3 - *((__global ushort*)(dst + dst_idx)) = (ushort)((src[src_idx + bidx] >> 3)|((src[src_idx + 1]&~7) << 2)|((src[src_idx + (bidx^2)]&~7) << 7)); + *((__global ushort*)(dst + dst_idx)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|((src_pix.R_COMP&~7) << 7)); #else - *((__global ushort*)(dst + dst_idx)) = (ushort)((src[src_idx + bidx] >> 3)|((src[src_idx + 1]&~7) << 2)| - ((src[src_idx + (bidx^2)]&~7) << 7)|(src[src_idx + 3] ? 0x8000 : 0)); + *((__global ushort*)(dst + dst_idx)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)| + ((src_pix.R_COMP&~7) << 7)|(src_pix.w ? 0x8000 : 0)); #endif + } + ++y; + } } } @@ -549,23 +654,31 @@ __kernel void BGR5x52Gray(__global const uchar* src, int src_step, int src_offse int rows, int cols) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1) * PIX_PER_WI_Y; - if (y < rows && x < cols) + if (x < cols) { - int src_idx = mad24(y, src_step, src_offset + x * scnbytes); - int dst_idx = mad24(y, dst_step, dst_offset + x); - int t = *((__global const ushort*)(src + src_idx)); + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) + { + if (y < rows) + { + int src_idx = mad24(y, src_step, src_offset + x * scnbytes); + int dst_idx = mad24(y, dst_step, dst_offset + x); + int t = *((__global const ushort*)(src + src_idx)); #if greenbits == 6 - dst[dst_idx] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y + - ((t >> 3) & 0xfc)*G2Y + - ((t >> 8) & 0xf8)*R2Y, yuv_shift); + dst[dst_idx] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y + + ((t >> 3) & 0xfc)*G2Y + + ((t >> 8) & 0xf8)*R2Y, yuv_shift); #else - dst[dst_idx] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y + - ((t >> 2) & 0xf8)*G2Y + - ((t >> 7) & 0xf8)*R2Y, yuv_shift); + dst[dst_idx] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y + + ((t >> 2) & 0xf8)*G2Y + + ((t >> 7) & 0xf8)*R2Y, yuv_shift); #endif + } + ++y; + } } } @@ -574,20 +687,28 @@ __kernel void Gray2BGR5x5(__global const uchar* src, int src_step, int src_offse int rows, int cols) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1) * PIX_PER_WI_Y; - if (y < rows && x < cols) + if (x < cols) { - int src_idx = mad24(y, src_step, src_offset + x); - int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); - int t = src[src_idx]; + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) + { + if (y < rows) + { + int src_idx = mad24(y, src_step, src_offset + x); + int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); + int t = src[src_idx]; #if greenbits == 6 - *((__global ushort*)(dst + dst_idx)) = (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8)); + *((__global ushort*)(dst + dst_idx)) = (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8)); #else - t >>= 3; - *((__global ushort*)(dst + dst_idx)) = (ushort)(t|(t << 5)|(t << 10)); + t >>= 3; + *((__global ushort*)(dst + dst_idx)) = (ushort)(t|(t << 5)|(t << 10)); #endif + } + ++y; + } } } @@ -608,36 +729,45 @@ __kernel void RGB2HSV(__global const uchar* src, int src_step, int src_offset, __constant int * sdiv_table, __constant int * hdiv_table) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1) * PIX_PER_WI_Y; - if (y < rows && x < cols) + if (x < cols) { - int src_idx = mad24(y, src_step, src_offset + x * scnbytes); - int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); - - int b = src[src_idx + bidx], g = src[src_idx + 1], r = src[src_idx + (bidx^2)]; - int h, s, v = b; - int vmin = b, diff; - int vr, vg; - - v = max( v, g ); - v = max( v, r ); - vmin = min( vmin, g ); - vmin = min( vmin, r ); - - diff = v - vmin; - vr = v == r ? -1 : 0; - vg = v == g ? -1 : 0; - - s = (diff * sdiv_table[v] + (1 << (hsv_shift-1))) >> hsv_shift; - h = (vr & (g - b)) + - (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff)))); - h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift; - h += h < 0 ? hrange : 0; - - dst[dst_idx] = convert_uchar_sat_rte(h); - dst[dst_idx + 1] = (uchar)s; - dst[dst_idx + 2] = (uchar)v; + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) + { + if (y < rows) + { + int src_idx = mad24(y, src_step, src_offset + x * scnbytes); + int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); + uchar4 src_pix = vload4(0, src + src_idx); + + int b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP; + int h, s, v = b; + int vmin = b, diff; + int vr, vg; + + v = max( v, g ); + v = max( v, r ); + vmin = min( vmin, g ); + vmin = min( vmin, r ); + + diff = v - vmin; + vr = v == r ? -1 : 0; + vg = v == g ? -1 : 0; + + s = (diff * sdiv_table[v] + (1 << (hsv_shift-1))) >> hsv_shift; + h = (vr & (g - b)) + + (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff)))); + h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift; + h += h < 0 ? hrange : 0; + + dst[dst_idx] = convert_uchar_sat_rte(h); + dst[dst_idx + 1] = (uchar)s; + dst[dst_idx + 2] = (uchar)v; + } + ++y; + } } } @@ -646,51 +776,60 @@ __kernel void HSV2RGB(__global const uchar* src, int src_step, int src_offset, int rows, int cols) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1) * PIX_PER_WI_Y; - if (y < rows && x < cols) + if (x < cols) { - int src_idx = mad24(y, src_step, src_offset + x * scnbytes); - int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); - - float h = src[src_idx], s = src[src_idx + 1]*(1/255.f), v = src[src_idx + 2]*(1/255.f); - float b, g, r; - - if (s != 0) + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) { - float tab[4]; - int sector; - h *= hscale; - if( h < 0 ) - do h += 6; while( h < 0 ); - else if( h >= 6 ) - do h -= 6; while( h >= 6 ); - sector = convert_int_sat_rtn(h); - h -= sector; - if( (unsigned)sector >= 6u ) + if (y < rows) { - sector = 0; - h = 0.f; - } - - tab[0] = v; - tab[1] = v*(1.f - s); - tab[2] = v*(1.f - s*h); - tab[3] = v*(1.f - s*(1.f - h)); - - b = tab[sector_data[sector][0]]; - g = tab[sector_data[sector][1]]; - r = tab[sector_data[sector][2]]; - } - else - b = g = r = v; - - dst[dst_idx + bidx] = convert_uchar_sat_rte(b*255.f); - dst[dst_idx + 1] = convert_uchar_sat_rte(g*255.f); - dst[dst_idx + (bidx^2)] = convert_uchar_sat_rte(r*255.f); + int src_idx = mad24(y, src_step, src_offset + x * scnbytes); + int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); + uchar4 src_pix = vload4(0, src + src_idx); + + float h = src_pix.x, s = src_pix.y*(1/255.f), v = src_pix.z*(1/255.f); + float b, g, r; + + if (s != 0) + { + float tab[4]; + int sector; + h *= hscale; + if( h < 0 ) + do h += 6; while( h < 0 ); + else if( h >= 6 ) + do h -= 6; while( h >= 6 ); + sector = convert_int_sat_rtn(h); + h -= sector; + if( (unsigned)sector >= 6u ) + { + sector = 0; + h = 0.f; + } + + tab[0] = v; + tab[1] = v*(1.f - s); + tab[2] = v*(1.f - s*h); + tab[3] = v*(1.f - s*(1.f - h)); + + b = tab[sector_data[sector][0]]; + g = tab[sector_data[sector][1]]; + r = tab[sector_data[sector][2]]; + } + else + b = g = r = v; + + dst[dst_idx + bidx] = convert_uchar_sat_rte(b*255.f); + dst[dst_idx + 1] = convert_uchar_sat_rte(g*255.f); + dst[dst_idx + (bidx^2)] = convert_uchar_sat_rte(r*255.f); #if dcn == 4 - dst[dst_idx + 3] = MAX_NUM; + dst[dst_idx + 3] = MAX_NUM; #endif + } + ++y; + } } } @@ -701,42 +840,51 @@ __kernel void RGB2HSV(__global const uchar* srcptr, int src_step, int src_offset int rows, int cols) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1) * PIX_PER_WI_Y; - if (y < rows && x < cols) + if (x < cols) { - int src_idx = mad24(y, src_step, src_offset + x * scnbytes); - int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); - - __global const float * src = (__global const float *)(srcptr + src_idx); - __global float * dst = (__global float *)(dstptr + dst_idx); - - float b = src[bidx], g = src[1], r = src[bidx^2]; - float h, s, v; - - float vmin, diff; - - v = vmin = r; - if( v < g ) v = g; - if( v < b ) v = b; - if( vmin > g ) vmin = g; - if( vmin > b ) vmin = b; - - diff = v - vmin; - s = diff/(float)(fabs(v) + FLT_EPSILON); - diff = (float)(60.f/(diff + FLT_EPSILON)); - if( v == r ) - h = (g - b)*diff; - else if( v == g ) - h = (b - r)*diff + 120.f; - else - h = (r - g)*diff + 240.f; - - if( h < 0 ) h += 360.f; - - dst[0] = h*hscale; - dst[1] = s; - dst[2] = v; + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) + { + if (y < rows) + { + int src_idx = mad24(y, src_step, src_offset + x * scnbytes); + int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); + + __global const float * src = (__global const float *)(srcptr + src_idx); + __global float * dst = (__global float *)(dstptr + dst_idx); + float4 src_pix = vload4(0, src); + + float b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP; + float h, s, v; + + float vmin, diff; + + v = vmin = r; + if( v < g ) v = g; + if( v < b ) v = b; + if( vmin > g ) vmin = g; + if( vmin > b ) vmin = b; + + diff = v - vmin; + s = diff/(float)(fabs(v) + FLT_EPSILON); + diff = (float)(60.f/(diff + FLT_EPSILON)); + if( v == r ) + h = (g - b)*diff; + else if( v == g ) + h = (b - r)*diff + 120.f; + else + h = (r - g)*diff + 240.f; + + if( h < 0 ) h += 360.f; + + dst[0] = h*hscale; + dst[1] = s; + dst[2] = v; + } + ++y; + } } } @@ -745,54 +893,63 @@ __kernel void HSV2RGB(__global const uchar* srcptr, int src_step, int src_offset int rows, int cols) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1) * PIX_PER_WI_Y; - if (y < rows && x < cols) + if (x < cols) { - int src_idx = mad24(y, src_step, src_offset + x * scnbytes); - int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); - - __global const float * src = (__global const float *)(srcptr + src_idx); - __global float * dst = (__global float *)(dstptr + dst_idx); - - float h = src[0], s = src[1], v = src[2]; - float b, g, r; - - if (s != 0) + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) { - float tab[4]; - int sector; - h *= hscale; - if(h < 0) - do h += 6; while (h < 0); - else if (h >= 6) - do h -= 6; while (h >= 6); - sector = convert_int_sat_rtn(h); - h -= sector; - if ((unsigned)sector >= 6u) + if (y < rows) { - sector = 0; - h = 0.f; - } - - tab[0] = v; - tab[1] = v*(1.f - s); - tab[2] = v*(1.f - s*h); - tab[3] = v*(1.f - s*(1.f - h)); - - b = tab[sector_data[sector][0]]; - g = tab[sector_data[sector][1]]; - r = tab[sector_data[sector][2]]; - } - else - b = g = r = v; - - dst[bidx] = b; - dst[1] = g; - dst[bidx^2] = r; + int src_idx = mad24(y, src_step, src_offset + x * scnbytes); + int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); + + __global const float * src = (__global const float *)(srcptr + src_idx); + __global float * dst = (__global float *)(dstptr + dst_idx); + float4 src_pix = vload4(0, src); + + float h = src_pix.x, s = src_pix.y, v = src_pix.z; + float b, g, r; + + if (s != 0) + { + float tab[4]; + int sector; + h *= hscale; + if(h < 0) + do h += 6; while (h < 0); + else if (h >= 6) + do h -= 6; while (h >= 6); + sector = convert_int_sat_rtn(h); + h -= sector; + if ((unsigned)sector >= 6u) + { + sector = 0; + h = 0.f; + } + + tab[0] = v; + tab[1] = v*(1.f - s); + tab[2] = v*(1.f - s*h); + tab[3] = v*(1.f - s*(1.f - h)); + + b = tab[sector_data[sector][0]]; + g = tab[sector_data[sector][1]]; + r = tab[sector_data[sector][2]]; + } + else + b = g = r = v; + + dst[bidx] = b; + dst[1] = g; + dst[bidx^2] = r; #if dcn == 4 - dst[3] = MAX_NUM; + dst[3] = MAX_NUM; #endif + } + ++y; + } } } @@ -807,44 +964,53 @@ __kernel void RGB2HLS(__global const uchar* src, int src_step, int src_offset, int rows, int cols) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1) * PIX_PER_WI_Y; - if (y < rows && x < cols) + if (x < cols) { - int src_idx = mad24(y, src_step, src_offset + x * scnbytes); - int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); - - float b = src[src_idx + bidx]*(1/255.f), g = src[src_idx + 1]*(1/255.f), r = src[src_idx + (bidx^2)]*(1/255.f); - float h = 0.f, s = 0.f, l; - float vmin, vmax, diff; - - vmax = vmin = r; - if (vmax < g) vmax = g; - if (vmax < b) vmax = b; - if (vmin > g) vmin = g; - if (vmin > b) vmin = b; - - diff = vmax - vmin; - l = (vmax + vmin)*0.5f; - - if (diff > FLT_EPSILON) + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) { - s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin); - diff = 60.f/diff; - - if( vmax == r ) - h = (g - b)*diff; - else if( vmax == g ) - h = (b - r)*diff + 120.f; - else - h = (r - g)*diff + 240.f; - - if( h < 0.f ) h += 360.f; + if (y < rows) + { + int src_idx = mad24(y, src_step, src_offset + x * scnbytes); + int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); + uchar4 src_pix = vload4(0, src + src_idx); + + float b = src_pix.B_COMP*(1/255.f), g = src_pix.G_COMP*(1/255.f), r = src_pix.R_COMP*(1/255.f); + float h = 0.f, s = 0.f, l; + float vmin, vmax, diff; + + vmax = vmin = r; + if (vmax < g) vmax = g; + if (vmax < b) vmax = b; + if (vmin > g) vmin = g; + if (vmin > b) vmin = b; + + diff = vmax - vmin; + l = (vmax + vmin)*0.5f; + + if (diff > FLT_EPSILON) + { + s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin); + diff = 60.f/diff; + + if( vmax == r ) + h = (g - b)*diff; + else if( vmax == g ) + h = (b - r)*diff + 120.f; + else + h = (r - g)*diff + 240.f; + + if( h < 0.f ) h += 360.f; + } + + dst[dst_idx] = convert_uchar_sat_rte(h*hscale); + dst[dst_idx + 1] = convert_uchar_sat_rte(l*255.f); + dst[dst_idx + 2] = convert_uchar_sat_rte(s*255.f); + } + ++y; } - - dst[dst_idx] = convert_uchar_sat_rte(h*hscale); - dst[dst_idx + 1] = convert_uchar_sat_rte(l*255.f); - dst[dst_idx + 2] = convert_uchar_sat_rte(s*255.f); } } @@ -853,50 +1019,59 @@ __kernel void HLS2RGB(__global const uchar* src, int src_step, int src_offset, int rows, int cols) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1) * PIX_PER_WI_Y; - if (y < rows && x < cols) + if (x < cols) { - int src_idx = mad24(y, src_step, src_offset + x * scnbytes); - int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); - - float h = src[src_idx], l = src[src_idx + 1]*(1.f/255.f), s = src[src_idx + 2]*(1.f/255.f); - float b, g, r; - - if (s != 0) + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) { - float tab[4]; - - float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s; - float p1 = 2*l - p2; - - h *= hscale; - if( h < 0 ) - do h += 6; while( h < 0 ); - else if( h >= 6 ) - do h -= 6; while( h >= 6 ); - - int sector = convert_int_sat_rtn(h); - h -= sector; - - tab[0] = p2; - tab[1] = p1; - tab[2] = p1 + (p2 - p1)*(1-h); - tab[3] = p1 + (p2 - p1)*h; - - b = tab[sector_data[sector][0]]; - g = tab[sector_data[sector][1]]; - r = tab[sector_data[sector][2]]; - } - else - b = g = r = l; - - dst[dst_idx + bidx] = convert_uchar_sat_rte(b*255.f); - dst[dst_idx + 1] = convert_uchar_sat_rte(g*255.f); - dst[dst_idx + (bidx^2)] = convert_uchar_sat_rte(r*255.f); + if (y < rows) + { + int src_idx = mad24(y, src_step, src_offset + x * scnbytes); + int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); + uchar4 src_pix = vload4(0, src + src_idx); + + float h = src_pix.x, l = src_pix.y*(1.f/255.f), s = src_pix.z*(1.f/255.f); + float b, g, r; + + if (s != 0) + { + float tab[4]; + + float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s; + float p1 = 2*l - p2; + + h *= hscale; + if( h < 0 ) + do h += 6; while( h < 0 ); + else if( h >= 6 ) + do h -= 6; while( h >= 6 ); + + int sector = convert_int_sat_rtn(h); + h -= sector; + + tab[0] = p2; + tab[1] = p1; + tab[2] = p1 + (p2 - p1)*(1-h); + tab[3] = p1 + (p2 - p1)*h; + + b = tab[sector_data[sector][0]]; + g = tab[sector_data[sector][1]]; + r = tab[sector_data[sector][2]]; + } + else + b = g = r = l; + + dst[dst_idx + bidx] = convert_uchar_sat_rte(b*255.f); + dst[dst_idx + 1] = convert_uchar_sat_rte(g*255.f); + dst[dst_idx + (bidx^2)] = convert_uchar_sat_rte(r*255.f); #if dcn == 4 - dst[dst_idx + 3] = MAX_NUM; + dst[dst_idx + 3] = MAX_NUM; #endif + } + ++y; + } } } @@ -907,47 +1082,56 @@ __kernel void RGB2HLS(__global const uchar* srcptr, int src_step, int src_offset int rows, int cols) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1) * PIX_PER_WI_Y; - if (y < rows && x < cols) + if (x < cols) { - int src_idx = mad24(y, src_step, src_offset + x * scnbytes); - int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); - - __global const float * src = (__global const float *)(srcptr + src_idx); - __global float * dst = (__global float *)(dstptr + dst_idx); - - float b = src[bidx], g = src[1], r = src[bidx^2]; - float h = 0.f, s = 0.f, l; - float vmin, vmax, diff; - - vmax = vmin = r; - if (vmax < g) vmax = g; - if (vmax < b) vmax = b; - if (vmin > g) vmin = g; - if (vmin > b) vmin = b; - - diff = vmax - vmin; - l = (vmax + vmin)*0.5f; - - if (diff > FLT_EPSILON) + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) { - s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin); - diff = 60.f/diff; - - if( vmax == r ) - h = (g - b)*diff; - else if( vmax == g ) - h = (b - r)*diff + 120.f; - else - h = (r - g)*diff + 240.f; - - if( h < 0.f ) h += 360.f; + if (y < rows) + { + int src_idx = mad24(y, src_step, src_offset + x * scnbytes); + int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); + + __global const float * src = (__global const float *)(srcptr + src_idx); + __global float * dst = (__global float *)(dstptr + dst_idx); + float4 src_pix = vload4(0, src); + + float b = src_pix.B_COMP, g = src_pix.G_COMP, r = src_pix.R_COMP; + float h = 0.f, s = 0.f, l; + float vmin, vmax, diff; + + vmax = vmin = r; + if (vmax < g) vmax = g; + if (vmax < b) vmax = b; + if (vmin > g) vmin = g; + if (vmin > b) vmin = b; + + diff = vmax - vmin; + l = (vmax + vmin)*0.5f; + + if (diff > FLT_EPSILON) + { + s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin); + diff = 60.f/diff; + + if( vmax == r ) + h = (g - b)*diff; + else if( vmax == g ) + h = (b - r)*diff + 120.f; + else + h = (r - g)*diff + 240.f; + + if( h < 0.f ) h += 360.f; + } + + dst[0] = h*hscale; + dst[1] = l; + dst[2] = s; + } + ++y; } - - dst[0] = h*hscale; - dst[1] = l; - dst[2] = s; } } @@ -956,54 +1140,63 @@ __kernel void HLS2RGB(__global const uchar* srcptr, int src_step, int src_offset int rows, int cols) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1) * PIX_PER_WI_Y; - if (y < rows && x < cols) + if (x < cols) { - int src_idx = mad24(y, src_step, src_offset + x * scnbytes); - int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); - - __global const float * src = (__global const float *)(srcptr + src_idx); - __global float * dst = (__global float *)(dstptr + dst_idx); - - float h = src[0], l = src[1], s = src[2]; - float b, g, r; - - if (s != 0) + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) { - float tab[4]; - int sector; - - float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s; - float p1 = 2*l - p2; - - h *= hscale; - if( h < 0 ) - do h += 6; while( h < 0 ); - else if( h >= 6 ) - do h -= 6; while( h >= 6 ); - - sector = convert_int_sat_rtn(h); - h -= sector; - - tab[0] = p2; - tab[1] = p1; - tab[2] = p1 + (p2 - p1)*(1-h); - tab[3] = p1 + (p2 - p1)*h; - - b = tab[sector_data[sector][0]]; - g = tab[sector_data[sector][1]]; - r = tab[sector_data[sector][2]]; - } - else - b = g = r = l; - - dst[bidx] = b; - dst[1] = g; - dst[bidx^2] = r; + if (y < rows) + { + int src_idx = mad24(y, src_step, src_offset + x * scnbytes); + int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); + + __global const float * src = (__global const float *)(srcptr + src_idx); + __global float * dst = (__global float *)(dstptr + dst_idx); + float4 src_pix = vload4(0, src); + + float h = src_pix.x, l = src_pix.y, s = src_pix.z; + float b, g, r; + + if (s != 0) + { + float tab[4]; + int sector; + + float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s; + float p1 = 2*l - p2; + + h *= hscale; + if( h < 0 ) + do h += 6; while( h < 0 ); + else if( h >= 6 ) + do h -= 6; while( h >= 6 ); + + sector = convert_int_sat_rtn(h); + h -= sector; + + tab[0] = p2; + tab[1] = p1; + tab[2] = p1 + (p2 - p1)*(1-h); + tab[3] = p1 + (p2 - p1)*h; + + b = tab[sector_data[sector][0]]; + g = tab[sector_data[sector][1]]; + r = tab[sector_data[sector][2]]; + } + else + b = g = r = l; + + dst[bidx] = b; + dst[1] = g; + dst[bidx^2] = r; #if dcn == 4 - dst[3] = MAX_NUM; + dst[3] = MAX_NUM; #endif + } + ++y; + } } } @@ -1018,21 +1211,29 @@ __kernel void RGBA2mRGBA(__global const uchar* src, int src_step, int src_offset int rows, int cols) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1) * PIX_PER_WI_Y; - if (y < rows && x < cols) + if (x < cols) { - x <<= 2; - int src_idx = mad24(y, src_step, src_offset + x); - int dst_idx = mad24(y, dst_step, dst_offset + x); + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) + { + if (y < rows) + { + int src_idx = mad24(y, src_step, src_offset + (x << 2)); + int dst_idx = mad24(y, dst_step, dst_offset + (x << 2)); + uchar4 src_pix = vload4(0, src + src_idx); - uchar v0 = src[src_idx], v1 = src[src_idx + 1]; - uchar v2 = src[src_idx + 2], v3 = src[src_idx + 3]; + uchar v0 = src_pix.x, v1 = src_pix.y; + uchar v2 = src_pix.z, v3 = src_pix.w; - dst[dst_idx] = (v0 * v3 + HALF_MAX) / MAX_NUM; - dst[dst_idx + 1] = (v1 * v3 + HALF_MAX) / MAX_NUM; - dst[dst_idx + 2] = (v2 * v3 + HALF_MAX) / MAX_NUM; - dst[dst_idx + 3] = v3; + dst[dst_idx] = (v0 * v3 + HALF_MAX) / MAX_NUM; + dst[dst_idx + 1] = (v1 * v3 + HALF_MAX) / MAX_NUM; + dst[dst_idx + 2] = (v2 * v3 + HALF_MAX) / MAX_NUM; + dst[dst_idx + 3] = v3; + } + ++y; + } } } @@ -1041,22 +1242,30 @@ __kernel void mRGBA2RGBA(__global const uchar* src, int src_step, int src_offset int rows, int cols) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1) * PIX_PER_WI_Y; - if (y < rows && x < cols) + if (x < cols) { - x <<= 2; - int src_idx = mad24(y, src_step, src_offset + x); - int dst_idx = mad24(y, dst_step, dst_offset + x); - - uchar v0 = src[src_idx], v1 = src[src_idx + 1]; - uchar v2 = src[src_idx + 2], v3 = src[src_idx + 3]; - uchar v3_half = v3 / 2; - - dst[dst_idx] = v3 == 0 ? 0 : (v0 * MAX_NUM + v3_half) / v3; - dst[dst_idx + 1] = v3 == 0 ? 0 : (v1 * MAX_NUM + v3_half) / v3; - dst[dst_idx + 2] = v3 == 0 ? 0 : (v2 * MAX_NUM + v3_half) / v3; - dst[dst_idx + 3] = v3; + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) + { + if (y < rows) + { + int src_idx = mad24(y, src_step, src_offset + (x << 2)); + int dst_idx = mad24(y, dst_step, dst_offset + (x << 2)); + uchar4 src_pix = vload4(0, src + src_idx); + + uchar v0 = src_pix.x, v1 = src_pix.y; + uchar v2 = src_pix.z, v3 = src_pix.w; + uchar v3_half = v3 / 2; + + dst[dst_idx] = v3 == 0 ? 0 : (v0 * MAX_NUM + v3_half) / v3; + dst[dst_idx + 1] = v3 == 0 ? 0 : (v1 * MAX_NUM + v3_half) / v3; + dst[dst_idx + 2] = v3 == 0 ? 0 : (v2 * MAX_NUM + v3_half) / v3; + dst[dst_idx + 3] = v3; + } + ++y; + } } } @@ -1086,32 +1295,41 @@ __kernel void BGR2Lab(__global const uchar * src, int src_step, int src_offset, __constant int * coeffs, int Lscale, int Lshift) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1) * PIX_PER_WI_Y; - if (y < rows && x < cols) + if (x < cols) { - int src_idx = mad24(y, src_step, src_offset + x * scnbytes); - int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) + { + if (y < rows) + { + int src_idx = mad24(y, src_step, src_offset + x * scnbytes); + int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); - src += src_idx; - dst += dst_idx; + __global const uchar* src_ptr = src + src_idx; + __global uchar* dst_ptr = dst + dst_idx; + uchar4 src_pix = vload4(0, src_ptr); - int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], - C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5], - C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; + int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], + C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5], + C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; - int R = gammaTab[src[0]], G = gammaTab[src[1]], B = gammaTab[src[2]]; - int fX = LabCbrtTab_b[CV_DESCALE(R*C0 + G*C1 + B*C2, lab_shift)]; - int fY = LabCbrtTab_b[CV_DESCALE(R*C3 + G*C4 + B*C5, lab_shift)]; - int fZ = LabCbrtTab_b[CV_DESCALE(R*C6 + G*C7 + B*C8, lab_shift)]; + int R = gammaTab[src_pix.x], G = gammaTab[src_pix.y], B = gammaTab[src_pix.z]; + int fX = LabCbrtTab_b[CV_DESCALE(R*C0 + G*C1 + B*C2, lab_shift)]; + int fY = LabCbrtTab_b[CV_DESCALE(R*C3 + G*C4 + B*C5, lab_shift)]; + int fZ = LabCbrtTab_b[CV_DESCALE(R*C6 + G*C7 + B*C8, lab_shift)]; - int L = CV_DESCALE( Lscale*fY + Lshift, lab_shift2 ); - int a = CV_DESCALE( 500*(fX - fY) + 128*(1 << lab_shift2), lab_shift2 ); - int b = CV_DESCALE( 200*(fY - fZ) + 128*(1 << lab_shift2), lab_shift2 ); + int L = CV_DESCALE( Lscale*fY + Lshift, lab_shift2 ); + int a = CV_DESCALE( 500*(fX - fY) + 128*(1 << lab_shift2), lab_shift2 ); + int b = CV_DESCALE( 200*(fY - fZ) + 128*(1 << lab_shift2), lab_shift2 ); - dst[0] = SAT_CAST(L); - dst[1] = SAT_CAST(a); - dst[2] = SAT_CAST(b); + dst_ptr[0] = SAT_CAST(L); + dst_ptr[1] = SAT_CAST(a); + dst_ptr[2] = SAT_CAST(b); + } + ++y; + } } } @@ -1125,45 +1343,54 @@ __kernel void BGR2Lab(__global const uchar * srcptr, int src_step, int src_offse __constant float * coeffs, float _1_3, float _a) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1) * PIX_PER_WI_Y; - if (y < rows && x < cols) + if (x < cols) { - int src_idx = mad24(y, src_step, src_offset + x * scnbytes); - int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) + { + if (y < rows) + { + int src_idx = mad24(y, src_step, src_offset + x * scnbytes); + int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); - __global const float * src = (__global const float *)(srcptr + src_idx); - __global float * dst = (__global float *)(dstptr + dst_idx); + __global const float * src = (__global const float *)(srcptr + src_idx); + __global float * dst = (__global float *)(dstptr + dst_idx); + float4 src_pix = vload4(0, src); - float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], - C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5], - C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; + float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], + C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5], + C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; - float R = clamp(src[0], 0.0f, 1.0f); - float G = clamp(src[1], 0.0f, 1.0f); - float B = clamp(src[2], 0.0f, 1.0f); + float R = clamp(src_pix.x, 0.0f, 1.0f); + float G = clamp(src_pix.y, 0.0f, 1.0f); + float B = clamp(src_pix.z, 0.0f, 1.0f); #ifdef SRGB - R = splineInterpolate(R * GammaTabScale, gammaTab, GAMMA_TAB_SIZE); - G = splineInterpolate(G * GammaTabScale, gammaTab, GAMMA_TAB_SIZE); - B = splineInterpolate(B * GammaTabScale, gammaTab, GAMMA_TAB_SIZE); + R = splineInterpolate(R * GammaTabScale, gammaTab, GAMMA_TAB_SIZE); + G = splineInterpolate(G * GammaTabScale, gammaTab, GAMMA_TAB_SIZE); + B = splineInterpolate(B * GammaTabScale, gammaTab, GAMMA_TAB_SIZE); #endif - float X = R*C0 + G*C1 + B*C2; - float Y = R*C3 + G*C4 + B*C5; - float Z = R*C6 + G*C7 + B*C8; + float X = R*C0 + G*C1 + B*C2; + float Y = R*C3 + G*C4 + B*C5; + float Z = R*C6 + G*C7 + B*C8; - float FX = X > 0.008856f ? pow(X, _1_3) : (7.787f * X + _a); - float FY = Y > 0.008856f ? pow(Y, _1_3) : (7.787f * Y + _a); - float FZ = Z > 0.008856f ? pow(Z, _1_3) : (7.787f * Z + _a); + float FX = X > 0.008856f ? pow(X, _1_3) : (7.787f * X + _a); + float FY = Y > 0.008856f ? pow(Y, _1_3) : (7.787f * Y + _a); + float FZ = Z > 0.008856f ? pow(Z, _1_3) : (7.787f * Z + _a); - float L = Y > 0.008856f ? (116.f * FY - 16.f) : (903.3f * Y); - float a = 500.f * (FX - FY); - float b = 200.f * (FY - FZ); + float L = Y > 0.008856f ? (116.f * FY - 16.f) : (903.3f * Y); + float a = 500.f * (FX - FY); + float b = 200.f * (FY - FZ); - dst[0] = L; - dst[1] = a; - dst[2] = b; + dst[0] = L; + dst[1] = a; + dst[2] = b; + } + ++y; + } } } @@ -1225,33 +1452,42 @@ __kernel void Lab2BGR(__global const uchar * src, int src_step, int src_offset, __constant float * coeffs, float lThresh, float fThresh) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1) * PIX_PER_WI_Y; - if (y < rows && x < cols) + if (x < cols) { - int src_idx = mad24(y, src_step, src_offset + x * scnbytes); - int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) + { + if (y < rows) + { + int src_idx = mad24(y, src_step, src_offset + x * scnbytes); + int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); - src += src_idx; - dst += dst_idx; + __global const uchar* src_ptr = src + src_idx; + __global uchar* dst_ptr = dst + dst_idx; + uchar4 src_pix = vload4(0, src_ptr); - float srcbuf[3], dstbuf[3]; - srcbuf[0] = src[0]*(100.f/255.f); - srcbuf[1] = convert_float(src[1] - 128); - srcbuf[2] = convert_float(src[2] - 128); + float srcbuf[3], dstbuf[3]; + srcbuf[0] = src_pix.x*(100.f/255.f); + srcbuf[1] = convert_float(src_pix.y - 128); + srcbuf[2] = convert_float(src_pix.z - 128); - Lab2BGR_f(&srcbuf[0], &dstbuf[0], + Lab2BGR_f(&srcbuf[0], &dstbuf[0], #ifdef SRGB - gammaTab, + gammaTab, #endif - coeffs, lThresh, fThresh); + coeffs, lThresh, fThresh); - dst[0] = SAT_CAST(dstbuf[0] * 255.0f); - dst[1] = SAT_CAST(dstbuf[1] * 255.0f); - dst[2] = SAT_CAST(dstbuf[2] * 255.0f); + dst_ptr[0] = SAT_CAST(dstbuf[0] * 255.0f); + dst_ptr[1] = SAT_CAST(dstbuf[1] * 255.0f); + dst_ptr[2] = SAT_CAST(dstbuf[2] * 255.0f); #if dcn == 4 - dst[3] = MAX_NUM; + dst_ptr[3] = MAX_NUM; #endif + } + ++y; + } } } @@ -1265,29 +1501,38 @@ __kernel void Lab2BGR(__global const uchar * srcptr, int src_step, int src_offse __constant float * coeffs, float lThresh, float fThresh) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1) * PIX_PER_WI_Y; - if (y < rows && x < cols) + if (x < cols) { - int src_idx = mad24(y, src_step, src_offset + x * scnbytes); - int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); + #pragma unroll + for (int cy = 0; cy < PIX_PER_WI_Y; ++cy) + { + if (y < rows) + { + int src_idx = mad24(y, src_step, src_offset + x * scnbytes); + int dst_idx = mad24(y, dst_step, dst_offset + x * dcnbytes); - __global const float * src = (__global const float *)(srcptr + src_idx); - __global float * dst = (__global float *)(dstptr + dst_idx); + __global const float * src = (__global const float *)(srcptr + src_idx); + __global float * dst = (__global float *)(dstptr + dst_idx); + float4 src_pix = vload4(0, src); - float srcbuf[3], dstbuf[3]; - srcbuf[0] = src[0], srcbuf[1] = src[1], srcbuf[2] = src[2]; + float srcbuf[3], dstbuf[3]; + srcbuf[0] = src_pix.x, srcbuf[1] = src_pix.y, srcbuf[2] = src_pix.z; - Lab2BGR_f(&srcbuf[0], &dstbuf[0], + Lab2BGR_f(&srcbuf[0], &dstbuf[0], #ifdef SRGB - gammaTab, + gammaTab, #endif - coeffs, lThresh, fThresh); + coeffs, lThresh, fThresh); - dst[0] = dstbuf[0], dst[1] = dstbuf[1], dst[2] = dstbuf[2]; + dst[0] = dstbuf[0], dst[1] = dstbuf[1], dst[2] = dstbuf[2]; #if dcn == 4 - dst[3] = MAX_NUM; + dst[3] = MAX_NUM; #endif + } + ++y; + } } } diff --git a/modules/imgproc/src/samplers.cpp b/modules/imgproc/src/samplers.cpp index d6cc8a56fe..197a7ced65 100644 --- a/modules/imgproc/src/samplers.cpp +++ b/modules/imgproc/src/samplers.cpp @@ -172,7 +172,7 @@ void getRectSubPix_Cn_(const _Tp* src, size_t src_step, Size src_size, dst[j+1] = cast_op(s1); } - for( j = 0; j < win_size.width; j++ ) + for( ; j < win_size.width; j++ ) { _WTp s0 = src[j]*a11 + src[j+cn]*a12 + src[j+src_step]*a21 + src[j+src_step+cn]*a22; dst[j] = cast_op(s0); diff --git a/modules/imgproc/src/sumpixels.cpp b/modules/imgproc/src/sumpixels.cpp old mode 100644 new mode 100755 index c32813fabd..6802916bab --- a/modules/imgproc/src/sumpixels.cpp +++ b/modules/imgproc/src/sumpixels.cpp @@ -219,6 +219,8 @@ static void integral_##suffix( T* src, size_t srcstep, ST* sum, size_t sumstep, DEF_INTEGRAL_FUNC(8u32s, uchar, int, double) DEF_INTEGRAL_FUNC(8u32f64f, uchar, float, double) DEF_INTEGRAL_FUNC(8u64f64f, uchar, double, double) +DEF_INTEGRAL_FUNC(16u64f64f, ushort, double, double) +DEF_INTEGRAL_FUNC(16s64f64f, short, double, double) DEF_INTEGRAL_FUNC(32f32f64f, float, float, double) DEF_INTEGRAL_FUNC(32f64f64f, float, double, double) DEF_INTEGRAL_FUNC(64f64f64f, double, double, double) @@ -411,6 +413,10 @@ void cv::integral( InputArray _src, OutputArray _sum, OutputArray _sqsum, Output func = (IntegralFunc)integral_8u32f32f; else if( depth == CV_8U && sdepth == CV_64F && sqdepth == CV_64F ) func = (IntegralFunc)integral_8u64f64f; + else if( depth == CV_16U && sdepth == CV_64F && sqdepth == CV_64F ) + func = (IntegralFunc)integral_16u64f64f; + else if( depth == CV_16S && sdepth == CV_64F && sqdepth == CV_64F ) + func = (IntegralFunc)integral_16s64f64f; else if( depth == CV_32F && sdepth == CV_32F && sqdepth == CV_64F ) func = (IntegralFunc)integral_32f32f64f; else if( depth == CV_32F && sdepth == CV_32F && sqdepth == CV_32F ) diff --git a/modules/imgproc/src/templmatch.cpp b/modules/imgproc/src/templmatch.cpp index ca132dd0f8..d3fb92ebb3 100644 --- a/modules/imgproc/src/templmatch.cpp +++ b/modules/imgproc/src/templmatch.cpp @@ -341,10 +341,93 @@ static bool ocl_matchTemplate( InputArray _img, InputArray _templ, OutputArray _ #endif +#if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7) + +typedef IppStatus (CV_STDCALL * ippimatchTemplate)(const void*, int, IppiSize, const void*, int, IppiSize, Ipp32f* , int , IppEnum , Ipp8u*); + +static bool ipp_crossCorr(const Mat& src, const Mat& tpl, Mat& dst) +{ + if (src.channels()!= 1) + return false; + + IppStatus status; + + IppiSize srcRoiSize = {src.cols,src.rows}; + IppiSize tplRoiSize = {tpl.cols,tpl.rows}; + + Ipp8u *pBuffer; + int bufSize=0; + + int depth = src.depth(); + + ippimatchTemplate ippFunc = + depth==CV_8U ? (ippimatchTemplate)ippiCrossCorrNorm_8u32f_C1R: + depth==CV_32F? (ippimatchTemplate)ippiCrossCorrNorm_32f_C1R: 0; + + if (ippFunc==0) + return false; + + IppEnum funCfg = (IppEnum)(ippAlgAuto | ippiNormNone | ippiROIValid); + + status = ippiCrossCorrNormGetBufferSize(srcRoiSize, tplRoiSize, funCfg, &bufSize); + if ( status < 0 ) + return false; + + pBuffer = ippsMalloc_8u( bufSize ); + + status = ippFunc(src.data, (int)src.step, srcRoiSize, tpl.data, (int)tpl.step, tplRoiSize, (Ipp32f*)dst.data, (int)dst.step, funCfg, pBuffer); + + ippsFree( pBuffer ); + return status >= 0; +} + +static bool ipp_sqrDistance(const Mat& src, const Mat& tpl, Mat& dst) +{ + if (src.channels()!= 1) + return false; + + IppStatus status; + + IppiSize srcRoiSize = {src.cols,src.rows}; + IppiSize tplRoiSize = {tpl.cols,tpl.rows}; + + Ipp8u *pBuffer; + int bufSize=0; + + int depth = src.depth(); + + ippimatchTemplate ippFunc = + depth==CV_8U ? (ippimatchTemplate)ippiSqrDistanceNorm_8u32f_C1R: + depth==CV_32F? (ippimatchTemplate)ippiSqrDistanceNorm_32f_C1R: 0; + + if (ippFunc==0) + return false; + + IppEnum funCfg = (IppEnum)(ippAlgAuto | ippiNormNone | ippiROIValid); + + status = ippiSqrDistanceNormGetBufferSize(srcRoiSize, tplRoiSize, funCfg, &bufSize); + if ( status < 0 ) + return false; + + pBuffer = ippsMalloc_8u( bufSize ); + + status = ippFunc(src.data, (int)src.step, srcRoiSize, tpl.data, (int)tpl.step, tplRoiSize, (Ipp32f*)dst.data, (int)dst.step, funCfg, pBuffer); + + ippsFree( pBuffer ); + return status >= 0; +} + +#endif + void crossCorr( const Mat& img, const Mat& _templ, Mat& corr, Size corrsize, int ctype, Point anchor, double delta, int borderType ) { +#if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7) + if (ipp_crossCorr(img, _templ, corr)) + return; +#endif + const double blockScale = 4.5; const int minBlockSize = 256; std::vector buf; @@ -560,6 +643,11 @@ void cv::matchTemplate( InputArray _img, InputArray _templ, OutputArray _result, return; #endif +#if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7) + if (method == CV_TM_SQDIFF && ipp_sqrDistance(img, templ, result)) + return; +#endif + int cn = img.channels(); crossCorr( img, templ, result, result.size(), result.type(), Point(0,0), 0, 0); diff --git a/modules/nonfree/src/sift.cpp b/modules/nonfree/src/sift.cpp index 259e934ede..2112971e94 100644 --- a/modules/nonfree/src/sift.cpp +++ b/modules/nonfree/src/sift.cpp @@ -111,21 +111,6 @@ namespace cv /******************************* Defs and macros *****************************/ -// default number of sampled intervals per octave -static const int SIFT_INTVLS = 3; - -// default sigma for initial gaussian smoothing -static const float SIFT_SIGMA = 1.6f; - -// default threshold on keypoint contrast |D(x)| -static const float SIFT_CONTR_THR = 0.04f; - -// default threshold on keypoint ratio of principle curvatures -static const float SIFT_CURV_THR = 10.f; - -// double image size before pyramid construction? -static const bool SIFT_IMG_DBL = true; - // default width of descriptor histogram array static const int SIFT_DESCR_WIDTH = 4; diff --git a/modules/video/src/simpleflow.cpp b/modules/video/src/simpleflow.cpp index 66f4c41bdc..20fc6b5431 100644 --- a/modules/video/src/simpleflow.cpp +++ b/modules/video/src/simpleflow.cpp @@ -66,21 +66,6 @@ inline static float dist(const Vec2f& p1, const Vec2f& p2) { (p1[1] - p2[1]) * (p1[1] - p2[1]); } -inline static float dist(const Point2f& p1, const Point2f& p2) { - return (p1.x - p2.x) * (p1.x - p2.x) + - (p1.y - p2.y) * (p1.y - p2.y); -} - -inline static float dist(float x1, float y1, float x2, float y2) { - return (x1 - x2) * (x1 - x2) + - (y1 - y2) * (y1 - y2); -} - -inline static int dist(int x1, int y1, int x2, int y2) { - return (x1 - x2) * (x1 - x2) + - (y1 - y2) * (y1 - y2); -} - template inline static T min(T t1, T t2, T t3) { return (t1 <= t2 && t1 <= t3) ? t1 : min(t2, t3); diff --git a/samples/cpp/CMakeLists.txt b/samples/cpp/CMakeLists.txt index 6ef95a8d39..5b92b5e502 100644 --- a/samples/cpp/CMakeLists.txt +++ b/samples/cpp/CMakeLists.txt @@ -93,6 +93,10 @@ if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_FOUND) ocv_list_filterout(cpp_samples "viz") + if(NOT HAVE_IPP_A) + ocv_list_filterout(cpp_samples "/ippasync/") + endif() + foreach(sample_filename ${cpp_samples}) get_filename_component(sample ${sample_filename} NAME_WE) OPENCV_DEFINE_CPP_EXAMPLE(${sample} ${sample_filename})