diff --git a/modules/imgproc/CMakeLists.txt b/modules/imgproc/CMakeLists.txt index 6232aa5fab..0c7b3268df 100644 --- a/modules/imgproc/CMakeLists.txt +++ b/modules/imgproc/CMakeLists.txt @@ -1,6 +1,12 @@ set(the_description "Image Processing") ocv_add_dispatched_file(accum SSE4_1 AVX AVX2) +ocv_add_dispatched_file(bilateral_filter SSE2 AVX2) +ocv_add_dispatched_file(box_filter SSE2 SSE4_1 AVX2) +ocv_add_dispatched_file(filter SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(color_hsv SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(color_rgb SSE2 SSE4_1 AVX2) ocv_add_dispatched_file(color_yuv SSE2 SSE4_1 AVX2) +ocv_add_dispatched_file(median_blur SSE2 SSE4_1 AVX2) +ocv_add_dispatched_file(morph SSE2 SSE4_1 AVX2) +ocv_add_dispatched_file(smooth SSE2 SSE4_1 AVX2) ocv_define_module(imgproc opencv_core WRAP java python js) diff --git a/modules/imgproc/src/bilateral_filter.dispatch.cpp b/modules/imgproc/src/bilateral_filter.dispatch.cpp new file mode 100644 index 0000000000..a27ebb18f5 --- /dev/null +++ b/modules/imgproc/src/bilateral_filter.dispatch.cpp @@ -0,0 +1,427 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, 2018, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" + +#include + +#include "opencv2/core/hal/intrin.hpp" +#include "opencl_kernels_imgproc.hpp" + +#include "bilateral_filter.simd.hpp" +#include "bilateral_filter.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content + +/****************************************************************************************\ + Bilateral Filtering +\****************************************************************************************/ + +namespace cv { + +#ifdef HAVE_OPENCL + +static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d, + double sigma_color, double sigma_space, + int borderType) +{ + CV_INSTRUMENT_REGION(); +#ifdef __ANDROID__ + if (ocl::Device::getDefault().isNVidia()) + return false; +#endif + + int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); + int i, j, maxk, radius; + + if (depth != CV_8U || cn > 4) + return false; + + if (sigma_color <= 0) + sigma_color = 1; + if (sigma_space <= 0) + sigma_space = 1; + + double gauss_color_coeff = -0.5 / (sigma_color * sigma_color); + double gauss_space_coeff = -0.5 / (sigma_space * sigma_space); + + if ( d <= 0 ) + radius = cvRound(sigma_space * 1.5); + else + radius = d / 2; + radius = MAX(radius, 1); + d = radius * 2 + 1; + + UMat src = _src.getUMat(), dst = _dst.getUMat(), temp; + if (src.u == dst.u) + return false; + + copyMakeBorder(src, temp, radius, radius, radius, radius, borderType); + std::vector _space_weight(d * d); + std::vector _space_ofs(d * d); + float * const space_weight = &_space_weight[0]; + int * const space_ofs = &_space_ofs[0]; + + // initialize space-related bilateral filter coefficients + for( i = -radius, maxk = 0; i <= radius; i++ ) + for( j = -radius; j <= radius; j++ ) + { + double r = std::sqrt((double)i * i + (double)j * j); + if ( r > radius ) + continue; + space_weight[maxk] = (float)std::exp(r * r * gauss_space_coeff); + space_ofs[maxk++] = (int)(i * temp.step + j * cn); + } + + char cvt[3][40]; + String cnstr = cn > 1 ? format("%d", cn) : ""; + String kernelName("bilateral"); + size_t sizeDiv = 1; + if ((ocl::Device::getDefault().isIntel()) && + (ocl::Device::getDefault().type() == ocl::Device::TYPE_GPU)) + { + //Intel GPU + if (dst.cols % 4 == 0 && cn == 1) // For single channel x4 sized images. + { + kernelName = "bilateral_float4"; + sizeDiv = 4; + } + } + ocl::Kernel k(kernelName.c_str(), ocl::imgproc::bilateral_oclsrc, + format("-D radius=%d -D maxk=%d -D cn=%d -D int_t=%s -D uint_t=uint%s -D convert_int_t=%s" + " -D uchar_t=%s -D float_t=%s -D convert_float_t=%s -D convert_uchar_t=%s -D gauss_color_coeff=(float)%f", + radius, maxk, cn, ocl::typeToStr(CV_32SC(cn)), cnstr.c_str(), + ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]), + ocl::typeToStr(type), ocl::typeToStr(CV_32FC(cn)), + ocl::convertTypeStr(CV_32S, CV_32F, cn, cvt[1]), + ocl::convertTypeStr(CV_32F, CV_8U, cn, cvt[2]), gauss_color_coeff)); + if (k.empty()) + return false; + + Mat mspace_weight(1, d * d, CV_32FC1, space_weight); + Mat mspace_ofs(1, d * d, CV_32SC1, space_ofs); + UMat ucolor_weight, uspace_weight, uspace_ofs; + + mspace_weight.copyTo(uspace_weight); + mspace_ofs.copyTo(uspace_ofs); + + k.args(ocl::KernelArg::ReadOnlyNoSize(temp), ocl::KernelArg::WriteOnly(dst), + ocl::KernelArg::PtrReadOnly(uspace_weight), + ocl::KernelArg::PtrReadOnly(uspace_ofs)); + + size_t globalsize[2] = { (size_t)dst.cols / sizeDiv, (size_t)dst.rows }; + return k.run(2, globalsize, NULL, false); +} +#endif + + +static void +bilateralFilter_8u( const Mat& src, Mat& dst, int d, + double sigma_color, double sigma_space, + int borderType ) +{ + CV_INSTRUMENT_REGION(); + + int cn = src.channels(); + int i, j, maxk, radius; + + CV_Assert( (src.type() == CV_8UC1 || src.type() == CV_8UC3) && src.data != dst.data ); + + if( sigma_color <= 0 ) + sigma_color = 1; + if( sigma_space <= 0 ) + sigma_space = 1; + + double gauss_color_coeff = -0.5/(sigma_color*sigma_color); + double gauss_space_coeff = -0.5/(sigma_space*sigma_space); + + if( d <= 0 ) + radius = cvRound(sigma_space*1.5); + else + radius = d/2; + radius = MAX(radius, 1); + d = radius*2 + 1; + + Mat temp; + copyMakeBorder( src, temp, radius, radius, radius, radius, borderType ); + + std::vector _color_weight(cn*256); + std::vector _space_weight(d*d); + std::vector _space_ofs(d*d); + float* color_weight = &_color_weight[0]; + float* space_weight = &_space_weight[0]; + int* space_ofs = &_space_ofs[0]; + + // initialize color-related bilateral filter coefficients + + for( i = 0; i < 256*cn; i++ ) + color_weight[i] = (float)std::exp(i*i*gauss_color_coeff); + + // initialize space-related bilateral filter coefficients + for( i = -radius, maxk = 0; i <= radius; i++ ) + { + j = -radius; + + for( ; j <= radius; j++ ) + { + double r = std::sqrt((double)i*i + (double)j*j); + if( r > radius ) + continue; + space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff); + space_ofs[maxk++] = (int)(i*temp.step + j*cn); + } + } + + CV_CPU_DISPATCH(bilateralFilterInvoker_8u, (dst, temp, radius, maxk, space_ofs, space_weight, color_weight), + CV_CPU_DISPATCH_MODES_ALL); +} + + +static void +bilateralFilter_32f( const Mat& src, Mat& dst, int d, + double sigma_color, double sigma_space, + int borderType ) +{ + CV_INSTRUMENT_REGION(); + + int cn = src.channels(); + int i, j, maxk, radius; + double minValSrc=-1, maxValSrc=1; + const int kExpNumBinsPerChannel = 1 << 12; + int kExpNumBins = 0; + float lastExpVal = 1.f; + float len, scale_index; + + CV_Assert( (src.type() == CV_32FC1 || src.type() == CV_32FC3) && src.data != dst.data ); + + if( sigma_color <= 0 ) + sigma_color = 1; + if( sigma_space <= 0 ) + sigma_space = 1; + + double gauss_color_coeff = -0.5/(sigma_color*sigma_color); + double gauss_space_coeff = -0.5/(sigma_space*sigma_space); + + if( d <= 0 ) + radius = cvRound(sigma_space*1.5); + else + radius = d/2; + radius = MAX(radius, 1); + d = radius*2 + 1; + // compute the min/max range for the input image (even if multichannel) + + minMaxLoc( src.reshape(1), &minValSrc, &maxValSrc ); + if(std::abs(minValSrc - maxValSrc) < FLT_EPSILON) + { + src.copyTo(dst); + return; + } + + // temporary copy of the image with borders for easy processing + Mat temp; + copyMakeBorder( src, temp, radius, radius, radius, radius, borderType ); + + // allocate lookup tables + std::vector _space_weight(d*d); + std::vector _space_ofs(d*d); + float* space_weight = &_space_weight[0]; + int* space_ofs = &_space_ofs[0]; + + // assign a length which is slightly more than needed + len = (float)(maxValSrc - minValSrc) * cn; + kExpNumBins = kExpNumBinsPerChannel * cn; + std::vector _expLUT(kExpNumBins+2); + float* expLUT = &_expLUT[0]; + + scale_index = kExpNumBins/len; + + // initialize the exp LUT + for( i = 0; i < kExpNumBins+2; i++ ) + { + if( lastExpVal > 0.f ) + { + double val = i / scale_index; + expLUT[i] = (float)std::exp(val * val * gauss_color_coeff); + lastExpVal = expLUT[i]; + } + else + expLUT[i] = 0.f; + } + + // initialize space-related bilateral filter coefficients + for( i = -radius, maxk = 0; i <= radius; i++ ) + for( j = -radius; j <= radius; j++ ) + { + double r = std::sqrt((double)i*i + (double)j*j); + if( r > radius || ( i == 0 && j == 0 ) ) + continue; + space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff); + space_ofs[maxk++] = (int)(i*(temp.step/sizeof(float)) + j*cn); + } + + // parallel_for usage + CV_CPU_DISPATCH(bilateralFilterInvoker_32f, (cn, radius, maxk, space_ofs, temp, dst, scale_index, space_weight, expLUT), + CV_CPU_DISPATCH_MODES_ALL); +} + +#ifdef HAVE_IPP +#define IPP_BILATERAL_PARALLEL 1 + +#ifdef HAVE_IPP_IW +class ipp_bilateralFilterParallel: public ParallelLoopBody +{ +public: + ipp_bilateralFilterParallel(::ipp::IwiImage &_src, ::ipp::IwiImage &_dst, int _radius, Ipp32f _valSquareSigma, Ipp32f _posSquareSigma, ::ipp::IwiBorderType _borderType, bool *_ok): + src(_src), dst(_dst) + { + pOk = _ok; + + radius = _radius; + valSquareSigma = _valSquareSigma; + posSquareSigma = _posSquareSigma; + borderType = _borderType; + + *pOk = true; + } + ~ipp_bilateralFilterParallel() {} + + virtual void operator() (const Range& range) const CV_OVERRIDE + { + if(*pOk == false) + return; + + try + { + ::ipp::IwiTile tile = ::ipp::IwiRoi(0, range.start, dst.m_size.width, range.end - range.start); + CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterBilateral, src, dst, radius, valSquareSigma, posSquareSigma, ::ipp::IwDefault(), borderType, tile); + } + catch(const ::ipp::IwException &) + { + *pOk = false; + return; + } + } +private: + ::ipp::IwiImage &src; + ::ipp::IwiImage &dst; + + int radius; + Ipp32f valSquareSigma; + Ipp32f posSquareSigma; + ::ipp::IwiBorderType borderType; + + bool *pOk; + const ipp_bilateralFilterParallel& operator= (const ipp_bilateralFilterParallel&); +}; +#endif + +static bool ipp_bilateralFilter(Mat &src, Mat &dst, int d, double sigmaColor, double sigmaSpace, int borderType) +{ +#ifdef HAVE_IPP_IW + CV_INSTRUMENT_REGION_IPP(); + + int radius = IPP_MAX(((d <= 0)?cvRound(sigmaSpace*1.5):d/2), 1); + Ipp32f valSquareSigma = (Ipp32f)((sigmaColor <= 0)?1:sigmaColor*sigmaColor); + Ipp32f posSquareSigma = (Ipp32f)((sigmaSpace <= 0)?1:sigmaSpace*sigmaSpace); + + // Acquire data and begin processing + try + { + ::ipp::IwiImage iwSrc = ippiGetImage(src); + ::ipp::IwiImage iwDst = ippiGetImage(dst); + ::ipp::IwiBorderSize borderSize(radius); + ::ipp::IwiBorderType ippBorder(ippiGetBorder(iwSrc, borderType, borderSize)); + if(!ippBorder) + return false; + + const int threads = ippiSuggestThreadsNum(iwDst, 2); + if(IPP_BILATERAL_PARALLEL && threads > 1) { + bool ok = true; + Range range(0, (int)iwDst.m_size.height); + ipp_bilateralFilterParallel invoker(iwSrc, iwDst, radius, valSquareSigma, posSquareSigma, ippBorder, &ok); + if(!ok) + return false; + + parallel_for_(range, invoker, threads*4); + + if(!ok) + return false; + } else { + CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterBilateral, iwSrc, iwDst, radius, valSquareSigma, posSquareSigma, ::ipp::IwDefault(), ippBorder); + } + } + catch (const ::ipp::IwException &) + { + return false; + } + return true; +#else + CV_UNUSED(src); CV_UNUSED(dst); CV_UNUSED(d); CV_UNUSED(sigmaColor); CV_UNUSED(sigmaSpace); CV_UNUSED(borderType); + return false; +#endif +} +#endif + +void bilateralFilter( InputArray _src, OutputArray _dst, int d, + double sigmaColor, double sigmaSpace, + int borderType ) +{ + CV_INSTRUMENT_REGION(); + + _dst.create( _src.size(), _src.type() ); + + CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(), + ocl_bilateralFilter_8u(_src, _dst, d, sigmaColor, sigmaSpace, borderType)) + + Mat src = _src.getMat(), dst = _dst.getMat(); + + CV_IPP_RUN_FAST(ipp_bilateralFilter(src, dst, d, sigmaColor, sigmaSpace, borderType)); + + if( src.depth() == CV_8U ) + bilateralFilter_8u( src, dst, d, sigmaColor, sigmaSpace, borderType ); + else if( src.depth() == CV_32F ) + bilateralFilter_32f( src, dst, d, sigmaColor, sigmaSpace, borderType ); + else + CV_Error( CV_StsUnsupportedFormat, + "Bilateral filtering is only implemented for 8u and 32f images" ); +} + +} // namespace diff --git a/modules/imgproc/src/bilateral_filter.cpp b/modules/imgproc/src/bilateral_filter.simd.hpp similarity index 83% rename from modules/imgproc/src/bilateral_filter.cpp rename to modules/imgproc/src/bilateral_filter.simd.hpp index e9181f2182..65abcd4e40 100644 --- a/modules/imgproc/src/bilateral_filter.cpp +++ b/modules/imgproc/src/bilateral_filter.simd.hpp @@ -43,18 +43,25 @@ #include "precomp.hpp" -#include - #include "opencv2/core/hal/intrin.hpp" -#include "opencl_kernels_imgproc.hpp" /****************************************************************************************\ Bilateral Filtering \****************************************************************************************/ -namespace cv -{ +namespace cv { +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN +// forward declarations +void bilateralFilterInvoker_8u( + Mat& dst, const Mat& temp, int radius, int maxk, + int* space_ofs, float *space_weight, float *color_weight); +void bilateralFilterInvoker_32f( + int cn, int radius, int maxk, int *space_ofs, + const Mat& temp, Mat& dst, float scale_index, float *space_weight, float *expLUT); +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY + +namespace { class BilateralFilter_8u_Invoker : public ParallelLoopBody { @@ -68,6 +75,8 @@ public: virtual void operator() (const Range& range) const CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int i, j, cn = dest->channels(), k; Size size = dest->size(); @@ -536,161 +545,20 @@ private: float *space_weight, *color_weight; }; -#ifdef HAVE_OPENCL - -static bool ocl_bilateralFilter_8u(InputArray _src, OutputArray _dst, int d, - double sigma_color, double sigma_space, - int borderType) -{ -#ifdef __ANDROID__ - if (ocl::Device::getDefault().isNVidia()) - return false; -#endif - - int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - int i, j, maxk, radius; - - if (depth != CV_8U || cn > 4) - return false; - - if (sigma_color <= 0) - sigma_color = 1; - if (sigma_space <= 0) - sigma_space = 1; - - double gauss_color_coeff = -0.5 / (sigma_color * sigma_color); - double gauss_space_coeff = -0.5 / (sigma_space * sigma_space); - - if ( d <= 0 ) - radius = cvRound(sigma_space * 1.5); - else - radius = d / 2; - radius = MAX(radius, 1); - d = radius * 2 + 1; - - UMat src = _src.getUMat(), dst = _dst.getUMat(), temp; - if (src.u == dst.u) - return false; +} // namespace anon - copyMakeBorder(src, temp, radius, radius, radius, radius, borderType); - std::vector _space_weight(d * d); - std::vector _space_ofs(d * d); - float * const space_weight = &_space_weight[0]; - int * const space_ofs = &_space_ofs[0]; - - // initialize space-related bilateral filter coefficients - for( i = -radius, maxk = 0; i <= radius; i++ ) - for( j = -radius; j <= radius; j++ ) - { - double r = std::sqrt((double)i * i + (double)j * j); - if ( r > radius ) - continue; - space_weight[maxk] = (float)std::exp(r * r * gauss_space_coeff); - space_ofs[maxk++] = (int)(i * temp.step + j * cn); - } - - char cvt[3][40]; - String cnstr = cn > 1 ? format("%d", cn) : ""; - String kernelName("bilateral"); - size_t sizeDiv = 1; - if ((ocl::Device::getDefault().isIntel()) && - (ocl::Device::getDefault().type() == ocl::Device::TYPE_GPU)) - { - //Intel GPU - if (dst.cols % 4 == 0 && cn == 1) // For single channel x4 sized images. - { - kernelName = "bilateral_float4"; - sizeDiv = 4; - } - } - ocl::Kernel k(kernelName.c_str(), ocl::imgproc::bilateral_oclsrc, - format("-D radius=%d -D maxk=%d -D cn=%d -D int_t=%s -D uint_t=uint%s -D convert_int_t=%s" - " -D uchar_t=%s -D float_t=%s -D convert_float_t=%s -D convert_uchar_t=%s -D gauss_color_coeff=(float)%f", - radius, maxk, cn, ocl::typeToStr(CV_32SC(cn)), cnstr.c_str(), - ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]), - ocl::typeToStr(type), ocl::typeToStr(CV_32FC(cn)), - ocl::convertTypeStr(CV_32S, CV_32F, cn, cvt[1]), - ocl::convertTypeStr(CV_32F, CV_8U, cn, cvt[2]), gauss_color_coeff)); - if (k.empty()) - return false; - - Mat mspace_weight(1, d * d, CV_32FC1, space_weight); - Mat mspace_ofs(1, d * d, CV_32SC1, space_ofs); - UMat ucolor_weight, uspace_weight, uspace_ofs; - - mspace_weight.copyTo(uspace_weight); - mspace_ofs.copyTo(uspace_ofs); - - k.args(ocl::KernelArg::ReadOnlyNoSize(temp), ocl::KernelArg::WriteOnly(dst), - ocl::KernelArg::PtrReadOnly(uspace_weight), - ocl::KernelArg::PtrReadOnly(uspace_ofs)); - - size_t globalsize[2] = { (size_t)dst.cols / sizeDiv, (size_t)dst.rows }; - return k.run(2, globalsize, NULL, false); -} - -#endif -static void -bilateralFilter_8u( const Mat& src, Mat& dst, int d, - double sigma_color, double sigma_space, - int borderType ) +void bilateralFilterInvoker_8u( + Mat& dst, const Mat& temp, int radius, int maxk, + int* space_ofs, float *space_weight, float *color_weight) { - int cn = src.channels(); - int i, j, maxk, radius; - Size size = src.size(); - - CV_Assert( (src.type() == CV_8UC1 || src.type() == CV_8UC3) && src.data != dst.data ); - - if( sigma_color <= 0 ) - sigma_color = 1; - if( sigma_space <= 0 ) - sigma_space = 1; - - double gauss_color_coeff = -0.5/(sigma_color*sigma_color); - double gauss_space_coeff = -0.5/(sigma_space*sigma_space); - - if( d <= 0 ) - radius = cvRound(sigma_space*1.5); - else - radius = d/2; - radius = MAX(radius, 1); - d = radius*2 + 1; - - Mat temp; - copyMakeBorder( src, temp, radius, radius, radius, radius, borderType ); - - std::vector _color_weight(cn*256); - std::vector _space_weight(d*d); - std::vector _space_ofs(d*d); - float* color_weight = &_color_weight[0]; - float* space_weight = &_space_weight[0]; - int* space_ofs = &_space_ofs[0]; - - // initialize color-related bilateral filter coefficients - - for( i = 0; i < 256*cn; i++ ) - color_weight[i] = (float)std::exp(i*i*gauss_color_coeff); - - // initialize space-related bilateral filter coefficients - for( i = -radius, maxk = 0; i <= radius; i++ ) - { - j = -radius; - - for( ; j <= radius; j++ ) - { - double r = std::sqrt((double)i*i + (double)j*j); - if( r > radius ) - continue; - space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff); - space_ofs[maxk++] = (int)(i*temp.step + j*cn); - } - } - + CV_INSTRUMENT_REGION(); BilateralFilter_8u_Invoker body(dst, temp, radius, maxk, space_ofs, space_weight, color_weight); - parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16)); + parallel_for_(Range(0, dst.rows), body, dst.total()/(double)(1<<16)); } +namespace { + class BilateralFilter_32f_Invoker : public ParallelLoopBody { @@ -705,6 +573,8 @@ public: virtual void operator() (const Range& range) const CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int i, j, k; Size size = dest->size(); @@ -1153,216 +1023,18 @@ private: float scale_index, *space_weight, *expLUT; }; +} // namespace anon -static void -bilateralFilter_32f( const Mat& src, Mat& dst, int d, - double sigma_color, double sigma_space, - int borderType ) +void bilateralFilterInvoker_32f( + int cn, int radius, int maxk, int *space_ofs, + const Mat& temp, Mat& dst, float scale_index, float *space_weight, float *expLUT) { - int cn = src.channels(); - int i, j, maxk, radius; - double minValSrc=-1, maxValSrc=1; - const int kExpNumBinsPerChannel = 1 << 12; - int kExpNumBins = 0; - float lastExpVal = 1.f; - float len, scale_index; - Size size = src.size(); - - CV_Assert( (src.type() == CV_32FC1 || src.type() == CV_32FC3) && src.data != dst.data ); - - if( sigma_color <= 0 ) - sigma_color = 1; - if( sigma_space <= 0 ) - sigma_space = 1; - - double gauss_color_coeff = -0.5/(sigma_color*sigma_color); - double gauss_space_coeff = -0.5/(sigma_space*sigma_space); - - if( d <= 0 ) - radius = cvRound(sigma_space*1.5); - else - radius = d/2; - radius = MAX(radius, 1); - d = radius*2 + 1; - // compute the min/max range for the input image (even if multichannel) - - minMaxLoc( src.reshape(1), &minValSrc, &maxValSrc ); - if(std::abs(minValSrc - maxValSrc) < FLT_EPSILON) - { - src.copyTo(dst); - return; - } - - // temporary copy of the image with borders for easy processing - Mat temp; - copyMakeBorder( src, temp, radius, radius, radius, radius, borderType ); - - // allocate lookup tables - std::vector _space_weight(d*d); - std::vector _space_ofs(d*d); - float* space_weight = &_space_weight[0]; - int* space_ofs = &_space_ofs[0]; - - // assign a length which is slightly more than needed - len = (float)(maxValSrc - minValSrc) * cn; - kExpNumBins = kExpNumBinsPerChannel * cn; - std::vector _expLUT(kExpNumBins+2); - float* expLUT = &_expLUT[0]; - - scale_index = kExpNumBins/len; - - // initialize the exp LUT - for( i = 0; i < kExpNumBins+2; i++ ) - { - if( lastExpVal > 0.f ) - { - double val = i / scale_index; - expLUT[i] = (float)std::exp(val * val * gauss_color_coeff); - lastExpVal = expLUT[i]; - } - else - expLUT[i] = 0.f; - } - - // initialize space-related bilateral filter coefficients - for( i = -radius, maxk = 0; i <= radius; i++ ) - for( j = -radius; j <= radius; j++ ) - { - double r = std::sqrt((double)i*i + (double)j*j); - if( r > radius || ( i == 0 && j == 0 ) ) - continue; - space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff); - space_ofs[maxk++] = (int)(i*(temp.step/sizeof(float)) + j*cn); - } - - // parallel_for usage + CV_INSTRUMENT_REGION(); BilateralFilter_32f_Invoker body(cn, radius, maxk, space_ofs, temp, dst, scale_index, space_weight, expLUT); - parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16)); + parallel_for_(Range(0, dst.rows), body, dst.total()/(double)(1<<16)); } -#ifdef HAVE_IPP -#define IPP_BILATERAL_PARALLEL 1 - -#ifdef HAVE_IPP_IW -class ipp_bilateralFilterParallel: public ParallelLoopBody -{ -public: - ipp_bilateralFilterParallel(::ipp::IwiImage &_src, ::ipp::IwiImage &_dst, int _radius, Ipp32f _valSquareSigma, Ipp32f _posSquareSigma, ::ipp::IwiBorderType _borderType, bool *_ok): - src(_src), dst(_dst) - { - pOk = _ok; - - radius = _radius; - valSquareSigma = _valSquareSigma; - posSquareSigma = _posSquareSigma; - borderType = _borderType; - - *pOk = true; - } - ~ipp_bilateralFilterParallel() {} - - virtual void operator() (const Range& range) const CV_OVERRIDE - { - if(*pOk == false) - return; - - try - { - ::ipp::IwiTile tile = ::ipp::IwiRoi(0, range.start, dst.m_size.width, range.end - range.start); - CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterBilateral, src, dst, radius, valSquareSigma, posSquareSigma, ::ipp::IwDefault(), borderType, tile); - } - catch(const ::ipp::IwException &) - { - *pOk = false; - return; - } - } -private: - ::ipp::IwiImage &src; - ::ipp::IwiImage &dst; - - int radius; - Ipp32f valSquareSigma; - Ipp32f posSquareSigma; - ::ipp::IwiBorderType borderType; - - bool *pOk; - const ipp_bilateralFilterParallel& operator= (const ipp_bilateralFilterParallel&); -}; #endif - -static bool ipp_bilateralFilter(Mat &src, Mat &dst, int d, double sigmaColor, double sigmaSpace, int borderType) -{ -#ifdef HAVE_IPP_IW - CV_INSTRUMENT_REGION_IPP(); - - int radius = IPP_MAX(((d <= 0)?cvRound(sigmaSpace*1.5):d/2), 1); - Ipp32f valSquareSigma = (Ipp32f)((sigmaColor <= 0)?1:sigmaColor*sigmaColor); - Ipp32f posSquareSigma = (Ipp32f)((sigmaSpace <= 0)?1:sigmaSpace*sigmaSpace); - - // Acquire data and begin processing - try - { - ::ipp::IwiImage iwSrc = ippiGetImage(src); - ::ipp::IwiImage iwDst = ippiGetImage(dst); - ::ipp::IwiBorderSize borderSize(radius); - ::ipp::IwiBorderType ippBorder(ippiGetBorder(iwSrc, borderType, borderSize)); - if(!ippBorder) - return false; - - const int threads = ippiSuggestThreadsNum(iwDst, 2); - if(IPP_BILATERAL_PARALLEL && threads > 1) { - bool ok = true; - Range range(0, (int)iwDst.m_size.height); - ipp_bilateralFilterParallel invoker(iwSrc, iwDst, radius, valSquareSigma, posSquareSigma, ippBorder, &ok); - if(!ok) - return false; - - parallel_for_(range, invoker, threads*4); - - if(!ok) - return false; - } else { - CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterBilateral, iwSrc, iwDst, radius, valSquareSigma, posSquareSigma, ::ipp::IwDefault(), ippBorder); - } - } - catch (const ::ipp::IwException &) - { - return false; - } - return true; -#else - CV_UNUSED(src); CV_UNUSED(dst); CV_UNUSED(d); CV_UNUSED(sigmaColor); CV_UNUSED(sigmaSpace); CV_UNUSED(borderType); - return false; -#endif -} -#endif - -} - -void cv::bilateralFilter( InputArray _src, OutputArray _dst, int d, - double sigmaColor, double sigmaSpace, - int borderType ) -{ - CV_INSTRUMENT_REGION(); - - _dst.create( _src.size(), _src.type() ); - - CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(), - ocl_bilateralFilter_8u(_src, _dst, d, sigmaColor, sigmaSpace, borderType)) - - Mat src = _src.getMat(), dst = _dst.getMat(); - - CV_IPP_RUN_FAST(ipp_bilateralFilter(src, dst, d, sigmaColor, sigmaSpace, borderType)); - - if( src.depth() == CV_8U ) - bilateralFilter_8u( src, dst, d, sigmaColor, sigmaSpace, borderType ); - else if( src.depth() == CV_32F ) - bilateralFilter_32f( src, dst, d, sigmaColor, sigmaSpace, borderType ); - else - CV_Error( CV_StsUnsupportedFormat, - "Bilateral filtering is only implemented for 8u and 32f images" ); -} - -/* End of file. */ +CV_CPU_OPTIMIZATION_NAMESPACE_END +} // namespace diff --git a/modules/imgproc/src/box_filter.dispatch.cpp b/modules/imgproc/src/box_filter.dispatch.cpp new file mode 100644 index 0000000000..154ccfd09e --- /dev/null +++ b/modules/imgproc/src/box_filter.dispatch.cpp @@ -0,0 +1,557 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, 2018, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" + +#include + +#include "opencv2/core/hal/intrin.hpp" +#include "opencl_kernels_imgproc.hpp" + +#include "opencv2/core/openvx/ovx_defs.hpp" + +#include "box_filter.simd.hpp" +#include "box_filter.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content + + +namespace cv { + +#ifdef HAVE_OPENCL + +static bool ocl_boxFilter3x3_8UC1( InputArray _src, OutputArray _dst, int ddepth, + Size ksize, Point anchor, int borderType, bool normalize ) +{ + const ocl::Device & dev = ocl::Device::getDefault(); + int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); + + if (ddepth < 0) + ddepth = sdepth; + + if (anchor.x < 0) + anchor.x = ksize.width / 2; + if (anchor.y < 0) + anchor.y = ksize.height / 2; + + if ( !(dev.isIntel() && (type == CV_8UC1) && + (_src.offset() == 0) && (_src.step() % 4 == 0) && + (_src.cols() % 16 == 0) && (_src.rows() % 2 == 0) && + (anchor.x == 1) && (anchor.y == 1) && + (ksize.width == 3) && (ksize.height == 3)) ) + return false; + + float alpha = 1.0f / (ksize.height * ksize.width); + Size size = _src.size(); + size_t globalsize[2] = { 0, 0 }; + size_t localsize[2] = { 0, 0 }; + const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" }; + + globalsize[0] = size.width / 16; + globalsize[1] = size.height / 2; + + char build_opts[1024]; + sprintf(build_opts, "-D %s %s", borderMap[borderType], normalize ? "-D NORMALIZE" : ""); + + ocl::Kernel kernel("boxFilter3x3_8UC1_cols16_rows2", cv::ocl::imgproc::boxFilter3x3_oclsrc, build_opts); + if (kernel.empty()) + return false; + + UMat src = _src.getUMat(); + _dst.create(size, CV_MAKETYPE(ddepth, cn)); + if (!(_dst.offset() == 0 && _dst.step() % 4 == 0)) + return false; + UMat dst = _dst.getUMat(); + + int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src)); + idxArg = kernel.set(idxArg, (int)src.step); + idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst)); + idxArg = kernel.set(idxArg, (int)dst.step); + idxArg = kernel.set(idxArg, (int)dst.rows); + idxArg = kernel.set(idxArg, (int)dst.cols); + if (normalize) + idxArg = kernel.set(idxArg, (float)alpha); + + return kernel.run(2, globalsize, (localsize[0] == 0) ? NULL : localsize, false); +} + +static bool ocl_boxFilter( InputArray _src, OutputArray _dst, int ddepth, + Size ksize, Point anchor, int borderType, bool normalize, bool sqr = false ) +{ + const ocl::Device & dev = ocl::Device::getDefault(); + int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), esz = CV_ELEM_SIZE(type); + bool doubleSupport = dev.doubleFPConfig() > 0; + + if (ddepth < 0) + ddepth = sdepth; + + if (cn > 4 || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) || + _src.offset() % esz != 0 || _src.step() % esz != 0) + return false; + + if (anchor.x < 0) + anchor.x = ksize.width / 2; + if (anchor.y < 0) + anchor.y = ksize.height / 2; + + int computeUnits = ocl::Device::getDefault().maxComputeUnits(); + float alpha = 1.0f / (ksize.height * ksize.width); + Size size = _src.size(), wholeSize; + bool isolated = (borderType & BORDER_ISOLATED) != 0; + borderType &= ~BORDER_ISOLATED; + int wdepth = std::max(CV_32F, std::max(ddepth, sdepth)), + wtype = CV_MAKE_TYPE(wdepth, cn), dtype = CV_MAKE_TYPE(ddepth, cn); + + const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" }; + size_t globalsize[2] = { (size_t)size.width, (size_t)size.height }; + size_t localsize_general[2] = { 0, 1 }, * localsize = NULL; + + UMat src = _src.getUMat(); + if (!isolated) + { + Point ofs; + src.locateROI(wholeSize, ofs); + } + + int h = isolated ? size.height : wholeSize.height; + int w = isolated ? size.width : wholeSize.width; + + size_t maxWorkItemSizes[32]; + ocl::Device::getDefault().maxWorkItemSizes(maxWorkItemSizes); + int tryWorkItems = (int)maxWorkItemSizes[0]; + + ocl::Kernel kernel; + + if (dev.isIntel() && !(dev.type() & ocl::Device::TYPE_CPU) && + ((ksize.width < 5 && ksize.height < 5 && esz <= 4) || + (ksize.width == 5 && ksize.height == 5 && cn == 1))) + { + if (w < ksize.width || h < ksize.height) + return false; + + // Figure out what vector size to use for loading the pixels. + int pxLoadNumPixels = cn != 1 || size.width % 4 ? 1 : 4; + int pxLoadVecSize = cn * pxLoadNumPixels; + + // Figure out how many pixels per work item to compute in X and Y + // directions. Too many and we run out of registers. + int pxPerWorkItemX = 1, pxPerWorkItemY = 1; + if (cn <= 2 && ksize.width <= 4 && ksize.height <= 4) + { + pxPerWorkItemX = size.width % 8 ? size.width % 4 ? size.width % 2 ? 1 : 2 : 4 : 8; + pxPerWorkItemY = size.height % 2 ? 1 : 2; + } + else if (cn < 4 || (ksize.width <= 4 && ksize.height <= 4)) + { + pxPerWorkItemX = size.width % 2 ? 1 : 2; + pxPerWorkItemY = size.height % 2 ? 1 : 2; + } + globalsize[0] = size.width / pxPerWorkItemX; + globalsize[1] = size.height / pxPerWorkItemY; + + // Need some padding in the private array for pixels + int privDataWidth = roundUp(pxPerWorkItemX + ksize.width - 1, pxLoadNumPixels); + + // Make the global size a nice round number so the runtime can pick + // from reasonable choices for the workgroup size + const int wgRound = 256; + globalsize[0] = roundUp(globalsize[0], wgRound); + + char build_options[1024], cvt[2][40]; + sprintf(build_options, "-D cn=%d " + "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d " + "-D PX_LOAD_VEC_SIZE=%d -D PX_LOAD_NUM_PX=%d " + "-D PX_PER_WI_X=%d -D PX_PER_WI_Y=%d -D PRIV_DATA_WIDTH=%d -D %s -D %s " + "-D PX_LOAD_X_ITERATIONS=%d -D PX_LOAD_Y_ITERATIONS=%d " + "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s " + "-D convertToWT=%s -D convertToDstT=%s%s%s -D PX_LOAD_FLOAT_VEC_CONV=convert_%s -D OP_BOX_FILTER", + cn, anchor.x, anchor.y, ksize.width, ksize.height, + pxLoadVecSize, pxLoadNumPixels, + pxPerWorkItemX, pxPerWorkItemY, privDataWidth, borderMap[borderType], + isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED", + privDataWidth / pxLoadNumPixels, pxPerWorkItemY + ksize.height - 1, + ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype), + ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth), + ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), + ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), + normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "", + ocl::typeToStr(CV_MAKE_TYPE(wdepth, pxLoadVecSize)) //PX_LOAD_FLOAT_VEC_CONV + ); + + + if (!kernel.create("filterSmall", cv::ocl::imgproc::filterSmall_oclsrc, build_options)) + return false; + } + else + { + localsize = localsize_general; + for ( ; ; ) + { + int BLOCK_SIZE_X = tryWorkItems, BLOCK_SIZE_Y = std::min(ksize.height * 10, size.height); + + while (BLOCK_SIZE_X > 32 && BLOCK_SIZE_X >= ksize.width * 2 && BLOCK_SIZE_X > size.width * 2) + BLOCK_SIZE_X /= 2; + while (BLOCK_SIZE_Y < BLOCK_SIZE_X / 8 && BLOCK_SIZE_Y * computeUnits * 32 < size.height) + BLOCK_SIZE_Y *= 2; + + if (ksize.width > BLOCK_SIZE_X || w < ksize.width || h < ksize.height) + return false; + + char cvt[2][50]; + String opts = format("-D LOCAL_SIZE_X=%d -D BLOCK_SIZE_Y=%d -D ST=%s -D DT=%s -D WT=%s -D convertToDT=%s -D convertToWT=%s" + " -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s%s%s%s%s" + " -D ST1=%s -D DT1=%s -D cn=%d", + BLOCK_SIZE_X, BLOCK_SIZE_Y, ocl::typeToStr(type), ocl::typeToStr(CV_MAKE_TYPE(ddepth, cn)), + ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), + ocl::convertTypeStr(wdepth, ddepth, cn, cvt[0]), + ocl::convertTypeStr(sdepth, wdepth, cn, cvt[1]), + anchor.x, anchor.y, ksize.width, ksize.height, borderMap[borderType], + isolated ? " -D BORDER_ISOLATED" : "", doubleSupport ? " -D DOUBLE_SUPPORT" : "", + normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "", + ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn); + + localsize[0] = BLOCK_SIZE_X; + globalsize[0] = divUp(size.width, BLOCK_SIZE_X - (ksize.width - 1)) * BLOCK_SIZE_X; + globalsize[1] = divUp(size.height, BLOCK_SIZE_Y); + + kernel.create("boxFilter", cv::ocl::imgproc::boxFilter_oclsrc, opts); + if (kernel.empty()) + return false; + + size_t kernelWorkGroupSize = kernel.workGroupSize(); + if (localsize[0] <= kernelWorkGroupSize) + break; + if (BLOCK_SIZE_X < (int)kernelWorkGroupSize) + return false; + + tryWorkItems = (int)kernelWorkGroupSize; + } + } + + _dst.create(size, CV_MAKETYPE(ddepth, cn)); + UMat dst = _dst.getUMat(); + + int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src)); + idxArg = kernel.set(idxArg, (int)src.step); + int srcOffsetX = (int)((src.offset % src.step) / src.elemSize()); + int srcOffsetY = (int)(src.offset / src.step); + int srcEndX = isolated ? srcOffsetX + size.width : wholeSize.width; + int srcEndY = isolated ? srcOffsetY + size.height : wholeSize.height; + idxArg = kernel.set(idxArg, srcOffsetX); + idxArg = kernel.set(idxArg, srcOffsetY); + idxArg = kernel.set(idxArg, srcEndX); + idxArg = kernel.set(idxArg, srcEndY); + idxArg = kernel.set(idxArg, ocl::KernelArg::WriteOnly(dst)); + if (normalize) + idxArg = kernel.set(idxArg, (float)alpha); + + return kernel.run(2, globalsize, localsize, false); +} + +#endif + +Ptr getRowSumFilter(int srcType, int sumType, int ksize, int anchor) +{ + CV_INSTRUMENT_REGION(); + + CV_CPU_DISPATCH(getRowSumFilter, (srcType, sumType, ksize, anchor), + CV_CPU_DISPATCH_MODES_ALL); +} + + +Ptr getColumnSumFilter(int sumType, int dstType, int ksize, int anchor, double scale) +{ + CV_INSTRUMENT_REGION(); + + CV_CPU_DISPATCH(getColumnSumFilter, (sumType, dstType, ksize, anchor, scale), + CV_CPU_DISPATCH_MODES_ALL); +} + + +Ptr createBoxFilter(int srcType, int dstType, Size ksize, + Point anchor, bool normalize, int borderType) +{ + CV_INSTRUMENT_REGION(); + + CV_CPU_DISPATCH(createBoxFilter, (srcType, dstType, ksize, anchor, normalize, borderType), + CV_CPU_DISPATCH_MODES_ALL); +} + +#ifdef HAVE_OPENVX + namespace ovx { + template <> inline bool skipSmallImages(int w, int h) { return w*h < 640 * 480; } + } + static bool openvx_boxfilter(InputArray _src, OutputArray _dst, int ddepth, + Size ksize, Point anchor, + bool normalize, int borderType) + { + if (ddepth < 0) + ddepth = CV_8UC1; + if (_src.type() != CV_8UC1 || ddepth != CV_8U || !normalize || + _src.cols() < 3 || _src.rows() < 3 || + ksize.width != 3 || ksize.height != 3 || + (anchor.x >= 0 && anchor.x != 1) || + (anchor.y >= 0 && anchor.y != 1) || + ovx::skipSmallImages(_src.cols(), _src.rows())) + return false; + + Mat src = _src.getMat(); + + if ((borderType & BORDER_ISOLATED) == 0 && src.isSubmatrix()) + return false; //Process isolated borders only + vx_enum border; + switch (borderType & ~BORDER_ISOLATED) + { + case BORDER_CONSTANT: + border = VX_BORDER_CONSTANT; + break; + case BORDER_REPLICATE: + border = VX_BORDER_REPLICATE; + break; + default: + return false; + } + + _dst.create(src.size(), CV_8UC1); + Mat dst = _dst.getMat(); + + try + { + ivx::Context ctx = ovx::getOpenVXContext(); + + Mat a; + if (dst.data != src.data) + a = src; + else + src.copyTo(a); + + ivx::Image + ia = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8, + ivx::Image::createAddressing(a.cols, a.rows, 1, (vx_int32)(a.step)), a.data), + ib = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8, + ivx::Image::createAddressing(dst.cols, dst.rows, 1, (vx_int32)(dst.step)), dst.data); + + //ATTENTION: VX_CONTEXT_IMMEDIATE_BORDER attribute change could lead to strange issues in multi-threaded environments + //since OpenVX standard says nothing about thread-safety for now + ivx::border_t prevBorder = ctx.immediateBorder(); + ctx.setImmediateBorder(border, (vx_uint8)(0)); + ivx::IVX_CHECK_STATUS(vxuBox3x3(ctx, ia, ib)); + ctx.setImmediateBorder(prevBorder); + } + catch (const ivx::RuntimeError & e) + { + VX_DbgThrow(e.what()); + } + catch (const ivx::WrapperError & e) + { + VX_DbgThrow(e.what()); + } + + return true; + } +#endif + +#if defined(HAVE_IPP) +static bool ipp_boxfilter(Mat &src, Mat &dst, Size ksize, Point anchor, bool normalize, int borderType) +{ +#ifdef HAVE_IPP_IW + CV_INSTRUMENT_REGION_IPP(); + +#if IPP_VERSION_X100 < 201801 + // Problem with SSE42 optimization for 16s and some 8u modes + if(ipp::getIppTopFeatures() == ippCPUID_SSE42 && (((src.depth() == CV_16S || src.depth() == CV_16U) && (src.channels() == 3 || src.channels() == 4)) || (src.depth() == CV_8U && src.channels() == 3 && (ksize.width > 5 || ksize.height > 5)))) + return false; + + // Other optimizations has some degradations too + if((((src.depth() == CV_16S || src.depth() == CV_16U) && (src.channels() == 4)) || (src.depth() == CV_8U && src.channels() == 1 && (ksize.width > 5 || ksize.height > 5)))) + return false; +#endif + + if(!normalize) + return false; + + if(!ippiCheckAnchor(anchor, ksize)) + return false; + + try + { + ::ipp::IwiImage iwSrc = ippiGetImage(src); + ::ipp::IwiImage iwDst = ippiGetImage(dst); + ::ipp::IwiSize iwKSize = ippiGetSize(ksize); + ::ipp::IwiBorderSize borderSize(iwKSize); + ::ipp::IwiBorderType ippBorder(ippiGetBorder(iwSrc, borderType, borderSize)); + if(!ippBorder) + return false; + + CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterBox, iwSrc, iwDst, iwKSize, ::ipp::IwDefault(), ippBorder); + } + catch (const ::ipp::IwException &) + { + return false; + } + + return true; +#else + CV_UNUSED(src); CV_UNUSED(dst); CV_UNUSED(ksize); CV_UNUSED(anchor); CV_UNUSED(normalize); CV_UNUSED(borderType); + return false; +#endif +} +#endif + + +void boxFilter(InputArray _src, OutputArray _dst, int ddepth, + Size ksize, Point anchor, + bool normalize, int borderType) +{ + CV_INSTRUMENT_REGION(); + + CV_OCL_RUN(_dst.isUMat() && + (borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || + borderType == BORDER_REFLECT || borderType == BORDER_REFLECT_101), + ocl_boxFilter3x3_8UC1(_src, _dst, ddepth, ksize, anchor, borderType, normalize)) + + CV_OCL_RUN(_dst.isUMat(), ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType, normalize)) + + Mat src = _src.getMat(); + int stype = src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype); + if( ddepth < 0 ) + ddepth = sdepth; + _dst.create( src.size(), CV_MAKETYPE(ddepth, cn) ); + Mat dst = _dst.getMat(); + if( borderType != BORDER_CONSTANT && normalize && (borderType & BORDER_ISOLATED) != 0 ) + { + if( src.rows == 1 ) + ksize.height = 1; + if( src.cols == 1 ) + ksize.width = 1; + } + + Point ofs; + Size wsz(src.cols, src.rows); + if(!(borderType&BORDER_ISOLATED)) + src.locateROI( wsz, ofs ); + + CALL_HAL(boxFilter, cv_hal_boxFilter, src.ptr(), src.step, dst.ptr(), dst.step, src.cols, src.rows, sdepth, ddepth, cn, + ofs.x, ofs.y, wsz.width - src.cols - ofs.x, wsz.height - src.rows - ofs.y, ksize.width, ksize.height, + anchor.x, anchor.y, normalize, borderType&~BORDER_ISOLATED); + + CV_OVX_RUN(true, + openvx_boxfilter(src, dst, ddepth, ksize, anchor, normalize, borderType)) + + CV_IPP_RUN_FAST(ipp_boxfilter(src, dst, ksize, anchor, normalize, borderType)); + + borderType = (borderType&~BORDER_ISOLATED); + + Ptr f = createBoxFilter( src.type(), dst.type(), + ksize, anchor, normalize, borderType ); + + f->apply( src, dst, wsz, ofs ); +} + + +void blur(InputArray src, OutputArray dst, + Size ksize, Point anchor, int borderType) +{ + CV_INSTRUMENT_REGION(); + + boxFilter( src, dst, -1, ksize, anchor, true, borderType ); +} + + +/****************************************************************************************\ + Squared Box Filter +\****************************************************************************************/ + +static Ptr getSqrRowSumFilter(int srcType, int sumType, int ksize, int anchor) +{ + CV_INSTRUMENT_REGION(); + + CV_CPU_DISPATCH(getSqrRowSumFilter, (srcType, sumType, ksize, anchor), + CV_CPU_DISPATCH_MODES_ALL); +} + +void sqrBoxFilter(InputArray _src, OutputArray _dst, int ddepth, + Size ksize, Point anchor, + bool normalize, int borderType) +{ + CV_INSTRUMENT_REGION(); + + int srcType = _src.type(), sdepth = CV_MAT_DEPTH(srcType), cn = CV_MAT_CN(srcType); + Size size = _src.size(); + + if( ddepth < 0 ) + ddepth = sdepth < CV_32F ? CV_32F : CV_64F; + + if( borderType != BORDER_CONSTANT && normalize ) + { + if( size.height == 1 ) + ksize.height = 1; + if( size.width == 1 ) + ksize.width = 1; + } + + CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2, + ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType, normalize, true)) + + int sumDepth = CV_64F; + if( sdepth == CV_8U ) + sumDepth = CV_32S; + int sumType = CV_MAKETYPE( sumDepth, cn ), dstType = CV_MAKETYPE(ddepth, cn); + + Mat src = _src.getMat(); + _dst.create( size, dstType ); + Mat dst = _dst.getMat(); + + Ptr rowFilter = getSqrRowSumFilter(srcType, sumType, ksize.width, anchor.x ); + Ptr columnFilter = getColumnSumFilter(sumType, + dstType, ksize.height, anchor.y, + normalize ? 1./(ksize.width*ksize.height) : 1); + + Ptr f = makePtr(Ptr(), rowFilter, columnFilter, + srcType, dstType, sumType, borderType ); + Point ofs; + Size wsz(src.cols, src.rows); + src.locateROI( wsz, ofs ); + + f->apply( src, dst, wsz, ofs ); +} + +} // namespace diff --git a/modules/imgproc/src/box_filter.cpp b/modules/imgproc/src/box_filter.simd.hpp similarity index 69% rename from modules/imgproc/src/box_filter.cpp rename to modules/imgproc/src/box_filter.simd.hpp index 14f266258f..4eadee8ec5 100644 --- a/modules/imgproc/src/box_filter.cpp +++ b/modules/imgproc/src/box_filter.simd.hpp @@ -42,21 +42,25 @@ //M*/ #include "precomp.hpp" - -#include - #include "opencv2/core/hal/intrin.hpp" -#include "opencl_kernels_imgproc.hpp" -#include "opencv2/core/openvx/ovx_defs.hpp" +namespace cv { +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN +// forward declarations +Ptr getRowSumFilter(int srcType, int sumType, int ksize, int anchor); +Ptr getColumnSumFilter(int sumType, int dstType, int ksize, int anchor, double scale); +Ptr createBoxFilter(int srcType, int dstType, Size ksize, + Point anchor, bool normalize, int borderType); -namespace cv -{ +Ptr getSqrRowSumFilter(int srcType, int sumType, int ksize, int anchor); + +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY /****************************************************************************************\ Box Filter \****************************************************************************************/ +namespace { template struct RowSum : public BaseRowFilter @@ -70,6 +74,8 @@ struct RowSum : virtual void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + const T* S = (const T*)src; ST* D = (ST*)dst; int i = 0, k, ksz_cn = ksize*cn; @@ -183,6 +189,8 @@ struct ColumnSum : virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int i; ST* SUM; bool haveScale = scale != 1; @@ -281,6 +289,8 @@ struct ColumnSum : virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int* SUM; bool haveScale = scale != 1; double _scale = scale; @@ -408,9 +418,6 @@ struct ColumnSum : } dst += dststep; } -#if CV_SIMD - vx_cleanup(); -#endif } double scale; @@ -452,6 +459,8 @@ public BaseColumnFilter virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + const int ds = divScale; const int dd = divDelta; ushort* SUM; @@ -586,9 +595,6 @@ public BaseColumnFilter } dst += dststep; } -#if CV_SIMD - vx_cleanup(); -#endif } double scale; @@ -616,6 +622,8 @@ struct ColumnSum : virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int i; int* SUM; bool haveScale = scale != 1; @@ -739,9 +747,6 @@ struct ColumnSum : } dst += dststep; } -#if CV_SIMD - vx_cleanup(); -#endif } double scale; @@ -767,6 +772,8 @@ struct ColumnSum : virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int* SUM; bool haveScale = scale != 1; double _scale = scale; @@ -888,9 +895,6 @@ struct ColumnSum : } dst += dststep; } -#if CV_SIMD - vx_cleanup(); -#endif } double scale; @@ -915,6 +919,8 @@ struct ColumnSum : virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int* SUM; bool haveScale = scale != 1; double _scale = scale; @@ -1022,9 +1028,6 @@ struct ColumnSum : } dst += dststep; } -#if CV_SIMD - vx_cleanup(); -#endif } double scale; @@ -1050,6 +1053,8 @@ struct ColumnSum : virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int* SUM; bool haveScale = scale != 1; double _scale = scale; @@ -1154,9 +1159,6 @@ struct ColumnSum : } dst += dststep; } -#if CV_SIMD - vx_cleanup(); -#endif } double scale; @@ -1164,243 +1166,13 @@ struct ColumnSum : std::vector sum; }; -#ifdef HAVE_OPENCL +} // namespace anon -static bool ocl_boxFilter3x3_8UC1( InputArray _src, OutputArray _dst, int ddepth, - Size ksize, Point anchor, int borderType, bool normalize ) -{ - const ocl::Device & dev = ocl::Device::getDefault(); - int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - - if (ddepth < 0) - ddepth = sdepth; - - if (anchor.x < 0) - anchor.x = ksize.width / 2; - if (anchor.y < 0) - anchor.y = ksize.height / 2; - - if ( !(dev.isIntel() && (type == CV_8UC1) && - (_src.offset() == 0) && (_src.step() % 4 == 0) && - (_src.cols() % 16 == 0) && (_src.rows() % 2 == 0) && - (anchor.x == 1) && (anchor.y == 1) && - (ksize.width == 3) && (ksize.height == 3)) ) - return false; - - float alpha = 1.0f / (ksize.height * ksize.width); - Size size = _src.size(); - size_t globalsize[2] = { 0, 0 }; - size_t localsize[2] = { 0, 0 }; - const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" }; - - globalsize[0] = size.width / 16; - globalsize[1] = size.height / 2; - - char build_opts[1024]; - sprintf(build_opts, "-D %s %s", borderMap[borderType], normalize ? "-D NORMALIZE" : ""); - - ocl::Kernel kernel("boxFilter3x3_8UC1_cols16_rows2", cv::ocl::imgproc::boxFilter3x3_oclsrc, build_opts); - if (kernel.empty()) - return false; - - UMat src = _src.getUMat(); - _dst.create(size, CV_MAKETYPE(ddepth, cn)); - if (!(_dst.offset() == 0 && _dst.step() % 4 == 0)) - return false; - UMat dst = _dst.getUMat(); - - int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src)); - idxArg = kernel.set(idxArg, (int)src.step); - idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst)); - idxArg = kernel.set(idxArg, (int)dst.step); - idxArg = kernel.set(idxArg, (int)dst.rows); - idxArg = kernel.set(idxArg, (int)dst.cols); - if (normalize) - idxArg = kernel.set(idxArg, (float)alpha); - - return kernel.run(2, globalsize, (localsize[0] == 0) ? NULL : localsize, false); -} -static bool ocl_boxFilter( InputArray _src, OutputArray _dst, int ddepth, - Size ksize, Point anchor, int borderType, bool normalize, bool sqr = false ) +Ptr getRowSumFilter(int srcType, int sumType, int ksize, int anchor) { - const ocl::Device & dev = ocl::Device::getDefault(); - int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), esz = CV_ELEM_SIZE(type); - bool doubleSupport = dev.doubleFPConfig() > 0; - - if (ddepth < 0) - ddepth = sdepth; - - if (cn > 4 || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) || - _src.offset() % esz != 0 || _src.step() % esz != 0) - return false; - - if (anchor.x < 0) - anchor.x = ksize.width / 2; - if (anchor.y < 0) - anchor.y = ksize.height / 2; - - int computeUnits = ocl::Device::getDefault().maxComputeUnits(); - float alpha = 1.0f / (ksize.height * ksize.width); - Size size = _src.size(), wholeSize; - bool isolated = (borderType & BORDER_ISOLATED) != 0; - borderType &= ~BORDER_ISOLATED; - int wdepth = std::max(CV_32F, std::max(ddepth, sdepth)), - wtype = CV_MAKE_TYPE(wdepth, cn), dtype = CV_MAKE_TYPE(ddepth, cn); - - const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" }; - size_t globalsize[2] = { (size_t)size.width, (size_t)size.height }; - size_t localsize_general[2] = { 0, 1 }, * localsize = NULL; - - UMat src = _src.getUMat(); - if (!isolated) - { - Point ofs; - src.locateROI(wholeSize, ofs); - } - - int h = isolated ? size.height : wholeSize.height; - int w = isolated ? size.width : wholeSize.width; - - size_t maxWorkItemSizes[32]; - ocl::Device::getDefault().maxWorkItemSizes(maxWorkItemSizes); - int tryWorkItems = (int)maxWorkItemSizes[0]; - - ocl::Kernel kernel; - - if (dev.isIntel() && !(dev.type() & ocl::Device::TYPE_CPU) && - ((ksize.width < 5 && ksize.height < 5 && esz <= 4) || - (ksize.width == 5 && ksize.height == 5 && cn == 1))) - { - if (w < ksize.width || h < ksize.height) - return false; - - // Figure out what vector size to use for loading the pixels. - int pxLoadNumPixels = cn != 1 || size.width % 4 ? 1 : 4; - int pxLoadVecSize = cn * pxLoadNumPixels; - - // Figure out how many pixels per work item to compute in X and Y - // directions. Too many and we run out of registers. - int pxPerWorkItemX = 1, pxPerWorkItemY = 1; - if (cn <= 2 && ksize.width <= 4 && ksize.height <= 4) - { - pxPerWorkItemX = size.width % 8 ? size.width % 4 ? size.width % 2 ? 1 : 2 : 4 : 8; - pxPerWorkItemY = size.height % 2 ? 1 : 2; - } - else if (cn < 4 || (ksize.width <= 4 && ksize.height <= 4)) - { - pxPerWorkItemX = size.width % 2 ? 1 : 2; - pxPerWorkItemY = size.height % 2 ? 1 : 2; - } - globalsize[0] = size.width / pxPerWorkItemX; - globalsize[1] = size.height / pxPerWorkItemY; - - // Need some padding in the private array for pixels - int privDataWidth = roundUp(pxPerWorkItemX + ksize.width - 1, pxLoadNumPixels); - - // Make the global size a nice round number so the runtime can pick - // from reasonable choices for the workgroup size - const int wgRound = 256; - globalsize[0] = roundUp(globalsize[0], wgRound); - - char build_options[1024], cvt[2][40]; - sprintf(build_options, "-D cn=%d " - "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d " - "-D PX_LOAD_VEC_SIZE=%d -D PX_LOAD_NUM_PX=%d " - "-D PX_PER_WI_X=%d -D PX_PER_WI_Y=%d -D PRIV_DATA_WIDTH=%d -D %s -D %s " - "-D PX_LOAD_X_ITERATIONS=%d -D PX_LOAD_Y_ITERATIONS=%d " - "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s " - "-D convertToWT=%s -D convertToDstT=%s%s%s -D PX_LOAD_FLOAT_VEC_CONV=convert_%s -D OP_BOX_FILTER", - cn, anchor.x, anchor.y, ksize.width, ksize.height, - pxLoadVecSize, pxLoadNumPixels, - pxPerWorkItemX, pxPerWorkItemY, privDataWidth, borderMap[borderType], - isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED", - privDataWidth / pxLoadNumPixels, pxPerWorkItemY + ksize.height - 1, - ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype), - ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth), - ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), - ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), - normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "", - ocl::typeToStr(CV_MAKE_TYPE(wdepth, pxLoadVecSize)) //PX_LOAD_FLOAT_VEC_CONV - ); - - - if (!kernel.create("filterSmall", cv::ocl::imgproc::filterSmall_oclsrc, build_options)) - return false; - } - else - { - localsize = localsize_general; - for ( ; ; ) - { - int BLOCK_SIZE_X = tryWorkItems, BLOCK_SIZE_Y = std::min(ksize.height * 10, size.height); - - while (BLOCK_SIZE_X > 32 && BLOCK_SIZE_X >= ksize.width * 2 && BLOCK_SIZE_X > size.width * 2) - BLOCK_SIZE_X /= 2; - while (BLOCK_SIZE_Y < BLOCK_SIZE_X / 8 && BLOCK_SIZE_Y * computeUnits * 32 < size.height) - BLOCK_SIZE_Y *= 2; - - if (ksize.width > BLOCK_SIZE_X || w < ksize.width || h < ksize.height) - return false; - - char cvt[2][50]; - String opts = format("-D LOCAL_SIZE_X=%d -D BLOCK_SIZE_Y=%d -D ST=%s -D DT=%s -D WT=%s -D convertToDT=%s -D convertToWT=%s" - " -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s%s%s%s%s" - " -D ST1=%s -D DT1=%s -D cn=%d", - BLOCK_SIZE_X, BLOCK_SIZE_Y, ocl::typeToStr(type), ocl::typeToStr(CV_MAKE_TYPE(ddepth, cn)), - ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), - ocl::convertTypeStr(wdepth, ddepth, cn, cvt[0]), - ocl::convertTypeStr(sdepth, wdepth, cn, cvt[1]), - anchor.x, anchor.y, ksize.width, ksize.height, borderMap[borderType], - isolated ? " -D BORDER_ISOLATED" : "", doubleSupport ? " -D DOUBLE_SUPPORT" : "", - normalize ? " -D NORMALIZE" : "", sqr ? " -D SQR" : "", - ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn); - - localsize[0] = BLOCK_SIZE_X; - globalsize[0] = divUp(size.width, BLOCK_SIZE_X - (ksize.width - 1)) * BLOCK_SIZE_X; - globalsize[1] = divUp(size.height, BLOCK_SIZE_Y); - - kernel.create("boxFilter", cv::ocl::imgproc::boxFilter_oclsrc, opts); - if (kernel.empty()) - return false; - - size_t kernelWorkGroupSize = kernel.workGroupSize(); - if (localsize[0] <= kernelWorkGroupSize) - break; - if (BLOCK_SIZE_X < (int)kernelWorkGroupSize) - return false; - - tryWorkItems = (int)kernelWorkGroupSize; - } - } - - _dst.create(size, CV_MAKETYPE(ddepth, cn)); - UMat dst = _dst.getUMat(); - - int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src)); - idxArg = kernel.set(idxArg, (int)src.step); - int srcOffsetX = (int)((src.offset % src.step) / src.elemSize()); - int srcOffsetY = (int)(src.offset / src.step); - int srcEndX = isolated ? srcOffsetX + size.width : wholeSize.width; - int srcEndY = isolated ? srcOffsetY + size.height : wholeSize.height; - idxArg = kernel.set(idxArg, srcOffsetX); - idxArg = kernel.set(idxArg, srcOffsetY); - idxArg = kernel.set(idxArg, srcEndX); - idxArg = kernel.set(idxArg, srcEndY); - idxArg = kernel.set(idxArg, ocl::KernelArg::WriteOnly(dst)); - if (normalize) - idxArg = kernel.set(idxArg, (float)alpha); - - return kernel.run(2, globalsize, localsize, false); -} - -#endif - -} - + CV_INSTRUMENT_REGION(); -cv::Ptr cv::getRowSumFilter(int srcType, int sumType, int ksize, int anchor) -{ int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(sumType); CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(srcType) ); @@ -1434,9 +1206,10 @@ cv::Ptr cv::getRowSumFilter(int srcType, int sumType, int ksi } -cv::Ptr cv::getColumnSumFilter(int sumType, int dstType, int ksize, - int anchor, double scale) +Ptr getColumnSumFilter(int sumType, int dstType, int ksize, int anchor, double scale) { + CV_INSTRUMENT_REGION(); + int sdepth = CV_MAT_DEPTH(sumType), ddepth = CV_MAT_DEPTH(dstType); CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(dstType) ); @@ -1474,9 +1247,11 @@ cv::Ptr cv::getColumnSumFilter(int sumType, int dstType, i } -cv::Ptr cv::createBoxFilter( int srcType, int dstType, Size ksize, - Point anchor, bool normalize, int borderType ) +Ptr createBoxFilter(int srcType, int dstType, Size ksize, + Point anchor, bool normalize, int borderType) { + CV_INSTRUMENT_REGION(); + int sdepth = CV_MAT_DEPTH(srcType); int cn = CV_MAT_CN(srcType), sumType = CV_64F; if( sdepth == CV_8U && CV_MAT_DEPTH(dstType) == CV_8U && @@ -1496,199 +1271,12 @@ cv::Ptr cv::createBoxFilter( int srcType, int dstType, Size ks srcType, dstType, sumType, borderType ); } -#ifdef HAVE_OPENVX -namespace cv -{ - namespace ovx { - template <> inline bool skipSmallImages(int w, int h) { return w*h < 640 * 480; } - } - static bool openvx_boxfilter(InputArray _src, OutputArray _dst, int ddepth, - Size ksize, Point anchor, - bool normalize, int borderType) - { - if (ddepth < 0) - ddepth = CV_8UC1; - if (_src.type() != CV_8UC1 || ddepth != CV_8U || !normalize || - _src.cols() < 3 || _src.rows() < 3 || - ksize.width != 3 || ksize.height != 3 || - (anchor.x >= 0 && anchor.x != 1) || - (anchor.y >= 0 && anchor.y != 1) || - ovx::skipSmallImages(_src.cols(), _src.rows())) - return false; - - Mat src = _src.getMat(); - - if ((borderType & BORDER_ISOLATED) == 0 && src.isSubmatrix()) - return false; //Process isolated borders only - vx_enum border; - switch (borderType & ~BORDER_ISOLATED) - { - case BORDER_CONSTANT: - border = VX_BORDER_CONSTANT; - break; - case BORDER_REPLICATE: - border = VX_BORDER_REPLICATE; - break; - default: - return false; - } - - _dst.create(src.size(), CV_8UC1); - Mat dst = _dst.getMat(); - - try - { - ivx::Context ctx = ovx::getOpenVXContext(); - - Mat a; - if (dst.data != src.data) - a = src; - else - src.copyTo(a); - - ivx::Image - ia = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8, - ivx::Image::createAddressing(a.cols, a.rows, 1, (vx_int32)(a.step)), a.data), - ib = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8, - ivx::Image::createAddressing(dst.cols, dst.rows, 1, (vx_int32)(dst.step)), dst.data); - - //ATTENTION: VX_CONTEXT_IMMEDIATE_BORDER attribute change could lead to strange issues in multi-threaded environments - //since OpenVX standard says nothing about thread-safety for now - ivx::border_t prevBorder = ctx.immediateBorder(); - ctx.setImmediateBorder(border, (vx_uint8)(0)); - ivx::IVX_CHECK_STATUS(vxuBox3x3(ctx, ia, ib)); - ctx.setImmediateBorder(prevBorder); - } - catch (const ivx::RuntimeError & e) - { - VX_DbgThrow(e.what()); - } - catch (const ivx::WrapperError & e) - { - VX_DbgThrow(e.what()); - } - - return true; - } -} -#endif - -#if defined(HAVE_IPP) -namespace cv -{ -static bool ipp_boxfilter(Mat &src, Mat &dst, Size ksize, Point anchor, bool normalize, int borderType) -{ -#ifdef HAVE_IPP_IW - CV_INSTRUMENT_REGION_IPP(); - -#if IPP_VERSION_X100 < 201801 - // Problem with SSE42 optimization for 16s and some 8u modes - if(ipp::getIppTopFeatures() == ippCPUID_SSE42 && (((src.depth() == CV_16S || src.depth() == CV_16U) && (src.channels() == 3 || src.channels() == 4)) || (src.depth() == CV_8U && src.channels() == 3 && (ksize.width > 5 || ksize.height > 5)))) - return false; - - // Other optimizations has some degradations too - if((((src.depth() == CV_16S || src.depth() == CV_16U) && (src.channels() == 4)) || (src.depth() == CV_8U && src.channels() == 1 && (ksize.width > 5 || ksize.height > 5)))) - return false; -#endif - - if(!normalize) - return false; - - if(!ippiCheckAnchor(anchor, ksize)) - return false; - - try - { - ::ipp::IwiImage iwSrc = ippiGetImage(src); - ::ipp::IwiImage iwDst = ippiGetImage(dst); - ::ipp::IwiSize iwKSize = ippiGetSize(ksize); - ::ipp::IwiBorderSize borderSize(iwKSize); - ::ipp::IwiBorderType ippBorder(ippiGetBorder(iwSrc, borderType, borderSize)); - if(!ippBorder) - return false; - - CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterBox, iwSrc, iwDst, iwKSize, ::ipp::IwDefault(), ippBorder); - } - catch (const ::ipp::IwException &) - { - return false; - } - - return true; -#else - CV_UNUSED(src); CV_UNUSED(dst); CV_UNUSED(ksize); CV_UNUSED(anchor); CV_UNUSED(normalize); CV_UNUSED(borderType); - return false; -#endif -} -} -#endif - - -void cv::boxFilter( InputArray _src, OutputArray _dst, int ddepth, - Size ksize, Point anchor, - bool normalize, int borderType ) -{ - CV_INSTRUMENT_REGION(); - - CV_OCL_RUN(_dst.isUMat() && - (borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || - borderType == BORDER_REFLECT || borderType == BORDER_REFLECT_101), - ocl_boxFilter3x3_8UC1(_src, _dst, ddepth, ksize, anchor, borderType, normalize)) - - CV_OCL_RUN(_dst.isUMat(), ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType, normalize)) - - Mat src = _src.getMat(); - int stype = src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype); - if( ddepth < 0 ) - ddepth = sdepth; - _dst.create( src.size(), CV_MAKETYPE(ddepth, cn) ); - Mat dst = _dst.getMat(); - if( borderType != BORDER_CONSTANT && normalize && (borderType & BORDER_ISOLATED) != 0 ) - { - if( src.rows == 1 ) - ksize.height = 1; - if( src.cols == 1 ) - ksize.width = 1; - } - - Point ofs; - Size wsz(src.cols, src.rows); - if(!(borderType&BORDER_ISOLATED)) - src.locateROI( wsz, ofs ); - - CALL_HAL(boxFilter, cv_hal_boxFilter, src.ptr(), src.step, dst.ptr(), dst.step, src.cols, src.rows, sdepth, ddepth, cn, - ofs.x, ofs.y, wsz.width - src.cols - ofs.x, wsz.height - src.rows - ofs.y, ksize.width, ksize.height, - anchor.x, anchor.y, normalize, borderType&~BORDER_ISOLATED); - - CV_OVX_RUN(true, - openvx_boxfilter(src, dst, ddepth, ksize, anchor, normalize, borderType)) - - CV_IPP_RUN_FAST(ipp_boxfilter(src, dst, ksize, anchor, normalize, borderType)); - - borderType = (borderType&~BORDER_ISOLATED); - - Ptr f = createBoxFilter( src.type(), dst.type(), - ksize, anchor, normalize, borderType ); - - f->apply( src, dst, wsz, ofs ); -} - - -void cv::blur( InputArray src, OutputArray dst, - Size ksize, Point anchor, int borderType ) -{ - CV_INSTRUMENT_REGION(); - - boxFilter( src, dst, -1, ksize, anchor, true, borderType ); -} /****************************************************************************************\ Squared Box Filter \****************************************************************************************/ - -namespace cv -{ +namespace { template struct SqrRowSum : @@ -1703,6 +1291,8 @@ struct SqrRowSum : virtual void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + const T* S = (const T*)src; ST* D = (ST*)dst; int i = 0, k, ksz_cn = ksize*cn; @@ -1727,7 +1317,9 @@ struct SqrRowSum : } }; -static Ptr getSqrRowSumFilter(int srcType, int sumType, int ksize, int anchor) +} // namespace anon + +Ptr getSqrRowSumFilter(int srcType, int sumType, int ksize, int anchor) { int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(sumType); CV_Assert( CV_MAT_CN(sumType) == CV_MAT_CN(srcType) ); @@ -1753,52 +1345,6 @@ static Ptr getSqrRowSumFilter(int srcType, int sumType, int ksize srcType, sumType)); } -} - -void cv::sqrBoxFilter( InputArray _src, OutputArray _dst, int ddepth, - Size ksize, Point anchor, - bool normalize, int borderType ) -{ - CV_INSTRUMENT_REGION(); - - int srcType = _src.type(), sdepth = CV_MAT_DEPTH(srcType), cn = CV_MAT_CN(srcType); - Size size = _src.size(); - - if( ddepth < 0 ) - ddepth = sdepth < CV_32F ? CV_32F : CV_64F; - - if( borderType != BORDER_CONSTANT && normalize ) - { - if( size.height == 1 ) - ksize.height = 1; - if( size.width == 1 ) - ksize.width = 1; - } - - CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2, - ocl_boxFilter(_src, _dst, ddepth, ksize, anchor, borderType, normalize, true)) - - int sumDepth = CV_64F; - if( sdepth == CV_8U ) - sumDepth = CV_32S; - int sumType = CV_MAKETYPE( sumDepth, cn ), dstType = CV_MAKETYPE(ddepth, cn); - - Mat src = _src.getMat(); - _dst.create( size, dstType ); - Mat dst = _dst.getMat(); - - Ptr rowFilter = getSqrRowSumFilter(srcType, sumType, ksize.width, anchor.x ); - Ptr columnFilter = getColumnSumFilter(sumType, - dstType, ksize.height, anchor.y, - normalize ? 1./(ksize.width*ksize.height) : 1); - - Ptr f = makePtr(Ptr(), rowFilter, columnFilter, - srcType, dstType, sumType, borderType ); - Point ofs; - Size wsz(src.cols, src.rows); - src.locateROI( wsz, ofs ); - - f->apply( src, dst, wsz, ofs ); -} - -/* End of file. */ +#endif +CV_CPU_OPTIMIZATION_NAMESPACE_END +} // namespace diff --git a/modules/imgproc/src/filter.avx2.cpp b/modules/imgproc/src/filter.avx2.cpp deleted file mode 100644 index e9ced20e36..0000000000 --- a/modules/imgproc/src/filter.avx2.cpp +++ /dev/null @@ -1,197 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009, Willow Garage Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#include "precomp.hpp" -#include "filter.hpp" - -namespace cv -{ - -int RowVec_32f_AVX(const float* src0, const float* _kx, float* dst, int width, int cn, int _ksize) -{ - int i = 0, k; - for (; i <= width - 8; i += 8) - { - const float* src = src0 + i; - __m256 f, x0; - __m256 s0 = _mm256_set1_ps(0.0f); - for (k = 0; k < _ksize; k++, src += cn) - { - f = _mm256_set1_ps(_kx[k]); - x0 = _mm256_loadu_ps(src); -#if CV_FMA3 - s0 = _mm256_fmadd_ps(x0, f, s0); -#else - s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f)); -#endif - } - _mm256_storeu_ps(dst + i, s0); - } - _mm256_zeroupper(); - return i; -} - -int SymmColumnVec_32f_Symm_AVX(const float** src, const float* ky, float* dst, float delta, int width, int ksize2) -{ - int i = 0, k; - const float *S, *S2; - const __m128 d4 = _mm_set1_ps(delta); - const __m256 d8 = _mm256_set1_ps(delta); - - for( ; i <= width - 16; i += 16 ) - { - __m256 f = _mm256_set1_ps(ky[0]); - __m256 s0, s1; - __m256 x0; - S = src[0] + i; - s0 = _mm256_loadu_ps(S); -#if CV_FMA3 - s0 = _mm256_fmadd_ps(s0, f, d8); -#else - s0 = _mm256_add_ps(_mm256_mul_ps(s0, f), d8); -#endif - s1 = _mm256_loadu_ps(S+8); -#if CV_FMA3 - s1 = _mm256_fmadd_ps(s1, f, d8); -#else - s1 = _mm256_add_ps(_mm256_mul_ps(s1, f), d8); -#endif - - for( k = 1; k <= ksize2; k++ ) - { - S = src[k] + i; - S2 = src[-k] + i; - f = _mm256_set1_ps(ky[k]); - x0 = _mm256_add_ps(_mm256_loadu_ps(S), _mm256_loadu_ps(S2)); -#if CV_FMA3 - s0 = _mm256_fmadd_ps(x0, f, s0); -#else - s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f)); -#endif - x0 = _mm256_add_ps(_mm256_loadu_ps(S+8), _mm256_loadu_ps(S2+8)); -#if CV_FMA3 - s1 = _mm256_fmadd_ps(x0, f, s1); -#else - s1 = _mm256_add_ps(s1, _mm256_mul_ps(x0, f)); -#endif - } - - _mm256_storeu_ps(dst + i, s0); - _mm256_storeu_ps(dst + i + 8, s1); - } - - for( ; i <= width - 4; i += 4 ) - { - __m128 f = _mm_set1_ps(ky[0]); - __m128 x0, s0 = _mm_load_ps(src[0] + i); - s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4); - - for( k = 1; k <= ksize2; k++ ) - { - f = _mm_set1_ps(ky[k]); - x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i)); - s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); - } - - _mm_storeu_ps(dst + i, s0); - } - - _mm256_zeroupper(); - return i; -} - -int SymmColumnVec_32f_Unsymm_AVX(const float** src, const float* ky, float* dst, float delta, int width, int ksize2) -{ - int i = 0, k; - const float *S2; - const __m128 d4 = _mm_set1_ps(delta); - const __m256 d8 = _mm256_set1_ps(delta); - - for (; i <= width - 16; i += 16) - { - __m256 f, s0 = d8, s1 = d8; - __m256 x0; - - for (k = 1; k <= ksize2; k++) - { - const float *S = src[k] + i; - S2 = src[-k] + i; - f = _mm256_set1_ps(ky[k]); - x0 = _mm256_sub_ps(_mm256_loadu_ps(S), _mm256_loadu_ps(S2)); -#if CV_FMA3 - s0 = _mm256_fmadd_ps(x0, f, s0); -#else - s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f)); -#endif - x0 = _mm256_sub_ps(_mm256_loadu_ps(S + 8), _mm256_loadu_ps(S2 + 8)); -#if CV_FMA3 - s1 = _mm256_fmadd_ps(x0, f, s1); -#else - s1 = _mm256_add_ps(s1, _mm256_mul_ps(x0, f)); -#endif - } - - _mm256_storeu_ps(dst + i, s0); - _mm256_storeu_ps(dst + i + 8, s1); - } - - for (; i <= width - 4; i += 4) - { - __m128 f, x0, s0 = d4; - - for (k = 1; k <= ksize2; k++) - { - f = _mm_set1_ps(ky[k]); - x0 = _mm_sub_ps(_mm_load_ps(src[k] + i), _mm_load_ps(src[-k] + i)); - s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); - } - - _mm_storeu_ps(dst + i, s0); - } - - _mm256_zeroupper(); - return i; -} - -} - -/* End of file. */ diff --git a/modules/imgproc/src/filter.dispatch.cpp b/modules/imgproc/src/filter.dispatch.cpp new file mode 100644 index 0000000000..b6f5331028 --- /dev/null +++ b/modules/imgproc/src/filter.dispatch.cpp @@ -0,0 +1,1432 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" +#include "opencv2/core/opencl/ocl_defs.hpp" +#include "opencl_kernels_imgproc.hpp" +#include "hal_replacement.hpp" +#include "opencv2/core/hal/intrin.hpp" +#include "filter.hpp" + +#include "filter.simd.hpp" +#include "filter.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content + + +/****************************************************************************************\ + Base Image Filter +\****************************************************************************************/ + +namespace cv { + +BaseRowFilter::BaseRowFilter() { ksize = anchor = -1; } +BaseRowFilter::~BaseRowFilter() {} + +BaseColumnFilter::BaseColumnFilter() { ksize = anchor = -1; } +BaseColumnFilter::~BaseColumnFilter() {} +void BaseColumnFilter::reset() {} + +BaseFilter::BaseFilter() { ksize = Size(-1,-1); anchor = Point(-1,-1); } +BaseFilter::~BaseFilter() {} +void BaseFilter::reset() {} + +FilterEngine::FilterEngine() + : srcType(-1), dstType(-1), bufType(-1), maxWidth(0), wholeSize(-1, -1), dx1(0), dx2(0), + rowBorderType(BORDER_REPLICATE), columnBorderType(BORDER_REPLICATE), + borderElemSize(0), bufStep(0), startY(0), startY0(0), endY(0), rowCount(0), dstY(0) +{ +} + + +FilterEngine::FilterEngine( const Ptr& _filter2D, + const Ptr& _rowFilter, + const Ptr& _columnFilter, + int _srcType, int _dstType, int _bufType, + int _rowBorderType, int _columnBorderType, + const Scalar& _borderValue ) + : srcType(-1), dstType(-1), bufType(-1), maxWidth(0), wholeSize(-1, -1), dx1(0), dx2(0), + rowBorderType(BORDER_REPLICATE), columnBorderType(BORDER_REPLICATE), + borderElemSize(0), bufStep(0), startY(0), startY0(0), endY(0), rowCount(0), dstY(0) +{ + init(_filter2D, _rowFilter, _columnFilter, _srcType, _dstType, _bufType, + _rowBorderType, _columnBorderType, _borderValue); +} + +FilterEngine::~FilterEngine() +{ +} + + +void FilterEngine::init( const Ptr& _filter2D, + const Ptr& _rowFilter, + const Ptr& _columnFilter, + int _srcType, int _dstType, int _bufType, + int _rowBorderType, int _columnBorderType, + const Scalar& _borderValue ) +{ + _srcType = CV_MAT_TYPE(_srcType); + _bufType = CV_MAT_TYPE(_bufType); + _dstType = CV_MAT_TYPE(_dstType); + + srcType = _srcType; + int srcElemSize = (int)getElemSize(srcType); + dstType = _dstType; + bufType = _bufType; + + filter2D = _filter2D; + rowFilter = _rowFilter; + columnFilter = _columnFilter; + + if( _columnBorderType < 0 ) + _columnBorderType = _rowBorderType; + + rowBorderType = _rowBorderType; + columnBorderType = _columnBorderType; + + CV_Assert( columnBorderType != BORDER_WRAP ); + + if( isSeparable() ) + { + CV_Assert( rowFilter && columnFilter ); + ksize = Size(rowFilter->ksize, columnFilter->ksize); + anchor = Point(rowFilter->anchor, columnFilter->anchor); + } + else + { + CV_Assert( bufType == srcType ); + ksize = filter2D->ksize; + anchor = filter2D->anchor; + } + + CV_Assert( 0 <= anchor.x && anchor.x < ksize.width && + 0 <= anchor.y && anchor.y < ksize.height ); + + borderElemSize = srcElemSize/(CV_MAT_DEPTH(srcType) >= CV_32S ? sizeof(int) : 1); + int borderLength = std::max(ksize.width - 1, 1); + borderTab.resize(borderLength*borderElemSize); + + maxWidth = bufStep = 0; + constBorderRow.clear(); + + if( rowBorderType == BORDER_CONSTANT || columnBorderType == BORDER_CONSTANT ) + { + constBorderValue.resize(srcElemSize*borderLength); + int srcType1 = CV_MAKETYPE(CV_MAT_DEPTH(srcType), MIN(CV_MAT_CN(srcType), 4)); + scalarToRawData(_borderValue, &constBorderValue[0], srcType1, + borderLength*CV_MAT_CN(srcType)); + } + + wholeSize = Size(-1,-1); +} + +#define VEC_ALIGN CV_MALLOC_ALIGN + +int FilterEngine::start(const Size& _wholeSize, const Size& sz, const Point& ofs) +{ + CV_INSTRUMENT_REGION(); + + CV_CPU_DISPATCH(FilterEngine__start, (*this, _wholeSize, sz, ofs), + CV_CPU_DISPATCH_MODES_ALL); +} + + +int FilterEngine::start(const Mat& src, const Size &wsz, const Point &ofs) +{ + start( wsz, src.size(), ofs); + return startY - ofs.y; +} + +int FilterEngine::remainingInputRows() const +{ + return endY - startY - rowCount; +} + +int FilterEngine::remainingOutputRows() const +{ + return roi.height - dstY; +} + +int FilterEngine::proceed(const uchar* src, int srcstep, int count, + uchar* dst, int dststep) +{ + CV_INSTRUMENT_REGION(); + + CV_Assert( wholeSize.width > 0 && wholeSize.height > 0 ); + + CV_CPU_DISPATCH(FilterEngine__proceed, (*this, src, srcstep, count, dst, dststep), + CV_CPU_DISPATCH_MODES_ALL); +} + +void FilterEngine::apply(const Mat& src, Mat& dst, const Size& wsz, const Point& ofs) +{ + CV_INSTRUMENT_REGION(); + + CV_CheckTypeEQ(src.type(), srcType, ""); + CV_CheckTypeEQ(dst.type(), dstType, ""); + + CV_CPU_DISPATCH(FilterEngine__apply, (*this, src, dst, wsz, ofs), + CV_CPU_DISPATCH_MODES_ALL); +} + +/****************************************************************************************\ +* Separable linear filter * +\****************************************************************************************/ + +int getKernelType(InputArray filter_kernel, Point anchor) +{ + Mat _kernel = filter_kernel.getMat(); + CV_Assert( _kernel.channels() == 1 ); + int i, sz = _kernel.rows*_kernel.cols; + + Mat kernel; + _kernel.convertTo(kernel, CV_64F); + + const double* coeffs = kernel.ptr(); + double sum = 0; + int type = KERNEL_SMOOTH + KERNEL_INTEGER; + if( (_kernel.rows == 1 || _kernel.cols == 1) && + anchor.x*2 + 1 == _kernel.cols && + anchor.y*2 + 1 == _kernel.rows ) + type |= (KERNEL_SYMMETRICAL + KERNEL_ASYMMETRICAL); + + for( i = 0; i < sz; i++ ) + { + double a = coeffs[i], b = coeffs[sz - i - 1]; + if( a != b ) + type &= ~KERNEL_SYMMETRICAL; + if( a != -b ) + type &= ~KERNEL_ASYMMETRICAL; + if( a < 0 ) + type &= ~KERNEL_SMOOTH; + if( a != saturate_cast(a) ) + type &= ~KERNEL_INTEGER; + sum += a; + } + + if( fabs(sum - 1) > FLT_EPSILON*(fabs(sum) + 1) ) + type &= ~KERNEL_SMOOTH; + return type; +} + + +Ptr getLinearRowFilter( + int srcType, int bufType, + InputArray _kernel, int anchor, + int symmetryType) +{ + CV_INSTRUMENT_REGION(); + + Mat kernelMat = _kernel.getMat(); + CV_CPU_DISPATCH(getLinearRowFilter, (srcType, bufType, kernelMat, anchor, symmetryType), + CV_CPU_DISPATCH_MODES_ALL); +} + + +Ptr getLinearColumnFilter( + int bufType, int dstType, + InputArray kernel, int anchor, + int symmetryType, double delta, + int bits) +{ + CV_INSTRUMENT_REGION(); + + Mat kernelMat = kernel.getMat(); + CV_CPU_DISPATCH(getLinearColumnFilter, (bufType, dstType, kernelMat, anchor, symmetryType, delta, bits), + CV_CPU_DISPATCH_MODES_ALL); +} + + +Ptr createSeparableLinearFilter( + int _srcType, int _dstType, + InputArray __rowKernel, InputArray __columnKernel, + Point _anchor, double _delta, + int _rowBorderType, int _columnBorderType, + const Scalar& _borderValue) +{ + Mat _rowKernel = __rowKernel.getMat(), _columnKernel = __columnKernel.getMat(); + _srcType = CV_MAT_TYPE(_srcType); + _dstType = CV_MAT_TYPE(_dstType); + int sdepth = CV_MAT_DEPTH(_srcType), ddepth = CV_MAT_DEPTH(_dstType); + int cn = CV_MAT_CN(_srcType); + CV_Assert( cn == CV_MAT_CN(_dstType) ); + int rsize = _rowKernel.rows + _rowKernel.cols - 1; + int csize = _columnKernel.rows + _columnKernel.cols - 1; + if( _anchor.x < 0 ) + _anchor.x = rsize/2; + if( _anchor.y < 0 ) + _anchor.y = csize/2; + int rtype = getKernelType(_rowKernel, + _rowKernel.rows == 1 ? Point(_anchor.x, 0) : Point(0, _anchor.x)); + int ctype = getKernelType(_columnKernel, + _columnKernel.rows == 1 ? Point(_anchor.y, 0) : Point(0, _anchor.y)); + Mat rowKernel, columnKernel; + + int bdepth = std::max(CV_32F,std::max(sdepth, ddepth)); + int bits = 0; + + if( sdepth == CV_8U && + ((rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL && + ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL && + ddepth == CV_8U) || + ((rtype & (KERNEL_SYMMETRICAL+KERNEL_ASYMMETRICAL)) && + (ctype & (KERNEL_SYMMETRICAL+KERNEL_ASYMMETRICAL)) && + (rtype & ctype & KERNEL_INTEGER) && + ddepth == CV_16S)) ) + { + bdepth = CV_32S; + bits = ddepth == CV_8U ? 8 : 0; + _rowKernel.convertTo( rowKernel, CV_32S, 1 << bits ); + _columnKernel.convertTo( columnKernel, CV_32S, 1 << bits ); + bits *= 2; + _delta *= (1 << bits); + } + else + { + if( _rowKernel.type() != bdepth ) + _rowKernel.convertTo( rowKernel, bdepth ); + else + rowKernel = _rowKernel; + if( _columnKernel.type() != bdepth ) + _columnKernel.convertTo( columnKernel, bdepth ); + else + columnKernel = _columnKernel; + } + + int _bufType = CV_MAKETYPE(bdepth, cn); + Ptr _rowFilter = getLinearRowFilter( + _srcType, _bufType, rowKernel, _anchor.x, rtype); + Ptr _columnFilter = getLinearColumnFilter( + _bufType, _dstType, columnKernel, _anchor.y, ctype, _delta, bits ); + + return Ptr( new FilterEngine(Ptr(), _rowFilter, _columnFilter, + _srcType, _dstType, _bufType, _rowBorderType, _columnBorderType, _borderValue )); +} + + +/****************************************************************************************\ +* Non-separable linear filter * +\****************************************************************************************/ + +void preprocess2DKernel( const Mat& kernel, std::vector& coords, std::vector& coeffs ) +{ + int i, j, k, nz = countNonZero(kernel), ktype = kernel.type(); + if(nz == 0) + nz = 1; + CV_Assert( ktype == CV_8U || ktype == CV_32S || ktype == CV_32F || ktype == CV_64F ); + coords.resize(nz); + coeffs.resize(nz*getElemSize(ktype)); + uchar* _coeffs = &coeffs[0]; + + for( i = k = 0; i < kernel.rows; i++ ) + { + const uchar* krow = kernel.ptr(i); + for( j = 0; j < kernel.cols; j++ ) + { + if( ktype == CV_8U ) + { + uchar val = krow[j]; + if( val == 0 ) + continue; + coords[k] = Point(j,i); + _coeffs[k++] = val; + } + else if( ktype == CV_32S ) + { + int val = ((const int*)krow)[j]; + if( val == 0 ) + continue; + coords[k] = Point(j,i); + ((int*)_coeffs)[k++] = val; + } + else if( ktype == CV_32F ) + { + float val = ((const float*)krow)[j]; + if( val == 0 ) + continue; + coords[k] = Point(j,i); + ((float*)_coeffs)[k++] = val; + } + else + { + double val = ((const double*)krow)[j]; + if( val == 0 ) + continue; + coords[k] = Point(j,i); + ((double*)_coeffs)[k++] = val; + } + } + } +} + + +template struct Filter2D : public BaseFilter +{ + typedef typename CastOp::type1 KT; + typedef typename CastOp::rtype DT; + + Filter2D( const Mat& _kernel, Point _anchor, + double _delta, const CastOp& _castOp=CastOp(), + const VecOp& _vecOp=VecOp() ) + { + anchor = _anchor; + ksize = _kernel.size(); + delta = saturate_cast(_delta); + castOp0 = _castOp; + vecOp = _vecOp; + CV_Assert( _kernel.type() == DataType::type ); + preprocess2DKernel( _kernel, coords, coeffs ); + ptrs.resize( coords.size() ); + } + + void operator()(const uchar** src, uchar* dst, int dststep, int count, int width, int cn) CV_OVERRIDE + { + KT _delta = delta; + const Point* pt = &coords[0]; + const KT* kf = (const KT*)&coeffs[0]; + const ST** kp = (const ST**)&ptrs[0]; + int i, k, nz = (int)coords.size(); + CastOp castOp = castOp0; + + width *= cn; + for( ; count > 0; count--, dst += dststep, src++ ) + { + DT* D = (DT*)dst; + + for( k = 0; k < nz; k++ ) + kp[k] = (const ST*)src[pt[k].y] + pt[k].x*cn; + + i = vecOp((const uchar**)kp, dst, width); + #if CV_ENABLE_UNROLLED + for( ; i <= width - 4; i += 4 ) + { + KT s0 = _delta, s1 = _delta, s2 = _delta, s3 = _delta; + + for( k = 0; k < nz; k++ ) + { + const ST* sptr = kp[k] + i; + KT f = kf[k]; + s0 += f*sptr[0]; + s1 += f*sptr[1]; + s2 += f*sptr[2]; + s3 += f*sptr[3]; + } + + D[i] = castOp(s0); D[i+1] = castOp(s1); + D[i+2] = castOp(s2); D[i+3] = castOp(s3); + } + #endif + for( ; i < width; i++ ) + { + KT s0 = _delta; + for( k = 0; k < nz; k++ ) + s0 += kf[k]*kp[k][i]; + D[i] = castOp(s0); + } + } + } + + std::vector coords; + std::vector coeffs; + std::vector ptrs; + KT delta; + CastOp castOp0; + VecOp vecOp; +}; + +#ifdef HAVE_OPENCL + +#define DIVUP(total, grain) (((total) + (grain) - 1) / (grain)) +#define ROUNDUP(sz, n) ((sz) + (n) - 1 - (((sz) + (n) - 1) % (n))) + +// prepare kernel: transpose and make double rows (+align). Returns size of aligned row +// Samples: +// a b c +// Input: d e f +// g h i +// Output, last two zeros is the alignment: +// a d g a d g 0 0 +// b e h b e h 0 0 +// c f i c f i 0 0 +template +static int _prepareKernelFilter2D(std::vector & data, const Mat & kernel) +{ + Mat _kernel; kernel.convertTo(_kernel, DataDepth::value); + int size_y_aligned = ROUNDUP(kernel.rows * 2, 4); + data.clear(); data.resize(size_y_aligned * kernel.cols, 0); + for (int x = 0; x < kernel.cols; x++) + { + for (int y = 0; y < kernel.rows; y++) + { + data[x * size_y_aligned + y] = _kernel.at(y, x); + data[x * size_y_aligned + y + kernel.rows] = _kernel.at(y, x); + } + } + return size_y_aligned; +} + +static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth, + InputArray _kernel, Point anchor, + double delta, int borderType ) +{ + int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); + ddepth = ddepth < 0 ? sdepth : ddepth; + int dtype = CV_MAKE_TYPE(ddepth, cn), wdepth = std::max(std::max(sdepth, ddepth), CV_32F), + wtype = CV_MAKE_TYPE(wdepth, cn); + if (cn > 4) + return false; + + Size ksize = _kernel.size(); + if (anchor.x < 0) + anchor.x = ksize.width / 2; + if (anchor.y < 0) + anchor.y = ksize.height / 2; + + bool isolated = (borderType & BORDER_ISOLATED) != 0; + borderType &= ~BORDER_ISOLATED; + const cv::ocl::Device &device = cv::ocl::Device::getDefault(); + bool doubleSupport = device.doubleFPConfig() > 0; + if (wdepth == CV_64F && !doubleSupport) + return false; + + const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", + "BORDER_WRAP", "BORDER_REFLECT_101" }; + + cv::Mat kernelMat = _kernel.getMat(); + cv::Size sz = _src.size(), wholeSize; + size_t globalsize[2] = { (size_t)sz.width, (size_t)sz.height }; + size_t localsize_general[2] = {0, 1}; + size_t* localsize = NULL; + + ocl::Kernel k; + UMat src = _src.getUMat(); + if (!isolated) + { + Point ofs; + src.locateROI(wholeSize, ofs); + } + + size_t tryWorkItems = device.maxWorkGroupSize(); + if (device.isIntel() && 128 < tryWorkItems) + tryWorkItems = 128; + char cvt[2][40]; + + // For smaller filter kernels, there is a special kernel that is more + // efficient than the general one. + UMat kernalDataUMat; + if (device.isIntel() && (device.type() & ocl::Device::TYPE_GPU) && + ((ksize.width < 5 && ksize.height < 5) || + (ksize.width == 5 && ksize.height == 5 && cn == 1))) + { + kernelMat = kernelMat.reshape(0, 1); + String kerStr = ocl::kernelToStr(kernelMat, CV_32F); + int h = isolated ? sz.height : wholeSize.height; + int w = isolated ? sz.width : wholeSize.width; + + if (w < ksize.width || h < ksize.height) + return false; + + // Figure out what vector size to use for loading the pixels. + int pxLoadNumPixels = cn != 1 || sz.width % 4 ? 1 : 4; + int pxLoadVecSize = cn * pxLoadNumPixels; + + // Figure out how many pixels per work item to compute in X and Y + // directions. Too many and we run out of registers. + int pxPerWorkItemX = 1; + int pxPerWorkItemY = 1; + if (cn <= 2 && ksize.width <= 4 && ksize.height <= 4) + { + pxPerWorkItemX = sz.width % 8 ? sz.width % 4 ? sz.width % 2 ? 1 : 2 : 4 : 8; + pxPerWorkItemY = sz.height % 2 ? 1 : 2; + } + else if (cn < 4 || (ksize.width <= 4 && ksize.height <= 4)) + { + pxPerWorkItemX = sz.width % 2 ? 1 : 2; + pxPerWorkItemY = sz.height % 2 ? 1 : 2; + } + globalsize[0] = sz.width / pxPerWorkItemX; + globalsize[1] = sz.height / pxPerWorkItemY; + + // Need some padding in the private array for pixels + int privDataWidth = ROUNDUP(pxPerWorkItemX + ksize.width - 1, pxLoadNumPixels); + + // Make the global size a nice round number so the runtime can pick + // from reasonable choices for the workgroup size + const int wgRound = 256; + globalsize[0] = ROUNDUP(globalsize[0], wgRound); + + char build_options[1024]; + sprintf(build_options, "-D cn=%d " + "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d " + "-D PX_LOAD_VEC_SIZE=%d -D PX_LOAD_NUM_PX=%d " + "-D PX_PER_WI_X=%d -D PX_PER_WI_Y=%d -D PRIV_DATA_WIDTH=%d -D %s -D %s " + "-D PX_LOAD_X_ITERATIONS=%d -D PX_LOAD_Y_ITERATIONS=%d " + "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s " + "-D convertToWT=%s -D convertToDstT=%s %s", + cn, anchor.x, anchor.y, ksize.width, ksize.height, + pxLoadVecSize, pxLoadNumPixels, + pxPerWorkItemX, pxPerWorkItemY, privDataWidth, borderMap[borderType], + isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED", + privDataWidth / pxLoadNumPixels, pxPerWorkItemY + ksize.height - 1, + ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype), + ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth), + ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), + ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), kerStr.c_str()); + + if (!k.create("filter2DSmall", cv::ocl::imgproc::filter2DSmall_oclsrc, build_options)) + return false; + } + else + { + localsize = localsize_general; + std::vector kernelMatDataFloat; + int kernel_size_y2_aligned = _prepareKernelFilter2D(kernelMatDataFloat, kernelMat); + String kerStr = ocl::kernelToStr(kernelMatDataFloat, CV_32F); + + for ( ; ; ) + { + size_t BLOCK_SIZE = tryWorkItems; + while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)sz.width * 2) + BLOCK_SIZE /= 2; + + if ((size_t)ksize.width > BLOCK_SIZE) + return false; + + int requiredTop = anchor.y; + int requiredLeft = (int)BLOCK_SIZE; // not this: anchor.x; + int requiredBottom = ksize.height - 1 - anchor.y; + int requiredRight = (int)BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x; + int h = isolated ? sz.height : wholeSize.height; + int w = isolated ? sz.width : wholeSize.width; + bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight; + + if ((w < ksize.width) || (h < ksize.height)) + return false; + + String opts = format("-D LOCAL_SIZE=%d -D cn=%d " + "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d " + "-D KERNEL_SIZE_Y2_ALIGNED=%d -D %s -D %s -D %s%s%s " + "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s " + "-D convertToWT=%s -D convertToDstT=%s", + (int)BLOCK_SIZE, cn, anchor.x, anchor.y, + ksize.width, ksize.height, kernel_size_y2_aligned, borderMap[borderType], + extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION", + isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED", + doubleSupport ? " -D DOUBLE_SUPPORT" : "", kerStr.c_str(), + ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype), + ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth), + ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), + ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1])); + + localsize[0] = BLOCK_SIZE; + globalsize[0] = DIVUP(sz.width, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE; + globalsize[1] = sz.height; + + if (!k.create("filter2D", cv::ocl::imgproc::filter2D_oclsrc, opts)) + return false; + + size_t kernelWorkGroupSize = k.workGroupSize(); + if (localsize[0] <= kernelWorkGroupSize) + break; + if (BLOCK_SIZE < kernelWorkGroupSize) + return false; + tryWorkItems = kernelWorkGroupSize; + } + } + + _dst.create(sz, dtype); + UMat dst = _dst.getUMat(); + + int srcOffsetX = (int)((src.offset % src.step) / src.elemSize()); + int srcOffsetY = (int)(src.offset / src.step); + int srcEndX = (isolated ? (srcOffsetX + sz.width) : wholeSize.width); + int srcEndY = (isolated ? (srcOffsetY + sz.height) : wholeSize.height); + + k.args(ocl::KernelArg::PtrReadOnly(src), (int)src.step, srcOffsetX, srcOffsetY, + srcEndX, srcEndY, ocl::KernelArg::WriteOnly(dst), (float)delta); + + return k.run(2, globalsize, localsize, false); +} + +const int shift_bits = 8; + +static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX, int anchor, + int borderType, int ddepth, bool fast8uc1, bool int_arithm) +{ + int type = src.type(), cn = CV_MAT_CN(type), sdepth = CV_MAT_DEPTH(type); + bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; + Size bufSize = buf.size(); + int buf_type = buf.type(), bdepth = CV_MAT_DEPTH(buf_type); + + if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) + return false; + +#ifdef __ANDROID__ + size_t localsize[2] = {16, 10}; +#else + size_t localsize[2] = {16, 16}; +#endif + + size_t globalsize[2] = {DIVUP(bufSize.width, localsize[0]) * localsize[0], DIVUP(bufSize.height, localsize[1]) * localsize[1]}; + if (fast8uc1) + globalsize[0] = DIVUP((bufSize.width + 3) >> 2, localsize[0]) * localsize[0]; + + int radiusX = anchor, radiusY = (buf.rows - src.rows) >> 1; + + bool isolated = (borderType & BORDER_ISOLATED) != 0; + const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", "BORDER_REFLECT_101" }, + * const btype = borderMap[borderType & ~BORDER_ISOLATED]; + + bool extra_extrapolation = src.rows < (int)((-radiusY + globalsize[1]) >> 1) + 1; + extra_extrapolation |= src.rows < radiusY; + extra_extrapolation |= src.cols < (int)((-radiusX + globalsize[0] + 8 * localsize[0] + 3) >> 1) + 1; + extra_extrapolation |= src.cols < radiusX; + + char cvt[40]; + cv::String build_options = cv::format("-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D %s -D %s" + " -D srcT=%s -D dstT=%s -D convertToDstT=%s -D srcT1=%s -D dstT1=%s%s%s", + radiusX, (int)localsize[0], (int)localsize[1], cn, btype, + extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION", + isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED", + ocl::typeToStr(type), ocl::typeToStr(buf_type), + ocl::convertTypeStr(sdepth, bdepth, cn, cvt), + ocl::typeToStr(sdepth), ocl::typeToStr(bdepth), + doubleSupport ? " -D DOUBLE_SUPPORT" : "", + int_arithm ? " -D INTEGER_ARITHMETIC" : ""); + build_options += ocl::kernelToStr(kernelX, bdepth); + + Size srcWholeSize; Point srcOffset; + src.locateROI(srcWholeSize, srcOffset); + + String kernelName("row_filter"); + if (fast8uc1) + kernelName += "_C1_D0"; + + ocl::Kernel k(kernelName.c_str(), cv::ocl::imgproc::filterSepRow_oclsrc, + build_options); + if (k.empty()) + return false; + + if (fast8uc1) + k.args(ocl::KernelArg::PtrReadOnly(src), (int)(src.step / src.elemSize()), srcOffset.x, + srcOffset.y, src.cols, src.rows, srcWholeSize.width, srcWholeSize.height, + ocl::KernelArg::PtrWriteOnly(buf), (int)(buf.step / buf.elemSize()), + buf.cols, buf.rows, radiusY); + else + k.args(ocl::KernelArg::PtrReadOnly(src), (int)src.step, srcOffset.x, + srcOffset.y, src.cols, src.rows, srcWholeSize.width, srcWholeSize.height, + ocl::KernelArg::PtrWriteOnly(buf), (int)buf.step, buf.cols, buf.rows, radiusY); + + return k.run(2, globalsize, localsize, false); +} + +static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY, double delta, int anchor, bool int_arithm) +{ + bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; + if (dst.depth() == CV_64F && !doubleSupport) + return false; + +#ifdef __ANDROID__ + size_t localsize[2] = { 16, 10 }; +#else + size_t localsize[2] = { 16, 16 }; +#endif + size_t globalsize[2] = { 0, 0 }; + + int dtype = dst.type(), cn = CV_MAT_CN(dtype), ddepth = CV_MAT_DEPTH(dtype); + Size sz = dst.size(); + int buf_type = buf.type(), bdepth = CV_MAT_DEPTH(buf_type); + + globalsize[1] = DIVUP(sz.height, localsize[1]) * localsize[1]; + globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0]; + + char cvt[40]; + cv::String build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d" + " -D srcT=%s -D dstT=%s -D convertToDstT=%s" + " -D srcT1=%s -D dstT1=%s -D SHIFT_BITS=%d%s%s", + anchor, (int)localsize[0], (int)localsize[1], cn, + ocl::typeToStr(buf_type), ocl::typeToStr(dtype), + ocl::convertTypeStr(bdepth, ddepth, cn, cvt), + ocl::typeToStr(bdepth), ocl::typeToStr(ddepth), + 2*shift_bits, doubleSupport ? " -D DOUBLE_SUPPORT" : "", + int_arithm ? " -D INTEGER_ARITHMETIC" : ""); + build_options += ocl::kernelToStr(kernelY, bdepth); + + ocl::Kernel k("col_filter", cv::ocl::imgproc::filterSepCol_oclsrc, + build_options); + if (k.empty()) + return false; + + k.args(ocl::KernelArg::ReadOnly(buf), ocl::KernelArg::WriteOnly(dst), + static_cast(delta)); + + return k.run(2, globalsize, localsize, false); +} + +const int optimizedSepFilterLocalWidth = 16; +const int optimizedSepFilterLocalHeight = 8; + +static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst, + Mat row_kernel, Mat col_kernel, + double delta, int borderType, int ddepth, int bdepth, bool int_arithm) +{ + Size size = _src.size(), wholeSize; + Point origin; + int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype), + esz = CV_ELEM_SIZE(stype), wdepth = std::max(std::max(sdepth, ddepth), bdepth), + dtype = CV_MAKE_TYPE(ddepth, cn); + size_t src_step = _src.step(), src_offset = _src.offset(); + bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; + + if (esz == 0 || src_step == 0 + || (src_offset % src_step) % esz != 0 + || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) + || !(borderType == BORDER_CONSTANT + || borderType == BORDER_REPLICATE + || borderType == BORDER_REFLECT + || borderType == BORDER_WRAP + || borderType == BORDER_REFLECT_101)) + return false; + + size_t lt2[2] = { optimizedSepFilterLocalWidth, optimizedSepFilterLocalHeight }; + size_t gt2[2] = { lt2[0] * (1 + (size.width - 1) / lt2[0]), lt2[1]}; + + char cvt[2][40]; + const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", + "BORDER_REFLECT_101" }; + + String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d%s%s" + " -D srcT=%s -D convertToWT=%s -D WT=%s -D dstT=%s -D convertToDstT=%s" + " -D %s -D srcT1=%s -D dstT1=%s -D WT1=%s -D CN=%d -D SHIFT_BITS=%d%s", + (int)lt2[0], (int)lt2[1], row_kernel.cols / 2, col_kernel.cols / 2, + ocl::kernelToStr(row_kernel, wdepth, "KERNEL_MATRIX_X").c_str(), + ocl::kernelToStr(col_kernel, wdepth, "KERNEL_MATRIX_Y").c_str(), + ocl::typeToStr(stype), ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), + ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), ocl::typeToStr(dtype), + ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType], + ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), ocl::typeToStr(wdepth), + cn, 2*shift_bits, int_arithm ? " -D INTEGER_ARITHMETIC" : ""); + + ocl::Kernel k("sep_filter", ocl::imgproc::filterSep_singlePass_oclsrc, opts); + if (k.empty()) + return false; + + UMat src = _src.getUMat(); + _dst.create(size, dtype); + UMat dst = _dst.getUMat(); + + int src_offset_x = static_cast((src_offset % src_step) / esz); + int src_offset_y = static_cast(src_offset / src_step); + + src.locateROI(wholeSize, origin); + + k.args(ocl::KernelArg::PtrReadOnly(src), (int)src_step, src_offset_x, src_offset_y, + wholeSize.height, wholeSize.width, ocl::KernelArg::WriteOnly(dst), + static_cast(delta)); + + return k.run(2, gt2, lt2, false); +} + +bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, + InputArray _kernelX, InputArray _kernelY, Point anchor, + double delta, int borderType ) +{ + const ocl::Device & d = ocl::Device::getDefault(); + Size imgSize = _src.size(); + + int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); + if (cn > 4) + return false; + + Mat kernelX = _kernelX.getMat().reshape(1, 1); + if (kernelX.cols % 2 != 1) + return false; + Mat kernelY = _kernelY.getMat().reshape(1, 1); + if (kernelY.cols % 2 != 1) + return false; + + if (ddepth < 0) + ddepth = sdepth; + + if (anchor.x < 0) + anchor.x = kernelX.cols >> 1; + if (anchor.y < 0) + anchor.y = kernelY.cols >> 1; + + int rtype = getKernelType(kernelX, + kernelX.rows == 1 ? Point(anchor.x, 0) : Point(0, anchor.x)); + int ctype = getKernelType(kernelY, + kernelY.rows == 1 ? Point(anchor.y, 0) : Point(0, anchor.y)); + + int bdepth = CV_32F; + bool int_arithm = false; + if( sdepth == CV_8U && ddepth == CV_8U && + rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL && + ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL) + { + if (ocl::Device::getDefault().isIntel()) + { + for (int i=0; i(0, i) = (float) cvRound(kernelX.at(0, i) * (1 << shift_bits)); + if (kernelX.data != kernelY.data) + for (int i=0; i(0, i) = (float) cvRound(kernelY.at(0, i) * (1 << shift_bits)); + } else + { + bdepth = CV_32S; + kernelX.convertTo( kernelX, bdepth, 1 << shift_bits ); + kernelY.convertTo( kernelY, bdepth, 1 << shift_bits ); + } + int_arithm = true; + } + + CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 && + imgSize.width > optimizedSepFilterLocalWidth + anchor.x && + imgSize.height > optimizedSepFilterLocalHeight + anchor.y && + (!(borderType & BORDER_ISOLATED) || _src.offset() == 0) && + anchor == Point(kernelX.cols >> 1, kernelY.cols >> 1) && + OCL_PERFORMANCE_CHECK(d.isIntel()), // TODO FIXIT + ocl_sepFilter2D_SinglePass(_src, _dst, kernelX, kernelY, delta, + borderType & ~BORDER_ISOLATED, ddepth, bdepth, int_arithm), true) + + UMat src = _src.getUMat(); + Size srcWholeSize; Point srcOffset; + src.locateROI(srcWholeSize, srcOffset); + + bool fast8uc1 = type == CV_8UC1 && srcOffset.x % 4 == 0 && + src.cols % 4 == 0 && src.step % 4 == 0; + + Size srcSize = src.size(); + Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1); + UMat buf(bufSize, CV_MAKETYPE(bdepth, cn)); + if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1, int_arithm)) + return false; + + _dst.create(srcSize, CV_MAKETYPE(ddepth, cn)); + UMat dst = _dst.getUMat(); + + return ocl_sepColFilter2D(buf, dst, kernelY, delta, anchor.y, int_arithm); +} + +#endif + +Ptr getLinearFilter( + int srcType, int dstType, + InputArray filter_kernel, Point anchor, + double delta, int bits) +{ + CV_INSTRUMENT_REGION(); + + Mat kernelMat = filter_kernel.getMat(); + CV_CPU_DISPATCH(getLinearFilter, (srcType, dstType, kernelMat, anchor, delta, bits), + CV_CPU_DISPATCH_MODES_ALL); +} + + +Ptr createLinearFilter( + int _srcType, int _dstType, + InputArray filter_kernel, + Point _anchor, double _delta, + int _rowBorderType, int _columnBorderType, + const Scalar& _borderValue) +{ + Mat _kernel = filter_kernel.getMat(); + _srcType = CV_MAT_TYPE(_srcType); + _dstType = CV_MAT_TYPE(_dstType); + int cn = CV_MAT_CN(_srcType); + CV_Assert( cn == CV_MAT_CN(_dstType) ); + + Mat kernel = _kernel; + int bits = 0; + + /*int sdepth = CV_MAT_DEPTH(_srcType), ddepth = CV_MAT_DEPTH(_dstType); + int ktype = _kernel.depth() == CV_32S ? KERNEL_INTEGER : getKernelType(_kernel, _anchor); + if( sdepth == CV_8U && (ddepth == CV_8U || ddepth == CV_16S) && + _kernel.rows*_kernel.cols <= (1 << 10) ) + { + bits = (ktype & KERNEL_INTEGER) ? 0 : 11; + _kernel.convertTo(kernel, CV_32S, 1 << bits); + }*/ + + Ptr _filter2D = getLinearFilter(_srcType, _dstType, + kernel, _anchor, _delta, bits); + + return makePtr(_filter2D, Ptr(), + Ptr(), _srcType, _dstType, _srcType, + _rowBorderType, _columnBorderType, _borderValue ); +} + + +//================================================================ +// HAL interface +//================================================================ + +static bool replacementFilter2D(int stype, int dtype, int kernel_type, + uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int full_width, int full_height, + int offset_x, int offset_y, + uchar * kernel_data, size_t kernel_step, + int kernel_width, int kernel_height, + int anchor_x, int anchor_y, + double delta, int borderType, bool isSubmatrix) +{ + cvhalFilter2D* ctx; + int res = cv_hal_filterInit(&ctx, kernel_data, kernel_step, kernel_type, kernel_width, kernel_height, width, height, + stype, dtype, borderType, delta, anchor_x, anchor_y, isSubmatrix, src_data == dst_data); + if (res != CV_HAL_ERROR_OK) + return false; + res = cv_hal_filter(ctx, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y); + bool success = (res == CV_HAL_ERROR_OK); + res = cv_hal_filterFree(ctx); + if (res != CV_HAL_ERROR_OK) + return false; + return success; +} + +#ifdef HAVE_IPP +static bool ippFilter2D(int stype, int dtype, int kernel_type, + uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int full_width, int full_height, + int offset_x, int offset_y, + uchar * kernel_data, size_t kernel_step, + int kernel_width, int kernel_height, + int anchor_x, int anchor_y, + double delta, int borderType, + bool isSubmatrix) +{ +#ifdef HAVE_IPP_IW + CV_INSTRUMENT_REGION_IPP(); + + ::ipp::IwiSize iwSize(width, height); + ::ipp::IwiSize kernelSize(kernel_width, kernel_height); + IppDataType type = ippiGetDataType(CV_MAT_DEPTH(stype)); + int channels = CV_MAT_CN(stype); + + CV_UNUSED(isSubmatrix); + +#if IPP_VERSION_X100 >= 201700 && IPP_VERSION_X100 <= 201702 // IPP bug with 1x1 kernel + if(kernel_width == 1 && kernel_height == 1) + return false; +#endif + +#if IPP_DISABLE_FILTER2D_BIG_MASK + // Too big difference compared to OpenCV FFT-based convolution + if(kernel_type == CV_32FC1 && (type == ipp16s || type == ipp16u) && (kernel_width > 7 || kernel_height > 7)) + return false; + + // Poor optimization for big kernels + if(kernel_width > 7 || kernel_height > 7) + return false; +#endif + + if(src_data == dst_data) + return false; + + if(stype != dtype) + return false; + + if(kernel_type != CV_16SC1 && kernel_type != CV_32FC1) + return false; + + // TODO: Implement offset for 8u, 16u + if(std::fabs(delta) >= DBL_EPSILON) + return false; + + if(!ippiCheckAnchor(anchor_x, anchor_y, kernel_width, kernel_height)) + return false; + + try + { + ::ipp::IwiBorderSize iwBorderSize; + ::ipp::IwiBorderType iwBorderType; + ::ipp::IwiImage iwKernel(ippiSize(kernel_width, kernel_height), ippiGetDataType(CV_MAT_DEPTH(kernel_type)), CV_MAT_CN(kernel_type), 0, (void*)kernel_data, kernel_step); + ::ipp::IwiImage iwSrc(iwSize, type, channels, ::ipp::IwiBorderSize(offset_x, offset_y, full_width-offset_x-width, full_height-offset_y-height), (void*)src_data, src_step); + ::ipp::IwiImage iwDst(iwSize, type, channels, ::ipp::IwiBorderSize(offset_x, offset_y, full_width-offset_x-width, full_height-offset_y-height), (void*)dst_data, dst_step); + + iwBorderSize = ::ipp::iwiSizeToBorderSize(kernelSize); + iwBorderType = ippiGetBorder(iwSrc, borderType, iwBorderSize); + if(!iwBorderType) + return false; + + CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilter, iwSrc, iwDst, iwKernel, ::ipp::IwiFilterParams(1, 0, ippAlgHintNone, ippRndFinancial), iwBorderType); + } + catch(const ::ipp::IwException& ex) + { + CV_UNUSED(ex); + return false; + } + + return true; +#else + CV_UNUSED(stype); CV_UNUSED(dtype); CV_UNUSED(kernel_type); CV_UNUSED(src_data); CV_UNUSED(src_step); + CV_UNUSED(dst_data); CV_UNUSED(dst_step); CV_UNUSED(width); CV_UNUSED(height); CV_UNUSED(full_width); + CV_UNUSED(full_height); CV_UNUSED(offset_x); CV_UNUSED(offset_y); CV_UNUSED(kernel_data); CV_UNUSED(kernel_step); + CV_UNUSED(kernel_width); CV_UNUSED(kernel_height); CV_UNUSED(anchor_x); CV_UNUSED(anchor_y); CV_UNUSED(delta); + CV_UNUSED(borderType); CV_UNUSED(isSubmatrix); + return false; +#endif +} +#endif + +static bool dftFilter2D(int stype, int dtype, int kernel_type, + uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int full_width, int full_height, + int offset_x, int offset_y, + uchar * kernel_data, size_t kernel_step, + int kernel_width, int kernel_height, + int anchor_x, int anchor_y, + double delta, int borderType) +{ + { + int sdepth = CV_MAT_DEPTH(stype); + int ddepth = CV_MAT_DEPTH(dtype); + int dft_filter_size = checkHardwareSupport(CV_CPU_SSE3) && ((sdepth == CV_8U && (ddepth == CV_8U || ddepth == CV_16S)) || (sdepth == CV_32F && ddepth == CV_32F)) ? 130 : 50; + if (kernel_width * kernel_height < dft_filter_size) + return false; + } + + Point anchor = Point(anchor_x, anchor_y); + Mat kernel = Mat(Size(kernel_width, kernel_height), kernel_type, kernel_data, kernel_step); + + Mat src(Size(full_width-offset_x, full_height-offset_y), stype, src_data, src_step); + Mat dst(Size(full_width, full_height), dtype, dst_data, dst_step); + Mat temp; + int src_channels = CV_MAT_CN(stype); + int dst_channels = CV_MAT_CN(dtype); + int ddepth = CV_MAT_DEPTH(dtype); + // crossCorr doesn't accept non-zero delta with multiple channels + if (src_channels != 1 && delta != 0) { + // The semantics of filter2D require that the delta be applied + // as floating-point math. So wee need an intermediate Mat + // with a float datatype. If the dest is already floats, + // we just use that. + int corrDepth = ddepth; + if ((ddepth == CV_32F || ddepth == CV_64F) && src_data != dst_data) { + temp = Mat(Size(full_width, full_height), dtype, dst_data, dst_step); + } else { + corrDepth = ddepth == CV_64F ? CV_64F : CV_32F; + temp.create(Size(full_width, full_height), CV_MAKETYPE(corrDepth, dst_channels)); + } + crossCorr(src, kernel, temp, src.size(), + CV_MAKETYPE(corrDepth, src_channels), + anchor, 0, borderType); + add(temp, delta, temp); + if (temp.data != dst_data) { + temp.convertTo(dst, dst.type()); + } + } else { + if (src_data != dst_data) + temp = Mat(Size(full_width, full_height), dtype, dst_data, dst_step); + else + temp.create(Size(full_width, full_height), dtype); + crossCorr(src, kernel, temp, src.size(), + CV_MAKETYPE(ddepth, src_channels), + anchor, delta, borderType); + if (temp.data != dst_data) + temp.copyTo(dst); + } + return true; +} + +static void ocvFilter2D(int stype, int dtype, int kernel_type, + uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int full_width, int full_height, + int offset_x, int offset_y, + uchar * kernel_data, size_t kernel_step, + int kernel_width, int kernel_height, + int anchor_x, int anchor_y, + double delta, int borderType) +{ + int borderTypeValue = borderType & ~BORDER_ISOLATED; + Mat kernel = Mat(Size(kernel_width, kernel_height), kernel_type, kernel_data, kernel_step); + Ptr f = createLinearFilter(stype, dtype, kernel, Point(anchor_x, anchor_y), delta, + borderTypeValue); + Mat src(Size(width, height), stype, src_data, src_step); + Mat dst(Size(width, height), dtype, dst_data, dst_step); + f->apply(src, dst, Size(full_width, full_height), Point(offset_x, offset_y)); +} + +static bool replacementSepFilter(int stype, int dtype, int ktype, + uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, + int width, int height, int full_width, int full_height, + int offset_x, int offset_y, + uchar * kernelx_data, int kernelx_len, + uchar * kernely_data, int kernely_len, + int anchor_x, int anchor_y, double delta, int borderType) +{ + cvhalFilter2D *ctx; + int res = cv_hal_sepFilterInit(&ctx, stype, dtype, ktype, + kernelx_data, kernelx_len, + kernely_data, kernely_len, + anchor_x, anchor_y, delta, borderType); + if (res != CV_HAL_ERROR_OK) + return false; + res = cv_hal_sepFilter(ctx, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y); + bool success = (res == CV_HAL_ERROR_OK); + res = cv_hal_sepFilterFree(ctx); + if (res != CV_HAL_ERROR_OK) + return false; + return success; +} + +static void ocvSepFilter(int stype, int dtype, int ktype, + uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, + int width, int height, int full_width, int full_height, + int offset_x, int offset_y, + uchar * kernelx_data, int kernelx_len, + uchar * kernely_data, int kernely_len, + int anchor_x, int anchor_y, double delta, int borderType) +{ + Mat kernelX(Size(kernelx_len, 1), ktype, kernelx_data); + Mat kernelY(Size(kernely_len, 1), ktype, kernely_data); + Ptr f = createSeparableLinearFilter(stype, dtype, kernelX, kernelY, + Point(anchor_x, anchor_y), + delta, borderType & ~BORDER_ISOLATED); + Mat src(Size(width, height), stype, src_data, src_step); + Mat dst(Size(width, height), dtype, dst_data, dst_step); + f->apply(src, dst, Size(full_width, full_height), Point(offset_x, offset_y)); +}; + +//=================================================================== +// HAL functions +//=================================================================== + +namespace hal { + + +CV_DEPRECATED Ptr Filter2D::create(uchar * , size_t , int , + int , int , + int , int , + int , int , + int , double , + int , int , + bool , bool ) { return Ptr(); } + +CV_DEPRECATED Ptr SepFilter2D::create(int , int , int , + uchar * , int , + uchar * , int , + int , int , + double , int ) { return Ptr(); } + + +void filter2D(int stype, int dtype, int kernel_type, + uchar * src_data, size_t src_step, + uchar * dst_data, size_t dst_step, + int width, int height, + int full_width, int full_height, + int offset_x, int offset_y, + uchar * kernel_data, size_t kernel_step, + int kernel_width, int kernel_height, + int anchor_x, int anchor_y, + double delta, int borderType, + bool isSubmatrix) +{ + bool res; + res = replacementFilter2D(stype, dtype, kernel_type, + src_data, src_step, + dst_data, dst_step, + width, height, + full_width, full_height, + offset_x, offset_y, + kernel_data, kernel_step, + kernel_width, kernel_height, + anchor_x, anchor_y, + delta, borderType, isSubmatrix); + if (res) + return; + + CV_IPP_RUN_FAST(ippFilter2D(stype, dtype, kernel_type, + src_data, src_step, + dst_data, dst_step, + width, height, + full_width, full_height, + offset_x, offset_y, + kernel_data, kernel_step, + kernel_width, kernel_height, + anchor_x, anchor_y, + delta, borderType, isSubmatrix)) + + res = dftFilter2D(stype, dtype, kernel_type, + src_data, src_step, + dst_data, dst_step, + full_width, full_height, + offset_x, offset_y, + kernel_data, kernel_step, + kernel_width, kernel_height, + anchor_x, anchor_y, + delta, borderType); + if (res) + return; + ocvFilter2D(stype, dtype, kernel_type, + src_data, src_step, + dst_data, dst_step, + width, height, + full_width, full_height, + offset_x, offset_y, + kernel_data, kernel_step, + kernel_width, kernel_height, + anchor_x, anchor_y, + delta, borderType); +} + +//--------------------------------------------------------------- + +void sepFilter2D(int stype, int dtype, int ktype, + uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, + int width, int height, int full_width, int full_height, + int offset_x, int offset_y, + uchar * kernelx_data, int kernelx_len, + uchar * kernely_data, int kernely_len, + int anchor_x, int anchor_y, double delta, int borderType) +{ + + bool res = replacementSepFilter(stype, dtype, ktype, + src_data, src_step, dst_data, dst_step, + width, height, full_width, full_height, + offset_x, offset_y, + kernelx_data, kernelx_len, + kernely_data, kernely_len, + anchor_x, anchor_y, delta, borderType); + if (res) + return; + ocvSepFilter(stype, dtype, ktype, + src_data, src_step, dst_data, dst_step, + width, height, full_width, full_height, + offset_x, offset_y, + kernelx_data, kernelx_len, + kernely_data, kernely_len, + anchor_x, anchor_y, delta, borderType); +} + +} // namespace cv::hal:: + +//================================================================ +// Main interface +//================================================================ + +void filter2D(InputArray _src, OutputArray _dst, int ddepth, + InputArray _kernel, Point anchor0, + double delta, int borderType) +{ + CV_INSTRUMENT_REGION(); + + CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2, + ocl_filter2D(_src, _dst, ddepth, _kernel, anchor0, delta, borderType)) + + Mat src = _src.getMat(), kernel = _kernel.getMat(); + + if( ddepth < 0 ) + ddepth = src.depth(); + + _dst.create( src.size(), CV_MAKETYPE(ddepth, src.channels()) ); + Mat dst = _dst.getMat(); + Point anchor = normalizeAnchor(anchor0, kernel.size()); + + Point ofs; + Size wsz(src.cols, src.rows); + if( (borderType & BORDER_ISOLATED) == 0 ) + src.locateROI( wsz, ofs ); + + hal::filter2D(src.type(), dst.type(), kernel.type(), + src.data, src.step, dst.data, dst.step, + dst.cols, dst.rows, wsz.width, wsz.height, ofs.x, ofs.y, + kernel.data, kernel.step, kernel.cols, kernel.rows, + anchor.x, anchor.y, + delta, borderType, src.isSubmatrix()); +} + +void sepFilter2D(InputArray _src, OutputArray _dst, int ddepth, + InputArray _kernelX, InputArray _kernelY, Point anchor, + double delta, int borderType) +{ + CV_INSTRUMENT_REGION(); + + CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && (size_t)_src.rows() > _kernelY.total() && (size_t)_src.cols() > _kernelX.total(), + ocl_sepFilter2D(_src, _dst, ddepth, _kernelX, _kernelY, anchor, delta, borderType)) + + Mat src = _src.getMat(), kernelX = _kernelX.getMat(), kernelY = _kernelY.getMat(); + + if( ddepth < 0 ) + ddepth = src.depth(); + + _dst.create( src.size(), CV_MAKETYPE(ddepth, src.channels()) ); + Mat dst = _dst.getMat(); + + Point ofs; + Size wsz(src.cols, src.rows); + if( (borderType & BORDER_ISOLATED) == 0 ) + src.locateROI( wsz, ofs ); + + CV_Assert( kernelX.type() == kernelY.type() && + (kernelX.cols == 1 || kernelX.rows == 1) && + (kernelY.cols == 1 || kernelY.rows == 1) ); + + Mat contKernelX = kernelX.isContinuous() ? kernelX : kernelX.clone(); + Mat contKernelY = kernelY.isContinuous() ? kernelY : kernelY.clone(); + + hal::sepFilter2D(src.type(), dst.type(), kernelX.type(), + src.data, src.step, dst.data, dst.step, + dst.cols, dst.rows, wsz.width, wsz.height, ofs.x, ofs.y, + contKernelX.data, kernelX.cols + kernelX.rows - 1, + contKernelY.data, kernelY.cols + kernelY.rows - 1, + anchor.x, anchor.y, delta, borderType & ~BORDER_ISOLATED); +} + +} // namespace + +CV_IMPL void +cvFilter2D( const CvArr* srcarr, CvArr* dstarr, const CvMat* _kernel, CvPoint anchor ) +{ + cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr); + cv::Mat kernel = cv::cvarrToMat(_kernel); + + CV_Assert( src.size() == dst.size() && src.channels() == dst.channels() ); + + cv::filter2D( src, dst, dst.depth(), kernel, anchor, 0, cv::BORDER_REPLICATE ); +} + +/* End of file. */ diff --git a/modules/imgproc/src/filter.hpp b/modules/imgproc/src/filter.hpp index 93f3f177e6..7b792d1935 100644 --- a/modules/imgproc/src/filter.hpp +++ b/modules/imgproc/src/filter.hpp @@ -45,17 +45,13 @@ namespace cv { -#if CV_TRY_AVX2 - int RowVec_32f_AVX(const float* src0, const float* _kx, float* dst, int width, int cn, int _ksize); - int SymmColumnVec_32f_Symm_AVX(const float** src, const float* ky, float* dst, float delta, int width, int ksize2); - int SymmColumnVec_32f_Unsymm_AVX(const float** src, const float* ky, float* dst, float delta, int width, int ksize2); -#endif - #ifdef HAVE_OPENCL bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, InputArray _kernelX, InputArray _kernelY, Point anchor, double delta, int borderType ); #endif + + void preprocess2DKernel(const Mat& kernel, std::vector& coords, std::vector& coeffs); } #endif diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.simd.hpp similarity index 68% rename from modules/imgproc/src/filter.cpp rename to modules/imgproc/src/filter.simd.hpp index 43200218dc..f09cd1ec1d 100644 --- a/modules/imgproc/src/filter.cpp +++ b/modules/imgproc/src/filter.simd.hpp @@ -41,160 +41,85 @@ //M*/ #include "precomp.hpp" -#include "opencv2/core/opencl/ocl_defs.hpp" -#include "opencl_kernels_imgproc.hpp" -#include "hal_replacement.hpp" #include "opencv2/core/hal/intrin.hpp" #include "filter.hpp" - -/****************************************************************************************\ - Base Image Filter -\****************************************************************************************/ - +#if defined(CV_CPU_BASELINE_MODE) #if IPP_VERSION_X100 >= 710 #define USE_IPP_SEP_FILTERS 1 #else #undef USE_IPP_SEP_FILTERS #endif +#endif -namespace cv -{ - -BaseRowFilter::BaseRowFilter() { ksize = anchor = -1; } -BaseRowFilter::~BaseRowFilter() {} - -BaseColumnFilter::BaseColumnFilter() { ksize = anchor = -1; } -BaseColumnFilter::~BaseColumnFilter() {} -void BaseColumnFilter::reset() {} - -BaseFilter::BaseFilter() { ksize = Size(-1,-1); anchor = Point(-1,-1); } -BaseFilter::~BaseFilter() {} -void BaseFilter::reset() {} - -FilterEngine::FilterEngine() - : srcType(-1), dstType(-1), bufType(-1), maxWidth(0), wholeSize(-1, -1), dx1(0), dx2(0), - rowBorderType(BORDER_REPLICATE), columnBorderType(BORDER_REPLICATE), - borderElemSize(0), bufStep(0), startY(0), startY0(0), endY(0), rowCount(0), dstY(0) -{ -} - - -FilterEngine::FilterEngine( const Ptr& _filter2D, - const Ptr& _rowFilter, - const Ptr& _columnFilter, - int _srcType, int _dstType, int _bufType, - int _rowBorderType, int _columnBorderType, - const Scalar& _borderValue ) - : srcType(-1), dstType(-1), bufType(-1), maxWidth(0), wholeSize(-1, -1), dx1(0), dx2(0), - rowBorderType(BORDER_REPLICATE), columnBorderType(BORDER_REPLICATE), - borderElemSize(0), bufStep(0), startY(0), startY0(0), endY(0), rowCount(0), dstY(0) -{ - init(_filter2D, _rowFilter, _columnFilter, _srcType, _dstType, _bufType, - _rowBorderType, _columnBorderType, _borderValue); -} - -FilterEngine::~FilterEngine() -{ -} - - -void FilterEngine::init( const Ptr& _filter2D, - const Ptr& _rowFilter, - const Ptr& _columnFilter, - int _srcType, int _dstType, int _bufType, - int _rowBorderType, int _columnBorderType, - const Scalar& _borderValue ) -{ - _srcType = CV_MAT_TYPE(_srcType); - _bufType = CV_MAT_TYPE(_bufType); - _dstType = CV_MAT_TYPE(_dstType); - - srcType = _srcType; - int srcElemSize = (int)getElemSize(srcType); - dstType = _dstType; - bufType = _bufType; - - filter2D = _filter2D; - rowFilter = _rowFilter; - columnFilter = _columnFilter; - - if( _columnBorderType < 0 ) - _columnBorderType = _rowBorderType; - - rowBorderType = _rowBorderType; - columnBorderType = _columnBorderType; - CV_Assert( columnBorderType != BORDER_WRAP ); +/****************************************************************************************\ + Base Image Filter +\****************************************************************************************/ - if( isSeparable() ) - { - CV_Assert( rowFilter && columnFilter ); - ksize = Size(rowFilter->ksize, columnFilter->ksize); - anchor = Point(rowFilter->anchor, columnFilter->anchor); - } - else - { - CV_Assert( bufType == srcType ); - ksize = filter2D->ksize; - anchor = filter2D->anchor; - } +namespace cv { +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN +// forward declarations +int FilterEngine__start(FilterEngine& this_, const Size &_wholeSize, const Size &sz, const Point &ofs); +int FilterEngine__proceed(FilterEngine& this_, const uchar* src, int srcstep, int count, + uchar* dst, int dststep); +void FilterEngine__apply(FilterEngine& this_, const Mat& src, Mat& dst, const Size& wsz, const Point& ofs); - CV_Assert( 0 <= anchor.x && anchor.x < ksize.width && - 0 <= anchor.y && anchor.y < ksize.height ); +Ptr getLinearRowFilter( + int srcType, int bufType, + const Mat& kernel, int anchor, + int symmetryType); - borderElemSize = srcElemSize/(CV_MAT_DEPTH(srcType) >= CV_32S ? sizeof(int) : 1); - int borderLength = std::max(ksize.width - 1, 1); - borderTab.resize(borderLength*borderElemSize); +Ptr getLinearColumnFilter( + int bufType, int dstType, + const Mat& kernel, int anchor, + int symmetryType, double delta, + int bits); - maxWidth = bufStep = 0; - constBorderRow.clear(); +Ptr getLinearFilter( + int srcType, int dstType, + const Mat& filter_kernel, Point anchor, + double delta, int bits); - if( rowBorderType == BORDER_CONSTANT || columnBorderType == BORDER_CONSTANT ) - { - constBorderValue.resize(srcElemSize*borderLength); - int srcType1 = CV_MAKETYPE(CV_MAT_DEPTH(srcType), MIN(CV_MAT_CN(srcType), 4)); - scalarToRawData(_borderValue, &constBorderValue[0], srcType1, - borderLength*CV_MAT_CN(srcType)); - } - wholeSize = Size(-1,-1); -} +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY #define VEC_ALIGN CV_MALLOC_ALIGN -int FilterEngine::start(const Size &_wholeSize, const Size &sz, const Point &ofs) +int FilterEngine__start(FilterEngine& this_, const Size &_wholeSize, const Size &sz, const Point &ofs) { + CV_INSTRUMENT_REGION(); + int i, j; - wholeSize = _wholeSize; - roi = Rect(ofs, sz); - CV_Assert( roi.x >= 0 && roi.y >= 0 && roi.width >= 0 && roi.height >= 0 && - roi.x + roi.width <= wholeSize.width && - roi.y + roi.height <= wholeSize.height ); + this_.wholeSize = _wholeSize; + this_.roi = Rect(ofs, sz); + CV_Assert( this_.roi.x >= 0 && this_.roi.y >= 0 && this_.roi.width >= 0 && this_.roi.height >= 0 && + this_.roi.x + this_.roi.width <= this_.wholeSize.width && + this_.roi.y + this_.roi.height <= this_.wholeSize.height ); - int esz = (int)getElemSize(srcType); - int bufElemSize = (int)getElemSize(bufType); - const uchar* constVal = !constBorderValue.empty() ? &constBorderValue[0] : 0; + int esz = (int)getElemSize(this_.srcType); + int bufElemSize = (int)getElemSize(this_.bufType); + const uchar* constVal = !this_.constBorderValue.empty() ? &this_.constBorderValue[0] : 0; - int _maxBufRows = std::max(ksize.height + 3, - std::max(anchor.y, - ksize.height-anchor.y-1)*2+1); + int _maxBufRows = std::max(this_.ksize.height + 3, + std::max(this_.anchor.y, + this_.ksize.height-this_.anchor.y-1)*2+1); - if( maxWidth < roi.width || _maxBufRows != (int)rows.size() ) + if (this_.maxWidth < this_.roi.width || _maxBufRows != (int)this_.rows.size() ) { - rows.resize(_maxBufRows); - maxWidth = std::max(maxWidth, roi.width); - int cn = CV_MAT_CN(srcType); - srcRow.resize(esz*(maxWidth + ksize.width - 1)); - if( columnBorderType == BORDER_CONSTANT ) + this_.rows.resize(_maxBufRows); + this_.maxWidth = std::max(this_.maxWidth, this_.roi.width); + int cn = CV_MAT_CN(this_.srcType); + this_.srcRow.resize(esz*(this_.maxWidth + this_.ksize.width - 1)); + if (this_.columnBorderType == BORDER_CONSTANT) { CV_Assert(constVal != NULL); - constBorderRow.resize(getElemSize(bufType)*(maxWidth + ksize.width - 1 + VEC_ALIGN)); - uchar *dst = alignPtr(&constBorderRow[0], VEC_ALIGN), *tdst; - int n = (int)constBorderValue.size(), N; - N = (maxWidth + ksize.width - 1)*esz; - tdst = isSeparable() ? &srcRow[0] : dst; + this_.constBorderRow.resize(getElemSize(this_.bufType)*(this_.maxWidth + this_.ksize.width - 1 + VEC_ALIGN)); + uchar *dst = alignPtr(&this_.constBorderRow[0], VEC_ALIGN); + int n = (int)this_.constBorderValue.size(); + int N = (this_.maxWidth + this_.ksize.width - 1)*esz; + uchar *tdst = this_.isSeparable() ? &this_.srcRow[0] : dst; for( i = 0; i < N; i += n ) { @@ -203,126 +128,113 @@ int FilterEngine::start(const Size &_wholeSize, const Size &sz, const Point &ofs tdst[i+j] = constVal[j]; } - if( isSeparable() ) - (*rowFilter)(&srcRow[0], dst, maxWidth, cn); + if (this_.isSeparable()) + (*this_.rowFilter)(&this_.srcRow[0], dst, this_.maxWidth, cn); } - int maxBufStep = bufElemSize*(int)alignSize(maxWidth + - (!isSeparable() ? ksize.width - 1 : 0),VEC_ALIGN); - ringBuf.resize(maxBufStep*rows.size()+VEC_ALIGN); + int maxBufStep = bufElemSize*(int)alignSize(this_.maxWidth + + (!this_.isSeparable() ? this_.ksize.width - 1 : 0), VEC_ALIGN); + this_.ringBuf.resize(maxBufStep*this_.rows.size()+VEC_ALIGN); } // adjust bufstep so that the used part of the ring buffer stays compact in memory - bufStep = bufElemSize*(int)alignSize(roi.width + (!isSeparable() ? ksize.width - 1 : 0),VEC_ALIGN); + this_.bufStep = bufElemSize*(int)alignSize(this_.roi.width + (!this_.isSeparable() ? this_.ksize.width - 1 : 0), VEC_ALIGN); - dx1 = std::max(anchor.x - roi.x, 0); - dx2 = std::max(ksize.width - anchor.x - 1 + roi.x + roi.width - wholeSize.width, 0); + this_.dx1 = std::max(this_.anchor.x - this_.roi.x, 0); + this_.dx2 = std::max(this_.ksize.width - this_.anchor.x - 1 + this_.roi.x + this_.roi.width - this_.wholeSize.width, 0); // recompute border tables - if( dx1 > 0 || dx2 > 0 ) + if (this_.dx1 > 0 || this_.dx2 > 0) { - if( rowBorderType == BORDER_CONSTANT ) + if (this_.rowBorderType == BORDER_CONSTANT ) { CV_Assert(constVal != NULL); - int nr = isSeparable() ? 1 : (int)rows.size(); + int nr = this_.isSeparable() ? 1 : (int)this_.rows.size(); for( i = 0; i < nr; i++ ) { - uchar* dst = isSeparable() ? &srcRow[0] : alignPtr(&ringBuf[0],VEC_ALIGN) + bufStep*i; - memcpy( dst, constVal, dx1*esz ); - memcpy( dst + (roi.width + ksize.width - 1 - dx2)*esz, constVal, dx2*esz ); + uchar* dst = this_.isSeparable() ? &this_.srcRow[0] : alignPtr(&this_.ringBuf[0], VEC_ALIGN) + this_.bufStep*i; + memcpy(dst, constVal, this_.dx1*esz); + memcpy(dst + (this_.roi.width + this_.ksize.width - 1 - this_.dx2)*esz, constVal, this_.dx2*esz); } } else { - int xofs1 = std::min(roi.x, anchor.x) - roi.x; + int xofs1 = std::min(this_.roi.x, this_.anchor.x) - this_.roi.x; - int btab_esz = borderElemSize, wholeWidth = wholeSize.width; - int* btab = (int*)&borderTab[0]; + int btab_esz = this_.borderElemSize, wholeWidth = this_.wholeSize.width; + int* btab = (int*)&this_.borderTab[0]; - for( i = 0; i < dx1; i++ ) + for( i = 0; i < this_.dx1; i++ ) { - int p0 = (borderInterpolate(i-dx1, wholeWidth, rowBorderType) + xofs1)*btab_esz; + int p0 = (borderInterpolate(i-this_.dx1, wholeWidth, this_.rowBorderType) + xofs1)*btab_esz; for( j = 0; j < btab_esz; j++ ) btab[i*btab_esz + j] = p0 + j; } - for( i = 0; i < dx2; i++ ) + for( i = 0; i < this_.dx2; i++ ) { - int p0 = (borderInterpolate(wholeWidth + i, wholeWidth, rowBorderType) + xofs1)*btab_esz; + int p0 = (borderInterpolate(wholeWidth + i, wholeWidth, this_.rowBorderType) + xofs1)*btab_esz; for( j = 0; j < btab_esz; j++ ) - btab[(i + dx1)*btab_esz + j] = p0 + j; + btab[(i + this_.dx1)*btab_esz + j] = p0 + j; } } } - rowCount = dstY = 0; - startY = startY0 = std::max(roi.y - anchor.y, 0); - endY = std::min(roi.y + roi.height + ksize.height - anchor.y - 1, wholeSize.height); - if( columnFilter ) - columnFilter->reset(); - if( filter2D ) - filter2D->reset(); - - return startY; -} + this_.rowCount = this_.dstY = 0; + this_.startY = this_.startY0 = std::max(this_.roi.y - this_.anchor.y, 0); + this_.endY = std::min(this_.roi.y + this_.roi.height + this_.ksize.height - this_.anchor.y - 1, this_.wholeSize.height); + if (this_.columnFilter) + this_.columnFilter->reset(); + if (this_.filter2D) + this_.filter2D->reset(); -int FilterEngine::start(const Mat& src, const Size &wsz, const Point &ofs) -{ - start( wsz, src.size(), ofs); - return startY - ofs.y; + return this_.startY; } -int FilterEngine::remainingInputRows() const -{ - return endY - startY - rowCount; -} -int FilterEngine::remainingOutputRows() const +int FilterEngine__proceed(FilterEngine& this_, const uchar* src, int srcstep, int count, + uchar* dst, int dststep) { - return roi.height - dstY; -} + CV_INSTRUMENT_REGION(); -int FilterEngine::proceed( const uchar* src, int srcstep, int count, - uchar* dst, int dststep ) -{ - CV_Assert( wholeSize.width > 0 && wholeSize.height > 0 ); - - const int *btab = &borderTab[0]; - int esz = (int)getElemSize(srcType), btab_esz = borderElemSize; - uchar** brows = &rows[0]; - int bufRows = (int)rows.size(); - int cn = CV_MAT_CN(bufType); - int width = roi.width, kwidth = ksize.width; - int kheight = ksize.height, ay = anchor.y; - int _dx1 = dx1, _dx2 = dx2; - int width1 = roi.width + kwidth - 1; - int xofs1 = std::min(roi.x, anchor.x); - bool isSep = isSeparable(); - bool makeBorder = (_dx1 > 0 || _dx2 > 0) && rowBorderType != BORDER_CONSTANT; + CV_DbgAssert(this_.wholeSize.width > 0 && this_.wholeSize.height > 0 ); + + const int *btab = &this_.borderTab[0]; + int esz = (int)getElemSize(this_.srcType), btab_esz = this_.borderElemSize; + uchar** brows = &this_.rows[0]; + int bufRows = (int)this_.rows.size(); + int cn = CV_MAT_CN(this_.bufType); + int width = this_.roi.width, kwidth = this_.ksize.width; + int kheight = this_.ksize.height, ay = this_.anchor.y; + int _dx1 = this_.dx1, _dx2 = this_.dx2; + int width1 = this_.roi.width + kwidth - 1; + int xofs1 = std::min(this_.roi.x, this_.anchor.x); + bool isSep = this_.isSeparable(); + bool makeBorder = (_dx1 > 0 || _dx2 > 0) && this_.rowBorderType != BORDER_CONSTANT; int dy = 0, i = 0; src -= xofs1*esz; - count = std::min(count, remainingInputRows()); + count = std::min(count, this_.remainingInputRows()); - CV_Assert( src && dst && count > 0 ); + CV_Assert(src && dst && count > 0); for(;; dst += dststep*i, dy += i) { - int dcount = bufRows - ay - startY - rowCount + roi.y; + int dcount = bufRows - ay - this_.startY - this_.rowCount + this_.roi.y; dcount = dcount > 0 ? dcount : bufRows - kheight + 1; dcount = std::min(dcount, count); count -= dcount; for( ; dcount-- > 0; src += srcstep ) { - int bi = (startY - startY0 + rowCount) % bufRows; - uchar* brow = alignPtr(&ringBuf[0], VEC_ALIGN) + bi*bufStep; - uchar* row = isSep ? &srcRow[0] : brow; + int bi = (this_.startY - this_.startY0 + this_.rowCount) % bufRows; + uchar* brow = alignPtr(&this_.ringBuf[0], VEC_ALIGN) + bi*this_.bufStep; + uchar* row = isSep ? &this_.srcRow[0] : brow; - if( ++rowCount > bufRows ) + if (++this_.rowCount > bufRows) { - --rowCount; - ++startY; + --this_.rowCount; + ++this_.startY; } memcpy( row + _dx1*esz, src, (width1 - _dx2 - _dx1)*esz ); @@ -349,99 +261,55 @@ int FilterEngine::proceed( const uchar* src, int srcstep, int count, } if( isSep ) - (*rowFilter)(row, brow, width, CV_MAT_CN(srcType)); + (*this_.rowFilter)(row, brow, width, CV_MAT_CN(this_.srcType)); } - int max_i = std::min(bufRows, roi.height - (dstY + dy) + (kheight - 1)); + int max_i = std::min(bufRows, this_.roi.height - (this_.dstY + dy) + (kheight - 1)); for( i = 0; i < max_i; i++ ) { - int srcY = borderInterpolate(dstY + dy + i + roi.y - ay, - wholeSize.height, columnBorderType); + int srcY = borderInterpolate(this_.dstY + dy + i + this_.roi.y - ay, + this_.wholeSize.height, this_.columnBorderType); if( srcY < 0 ) // can happen only with constant border type - brows[i] = alignPtr(&constBorderRow[0], VEC_ALIGN); + brows[i] = alignPtr(&this_.constBorderRow[0], VEC_ALIGN); else { - CV_Assert( srcY >= startY ); - if( srcY >= startY + rowCount ) + CV_Assert(srcY >= this_.startY); + if( srcY >= this_.startY + this_.rowCount) break; - int bi = (srcY - startY0) % bufRows; - brows[i] = alignPtr(&ringBuf[0], VEC_ALIGN) + bi*bufStep; + int bi = (srcY - this_.startY0) % bufRows; + brows[i] = alignPtr(&this_.ringBuf[0], VEC_ALIGN) + bi*this_.bufStep; } } if( i < kheight ) break; i -= kheight - 1; - if( isSeparable() ) - (*columnFilter)((const uchar**)brows, dst, dststep, i, roi.width*cn); + if (isSep) + (*this_.columnFilter)((const uchar**)brows, dst, dststep, i, this_.roi.width*cn); else - (*filter2D)((const uchar**)brows, dst, dststep, i, roi.width, cn); + (*this_.filter2D)((const uchar**)brows, dst, dststep, i, this_.roi.width, cn); } - dstY += dy; - CV_Assert( dstY <= roi.height ); + this_.dstY += dy; + CV_Assert(this_.dstY <= this_.roi.height); return dy; } -void FilterEngine::apply(const Mat& src, Mat& dst, const Size & wsz, const Point & ofs) +void FilterEngine__apply(FilterEngine& this_, const Mat& src, Mat& dst, const Size& wsz, const Point& ofs) { CV_INSTRUMENT_REGION(); - CV_Assert( src.type() == srcType && dst.type() == dstType ); + CV_DbgAssert(src.type() == this_.srcType && dst.type() == this_.dstType); - int y = start(src, wsz, ofs); - proceed(src.ptr() + y*src.step, + FilterEngine__start(this_, wsz, src.size(), ofs); + int y = this_.startY - ofs.y; + FilterEngine__proceed(this_, + src.ptr() + y*src.step, (int)src.step, - endY - startY, + this_.endY - this_.startY, dst.ptr(), (int)dst.step ); } -} - -/****************************************************************************************\ -* Separable linear filter * -\****************************************************************************************/ - -int cv::getKernelType(InputArray filter_kernel, Point anchor) -{ - Mat _kernel = filter_kernel.getMat(); - CV_Assert( _kernel.channels() == 1 ); - int i, sz = _kernel.rows*_kernel.cols; - - Mat kernel; - _kernel.convertTo(kernel, CV_64F); - - const double* coeffs = kernel.ptr(); - double sum = 0; - int type = KERNEL_SMOOTH + KERNEL_INTEGER; - if( (_kernel.rows == 1 || _kernel.cols == 1) && - anchor.x*2 + 1 == _kernel.cols && - anchor.y*2 + 1 == _kernel.rows ) - type |= (KERNEL_SYMMETRICAL + KERNEL_ASYMMETRICAL); - - for( i = 0; i < sz; i++ ) - { - double a = coeffs[i], b = coeffs[sz - i - 1]; - if( a != b ) - type &= ~KERNEL_SYMMETRICAL; - if( a != -b ) - type &= ~KERNEL_ASYMMETRICAL; - if( a < 0 ) - type &= ~KERNEL_SMOOTH; - if( a != saturate_cast(a) ) - type &= ~KERNEL_INTEGER; - sum += a; - } - - if( fabs(sum - 1) > FLT_EPSILON*(fabs(sum) + 1) ) - type &= ~KERNEL_SMOOTH; - return type; -} - - -namespace cv -{ - struct RowNoVec { RowNoVec() {} @@ -503,6 +371,8 @@ struct RowVec_8u32s int operator()(const uchar* _src, uchar* _dst, int width, int cn) const { + CV_INSTRUMENT_REGION(); + int i = 0, k, _ksize = kernel.rows + kernel.cols - 1; int* dst = (int*)_dst; const int* _kx = kernel.ptr(); @@ -587,7 +457,6 @@ struct RowVec_8u32s i += v_uint32::nlanes; } } - vx_cleanup(); return i; } @@ -618,6 +487,8 @@ struct SymmRowSmallVec_8u32s int operator()(const uchar* src, uchar* _dst, int width, int cn) const { + CV_INSTRUMENT_REGION(); + int i = 0, j, k, _ksize = kernel.rows + kernel.cols - 1; int* dst = (int*)_dst; bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; @@ -1083,8 +954,6 @@ struct SymmRowSmallVec_8u32s } } } - - vx_cleanup(); return i; } @@ -1107,6 +976,8 @@ struct SymmColumnVec_32s8u int operator()(const uchar** _src, uchar* dst, int width) const { + CV_INSTRUMENT_REGION(); + int _ksize = kernel.rows + kernel.cols - 1; if( _ksize == 1 ) return 0; @@ -1237,8 +1108,6 @@ struct SymmColumnVec_32s8u i += v_int32x4::nlanes; } } - - vx_cleanup(); return i; } @@ -1261,6 +1130,8 @@ struct SymmColumnSmallVec_32s16s int operator()(const uchar** _src, uchar* _dst, int width) const { + CV_INSTRUMENT_REGION(); + int ksize2 = (kernel.rows + kernel.cols - 1)/2; const float* ky = kernel.ptr() + ksize2; int i = 0; @@ -1420,8 +1291,6 @@ struct SymmColumnSmallVec_32s16s } } } - - vx_cleanup(); return i; } @@ -1443,6 +1312,8 @@ struct RowVec_16s32f int operator()(const uchar* _src, uchar* _dst, int width, int cn) const { + CV_INSTRUMENT_REGION(); + int i = 0, k, _ksize = kernel.rows + kernel.cols - 1; float* dst = (float*)_dst; const float* _kx = kernel.ptr(); @@ -1495,7 +1366,6 @@ struct RowVec_16s32f v_store(dst + i, s0); i += v_float32::nlanes; } - vx_cleanup(); return i; } @@ -1516,6 +1386,8 @@ struct SymmColumnVec_32f16s int operator()(const uchar** _src, uchar* _dst, int width) const { + CV_INSTRUMENT_REGION(); + int _ksize = kernel.rows + kernel.cols - 1; if( _ksize == 1 ) return 0; @@ -1620,7 +1492,6 @@ struct SymmColumnVec_32f16s } } - vx_cleanup(); return i; } @@ -1636,7 +1507,6 @@ struct RowVec_32f { RowVec_32f() { - haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2; #if defined USE_IPP_SEP_FILTERS bufsz = -1; #endif @@ -1645,7 +1515,6 @@ struct RowVec_32f RowVec_32f( const Mat& _kernel ) { kernel = _kernel; - haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2; #if defined USE_IPP_SEP_FILTERS bufsz = -1; #endif @@ -1653,6 +1522,8 @@ struct RowVec_32f int operator()(const uchar* _src, uchar* _dst, int width, int cn) const { + CV_INSTRUMENT_REGION(); + #if defined USE_IPP_SEP_FILTERS CV_IPP_CHECK() { @@ -1670,9 +1541,24 @@ struct RowVec_32f int i = 0, k; width *= cn; -#if CV_TRY_AVX2 - if (haveAVX2) - return RowVec_32f_AVX(src0, _kx, dst, width, cn, _ksize); +#if CV_AVX + for (; i <= width - 8; i += 8) + { + const float* src = src0 + i; + __m256 f, x0; + __m256 s0 = _mm256_set1_ps(0.0f); + for (k = 0; k < _ksize; k++, src += cn) + { + f = _mm256_set1_ps(_kx[k]); + x0 = _mm256_loadu_ps(src); +#if CV_FMA3 + s0 = _mm256_fmadd_ps(x0, f, s0); +#else + s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f)); +#endif + } + _mm256_storeu_ps(dst + i, s0); + } #endif v_float32 k0 = vx_setall_f32(_kx[0]); for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes ) @@ -1722,12 +1608,10 @@ struct RowVec_32f v_store(dst + i, s0); i += v_float32::nlanes; } - vx_cleanup(); return i; } Mat kernel; - bool haveAVX2; #if defined USE_IPP_SEP_FILTERS private: mutable int bufsz; @@ -1782,6 +1666,8 @@ struct SymmRowSmallVec_32f int operator()(const uchar* _src, uchar* _dst, int width, int cn) const { + CV_INSTRUMENT_REGION(); + int i = 0, _ksize = kernel.rows + kernel.cols - 1; if( _ksize == 1 ) return 0; @@ -1868,8 +1754,6 @@ struct SymmRowSmallVec_32f v_store(dst + i, v_muladd(vx_load(src + 2*cn) - vx_load(src - 2*cn), k2, (vx_load(src + cn) - vx_load(src - cn)) * k1)); } } - - vx_cleanup(); return i; } @@ -1882,7 +1766,6 @@ struct SymmColumnVec_32f { SymmColumnVec_32f() { symmetryType=0; - haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2; delta = 0; } SymmColumnVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta) @@ -1890,12 +1773,13 @@ struct SymmColumnVec_32f symmetryType = _symmetryType; kernel = _kernel; delta = (float)_delta; - haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2; CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); } int operator()(const uchar** _src, uchar* _dst, int width) const { + CV_INSTRUMENT_REGION(); + int ksize2 = (kernel.rows + kernel.cols - 1)/2; const float* ky = kernel.ptr() + ksize2; int i = 0, k; @@ -1906,9 +1790,53 @@ struct SymmColumnVec_32f if( symmetrical ) { -#if CV_TRY_AVX2 - if (haveAVX2) - return SymmColumnVec_32f_Symm_AVX(src, ky, dst, delta, width, ksize2); +#if CV_AVX + { + const float *S, *S2; + const __m256 d8 = _mm256_set1_ps(delta); + + for( ; i <= width - 16; i += 16 ) + { + __m256 f = _mm256_set1_ps(ky[0]); + __m256 s0, s1; + __m256 x0; + S = src[0] + i; + s0 = _mm256_loadu_ps(S); +#if CV_FMA3 + s0 = _mm256_fmadd_ps(s0, f, d8); +#else + s0 = _mm256_add_ps(_mm256_mul_ps(s0, f), d8); +#endif + s1 = _mm256_loadu_ps(S+8); +#if CV_FMA3 + s1 = _mm256_fmadd_ps(s1, f, d8); +#else + s1 = _mm256_add_ps(_mm256_mul_ps(s1, f), d8); +#endif + + for( k = 1; k <= ksize2; k++ ) + { + S = src[k] + i; + S2 = src[-k] + i; + f = _mm256_set1_ps(ky[k]); + x0 = _mm256_add_ps(_mm256_loadu_ps(S), _mm256_loadu_ps(S2)); +#if CV_FMA3 + s0 = _mm256_fmadd_ps(x0, f, s0); +#else + s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f)); +#endif + x0 = _mm256_add_ps(_mm256_loadu_ps(S+8), _mm256_loadu_ps(S2+8)); +#if CV_FMA3 + s1 = _mm256_fmadd_ps(x0, f, s1); +#else + s1 = _mm256_add_ps(s1, _mm256_mul_ps(x0, f)); +#endif + } + + _mm256_storeu_ps(dst + i, s0); + _mm256_storeu_ps(dst + i + 8, s1); + } + } #endif const v_float32 d4 = vx_setall_f32(delta); const v_float32 k0 = vx_setall_f32(ky[0]); @@ -1956,11 +1884,41 @@ struct SymmColumnVec_32f } else { -#if CV_TRY_AVX2 - if (haveAVX2) - return SymmColumnVec_32f_Unsymm_AVX(src, ky, dst, delta, width, ksize2); -#endif CV_DbgAssert(ksize2 > 0); +#if CV_AVX + { + const float *S2; + const __m256 d8 = _mm256_set1_ps(delta); + + for (; i <= width - 16; i += 16) + { + __m256 f, s0 = d8, s1 = d8; + __m256 x0; + + for (k = 1; k <= ksize2; k++) + { + const float *S = src[k] + i; + S2 = src[-k] + i; + f = _mm256_set1_ps(ky[k]); + x0 = _mm256_sub_ps(_mm256_loadu_ps(S), _mm256_loadu_ps(S2)); +#if CV_FMA3 + s0 = _mm256_fmadd_ps(x0, f, s0); +#else + s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f)); +#endif + x0 = _mm256_sub_ps(_mm256_loadu_ps(S + 8), _mm256_loadu_ps(S2 + 8)); +#if CV_FMA3 + s1 = _mm256_fmadd_ps(x0, f, s1); +#else + s1 = _mm256_add_ps(s1, _mm256_mul_ps(x0, f)); +#endif + } + + _mm256_storeu_ps(dst + i, s0); + _mm256_storeu_ps(dst + i + 8, s1); + } + } +#endif const v_float32 d4 = vx_setall_f32(delta); const v_float32 k1 = vx_setall_f32(ky[1]); for( ; i <= width - 4*v_float32::nlanes; i += 4*v_float32::nlanes ) @@ -2005,15 +1963,12 @@ struct SymmColumnVec_32f i += v_float32::nlanes; } } - - vx_cleanup(); return i; } int symmetryType; float delta; Mat kernel; - bool haveAVX2; }; @@ -2030,6 +1985,8 @@ struct SymmColumnSmallVec_32f int operator()(const uchar** _src, uchar* _dst, int width) const { + CV_INSTRUMENT_REGION(); + int ksize2 = (kernel.rows + kernel.cols - 1)/2; const float* ky = kernel.ptr() + ksize2; int i = 0; @@ -2085,8 +2042,6 @@ struct SymmColumnSmallVec_32f v_store(dst + i, v_muladd(vx_load(S2 + i) - vx_load(S0 + i), k1, d4)); } } - - vx_cleanup(); return i; } @@ -2115,6 +2070,8 @@ struct FilterVec_8u int operator()(const uchar** src, uchar* dst, int width) const { + CV_INSTRUMENT_REGION(); + CV_DbgAssert(_nz > 0); const float* kf = (const float*)&coeffs[0]; int i = 0, k, nz = _nz; @@ -2175,8 +2132,6 @@ struct FilterVec_8u *(int*)(dst + i) = v_reinterpret_as_s32(v_pack_u(s16, s16)).get0(); i += v_int32x4::nlanes; } - - vx_cleanup(); return i; } @@ -2201,6 +2156,8 @@ struct FilterVec_8u16s int operator()(const uchar** src, uchar* _dst, int width) const { + CV_INSTRUMENT_REGION(); + CV_DbgAssert(_nz > 0); const float* kf = (const float*)&coeffs[0]; short* dst = (short*)_dst; @@ -2251,8 +2208,6 @@ struct FilterVec_8u16s v_pack_store(dst + i, v_round(s0)); i += v_int32::nlanes; } - - vx_cleanup(); return i; } @@ -2275,6 +2230,8 @@ struct FilterVec_32f int operator()(const uchar** _src, uchar* _dst, int width) const { + CV_INSTRUMENT_REGION(); + const float* kf = (const float*)&coeffs[0]; const float** src = (const float**)_src; float* dst = (float*)_dst; @@ -2323,8 +2280,6 @@ struct FilterVec_32f v_store(dst + i, s0); i += v_float32::nlanes; } - - vx_cleanup(); return i; } @@ -2369,6 +2324,8 @@ template struct RowFilter : public BaseRo void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int _ksize = ksize; const DT* kx = kernel.ptr
(); const ST* S; @@ -2427,6 +2384,8 @@ template struct SymmRowSmallFilter : void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int ksize2 = this->ksize/2, ksize2n = ksize2*cn; const DT* kx = this->kernel.template ptr
() + ksize2; bool symmetrical = (this->symmetryType & KERNEL_SYMMETRICAL) != 0; @@ -2566,6 +2525,8 @@ template struct ColumnFilter : public BaseColumnFilte void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + const ST* ky = kernel.template ptr(); ST _delta = delta; int _ksize = ksize; @@ -2629,6 +2590,8 @@ template struct SymmColumnFilter : public ColumnFilte void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int ksize2 = this->ksize/2; const ST* ky = this->kernel.template ptr() + ksize2; int i, k; @@ -2735,6 +2698,8 @@ struct SymmColumnSmallFilter : public SymmColumnFilter void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE { + CV_INSTRUMENT_REGION(); + int ksize2 = this->ksize/2; const ST* ky = this->kernel.template ptr() + ksize2; int i; @@ -2904,13 +2869,14 @@ template struct FixedPtCastEx int SHIFT, DELTA; }; -} -cv::Ptr cv::getLinearRowFilter( int srcType, int bufType, - InputArray _kernel, int anchor, - int symmetryType ) +Ptr getLinearRowFilter( + int srcType, int bufType, + const Mat& kernel, int anchor, + int symmetryType) { - Mat kernel = _kernel.getMat(); + CV_INSTRUMENT_REGION(); + int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(bufType); int cn = CV_MAT_CN(srcType); CV_Assert( cn == CV_MAT_CN(bufType) && @@ -2958,12 +2924,14 @@ cv::Ptr cv::getLinearRowFilter( int srcType, int bufType, } -cv::Ptr cv::getLinearColumnFilter( int bufType, int dstType, - InputArray _kernel, int anchor, - int symmetryType, double delta, - int bits ) +Ptr getLinearColumnFilter( + int bufType, int dstType, + const Mat& kernel, int anchor, + int symmetryType, double delta, + int bits) { - Mat kernel = _kernel.getMat(); + CV_INSTRUMENT_REGION(); + int sdepth = CV_MAT_DEPTH(bufType), ddepth = CV_MAT_DEPTH(dstType); int cn = CV_MAT_CN(dstType); CV_Assert( cn == CV_MAT_CN(bufType) && @@ -3053,131 +3021,6 @@ cv::Ptr cv::getLinearColumnFilter( int bufType, int dstTyp } -cv::Ptr cv::createSeparableLinearFilter( - int _srcType, int _dstType, - InputArray __rowKernel, InputArray __columnKernel, - Point _anchor, double _delta, - int _rowBorderType, int _columnBorderType, - const Scalar& _borderValue ) -{ - Mat _rowKernel = __rowKernel.getMat(), _columnKernel = __columnKernel.getMat(); - _srcType = CV_MAT_TYPE(_srcType); - _dstType = CV_MAT_TYPE(_dstType); - int sdepth = CV_MAT_DEPTH(_srcType), ddepth = CV_MAT_DEPTH(_dstType); - int cn = CV_MAT_CN(_srcType); - CV_Assert( cn == CV_MAT_CN(_dstType) ); - int rsize = _rowKernel.rows + _rowKernel.cols - 1; - int csize = _columnKernel.rows + _columnKernel.cols - 1; - if( _anchor.x < 0 ) - _anchor.x = rsize/2; - if( _anchor.y < 0 ) - _anchor.y = csize/2; - int rtype = getKernelType(_rowKernel, - _rowKernel.rows == 1 ? Point(_anchor.x, 0) : Point(0, _anchor.x)); - int ctype = getKernelType(_columnKernel, - _columnKernel.rows == 1 ? Point(_anchor.y, 0) : Point(0, _anchor.y)); - Mat rowKernel, columnKernel; - - int bdepth = std::max(CV_32F,std::max(sdepth, ddepth)); - int bits = 0; - - if( sdepth == CV_8U && - ((rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL && - ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL && - ddepth == CV_8U) || - ((rtype & (KERNEL_SYMMETRICAL+KERNEL_ASYMMETRICAL)) && - (ctype & (KERNEL_SYMMETRICAL+KERNEL_ASYMMETRICAL)) && - (rtype & ctype & KERNEL_INTEGER) && - ddepth == CV_16S)) ) - { - bdepth = CV_32S; - bits = ddepth == CV_8U ? 8 : 0; - _rowKernel.convertTo( rowKernel, CV_32S, 1 << bits ); - _columnKernel.convertTo( columnKernel, CV_32S, 1 << bits ); - bits *= 2; - _delta *= (1 << bits); - } - else - { - if( _rowKernel.type() != bdepth ) - _rowKernel.convertTo( rowKernel, bdepth ); - else - rowKernel = _rowKernel; - if( _columnKernel.type() != bdepth ) - _columnKernel.convertTo( columnKernel, bdepth ); - else - columnKernel = _columnKernel; - } - - int _bufType = CV_MAKETYPE(bdepth, cn); - Ptr _rowFilter = getLinearRowFilter( - _srcType, _bufType, rowKernel, _anchor.x, rtype); - Ptr _columnFilter = getLinearColumnFilter( - _bufType, _dstType, columnKernel, _anchor.y, ctype, _delta, bits ); - - return Ptr( new FilterEngine(Ptr(), _rowFilter, _columnFilter, - _srcType, _dstType, _bufType, _rowBorderType, _columnBorderType, _borderValue )); -} - - -/****************************************************************************************\ -* Non-separable linear filter * -\****************************************************************************************/ - -namespace cv -{ - -void preprocess2DKernel( const Mat& kernel, std::vector& coords, std::vector& coeffs ) -{ - int i, j, k, nz = countNonZero(kernel), ktype = kernel.type(); - if(nz == 0) - nz = 1; - CV_Assert( ktype == CV_8U || ktype == CV_32S || ktype == CV_32F || ktype == CV_64F ); - coords.resize(nz); - coeffs.resize(nz*getElemSize(ktype)); - uchar* _coeffs = &coeffs[0]; - - for( i = k = 0; i < kernel.rows; i++ ) - { - const uchar* krow = kernel.ptr(i); - for( j = 0; j < kernel.cols; j++ ) - { - if( ktype == CV_8U ) - { - uchar val = krow[j]; - if( val == 0 ) - continue; - coords[k] = Point(j,i); - _coeffs[k++] = val; - } - else if( ktype == CV_32S ) - { - int val = ((const int*)krow)[j]; - if( val == 0 ) - continue; - coords[k] = Point(j,i); - ((int*)_coeffs)[k++] = val; - } - else if( ktype == CV_32F ) - { - float val = ((const float*)krow)[j]; - if( val == 0 ) - continue; - coords[k] = Point(j,i); - ((float*)_coeffs)[k++] = val; - } - else - { - double val = ((const double*)krow)[j]; - if( val == 0 ) - continue; - coords[k] = Point(j,i); - ((double*)_coeffs)[k++] = val; - } - } - } -} - template struct Filter2D : public BaseFilter { @@ -3253,489 +3096,14 @@ template struct Filter2D : public BaseFi VecOp vecOp; }; -#ifdef HAVE_OPENCL - -#define DIVUP(total, grain) (((total) + (grain) - 1) / (grain)) -#define ROUNDUP(sz, n) ((sz) + (n) - 1 - (((sz) + (n) - 1) % (n))) - -// prepare kernel: transpose and make double rows (+align). Returns size of aligned row -// Samples: -// a b c -// Input: d e f -// g h i -// Output, last two zeros is the alignment: -// a d g a d g 0 0 -// b e h b e h 0 0 -// c f i c f i 0 0 -template -static int _prepareKernelFilter2D(std::vector & data, const Mat & kernel) -{ - Mat _kernel; kernel.convertTo(_kernel, DataDepth::value); - int size_y_aligned = ROUNDUP(kernel.rows * 2, 4); - data.clear(); data.resize(size_y_aligned * kernel.cols, 0); - for (int x = 0; x < kernel.cols; x++) - { - for (int y = 0; y < kernel.rows; y++) - { - data[x * size_y_aligned + y] = _kernel.at(y, x); - data[x * size_y_aligned + y + kernel.rows] = _kernel.at(y, x); - } - } - return size_y_aligned; -} - -static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth, - InputArray _kernel, Point anchor, - double delta, int borderType ) -{ - int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - ddepth = ddepth < 0 ? sdepth : ddepth; - int dtype = CV_MAKE_TYPE(ddepth, cn), wdepth = std::max(std::max(sdepth, ddepth), CV_32F), - wtype = CV_MAKE_TYPE(wdepth, cn); - if (cn > 4) - return false; - - Size ksize = _kernel.size(); - if (anchor.x < 0) - anchor.x = ksize.width / 2; - if (anchor.y < 0) - anchor.y = ksize.height / 2; - - bool isolated = (borderType & BORDER_ISOLATED) != 0; - borderType &= ~BORDER_ISOLATED; - const cv::ocl::Device &device = cv::ocl::Device::getDefault(); - bool doubleSupport = device.doubleFPConfig() > 0; - if (wdepth == CV_64F && !doubleSupport) - return false; - - const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", - "BORDER_WRAP", "BORDER_REFLECT_101" }; - - cv::Mat kernelMat = _kernel.getMat(); - cv::Size sz = _src.size(), wholeSize; - size_t globalsize[2] = { (size_t)sz.width, (size_t)sz.height }; - size_t localsize_general[2] = {0, 1}; - size_t* localsize = NULL; - - ocl::Kernel k; - UMat src = _src.getUMat(); - if (!isolated) - { - Point ofs; - src.locateROI(wholeSize, ofs); - } - - size_t tryWorkItems = device.maxWorkGroupSize(); - if (device.isIntel() && 128 < tryWorkItems) - tryWorkItems = 128; - char cvt[2][40]; - - // For smaller filter kernels, there is a special kernel that is more - // efficient than the general one. - UMat kernalDataUMat; - if (device.isIntel() && (device.type() & ocl::Device::TYPE_GPU) && - ((ksize.width < 5 && ksize.height < 5) || - (ksize.width == 5 && ksize.height == 5 && cn == 1))) - { - kernelMat = kernelMat.reshape(0, 1); - String kerStr = ocl::kernelToStr(kernelMat, CV_32F); - int h = isolated ? sz.height : wholeSize.height; - int w = isolated ? sz.width : wholeSize.width; - - if (w < ksize.width || h < ksize.height) - return false; - - // Figure out what vector size to use for loading the pixels. - int pxLoadNumPixels = cn != 1 || sz.width % 4 ? 1 : 4; - int pxLoadVecSize = cn * pxLoadNumPixels; - - // Figure out how many pixels per work item to compute in X and Y - // directions. Too many and we run out of registers. - int pxPerWorkItemX = 1; - int pxPerWorkItemY = 1; - if (cn <= 2 && ksize.width <= 4 && ksize.height <= 4) - { - pxPerWorkItemX = sz.width % 8 ? sz.width % 4 ? sz.width % 2 ? 1 : 2 : 4 : 8; - pxPerWorkItemY = sz.height % 2 ? 1 : 2; - } - else if (cn < 4 || (ksize.width <= 4 && ksize.height <= 4)) - { - pxPerWorkItemX = sz.width % 2 ? 1 : 2; - pxPerWorkItemY = sz.height % 2 ? 1 : 2; - } - globalsize[0] = sz.width / pxPerWorkItemX; - globalsize[1] = sz.height / pxPerWorkItemY; - - // Need some padding in the private array for pixels - int privDataWidth = ROUNDUP(pxPerWorkItemX + ksize.width - 1, pxLoadNumPixels); - - // Make the global size a nice round number so the runtime can pick - // from reasonable choices for the workgroup size - const int wgRound = 256; - globalsize[0] = ROUNDUP(globalsize[0], wgRound); - - char build_options[1024]; - sprintf(build_options, "-D cn=%d " - "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d " - "-D PX_LOAD_VEC_SIZE=%d -D PX_LOAD_NUM_PX=%d " - "-D PX_PER_WI_X=%d -D PX_PER_WI_Y=%d -D PRIV_DATA_WIDTH=%d -D %s -D %s " - "-D PX_LOAD_X_ITERATIONS=%d -D PX_LOAD_Y_ITERATIONS=%d " - "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s " - "-D convertToWT=%s -D convertToDstT=%s %s", - cn, anchor.x, anchor.y, ksize.width, ksize.height, - pxLoadVecSize, pxLoadNumPixels, - pxPerWorkItemX, pxPerWorkItemY, privDataWidth, borderMap[borderType], - isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED", - privDataWidth / pxLoadNumPixels, pxPerWorkItemY + ksize.height - 1, - ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype), - ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth), - ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), - ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), kerStr.c_str()); - - if (!k.create("filter2DSmall", cv::ocl::imgproc::filter2DSmall_oclsrc, build_options)) - return false; - } - else - { - localsize = localsize_general; - std::vector kernelMatDataFloat; - int kernel_size_y2_aligned = _prepareKernelFilter2D(kernelMatDataFloat, kernelMat); - String kerStr = ocl::kernelToStr(kernelMatDataFloat, CV_32F); - - for ( ; ; ) - { - size_t BLOCK_SIZE = tryWorkItems; - while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)sz.width * 2) - BLOCK_SIZE /= 2; - - if ((size_t)ksize.width > BLOCK_SIZE) - return false; - - int requiredTop = anchor.y; - int requiredLeft = (int)BLOCK_SIZE; // not this: anchor.x; - int requiredBottom = ksize.height - 1 - anchor.y; - int requiredRight = (int)BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x; - int h = isolated ? sz.height : wholeSize.height; - int w = isolated ? sz.width : wholeSize.width; - bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight; - - if ((w < ksize.width) || (h < ksize.height)) - return false; - - String opts = format("-D LOCAL_SIZE=%d -D cn=%d " - "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d " - "-D KERNEL_SIZE_Y2_ALIGNED=%d -D %s -D %s -D %s%s%s " - "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s " - "-D convertToWT=%s -D convertToDstT=%s", - (int)BLOCK_SIZE, cn, anchor.x, anchor.y, - ksize.width, ksize.height, kernel_size_y2_aligned, borderMap[borderType], - extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION", - isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED", - doubleSupport ? " -D DOUBLE_SUPPORT" : "", kerStr.c_str(), - ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype), - ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth), - ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), - ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1])); - - localsize[0] = BLOCK_SIZE; - globalsize[0] = DIVUP(sz.width, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE; - globalsize[1] = sz.height; - - if (!k.create("filter2D", cv::ocl::imgproc::filter2D_oclsrc, opts)) - return false; - - size_t kernelWorkGroupSize = k.workGroupSize(); - if (localsize[0] <= kernelWorkGroupSize) - break; - if (BLOCK_SIZE < kernelWorkGroupSize) - return false; - tryWorkItems = kernelWorkGroupSize; - } - } - - _dst.create(sz, dtype); - UMat dst = _dst.getUMat(); - - int srcOffsetX = (int)((src.offset % src.step) / src.elemSize()); - int srcOffsetY = (int)(src.offset / src.step); - int srcEndX = (isolated ? (srcOffsetX + sz.width) : wholeSize.width); - int srcEndY = (isolated ? (srcOffsetY + sz.height) : wholeSize.height); - - k.args(ocl::KernelArg::PtrReadOnly(src), (int)src.step, srcOffsetX, srcOffsetY, - srcEndX, srcEndY, ocl::KernelArg::WriteOnly(dst), (float)delta); - - return k.run(2, globalsize, localsize, false); -} - -const int shift_bits = 8; - -static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX, int anchor, - int borderType, int ddepth, bool fast8uc1, bool int_arithm) -{ - int type = src.type(), cn = CV_MAT_CN(type), sdepth = CV_MAT_DEPTH(type); - bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; - Size bufSize = buf.size(); - int buf_type = buf.type(), bdepth = CV_MAT_DEPTH(buf_type); - - if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) - return false; - -#ifdef __ANDROID__ - size_t localsize[2] = {16, 10}; -#else - size_t localsize[2] = {16, 16}; -#endif - - size_t globalsize[2] = {DIVUP(bufSize.width, localsize[0]) * localsize[0], DIVUP(bufSize.height, localsize[1]) * localsize[1]}; - if (fast8uc1) - globalsize[0] = DIVUP((bufSize.width + 3) >> 2, localsize[0]) * localsize[0]; - - int radiusX = anchor, radiusY = (buf.rows - src.rows) >> 1; - - bool isolated = (borderType & BORDER_ISOLATED) != 0; - const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", "BORDER_REFLECT_101" }, - * const btype = borderMap[borderType & ~BORDER_ISOLATED]; - - bool extra_extrapolation = src.rows < (int)((-radiusY + globalsize[1]) >> 1) + 1; - extra_extrapolation |= src.rows < radiusY; - extra_extrapolation |= src.cols < (int)((-radiusX + globalsize[0] + 8 * localsize[0] + 3) >> 1) + 1; - extra_extrapolation |= src.cols < radiusX; - - char cvt[40]; - cv::String build_options = cv::format("-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D %s -D %s" - " -D srcT=%s -D dstT=%s -D convertToDstT=%s -D srcT1=%s -D dstT1=%s%s%s", - radiusX, (int)localsize[0], (int)localsize[1], cn, btype, - extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION", - isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED", - ocl::typeToStr(type), ocl::typeToStr(buf_type), - ocl::convertTypeStr(sdepth, bdepth, cn, cvt), - ocl::typeToStr(sdepth), ocl::typeToStr(bdepth), - doubleSupport ? " -D DOUBLE_SUPPORT" : "", - int_arithm ? " -D INTEGER_ARITHMETIC" : ""); - build_options += ocl::kernelToStr(kernelX, bdepth); - - Size srcWholeSize; Point srcOffset; - src.locateROI(srcWholeSize, srcOffset); - - String kernelName("row_filter"); - if (fast8uc1) - kernelName += "_C1_D0"; - - ocl::Kernel k(kernelName.c_str(), cv::ocl::imgproc::filterSepRow_oclsrc, - build_options); - if (k.empty()) - return false; - - if (fast8uc1) - k.args(ocl::KernelArg::PtrReadOnly(src), (int)(src.step / src.elemSize()), srcOffset.x, - srcOffset.y, src.cols, src.rows, srcWholeSize.width, srcWholeSize.height, - ocl::KernelArg::PtrWriteOnly(buf), (int)(buf.step / buf.elemSize()), - buf.cols, buf.rows, radiusY); - else - k.args(ocl::KernelArg::PtrReadOnly(src), (int)src.step, srcOffset.x, - srcOffset.y, src.cols, src.rows, srcWholeSize.width, srcWholeSize.height, - ocl::KernelArg::PtrWriteOnly(buf), (int)buf.step, buf.cols, buf.rows, radiusY); - - return k.run(2, globalsize, localsize, false); -} -static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY, double delta, int anchor, bool int_arithm) +Ptr getLinearFilter( + int srcType, int dstType, + const Mat& _kernel, Point anchor, + double delta, int bits) { - bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; - if (dst.depth() == CV_64F && !doubleSupport) - return false; - -#ifdef __ANDROID__ - size_t localsize[2] = { 16, 10 }; -#else - size_t localsize[2] = { 16, 16 }; -#endif - size_t globalsize[2] = { 0, 0 }; - - int dtype = dst.type(), cn = CV_MAT_CN(dtype), ddepth = CV_MAT_DEPTH(dtype); - Size sz = dst.size(); - int buf_type = buf.type(), bdepth = CV_MAT_DEPTH(buf_type); - - globalsize[1] = DIVUP(sz.height, localsize[1]) * localsize[1]; - globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0]; - - char cvt[40]; - cv::String build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d" - " -D srcT=%s -D dstT=%s -D convertToDstT=%s" - " -D srcT1=%s -D dstT1=%s -D SHIFT_BITS=%d%s%s", - anchor, (int)localsize[0], (int)localsize[1], cn, - ocl::typeToStr(buf_type), ocl::typeToStr(dtype), - ocl::convertTypeStr(bdepth, ddepth, cn, cvt), - ocl::typeToStr(bdepth), ocl::typeToStr(ddepth), - 2*shift_bits, doubleSupport ? " -D DOUBLE_SUPPORT" : "", - int_arithm ? " -D INTEGER_ARITHMETIC" : ""); - build_options += ocl::kernelToStr(kernelY, bdepth); - - ocl::Kernel k("col_filter", cv::ocl::imgproc::filterSepCol_oclsrc, - build_options); - if (k.empty()) - return false; - - k.args(ocl::KernelArg::ReadOnly(buf), ocl::KernelArg::WriteOnly(dst), - static_cast(delta)); - - return k.run(2, globalsize, localsize, false); -} - -const int optimizedSepFilterLocalWidth = 16; -const int optimizedSepFilterLocalHeight = 8; - -static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst, - Mat row_kernel, Mat col_kernel, - double delta, int borderType, int ddepth, int bdepth, bool int_arithm) -{ - Size size = _src.size(), wholeSize; - Point origin; - int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype), - esz = CV_ELEM_SIZE(stype), wdepth = std::max(std::max(sdepth, ddepth), bdepth), - dtype = CV_MAKE_TYPE(ddepth, cn); - size_t src_step = _src.step(), src_offset = _src.offset(); - bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; - - if (esz == 0 || src_step == 0 - || (src_offset % src_step) % esz != 0 - || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) - || !(borderType == BORDER_CONSTANT - || borderType == BORDER_REPLICATE - || borderType == BORDER_REFLECT - || borderType == BORDER_WRAP - || borderType == BORDER_REFLECT_101)) - return false; - - size_t lt2[2] = { optimizedSepFilterLocalWidth, optimizedSepFilterLocalHeight }; - size_t gt2[2] = { lt2[0] * (1 + (size.width - 1) / lt2[0]), lt2[1]}; - - char cvt[2][40]; - const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", - "BORDER_REFLECT_101" }; - - String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d%s%s" - " -D srcT=%s -D convertToWT=%s -D WT=%s -D dstT=%s -D convertToDstT=%s" - " -D %s -D srcT1=%s -D dstT1=%s -D WT1=%s -D CN=%d -D SHIFT_BITS=%d%s", - (int)lt2[0], (int)lt2[1], row_kernel.cols / 2, col_kernel.cols / 2, - ocl::kernelToStr(row_kernel, wdepth, "KERNEL_MATRIX_X").c_str(), - ocl::kernelToStr(col_kernel, wdepth, "KERNEL_MATRIX_Y").c_str(), - ocl::typeToStr(stype), ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), - ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), ocl::typeToStr(dtype), - ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType], - ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), ocl::typeToStr(wdepth), - cn, 2*shift_bits, int_arithm ? " -D INTEGER_ARITHMETIC" : ""); - - ocl::Kernel k("sep_filter", ocl::imgproc::filterSep_singlePass_oclsrc, opts); - if (k.empty()) - return false; - - UMat src = _src.getUMat(); - _dst.create(size, dtype); - UMat dst = _dst.getUMat(); - - int src_offset_x = static_cast((src_offset % src_step) / esz); - int src_offset_y = static_cast(src_offset / src_step); - - src.locateROI(wholeSize, origin); - - k.args(ocl::KernelArg::PtrReadOnly(src), (int)src_step, src_offset_x, src_offset_y, - wholeSize.height, wholeSize.width, ocl::KernelArg::WriteOnly(dst), - static_cast(delta)); - - return k.run(2, gt2, lt2, false); -} - -bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, - InputArray _kernelX, InputArray _kernelY, Point anchor, - double delta, int borderType ) -{ - const ocl::Device & d = ocl::Device::getDefault(); - Size imgSize = _src.size(); - - int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - if (cn > 4) - return false; - - Mat kernelX = _kernelX.getMat().reshape(1, 1); - if (kernelX.cols % 2 != 1) - return false; - Mat kernelY = _kernelY.getMat().reshape(1, 1); - if (kernelY.cols % 2 != 1) - return false; - - if (ddepth < 0) - ddepth = sdepth; - - if (anchor.x < 0) - anchor.x = kernelX.cols >> 1; - if (anchor.y < 0) - anchor.y = kernelY.cols >> 1; - - int rtype = getKernelType(kernelX, - kernelX.rows == 1 ? Point(anchor.x, 0) : Point(0, anchor.x)); - int ctype = getKernelType(kernelY, - kernelY.rows == 1 ? Point(anchor.y, 0) : Point(0, anchor.y)); - - int bdepth = CV_32F; - bool int_arithm = false; - if( sdepth == CV_8U && ddepth == CV_8U && - rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL && - ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL) - { - if (ocl::Device::getDefault().isIntel()) - { - for (int i=0; i(0, i) = (float) cvRound(kernelX.at(0, i) * (1 << shift_bits)); - if (kernelX.data != kernelY.data) - for (int i=0; i(0, i) = (float) cvRound(kernelY.at(0, i) * (1 << shift_bits)); - } else - { - bdepth = CV_32S; - kernelX.convertTo( kernelX, bdepth, 1 << shift_bits ); - kernelY.convertTo( kernelY, bdepth, 1 << shift_bits ); - } - int_arithm = true; - } - - CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 && - imgSize.width > optimizedSepFilterLocalWidth + anchor.x && - imgSize.height > optimizedSepFilterLocalHeight + anchor.y && - (!(borderType & BORDER_ISOLATED) || _src.offset() == 0) && - anchor == Point(kernelX.cols >> 1, kernelY.cols >> 1) && - OCL_PERFORMANCE_CHECK(d.isIntel()), // TODO FIXIT - ocl_sepFilter2D_SinglePass(_src, _dst, kernelX, kernelY, delta, - borderType & ~BORDER_ISOLATED, ddepth, bdepth, int_arithm), true) - - UMat src = _src.getUMat(); - Size srcWholeSize; Point srcOffset; - src.locateROI(srcWholeSize, srcOffset); - - bool fast8uc1 = type == CV_8UC1 && srcOffset.x % 4 == 0 && - src.cols % 4 == 0 && src.step % 4 == 0; - - Size srcSize = src.size(); - Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1); - UMat buf(bufSize, CV_MAKETYPE(bdepth, cn)); - if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1, int_arithm)) - return false; - - _dst.create(srcSize, CV_MAKETYPE(ddepth, cn)); - UMat dst = _dst.getUMat(); - - return ocl_sepColFilter2D(buf, dst, kernelY, delta, anchor.y, int_arithm); -} - -#endif - -} + CV_INSTRUMENT_REGION(); -cv::Ptr cv::getLinearFilter(int srcType, int dstType, - InputArray filter_kernel, Point anchor, - double delta, int bits) -{ - Mat _kernel = filter_kernel.getMat(); int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(dstType); int cn = CV_MAT_CN(srcType), kdepth = _kernel.depth(); CV_Assert( cn == CV_MAT_CN(dstType) && ddepth >= sdepth ); @@ -3806,476 +3174,6 @@ cv::Ptr cv::getLinearFilter(int srcType, int dstType, srcType, dstType)); } - -cv::Ptr cv::createLinearFilter( int _srcType, int _dstType, - InputArray filter_kernel, - Point _anchor, double _delta, - int _rowBorderType, int _columnBorderType, - const Scalar& _borderValue ) -{ - Mat _kernel = filter_kernel.getMat(); - _srcType = CV_MAT_TYPE(_srcType); - _dstType = CV_MAT_TYPE(_dstType); - int cn = CV_MAT_CN(_srcType); - CV_Assert( cn == CV_MAT_CN(_dstType) ); - - Mat kernel = _kernel; - int bits = 0; - - /*int sdepth = CV_MAT_DEPTH(_srcType), ddepth = CV_MAT_DEPTH(_dstType); - int ktype = _kernel.depth() == CV_32S ? KERNEL_INTEGER : getKernelType(_kernel, _anchor); - if( sdepth == CV_8U && (ddepth == CV_8U || ddepth == CV_16S) && - _kernel.rows*_kernel.cols <= (1 << 10) ) - { - bits = (ktype & KERNEL_INTEGER) ? 0 : 11; - _kernel.convertTo(kernel, CV_32S, 1 << bits); - }*/ - - Ptr _filter2D = getLinearFilter(_srcType, _dstType, - kernel, _anchor, _delta, bits); - - return makePtr(_filter2D, Ptr(), - Ptr(), _srcType, _dstType, _srcType, - _rowBorderType, _columnBorderType, _borderValue ); -} - - -//================================================================ -// HAL interface -//================================================================ - -using namespace cv; - -static bool replacementFilter2D(int stype, int dtype, int kernel_type, - uchar * src_data, size_t src_step, - uchar * dst_data, size_t dst_step, - int width, int height, - int full_width, int full_height, - int offset_x, int offset_y, - uchar * kernel_data, size_t kernel_step, - int kernel_width, int kernel_height, - int anchor_x, int anchor_y, - double delta, int borderType, bool isSubmatrix) -{ - cvhalFilter2D* ctx; - int res = cv_hal_filterInit(&ctx, kernel_data, kernel_step, kernel_type, kernel_width, kernel_height, width, height, - stype, dtype, borderType, delta, anchor_x, anchor_y, isSubmatrix, src_data == dst_data); - if (res != CV_HAL_ERROR_OK) - return false; - res = cv_hal_filter(ctx, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y); - bool success = (res == CV_HAL_ERROR_OK); - res = cv_hal_filterFree(ctx); - if (res != CV_HAL_ERROR_OK) - return false; - return success; -} - -#ifdef HAVE_IPP -static bool ippFilter2D(int stype, int dtype, int kernel_type, - uchar * src_data, size_t src_step, - uchar * dst_data, size_t dst_step, - int width, int height, - int full_width, int full_height, - int offset_x, int offset_y, - uchar * kernel_data, size_t kernel_step, - int kernel_width, int kernel_height, - int anchor_x, int anchor_y, - double delta, int borderType, - bool isSubmatrix) -{ -#ifdef HAVE_IPP_IW - CV_INSTRUMENT_REGION_IPP(); - - ::ipp::IwiSize iwSize(width, height); - ::ipp::IwiSize kernelSize(kernel_width, kernel_height); - IppDataType type = ippiGetDataType(CV_MAT_DEPTH(stype)); - int channels = CV_MAT_CN(stype); - - CV_UNUSED(isSubmatrix); - -#if IPP_VERSION_X100 >= 201700 && IPP_VERSION_X100 <= 201702 // IPP bug with 1x1 kernel - if(kernel_width == 1 && kernel_height == 1) - return false; -#endif - -#if IPP_DISABLE_FILTER2D_BIG_MASK - // Too big difference compared to OpenCV FFT-based convolution - if(kernel_type == CV_32FC1 && (type == ipp16s || type == ipp16u) && (kernel_width > 7 || kernel_height > 7)) - return false; - - // Poor optimization for big kernels - if(kernel_width > 7 || kernel_height > 7) - return false; -#endif - - if(src_data == dst_data) - return false; - - if(stype != dtype) - return false; - - if(kernel_type != CV_16SC1 && kernel_type != CV_32FC1) - return false; - - // TODO: Implement offset for 8u, 16u - if(std::fabs(delta) >= DBL_EPSILON) - return false; - - if(!ippiCheckAnchor(anchor_x, anchor_y, kernel_width, kernel_height)) - return false; - - try - { - ::ipp::IwiBorderSize iwBorderSize; - ::ipp::IwiBorderType iwBorderType; - ::ipp::IwiImage iwKernel(ippiSize(kernel_width, kernel_height), ippiGetDataType(CV_MAT_DEPTH(kernel_type)), CV_MAT_CN(kernel_type), 0, (void*)kernel_data, kernel_step); - ::ipp::IwiImage iwSrc(iwSize, type, channels, ::ipp::IwiBorderSize(offset_x, offset_y, full_width-offset_x-width, full_height-offset_y-height), (void*)src_data, src_step); - ::ipp::IwiImage iwDst(iwSize, type, channels, ::ipp::IwiBorderSize(offset_x, offset_y, full_width-offset_x-width, full_height-offset_y-height), (void*)dst_data, dst_step); - - iwBorderSize = ::ipp::iwiSizeToBorderSize(kernelSize); - iwBorderType = ippiGetBorder(iwSrc, borderType, iwBorderSize); - if(!iwBorderType) - return false; - - CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilter, iwSrc, iwDst, iwKernel, ::ipp::IwiFilterParams(1, 0, ippAlgHintNone, ippRndFinancial), iwBorderType); - } - catch(const ::ipp::IwException& ex) - { - CV_UNUSED(ex); - return false; - } - - return true; -#else - CV_UNUSED(stype); CV_UNUSED(dtype); CV_UNUSED(kernel_type); CV_UNUSED(src_data); CV_UNUSED(src_step); - CV_UNUSED(dst_data); CV_UNUSED(dst_step); CV_UNUSED(width); CV_UNUSED(height); CV_UNUSED(full_width); - CV_UNUSED(full_height); CV_UNUSED(offset_x); CV_UNUSED(offset_y); CV_UNUSED(kernel_data); CV_UNUSED(kernel_step); - CV_UNUSED(kernel_width); CV_UNUSED(kernel_height); CV_UNUSED(anchor_x); CV_UNUSED(anchor_y); CV_UNUSED(delta); - CV_UNUSED(borderType); CV_UNUSED(isSubmatrix); - return false; #endif -} -#endif - -static bool dftFilter2D(int stype, int dtype, int kernel_type, - uchar * src_data, size_t src_step, - uchar * dst_data, size_t dst_step, - int full_width, int full_height, - int offset_x, int offset_y, - uchar * kernel_data, size_t kernel_step, - int kernel_width, int kernel_height, - int anchor_x, int anchor_y, - double delta, int borderType) -{ - { - int sdepth = CV_MAT_DEPTH(stype); - int ddepth = CV_MAT_DEPTH(dtype); - int dft_filter_size = checkHardwareSupport(CV_CPU_SSE3) && ((sdepth == CV_8U && (ddepth == CV_8U || ddepth == CV_16S)) || (sdepth == CV_32F && ddepth == CV_32F)) ? 130 : 50; - if (kernel_width * kernel_height < dft_filter_size) - return false; - } - - Point anchor = Point(anchor_x, anchor_y); - Mat kernel = Mat(Size(kernel_width, kernel_height), kernel_type, kernel_data, kernel_step); - - Mat src(Size(full_width-offset_x, full_height-offset_y), stype, src_data, src_step); - Mat dst(Size(full_width, full_height), dtype, dst_data, dst_step); - Mat temp; - int src_channels = CV_MAT_CN(stype); - int dst_channels = CV_MAT_CN(dtype); - int ddepth = CV_MAT_DEPTH(dtype); - // crossCorr doesn't accept non-zero delta with multiple channels - if (src_channels != 1 && delta != 0) { - // The semantics of filter2D require that the delta be applied - // as floating-point math. So wee need an intermediate Mat - // with a float datatype. If the dest is already floats, - // we just use that. - int corrDepth = ddepth; - if ((ddepth == CV_32F || ddepth == CV_64F) && src_data != dst_data) { - temp = Mat(Size(full_width, full_height), dtype, dst_data, dst_step); - } else { - corrDepth = ddepth == CV_64F ? CV_64F : CV_32F; - temp.create(Size(full_width, full_height), CV_MAKETYPE(corrDepth, dst_channels)); - } - crossCorr(src, kernel, temp, src.size(), - CV_MAKETYPE(corrDepth, src_channels), - anchor, 0, borderType); - add(temp, delta, temp); - if (temp.data != dst_data) { - temp.convertTo(dst, dst.type()); - } - } else { - if (src_data != dst_data) - temp = Mat(Size(full_width, full_height), dtype, dst_data, dst_step); - else - temp.create(Size(full_width, full_height), dtype); - crossCorr(src, kernel, temp, src.size(), - CV_MAKETYPE(ddepth, src_channels), - anchor, delta, borderType); - if (temp.data != dst_data) - temp.copyTo(dst); - } - return true; -} - -static void ocvFilter2D(int stype, int dtype, int kernel_type, - uchar * src_data, size_t src_step, - uchar * dst_data, size_t dst_step, - int width, int height, - int full_width, int full_height, - int offset_x, int offset_y, - uchar * kernel_data, size_t kernel_step, - int kernel_width, int kernel_height, - int anchor_x, int anchor_y, - double delta, int borderType) -{ - int borderTypeValue = borderType & ~BORDER_ISOLATED; - Mat kernel = Mat(Size(kernel_width, kernel_height), kernel_type, kernel_data, kernel_step); - Ptr f = createLinearFilter(stype, dtype, kernel, Point(anchor_x, anchor_y), delta, - borderTypeValue); - Mat src(Size(width, height), stype, src_data, src_step); - Mat dst(Size(width, height), dtype, dst_data, dst_step); - f->apply(src, dst, Size(full_width, full_height), Point(offset_x, offset_y)); -} - -static bool replacementSepFilter(int stype, int dtype, int ktype, - uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, - int width, int height, int full_width, int full_height, - int offset_x, int offset_y, - uchar * kernelx_data, int kernelx_len, - uchar * kernely_data, int kernely_len, - int anchor_x, int anchor_y, double delta, int borderType) -{ - cvhalFilter2D *ctx; - int res = cv_hal_sepFilterInit(&ctx, stype, dtype, ktype, - kernelx_data, kernelx_len, - kernely_data, kernely_len, - anchor_x, anchor_y, delta, borderType); - if (res != CV_HAL_ERROR_OK) - return false; - res = cv_hal_sepFilter(ctx, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y); - bool success = (res == CV_HAL_ERROR_OK); - res = cv_hal_sepFilterFree(ctx); - if (res != CV_HAL_ERROR_OK) - return false; - return success; -} - -static void ocvSepFilter(int stype, int dtype, int ktype, - uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, - int width, int height, int full_width, int full_height, - int offset_x, int offset_y, - uchar * kernelx_data, int kernelx_len, - uchar * kernely_data, int kernely_len, - int anchor_x, int anchor_y, double delta, int borderType) -{ - Mat kernelX(Size(kernelx_len, 1), ktype, kernelx_data); - Mat kernelY(Size(kernely_len, 1), ktype, kernely_data); - Ptr f = createSeparableLinearFilter(stype, dtype, kernelX, kernelY, - Point(anchor_x, anchor_y), - delta, borderType & ~BORDER_ISOLATED); - Mat src(Size(width, height), stype, src_data, src_step); - Mat dst(Size(width, height), dtype, dst_data, dst_step); - f->apply(src, dst, Size(full_width, full_height), Point(offset_x, offset_y)); -}; - -//=================================================================== -// HAL functions -//=================================================================== - -namespace cv { -namespace hal { - - -CV_DEPRECATED Ptr Filter2D::create(uchar * , size_t , int , - int , int , - int , int , - int , int , - int , double , - int , int , - bool , bool ) { return Ptr(); } - -CV_DEPRECATED Ptr SepFilter2D::create(int , int , int , - uchar * , int , - uchar * , int , - int , int , - double , int ) { return Ptr(); } - - -void filter2D(int stype, int dtype, int kernel_type, - uchar * src_data, size_t src_step, - uchar * dst_data, size_t dst_step, - int width, int height, - int full_width, int full_height, - int offset_x, int offset_y, - uchar * kernel_data, size_t kernel_step, - int kernel_width, int kernel_height, - int anchor_x, int anchor_y, - double delta, int borderType, - bool isSubmatrix) -{ - bool res; - res = replacementFilter2D(stype, dtype, kernel_type, - src_data, src_step, - dst_data, dst_step, - width, height, - full_width, full_height, - offset_x, offset_y, - kernel_data, kernel_step, - kernel_width, kernel_height, - anchor_x, anchor_y, - delta, borderType, isSubmatrix); - if (res) - return; - - CV_IPP_RUN_FAST(ippFilter2D(stype, dtype, kernel_type, - src_data, src_step, - dst_data, dst_step, - width, height, - full_width, full_height, - offset_x, offset_y, - kernel_data, kernel_step, - kernel_width, kernel_height, - anchor_x, anchor_y, - delta, borderType, isSubmatrix)) - - res = dftFilter2D(stype, dtype, kernel_type, - src_data, src_step, - dst_data, dst_step, - full_width, full_height, - offset_x, offset_y, - kernel_data, kernel_step, - kernel_width, kernel_height, - anchor_x, anchor_y, - delta, borderType); - if (res) - return; - ocvFilter2D(stype, dtype, kernel_type, - src_data, src_step, - dst_data, dst_step, - width, height, - full_width, full_height, - offset_x, offset_y, - kernel_data, kernel_step, - kernel_width, kernel_height, - anchor_x, anchor_y, - delta, borderType); -} - -//--------------------------------------------------------------- - -void sepFilter2D(int stype, int dtype, int ktype, - uchar* src_data, size_t src_step, uchar* dst_data, size_t dst_step, - int width, int height, int full_width, int full_height, - int offset_x, int offset_y, - uchar * kernelx_data, int kernelx_len, - uchar * kernely_data, int kernely_len, - int anchor_x, int anchor_y, double delta, int borderType) -{ - - bool res = replacementSepFilter(stype, dtype, ktype, - src_data, src_step, dst_data, dst_step, - width, height, full_width, full_height, - offset_x, offset_y, - kernelx_data, kernelx_len, - kernely_data, kernely_len, - anchor_x, anchor_y, delta, borderType); - if (res) - return; - ocvSepFilter(stype, dtype, ktype, - src_data, src_step, dst_data, dst_step, - width, height, full_width, full_height, - offset_x, offset_y, - kernelx_data, kernelx_len, - kernely_data, kernely_len, - anchor_x, anchor_y, delta, borderType); -} - -} // cv::hal:: -} // cv:: - -//================================================================ -// Main interface -//================================================================ - -void cv::filter2D( InputArray _src, OutputArray _dst, int ddepth, - InputArray _kernel, Point anchor0, - double delta, int borderType ) -{ - CV_INSTRUMENT_REGION(); - - CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2, - ocl_filter2D(_src, _dst, ddepth, _kernel, anchor0, delta, borderType)) - - Mat src = _src.getMat(), kernel = _kernel.getMat(); - - if( ddepth < 0 ) - ddepth = src.depth(); - - _dst.create( src.size(), CV_MAKETYPE(ddepth, src.channels()) ); - Mat dst = _dst.getMat(); - Point anchor = normalizeAnchor(anchor0, kernel.size()); - - Point ofs; - Size wsz(src.cols, src.rows); - if( (borderType & BORDER_ISOLATED) == 0 ) - src.locateROI( wsz, ofs ); - - hal::filter2D(src.type(), dst.type(), kernel.type(), - src.data, src.step, dst.data, dst.step, - dst.cols, dst.rows, wsz.width, wsz.height, ofs.x, ofs.y, - kernel.data, kernel.step, kernel.cols, kernel.rows, - anchor.x, anchor.y, - delta, borderType, src.isSubmatrix()); -} - -void cv::sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, - InputArray _kernelX, InputArray _kernelY, Point anchor, - double delta, int borderType ) -{ - CV_INSTRUMENT_REGION(); - - CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && (size_t)_src.rows() > _kernelY.total() && (size_t)_src.cols() > _kernelX.total(), - ocl_sepFilter2D(_src, _dst, ddepth, _kernelX, _kernelY, anchor, delta, borderType)) - - Mat src = _src.getMat(), kernelX = _kernelX.getMat(), kernelY = _kernelY.getMat(); - - if( ddepth < 0 ) - ddepth = src.depth(); - - _dst.create( src.size(), CV_MAKETYPE(ddepth, src.channels()) ); - Mat dst = _dst.getMat(); - - Point ofs; - Size wsz(src.cols, src.rows); - if( (borderType & BORDER_ISOLATED) == 0 ) - src.locateROI( wsz, ofs ); - - CV_Assert( kernelX.type() == kernelY.type() && - (kernelX.cols == 1 || kernelX.rows == 1) && - (kernelY.cols == 1 || kernelY.rows == 1) ); - - Mat contKernelX = kernelX.isContinuous() ? kernelX : kernelX.clone(); - Mat contKernelY = kernelY.isContinuous() ? kernelY : kernelY.clone(); - - hal::sepFilter2D(src.type(), dst.type(), kernelX.type(), - src.data, src.step, dst.data, dst.step, - dst.cols, dst.rows, wsz.width, wsz.height, ofs.x, ofs.y, - contKernelX.data, kernelX.cols + kernelX.rows - 1, - contKernelY.data, kernelY.cols + kernelY.rows - 1, - anchor.x, anchor.y, delta, borderType & ~BORDER_ISOLATED); -} - - -CV_IMPL void -cvFilter2D( const CvArr* srcarr, CvArr* dstarr, const CvMat* _kernel, CvPoint anchor ) -{ - cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr); - cv::Mat kernel = cv::cvarrToMat(_kernel); - - CV_Assert( src.size() == dst.size() && src.channels() == dst.channels() ); - - cv::filter2D( src, dst, dst.depth(), kernel, anchor, 0, cv::BORDER_REPLICATE ); -} - -/* End of file. */ +CV_CPU_OPTIMIZATION_NAMESPACE_END +} // namespace diff --git a/modules/imgproc/src/fixedpoint.inl.hpp b/modules/imgproc/src/fixedpoint.inl.hpp index a1a75a29e1..40b1c3faa1 100644 --- a/modules/imgproc/src/fixedpoint.inl.hpp +++ b/modules/imgproc/src/fixedpoint.inl.hpp @@ -9,10 +9,7 @@ #ifndef _CV_FIXEDPOINT_HPP_ #define _CV_FIXEDPOINT_HPP_ -#include "opencv2/core/softfloat.hpp" - -namespace -{ +namespace { class fixedpoint64 { diff --git a/modules/imgproc/src/median_blur.dispatch.cpp b/modules/imgproc/src/median_blur.dispatch.cpp new file mode 100644 index 0000000000..d993fbad5b --- /dev/null +++ b/modules/imgproc/src/median_blur.dispatch.cpp @@ -0,0 +1,317 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, 2018, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" + +#include + +#include "opencv2/core/hal/intrin.hpp" +#include "opencl_kernels_imgproc.hpp" + +#include "opencv2/core/openvx/ovx_defs.hpp" + +#include "median_blur.simd.hpp" +#include "median_blur.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content + +namespace cv { + +#ifdef HAVE_OPENCL + +#define DIVUP(total, grain) ((total + grain - 1) / (grain)) + +static bool ocl_medianFilter(InputArray _src, OutputArray _dst, int m) +{ + size_t localsize[2] = { 16, 16 }; + size_t globalsize[2]; + int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); + + if ( !((depth == CV_8U || depth == CV_16U || depth == CV_16S || depth == CV_32F) && cn <= 4 && (m == 3 || m == 5)) ) + return false; + + Size imgSize = _src.size(); + bool useOptimized = (1 == cn) && + (size_t)imgSize.width >= localsize[0] * 8 && + (size_t)imgSize.height >= localsize[1] * 8 && + imgSize.width % 4 == 0 && + imgSize.height % 4 == 0 && + (ocl::Device::getDefault().isIntel()); + + cv::String kname = format( useOptimized ? "medianFilter%d_u" : "medianFilter%d", m) ; + cv::String kdefs = useOptimized ? + format("-D T=%s -D T1=%s -D T4=%s%d -D cn=%d -D USE_4OPT", ocl::typeToStr(type), + ocl::typeToStr(depth), ocl::typeToStr(depth), cn*4, cn) + : + format("-D T=%s -D T1=%s -D cn=%d", ocl::typeToStr(type), ocl::typeToStr(depth), cn) ; + + ocl::Kernel k(kname.c_str(), ocl::imgproc::medianFilter_oclsrc, kdefs.c_str() ); + + if (k.empty()) + return false; + + UMat src = _src.getUMat(); + _dst.create(src.size(), type); + UMat dst = _dst.getUMat(); + + k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst)); + + if( useOptimized ) + { + globalsize[0] = DIVUP(src.cols / 4, localsize[0]) * localsize[0]; + globalsize[1] = DIVUP(src.rows / 4, localsize[1]) * localsize[1]; + } + else + { + globalsize[0] = (src.cols + localsize[0] + 2) / localsize[0] * localsize[0]; + globalsize[1] = (src.rows + localsize[1] - 1) / localsize[1] * localsize[1]; + } + + return k.run(2, globalsize, localsize, false); +} + +#undef DIVUP + +#endif + +#ifdef HAVE_OPENVX +namespace ovx { + template <> inline bool skipSmallImages(int w, int h) { return w*h < 1280 * 720; } +} +static bool openvx_medianFilter(InputArray _src, OutputArray _dst, int ksize) +{ + if (_src.type() != CV_8UC1 || _dst.type() != CV_8U +#ifndef VX_VERSION_1_1 + || ksize != 3 +#endif + ) + return false; + + Mat src = _src.getMat(); + Mat dst = _dst.getMat(); + + if ( +#ifdef VX_VERSION_1_1 + ksize != 3 ? ovx::skipSmallImages(src.cols, src.rows) : +#endif + ovx::skipSmallImages(src.cols, src.rows) + ) + return false; + + try + { + ivx::Context ctx = ovx::getOpenVXContext(); +#ifdef VX_VERSION_1_1 + if ((vx_size)ksize > ctx.nonlinearMaxDimension()) + return false; +#endif + + Mat a; + if (dst.data != src.data) + a = src; + else + src.copyTo(a); + + ivx::Image + ia = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8, + ivx::Image::createAddressing(a.cols, a.rows, 1, (vx_int32)(a.step)), a.data), + ib = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8, + ivx::Image::createAddressing(dst.cols, dst.rows, 1, (vx_int32)(dst.step)), dst.data); + + //ATTENTION: VX_CONTEXT_IMMEDIATE_BORDER attribute change could lead to strange issues in multi-threaded environments + //since OpenVX standard says nothing about thread-safety for now + ivx::border_t prevBorder = ctx.immediateBorder(); + ctx.setImmediateBorder(VX_BORDER_REPLICATE); +#ifdef VX_VERSION_1_1 + if (ksize == 3) +#endif + { + ivx::IVX_CHECK_STATUS(vxuMedian3x3(ctx, ia, ib)); + } +#ifdef VX_VERSION_1_1 + else + { + ivx::Matrix mtx; + if(ksize == 5) + mtx = ivx::Matrix::createFromPattern(ctx, VX_PATTERN_BOX, ksize, ksize); + else + { + vx_size supportedSize; + ivx::IVX_CHECK_STATUS(vxQueryContext(ctx, VX_CONTEXT_NONLINEAR_MAX_DIMENSION, &supportedSize, sizeof(supportedSize))); + if ((vx_size)ksize > supportedSize) + { + ctx.setImmediateBorder(prevBorder); + return false; + } + Mat mask(ksize, ksize, CV_8UC1, Scalar(255)); + mtx = ivx::Matrix::create(ctx, VX_TYPE_UINT8, ksize, ksize); + mtx.copyFrom(mask); + } + ivx::IVX_CHECK_STATUS(vxuNonLinearFilter(ctx, VX_NONLINEAR_FILTER_MEDIAN, ia, mtx, ib)); + } +#endif + ctx.setImmediateBorder(prevBorder); + } + catch (const ivx::RuntimeError & e) + { + VX_DbgThrow(e.what()); + } + catch (const ivx::WrapperError & e) + { + VX_DbgThrow(e.what()); + } + + return true; +} +#endif + +#ifdef HAVE_IPP +static bool ipp_medianFilter(Mat &src0, Mat &dst, int ksize) +{ + CV_INSTRUMENT_REGION_IPP(); + +#if IPP_VERSION_X100 < 201801 + // Degradations for big kernel + if(ksize > 7) + return false; +#endif + + { + int bufSize; + IppiSize dstRoiSize = ippiSize(dst.cols, dst.rows), maskSize = ippiSize(ksize, ksize); + IppDataType ippType = ippiGetDataType(src0.type()); + int channels = src0.channels(); + IppAutoBuffer buffer; + + if(src0.isSubmatrix()) + return false; + + Mat src; + if(dst.data != src0.data) + src = src0; + else + src0.copyTo(src); + + if(ippiFilterMedianBorderGetBufferSize(dstRoiSize, maskSize, ippType, channels, &bufSize) < 0) + return false; + + buffer.allocate(bufSize); + + switch(ippType) + { + case ipp8u: + if(channels == 1) + return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_8u_C1R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; + else if(channels == 3) + return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_8u_C3R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; + else if(channels == 4) + return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_8u_C4R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; + else + return false; + case ipp16u: + if(channels == 1) + return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_16u_C1R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; + else if(channels == 3) + return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_16u_C3R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; + else if(channels == 4) + return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_16u_C4R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; + else + return false; + case ipp16s: + if(channels == 1) + return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_16s_C1R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; + else if(channels == 3) + return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_16s_C3R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; + else if(channels == 4) + return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_16s_C4R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; + else + return false; + case ipp32f: + if(channels == 1) + return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_32f_C1R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; + else + return false; + default: + return false; + } + } +} +#endif + +void medianBlur( InputArray _src0, OutputArray _dst, int ksize ) +{ + CV_INSTRUMENT_REGION(); + + CV_Assert( (ksize % 2 == 1) && (_src0.dims() <= 2 )); + + if( ksize <= 1 || _src0.empty() ) + { + _src0.copyTo(_dst); + return; + } + + CV_OCL_RUN(_dst.isUMat(), + ocl_medianFilter(_src0,_dst, ksize)) + + Mat src0 = _src0.getMat(); + _dst.create( src0.size(), src0.type() ); + Mat dst = _dst.getMat(); + + CALL_HAL(medianBlur, cv_hal_medianBlur, src0.data, src0.step, dst.data, dst.step, src0.cols, src0.rows, src0.depth(), + src0.channels(), ksize); + + CV_OVX_RUN(true, + openvx_medianFilter(_src0, _dst, ksize)) + + CV_IPP_RUN_FAST(ipp_medianFilter(src0, dst, ksize)); + +#ifdef HAVE_TEGRA_OPTIMIZATION + if (tegra::useTegra() && tegra::medianBlur(src0, dst, ksize)) + return; +#endif + + CV_CPU_DISPATCH(medianBlur, (src0, dst, ksize), + CV_CPU_DISPATCH_MODES_ALL); +} + +} // namespace + +/* End of file. */ diff --git a/modules/imgproc/src/median_blur.cpp b/modules/imgproc/src/median_blur.simd.hpp similarity index 80% rename from modules/imgproc/src/median_blur.cpp rename to modules/imgproc/src/median_blur.simd.hpp index c98cd9215a..c3203f2a07 100644 --- a/modules/imgproc/src/median_blur.cpp +++ b/modules/imgproc/src/median_blur.simd.hpp @@ -46,9 +46,11 @@ #include #include "opencv2/core/hal/intrin.hpp" -#include "opencl_kernels_imgproc.hpp" -#include "opencv2/core/openvx/ovx_defs.hpp" +#ifdef _MSC_VER +#pragma warning(disable: 4244) // warning C4244: 'argument': conversion from 'int' to 'ushort', possible loss of data + // triggered on intrinsic code from medianBlur_8u_O1() +#endif /* * This file includes the code, contributed by Simon Perreault @@ -71,12 +73,18 @@ Median Filter \****************************************************************************************/ -namespace cv -{ +namespace cv { +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN +// forward declarations +void medianBlur(const Mat& src0, /*const*/ Mat& dst, int ksize); + +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY static void medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize ) { + CV_INSTRUMENT_REGION(); + typedef ushort HT; /** @@ -330,9 +338,6 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize ) } } } -#if CV_SIMD - vx_cleanup(); -#endif } #undef HOP @@ -342,6 +347,8 @@ medianBlur_8u_O1( const Mat& _src, Mat& _dst, int ksize ) static void medianBlur_8u_Om( const Mat& _src, Mat& _dst, int m ) { + CV_INSTRUMENT_REGION(); + #define N 16 int zone0[4][N]; int zone1[4][N*N]; @@ -671,6 +678,8 @@ template static void medianBlur_SortNet( const Mat& _src, Mat& _dst, int m ) { + CV_INSTRUMENT_REGION(); + typedef typename Op::value_type T; typedef typename Op::arg_type WT; typedef typename VecOp::arg_type VT; @@ -770,9 +779,6 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m ) limit = size.width; } } -#if CV_SIMD - vx_cleanup(); -#endif } else if( m == 5 ) { @@ -934,266 +940,15 @@ medianBlur_SortNet( const Mat& _src, Mat& _dst, int m ) limit = size.width; } } -#if CV_SIMD - vx_cleanup(); -#endif - } -} - -#ifdef HAVE_OPENCL - -#define DIVUP(total, grain) ((total + grain - 1) / (grain)) - -static bool ocl_medianFilter(InputArray _src, OutputArray _dst, int m) -{ - size_t localsize[2] = { 16, 16 }; - size_t globalsize[2]; - int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - - if ( !((depth == CV_8U || depth == CV_16U || depth == CV_16S || depth == CV_32F) && cn <= 4 && (m == 3 || m == 5)) ) - return false; - - Size imgSize = _src.size(); - bool useOptimized = (1 == cn) && - (size_t)imgSize.width >= localsize[0] * 8 && - (size_t)imgSize.height >= localsize[1] * 8 && - imgSize.width % 4 == 0 && - imgSize.height % 4 == 0 && - (ocl::Device::getDefault().isIntel()); - - cv::String kname = format( useOptimized ? "medianFilter%d_u" : "medianFilter%d", m) ; - cv::String kdefs = useOptimized ? - format("-D T=%s -D T1=%s -D T4=%s%d -D cn=%d -D USE_4OPT", ocl::typeToStr(type), - ocl::typeToStr(depth), ocl::typeToStr(depth), cn*4, cn) - : - format("-D T=%s -D T1=%s -D cn=%d", ocl::typeToStr(type), ocl::typeToStr(depth), cn) ; - - ocl::Kernel k(kname.c_str(), ocl::imgproc::medianFilter_oclsrc, kdefs.c_str() ); - - if (k.empty()) - return false; - - UMat src = _src.getUMat(); - _dst.create(src.size(), type); - UMat dst = _dst.getUMat(); - - k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst)); - - if( useOptimized ) - { - globalsize[0] = DIVUP(src.cols / 4, localsize[0]) * localsize[0]; - globalsize[1] = DIVUP(src.rows / 4, localsize[1]) * localsize[1]; - } - else - { - globalsize[0] = (src.cols + localsize[0] + 2) / localsize[0] * localsize[0]; - globalsize[1] = (src.rows + localsize[1] - 1) / localsize[1] * localsize[1]; - } - - return k.run(2, globalsize, localsize, false); -} - -#undef DIVUP - -#endif - -#ifdef HAVE_OPENVX -namespace ovx { - template <> inline bool skipSmallImages(int w, int h) { return w*h < 1280 * 720; } -} -static bool openvx_medianFilter(InputArray _src, OutputArray _dst, int ksize) -{ - if (_src.type() != CV_8UC1 || _dst.type() != CV_8U -#ifndef VX_VERSION_1_1 - || ksize != 3 -#endif - ) - return false; - - Mat src = _src.getMat(); - Mat dst = _dst.getMat(); - - if ( -#ifdef VX_VERSION_1_1 - ksize != 3 ? ovx::skipSmallImages(src.cols, src.rows) : -#endif - ovx::skipSmallImages(src.cols, src.rows) - ) - return false; - - try - { - ivx::Context ctx = ovx::getOpenVXContext(); -#ifdef VX_VERSION_1_1 - if ((vx_size)ksize > ctx.nonlinearMaxDimension()) - return false; -#endif - - Mat a; - if (dst.data != src.data) - a = src; - else - src.copyTo(a); - - ivx::Image - ia = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8, - ivx::Image::createAddressing(a.cols, a.rows, 1, (vx_int32)(a.step)), a.data), - ib = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8, - ivx::Image::createAddressing(dst.cols, dst.rows, 1, (vx_int32)(dst.step)), dst.data); - - //ATTENTION: VX_CONTEXT_IMMEDIATE_BORDER attribute change could lead to strange issues in multi-threaded environments - //since OpenVX standard says nothing about thread-safety for now - ivx::border_t prevBorder = ctx.immediateBorder(); - ctx.setImmediateBorder(VX_BORDER_REPLICATE); -#ifdef VX_VERSION_1_1 - if (ksize == 3) -#endif - { - ivx::IVX_CHECK_STATUS(vxuMedian3x3(ctx, ia, ib)); - } -#ifdef VX_VERSION_1_1 - else - { - ivx::Matrix mtx; - if(ksize == 5) - mtx = ivx::Matrix::createFromPattern(ctx, VX_PATTERN_BOX, ksize, ksize); - else - { - vx_size supportedSize; - ivx::IVX_CHECK_STATUS(vxQueryContext(ctx, VX_CONTEXT_NONLINEAR_MAX_DIMENSION, &supportedSize, sizeof(supportedSize))); - if ((vx_size)ksize > supportedSize) - { - ctx.setImmediateBorder(prevBorder); - return false; - } - Mat mask(ksize, ksize, CV_8UC1, Scalar(255)); - mtx = ivx::Matrix::create(ctx, VX_TYPE_UINT8, ksize, ksize); - mtx.copyFrom(mask); - } - ivx::IVX_CHECK_STATUS(vxuNonLinearFilter(ctx, VX_NONLINEAR_FILTER_MEDIAN, ia, mtx, ib)); - } -#endif - ctx.setImmediateBorder(prevBorder); - } - catch (const ivx::RuntimeError & e) - { - VX_DbgThrow(e.what()); } - catch (const ivx::WrapperError & e) - { - VX_DbgThrow(e.what()); - } - - return true; } -#endif - -#ifdef HAVE_IPP -static bool ipp_medianFilter(Mat &src0, Mat &dst, int ksize) -{ - CV_INSTRUMENT_REGION_IPP(); - -#if IPP_VERSION_X100 < 201801 - // Degradations for big kernel - if(ksize > 7) - return false; -#endif - - { - int bufSize; - IppiSize dstRoiSize = ippiSize(dst.cols, dst.rows), maskSize = ippiSize(ksize, ksize); - IppDataType ippType = ippiGetDataType(src0.type()); - int channels = src0.channels(); - IppAutoBuffer buffer; - - if(src0.isSubmatrix()) - return false; - - Mat src; - if(dst.data != src0.data) - src = src0; - else - src0.copyTo(src); - - if(ippiFilterMedianBorderGetBufferSize(dstRoiSize, maskSize, ippType, channels, &bufSize) < 0) - return false; - buffer.allocate(bufSize); +} // namespace anon - switch(ippType) - { - case ipp8u: - if(channels == 1) - return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_8u_C1R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; - else if(channels == 3) - return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_8u_C3R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; - else if(channels == 4) - return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_8u_C4R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; - else - return false; - case ipp16u: - if(channels == 1) - return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_16u_C1R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; - else if(channels == 3) - return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_16u_C3R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; - else if(channels == 4) - return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_16u_C4R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; - else - return false; - case ipp16s: - if(channels == 1) - return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_16s_C1R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; - else if(channels == 3) - return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_16s_C3R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; - else if(channels == 4) - return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_16s_C4R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; - else - return false; - case ipp32f: - if(channels == 1) - return CV_INSTRUMENT_FUN_IPP(ippiFilterMedianBorder_32f_C1R, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, dstRoiSize, maskSize, ippBorderRepl, 0, buffer) >= 0; - else - return false; - default: - return false; - } - } -} -#endif -} - -void medianBlur( InputArray _src0, OutputArray _dst, int ksize ) +void medianBlur(const Mat& src0, /*const*/ Mat& dst, int ksize) { CV_INSTRUMENT_REGION(); - CV_Assert( (ksize % 2 == 1) && (_src0.dims() <= 2 )); - - if( ksize <= 1 || _src0.empty() ) - { - _src0.copyTo(_dst); - return; - } - - CV_OCL_RUN(_dst.isUMat(), - ocl_medianFilter(_src0,_dst, ksize)) - - Mat src0 = _src0.getMat(); - _dst.create( src0.size(), src0.type() ); - Mat dst = _dst.getMat(); - - CALL_HAL(medianBlur, cv_hal_medianBlur, src0.data, src0.step, dst.data, dst.step, src0.cols, src0.rows, src0.depth(), - src0.channels(), ksize); - - CV_OVX_RUN(true, - openvx_medianFilter(_src0, _dst, ksize)) - - CV_IPP_RUN_FAST(ipp_medianFilter(src0, dst, ksize)); - -#ifdef HAVE_TEGRA_OPTIMIZATION - if (tegra::useTegra() && tegra::medianBlur(src0, dst, ksize)) - return; -#endif - bool useSortNet = ksize == 3 || (ksize == 5 #if !(CV_SIMD) && ( src0.depth() > CV_8U || src0.channels() == 2 || src0.channels() > 4 ) @@ -1223,6 +978,7 @@ void medianBlur( InputArray _src0, OutputArray _dst, int ksize ) } else { + // TODO AVX guard (external call) cv::copyMakeBorder( src0, src, 0, 0, ksize/2, ksize/2, BORDER_REPLICATE|BORDER_ISOLATED); int cn = src0.channels(); @@ -1237,6 +993,6 @@ void medianBlur( InputArray _src0, OutputArray _dst, int ksize ) } } -} - -/* End of file. */ +#endif +CV_CPU_OPTIMIZATION_NAMESPACE_END +} // namespace diff --git a/modules/imgproc/src/morph.cpp b/modules/imgproc/src/morph.dispatch.cpp similarity index 64% rename from modules/imgproc/src/morph.cpp rename to modules/imgproc/src/morph.dispatch.cpp index c18e5c8066..326bc66593 100644 --- a/modules/imgproc/src/morph.cpp +++ b/modules/imgproc/src/morph.dispatch.cpp @@ -48,779 +48,49 @@ #include "opencv2/core/hal/intrin.hpp" #include +#include "morph.simd.hpp" +#include "morph.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content + + /****************************************************************************************\ Basic Morphological Operations: Erosion & Dilation \****************************************************************************************/ -using namespace std; - -namespace cv -{ - -template struct MinOp -{ - typedef T type1; - typedef T type2; - typedef T rtype; - T operator ()(const T a, const T b) const { return std::min(a, b); } -}; - -template struct MaxOp -{ - typedef T type1; - typedef T type2; - typedef T rtype; - T operator ()(const T a, const T b) const { return std::max(a, b); } -}; - -#undef CV_MIN_8U -#undef CV_MAX_8U -#define CV_MIN_8U(a,b) ((a) - CV_FAST_CAST_8U((a) - (b))) -#define CV_MAX_8U(a,b) ((a) + CV_FAST_CAST_8U((b) - (a))) - -template<> inline uchar MinOp::operator ()(const uchar a, const uchar b) const { return CV_MIN_8U(a, b); } -template<> inline uchar MaxOp::operator ()(const uchar a, const uchar b) const { return CV_MAX_8U(a, b); } - -struct MorphRowNoVec -{ - MorphRowNoVec(int, int) {} - int operator()(const uchar*, uchar*, int, int) const { return 0; } -}; - -struct MorphColumnNoVec -{ - MorphColumnNoVec(int, int) {} - int operator()(const uchar**, uchar*, int, int, int) const { return 0; } -}; - -struct MorphNoVec -{ - int operator()(uchar**, int, uchar*, int) const { return 0; } -}; - -#if CV_SIMD - -template struct MorphRowVec -{ - typedef typename VecUpdate::vtype vtype; - typedef typename vtype::lane_type stype; - MorphRowVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {} - int operator()(const uchar* src, uchar* dst, int width, int cn) const - { - int i, k, _ksize = ksize*cn; - width *= cn; - VecUpdate updateOp; - - for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes ) - { - vtype s0 = vx_load((const stype*)src + i); - vtype s1 = vx_load((const stype*)src + i + vtype::nlanes); - vtype s2 = vx_load((const stype*)src + i + 2*vtype::nlanes); - vtype s3 = vx_load((const stype*)src + i + 3*vtype::nlanes); - for (k = cn; k < _ksize; k += cn) - { - s0 = updateOp(s0, vx_load((const stype*)src + i + k)); - s1 = updateOp(s1, vx_load((const stype*)src + i + k + vtype::nlanes)); - s2 = updateOp(s2, vx_load((const stype*)src + i + k + 2*vtype::nlanes)); - s3 = updateOp(s3, vx_load((const stype*)src + i + k + 3*vtype::nlanes)); - } - v_store((stype*)dst + i, s0); - v_store((stype*)dst + i + vtype::nlanes, s1); - v_store((stype*)dst + i + 2*vtype::nlanes, s2); - v_store((stype*)dst + i + 3*vtype::nlanes, s3); - } - if( i <= width - 2*vtype::nlanes ) - { - vtype s0 = vx_load((const stype*)src + i); - vtype s1 = vx_load((const stype*)src + i + vtype::nlanes); - for( k = cn; k < _ksize; k += cn ) - { - s0 = updateOp(s0, vx_load((const stype*)src + i + k)); - s1 = updateOp(s1, vx_load((const stype*)src + i + k + vtype::nlanes)); - } - v_store((stype*)dst + i, s0); - v_store((stype*)dst + i + vtype::nlanes, s1); - i += 2*vtype::nlanes; - } - if( i <= width - vtype::nlanes ) - { - vtype s = vx_load((const stype*)src + i); - for( k = cn; k < _ksize; k += cn ) - s = updateOp(s, vx_load((const stype*)src + i + k)); - v_store((stype*)dst + i, s); - i += vtype::nlanes; - } - if( i <= width - vtype::nlanes/2 ) - { - vtype s = vx_load_low((const stype*)src + i); - for( k = cn; k < _ksize; k += cn ) - s = updateOp(s, vx_load_low((const stype*)src + i + k)); - v_store_low((stype*)dst + i, s); - i += vtype::nlanes/2; - } - - return i - i % cn; - } - - int ksize, anchor; -}; - - -template struct MorphColumnVec -{ - typedef typename VecUpdate::vtype vtype; - typedef typename vtype::lane_type stype; - MorphColumnVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {} - int operator()(const uchar** _src, uchar* _dst, int dststep, int count, int width) const - { - int i = 0, k, _ksize = ksize; - VecUpdate updateOp; - - for( i = 0; i < count + ksize - 1; i++ ) - CV_Assert( ((size_t)_src[i] & (CV_SIMD_WIDTH-1)) == 0 ); - - const stype** src = (const stype**)_src; - stype* dst = (stype*)_dst; - dststep /= sizeof(dst[0]); - - for( ; _ksize > 1 && count > 1; count -= 2, dst += dststep*2, src += 2 ) - { - for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes) - { - const stype* sptr = src[1] + i; - vtype s0 = vx_load_aligned(sptr); - vtype s1 = vx_load_aligned(sptr + vtype::nlanes); - vtype s2 = vx_load_aligned(sptr + 2*vtype::nlanes); - vtype s3 = vx_load_aligned(sptr + 3*vtype::nlanes); - - for( k = 2; k < _ksize; k++ ) - { - sptr = src[k] + i; - s0 = updateOp(s0, vx_load_aligned(sptr)); - s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)); - s2 = updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes)); - s3 = updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes)); - } - - sptr = src[0] + i; - v_store(dst + i, updateOp(s0, vx_load_aligned(sptr))); - v_store(dst + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes))); - v_store(dst + i + 2*vtype::nlanes, updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes))); - v_store(dst + i + 3*vtype::nlanes, updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes))); - - sptr = src[k] + i; - v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(sptr))); - v_store(dst + dststep + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes))); - v_store(dst + dststep + i + 2*vtype::nlanes, updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes))); - v_store(dst + dststep + i + 3*vtype::nlanes, updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes))); - } - if( i <= width - 2*vtype::nlanes ) - { - const stype* sptr = src[1] + i; - vtype s0 = vx_load_aligned(sptr); - vtype s1 = vx_load_aligned(sptr + vtype::nlanes); - - for( k = 2; k < _ksize; k++ ) - { - sptr = src[k] + i; - s0 = updateOp(s0, vx_load_aligned(sptr)); - s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)); - } - - sptr = src[0] + i; - v_store(dst + i, updateOp(s0, vx_load_aligned(sptr))); - v_store(dst + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes))); - - sptr = src[k] + i; - v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(sptr))); - v_store(dst + dststep + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes))); - i += 2*vtype::nlanes; - } - if( i <= width - vtype::nlanes ) - { - vtype s0 = vx_load_aligned(src[1] + i); - - for( k = 2; k < _ksize; k++ ) - s0 = updateOp(s0, vx_load_aligned(src[k] + i)); - - v_store(dst + i, updateOp(s0, vx_load_aligned(src[0] + i))); - v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(src[k] + i))); - i += vtype::nlanes; - } - if( i <= width - vtype::nlanes/2 ) - { - vtype s0 = vx_load_low(src[1] + i); - - for( k = 2; k < _ksize; k++ ) - s0 = updateOp(s0, vx_load_low(src[k] + i)); - - v_store_low(dst + i, updateOp(s0, vx_load_low(src[0] + i))); - v_store_low(dst + dststep + i, updateOp(s0, vx_load_low(src[k] + i))); - i += vtype::nlanes/2; - } - } - - for( ; count > 0; count--, dst += dststep, src++ ) - { - for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes) - { - const stype* sptr = src[0] + i; - vtype s0 = vx_load_aligned(sptr); - vtype s1 = vx_load_aligned(sptr + vtype::nlanes); - vtype s2 = vx_load_aligned(sptr + 2*vtype::nlanes); - vtype s3 = vx_load_aligned(sptr + 3*vtype::nlanes); - - for( k = 1; k < _ksize; k++ ) - { - sptr = src[k] + i; - s0 = updateOp(s0, vx_load_aligned(sptr)); - s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)); - s2 = updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes)); - s3 = updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes)); - } - v_store(dst + i, s0); - v_store(dst + i + vtype::nlanes, s1); - v_store(dst + i + 2*vtype::nlanes, s2); - v_store(dst + i + 3*vtype::nlanes, s3); - } - if( i <= width - 2*vtype::nlanes ) - { - const stype* sptr = src[0] + i; - vtype s0 = vx_load_aligned(sptr); - vtype s1 = vx_load_aligned(sptr + vtype::nlanes); - - for( k = 1; k < _ksize; k++ ) - { - sptr = src[k] + i; - s0 = updateOp(s0, vx_load_aligned(sptr)); - s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)); - } - v_store(dst + i, s0); - v_store(dst + i + vtype::nlanes, s1); - i += 2*vtype::nlanes; - } - if( i <= width - vtype::nlanes ) - { - vtype s0 = vx_load_aligned(src[0] + i); - - for( k = 1; k < _ksize; k++ ) - s0 = updateOp(s0, vx_load_aligned(src[k] + i)); - v_store(dst + i, s0); - i += vtype::nlanes; - } - if( i <= width - vtype::nlanes/2 ) - { - vtype s0 = vx_load_low(src[0] + i); - - for( k = 1; k < _ksize; k++ ) - s0 = updateOp(s0, vx_load_low(src[k] + i)); - v_store_low(dst + i, s0); - i += vtype::nlanes/2; - } - } - - return i; - } - - int ksize, anchor; -}; - - -template struct MorphVec -{ - typedef typename VecUpdate::vtype vtype; - typedef typename vtype::lane_type stype; - int operator()(uchar** _src, int nz, uchar* _dst, int width) const - { - const stype** src = (const stype**)_src; - stype* dst = (stype*)_dst; - int i, k; - VecUpdate updateOp; - - for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes ) - { - const stype* sptr = src[0] + i; - vtype s0 = vx_load(sptr); - vtype s1 = vx_load(sptr + vtype::nlanes); - vtype s2 = vx_load(sptr + 2*vtype::nlanes); - vtype s3 = vx_load(sptr + 3*vtype::nlanes); - for( k = 1; k < nz; k++ ) - { - sptr = src[k] + i; - s0 = updateOp(s0, vx_load(sptr)); - s1 = updateOp(s1, vx_load(sptr + vtype::nlanes)); - s2 = updateOp(s2, vx_load(sptr + 2*vtype::nlanes)); - s3 = updateOp(s3, vx_load(sptr + 3*vtype::nlanes)); - } - v_store(dst + i, s0); - v_store(dst + i + vtype::nlanes, s1); - v_store(dst + i + 2*vtype::nlanes, s2); - v_store(dst + i + 3*vtype::nlanes, s3); - } - if( i <= width - 2*vtype::nlanes ) - { - const stype* sptr = src[0] + i; - vtype s0 = vx_load(sptr); - vtype s1 = vx_load(sptr + vtype::nlanes); - for( k = 1; k < nz; k++ ) - { - sptr = src[k] + i; - s0 = updateOp(s0, vx_load(sptr)); - s1 = updateOp(s1, vx_load(sptr + vtype::nlanes)); - } - v_store(dst + i, s0); - v_store(dst + i + vtype::nlanes, s1); - i += 2*vtype::nlanes; - } - if( i <= width - vtype::nlanes ) - { - vtype s0 = vx_load(src[0] + i); - for( k = 1; k < nz; k++ ) - s0 = updateOp(s0, vx_load(src[k] + i)); - v_store(dst + i, s0); - i += vtype::nlanes; - } - if( i <= width - vtype::nlanes/2 ) - { - vtype s0 = vx_load_low(src[0] + i); - for( k = 1; k < nz; k++ ) - s0 = updateOp(s0, vx_load_low(src[k] + i)); - v_store_low(dst + i, s0); - i += vtype::nlanes/2; - } - return i; - } -}; - -template struct VMin -{ - typedef T vtype; - vtype operator()(const vtype& a, const vtype& b) const { return v_min(a,b); } -}; -template struct VMax -{ - typedef T vtype; - vtype operator()(const vtype& a, const vtype& b) const { return v_max(a,b); } -}; - -typedef MorphRowVec > ErodeRowVec8u; -typedef MorphRowVec > DilateRowVec8u; -typedef MorphRowVec > ErodeRowVec16u; -typedef MorphRowVec > DilateRowVec16u; -typedef MorphRowVec > ErodeRowVec16s; -typedef MorphRowVec > DilateRowVec16s; -typedef MorphRowVec > ErodeRowVec32f; -typedef MorphRowVec > DilateRowVec32f; - -typedef MorphColumnVec > ErodeColumnVec8u; -typedef MorphColumnVec > DilateColumnVec8u; -typedef MorphColumnVec > ErodeColumnVec16u; -typedef MorphColumnVec > DilateColumnVec16u; -typedef MorphColumnVec > ErodeColumnVec16s; -typedef MorphColumnVec > DilateColumnVec16s; -typedef MorphColumnVec > ErodeColumnVec32f; -typedef MorphColumnVec > DilateColumnVec32f; - -typedef MorphVec > ErodeVec8u; -typedef MorphVec > DilateVec8u; -typedef MorphVec > ErodeVec16u; -typedef MorphVec > DilateVec16u; -typedef MorphVec > ErodeVec16s; -typedef MorphVec > DilateVec16s; -typedef MorphVec > ErodeVec32f; -typedef MorphVec > DilateVec32f; - -#else - -typedef MorphRowNoVec ErodeRowVec8u; -typedef MorphRowNoVec DilateRowVec8u; - -typedef MorphColumnNoVec ErodeColumnVec8u; -typedef MorphColumnNoVec DilateColumnVec8u; - -typedef MorphRowNoVec ErodeRowVec16u; -typedef MorphRowNoVec DilateRowVec16u; -typedef MorphRowNoVec ErodeRowVec16s; -typedef MorphRowNoVec DilateRowVec16s; -typedef MorphRowNoVec ErodeRowVec32f; -typedef MorphRowNoVec DilateRowVec32f; - -typedef MorphColumnNoVec ErodeColumnVec16u; -typedef MorphColumnNoVec DilateColumnVec16u; -typedef MorphColumnNoVec ErodeColumnVec16s; -typedef MorphColumnNoVec DilateColumnVec16s; -typedef MorphColumnNoVec ErodeColumnVec32f; -typedef MorphColumnNoVec DilateColumnVec32f; - -typedef MorphNoVec ErodeVec8u; -typedef MorphNoVec DilateVec8u; -typedef MorphNoVec ErodeVec16u; -typedef MorphNoVec DilateVec16u; -typedef MorphNoVec ErodeVec16s; -typedef MorphNoVec DilateVec16s; -typedef MorphNoVec ErodeVec32f; -typedef MorphNoVec DilateVec32f; - -#endif - -typedef MorphRowNoVec ErodeRowVec64f; -typedef MorphRowNoVec DilateRowVec64f; -typedef MorphColumnNoVec ErodeColumnVec64f; -typedef MorphColumnNoVec DilateColumnVec64f; -typedef MorphNoVec ErodeVec64f; -typedef MorphNoVec DilateVec64f; - - -template struct MorphRowFilter : public BaseRowFilter -{ - typedef typename Op::rtype T; - - MorphRowFilter( int _ksize, int _anchor ) : vecOp(_ksize, _anchor) - { - ksize = _ksize; - anchor = _anchor; - } - - void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE - { - int i, j, k, _ksize = ksize*cn; - const T* S = (const T*)src; - Op op; - T* D = (T*)dst; - - if( _ksize == cn ) - { - for( i = 0; i < width*cn; i++ ) - D[i] = S[i]; - return; - } - - int i0 = vecOp(src, dst, width, cn); - width *= cn; - - for( k = 0; k < cn; k++, S++, D++ ) - { - for( i = i0; i <= width - cn*2; i += cn*2 ) - { - const T* s = S + i; - T m = s[cn]; - for( j = cn*2; j < _ksize; j += cn ) - m = op(m, s[j]); - D[i] = op(m, s[0]); - D[i+cn] = op(m, s[j]); - } - - for( ; i < width; i += cn ) - { - const T* s = S + i; - T m = s[0]; - for( j = cn; j < _ksize; j += cn ) - m = op(m, s[j]); - D[i] = m; - } - } - } - - VecOp vecOp; -}; - - -template struct MorphColumnFilter : public BaseColumnFilter -{ - typedef typename Op::rtype T; - - MorphColumnFilter( int _ksize, int _anchor ) : vecOp(_ksize, _anchor) - { - ksize = _ksize; - anchor = _anchor; - } - - void operator()(const uchar** _src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE - { - int i, k, _ksize = ksize; - const T** src = (const T**)_src; - T* D = (T*)dst; - Op op; - - int i0 = vecOp(_src, dst, dststep, count, width); - dststep /= sizeof(D[0]); - - for( ; _ksize > 1 && count > 1; count -= 2, D += dststep*2, src += 2 ) - { - i = i0; - #if CV_ENABLE_UNROLLED - for( ; i <= width - 4; i += 4 ) - { - const T* sptr = src[1] + i; - T s0 = sptr[0], s1 = sptr[1], s2 = sptr[2], s3 = sptr[3]; - - for( k = 2; k < _ksize; k++ ) - { - sptr = src[k] + i; - s0 = op(s0, sptr[0]); s1 = op(s1, sptr[1]); - s2 = op(s2, sptr[2]); s3 = op(s3, sptr[3]); - } - - sptr = src[0] + i; - D[i] = op(s0, sptr[0]); - D[i+1] = op(s1, sptr[1]); - D[i+2] = op(s2, sptr[2]); - D[i+3] = op(s3, sptr[3]); - - sptr = src[k] + i; - D[i+dststep] = op(s0, sptr[0]); - D[i+dststep+1] = op(s1, sptr[1]); - D[i+dststep+2] = op(s2, sptr[2]); - D[i+dststep+3] = op(s3, sptr[3]); - } - #endif - for( ; i < width; i++ ) - { - T s0 = src[1][i]; - - for( k = 2; k < _ksize; k++ ) - s0 = op(s0, src[k][i]); - - D[i] = op(s0, src[0][i]); - D[i+dststep] = op(s0, src[k][i]); - } - } - - for( ; count > 0; count--, D += dststep, src++ ) - { - i = i0; - #if CV_ENABLE_UNROLLED - for( ; i <= width - 4; i += 4 ) - { - const T* sptr = src[0] + i; - T s0 = sptr[0], s1 = sptr[1], s2 = sptr[2], s3 = sptr[3]; - - for( k = 1; k < _ksize; k++ ) - { - sptr = src[k] + i; - s0 = op(s0, sptr[0]); s1 = op(s1, sptr[1]); - s2 = op(s2, sptr[2]); s3 = op(s3, sptr[3]); - } - - D[i] = s0; D[i+1] = s1; - D[i+2] = s2; D[i+3] = s3; - } - #endif - for( ; i < width; i++ ) - { - T s0 = src[0][i]; - for( k = 1; k < _ksize; k++ ) - s0 = op(s0, src[k][i]); - D[i] = s0; - } - } - } - - VecOp vecOp; -}; - - -template struct MorphFilter : BaseFilter -{ - typedef typename Op::rtype T; - - MorphFilter( const Mat& _kernel, Point _anchor ) - { - anchor = _anchor; - ksize = _kernel.size(); - CV_Assert( _kernel.type() == CV_8U ); - - std::vector coeffs; // we do not really the values of non-zero - // kernel elements, just their locations - preprocess2DKernel( _kernel, coords, coeffs ); - ptrs.resize( coords.size() ); - } - - void operator()(const uchar** src, uchar* dst, int dststep, int count, int width, int cn) CV_OVERRIDE - { - const Point* pt = &coords[0]; - const T** kp = (const T**)&ptrs[0]; - int i, k, nz = (int)coords.size(); - Op op; - - width *= cn; - for( ; count > 0; count--, dst += dststep, src++ ) - { - T* D = (T*)dst; - - for( k = 0; k < nz; k++ ) - kp[k] = (const T*)src[pt[k].y] + pt[k].x*cn; - - i = vecOp(&ptrs[0], nz, dst, width); - #if CV_ENABLE_UNROLLED - for( ; i <= width - 4; i += 4 ) - { - const T* sptr = kp[0] + i; - T s0 = sptr[0], s1 = sptr[1], s2 = sptr[2], s3 = sptr[3]; - - for( k = 1; k < nz; k++ ) - { - sptr = kp[k] + i; - s0 = op(s0, sptr[0]); s1 = op(s1, sptr[1]); - s2 = op(s2, sptr[2]); s3 = op(s3, sptr[3]); - } - - D[i] = s0; D[i+1] = s1; - D[i+2] = s2; D[i+3] = s3; - } - #endif - for( ; i < width; i++ ) - { - T s0 = kp[0][i]; - for( k = 1; k < nz; k++ ) - s0 = op(s0, kp[k][i]); - D[i] = s0; - } - } - } - - std::vector coords; - std::vector ptrs; - VecOp vecOp; -}; - -} +namespace cv { /////////////////////////////////// External Interface ///////////////////////////////////// -cv::Ptr cv::getMorphologyRowFilter(int op, int type, int ksize, int anchor) +Ptr getMorphologyRowFilter(int op, int type, int ksize, int anchor) { - int depth = CV_MAT_DEPTH(type); - if( anchor < 0 ) - anchor = ksize/2; - CV_Assert( op == MORPH_ERODE || op == MORPH_DILATE ); - if( op == MORPH_ERODE ) - { - if( depth == CV_8U ) - return makePtr, - ErodeRowVec8u> >(ksize, anchor); - if( depth == CV_16U ) - return makePtr, - ErodeRowVec16u> >(ksize, anchor); - if( depth == CV_16S ) - return makePtr, - ErodeRowVec16s> >(ksize, anchor); - if( depth == CV_32F ) - return makePtr, - ErodeRowVec32f> >(ksize, anchor); - if( depth == CV_64F ) - return makePtr, - ErodeRowVec64f> >(ksize, anchor); - } - else - { - if( depth == CV_8U ) - return makePtr, - DilateRowVec8u> >(ksize, anchor); - if( depth == CV_16U ) - return makePtr, - DilateRowVec16u> >(ksize, anchor); - if( depth == CV_16S ) - return makePtr, - DilateRowVec16s> >(ksize, anchor); - if( depth == CV_32F ) - return makePtr, - DilateRowVec32f> >(ksize, anchor); - if( depth == CV_64F ) - return makePtr, - DilateRowVec64f> >(ksize, anchor); - } + CV_INSTRUMENT_REGION(); - CV_Error_( CV_StsNotImplemented, ("Unsupported data type (=%d)", type)); + CV_CPU_DISPATCH(getMorphologyRowFilter, (op, type, ksize, anchor), + CV_CPU_DISPATCH_MODES_ALL); } -cv::Ptr cv::getMorphologyColumnFilter(int op, int type, int ksize, int anchor) +Ptr getMorphologyColumnFilter(int op, int type, int ksize, int anchor) { - int depth = CV_MAT_DEPTH(type); - if( anchor < 0 ) - anchor = ksize/2; - CV_Assert( op == MORPH_ERODE || op == MORPH_DILATE ); - if( op == MORPH_ERODE ) - { - if( depth == CV_8U ) - return makePtr, - ErodeColumnVec8u> >(ksize, anchor); - if( depth == CV_16U ) - return makePtr, - ErodeColumnVec16u> >(ksize, anchor); - if( depth == CV_16S ) - return makePtr, - ErodeColumnVec16s> >(ksize, anchor); - if( depth == CV_32F ) - return makePtr, - ErodeColumnVec32f> >(ksize, anchor); - if( depth == CV_64F ) - return makePtr, - ErodeColumnVec64f> >(ksize, anchor); - } - else - { - if( depth == CV_8U ) - return makePtr, - DilateColumnVec8u> >(ksize, anchor); - if( depth == CV_16U ) - return makePtr, - DilateColumnVec16u> >(ksize, anchor); - if( depth == CV_16S ) - return makePtr, - DilateColumnVec16s> >(ksize, anchor); - if( depth == CV_32F ) - return makePtr, - DilateColumnVec32f> >(ksize, anchor); - if( depth == CV_64F ) - return makePtr, - DilateColumnVec64f> >(ksize, anchor); - } + CV_INSTRUMENT_REGION(); - CV_Error_( CV_StsNotImplemented, ("Unsupported data type (=%d)", type)); + CV_CPU_DISPATCH(getMorphologyColumnFilter, (op, type, ksize, anchor), + CV_CPU_DISPATCH_MODES_ALL); } -cv::Ptr cv::getMorphologyFilter(int op, int type, InputArray _kernel, Point anchor) +Ptr getMorphologyFilter(int op, int type, InputArray _kernel, Point anchor) { - Mat kernel = _kernel.getMat(); - int depth = CV_MAT_DEPTH(type); - anchor = normalizeAnchor(anchor, kernel.size()); - CV_Assert( op == MORPH_ERODE || op == MORPH_DILATE ); - if( op == MORPH_ERODE ) - { - if( depth == CV_8U ) - return makePtr, ErodeVec8u> >(kernel, anchor); - if( depth == CV_16U ) - return makePtr, ErodeVec16u> >(kernel, anchor); - if( depth == CV_16S ) - return makePtr, ErodeVec16s> >(kernel, anchor); - if( depth == CV_32F ) - return makePtr, ErodeVec32f> >(kernel, anchor); - if( depth == CV_64F ) - return makePtr, ErodeVec64f> >(kernel, anchor); - } - else - { - if( depth == CV_8U ) - return makePtr, DilateVec8u> >(kernel, anchor); - if( depth == CV_16U ) - return makePtr, DilateVec16u> >(kernel, anchor); - if( depth == CV_16S ) - return makePtr, DilateVec16s> >(kernel, anchor); - if( depth == CV_32F ) - return makePtr, DilateVec32f> >(kernel, anchor); - if( depth == CV_64F ) - return makePtr, DilateVec64f> >(kernel, anchor); - } + CV_INSTRUMENT_REGION(); - CV_Error_( CV_StsNotImplemented, ("Unsupported data type (=%d)", type)); + Mat kernel = _kernel.getMat(); + CV_CPU_DISPATCH(getMorphologyFilter, (op, type, kernel, anchor), + CV_CPU_DISPATCH_MODES_ALL); } -cv::Ptr cv::createMorphologyFilter( int op, int type, InputArray _kernel, - Point anchor, int _rowBorderType, int _columnBorderType, - const Scalar& _borderValue ) +Ptr createMorphologyFilter( + int op, int type, InputArray _kernel, + Point anchor, int _rowBorderType, int _columnBorderType, + const Scalar& _borderValue) { Mat kernel = _kernel.getMat(); anchor = normalizeAnchor(anchor, kernel.size()); @@ -862,7 +132,7 @@ cv::Ptr cv::createMorphologyFilter( int op, int type, InputArr } -cv::Mat cv::getStructuringElement(int shape, Size ksize, Point anchor) +Mat getStructuringElement(int shape, Size ksize, Point anchor) { int i, j; int r = 0, c = 0; @@ -915,9 +185,6 @@ cv::Mat cv::getStructuringElement(int shape, Size ksize, Point anchor) return elem; } -namespace cv -{ - // ===== 1. replacement implementation static bool halMorph(int op, int src_type, int dst_type, @@ -1732,9 +999,7 @@ static void morphOp( int op, InputArray _src, OutputArray _dst, (src.isSubmatrix() && !isolated)); } -} - -void cv::erode( InputArray src, OutputArray dst, InputArray kernel, +void erode( InputArray src, OutputArray dst, InputArray kernel, Point anchor, int iterations, int borderType, const Scalar& borderValue ) { @@ -1744,7 +1009,7 @@ void cv::erode( InputArray src, OutputArray dst, InputArray kernel, } -void cv::dilate( InputArray src, OutputArray dst, InputArray kernel, +void dilate( InputArray src, OutputArray dst, InputArray kernel, Point anchor, int iterations, int borderType, const Scalar& borderValue ) { @@ -1755,8 +1020,6 @@ void cv::dilate( InputArray src, OutputArray dst, InputArray kernel, #ifdef HAVE_OPENCL -namespace cv { - static bool ocl_morphologyEx(InputArray _src, OutputArray _dst, int op, InputArray kernel, Point anchor, int iterations, int borderType, const Scalar& borderValue) @@ -1813,13 +1076,11 @@ static bool ocl_morphologyEx(InputArray _src, OutputArray _dst, int op, return true; } -} #endif #define IPP_DISABLE_MORPH_ADV 1 #ifdef HAVE_IPP #if !IPP_DISABLE_MORPH_ADV -namespace cv { static bool ipp_morphologyEx(int op, InputArray _src, OutputArray _dst, InputArray _kernel, Point anchor, int iterations, @@ -1884,11 +1145,10 @@ static bool ipp_morphologyEx(int op, InputArray _src, OutputArray _dst, return false; #endif } -} #endif #endif -void cv::morphologyEx( InputArray _src, OutputArray _dst, int op, +void morphologyEx( InputArray _src, OutputArray _dst, int op, InputArray _kernel, Point anchor, int iterations, int borderType, const Scalar& borderValue ) { @@ -1985,6 +1245,8 @@ void cv::morphologyEx( InputArray _src, OutputArray _dst, int op, } } +} // namespace cv + CV_IMPL IplConvKernel * cvCreateStructuringElementEx( int cols, int rows, int anchorX, int anchorY, diff --git a/modules/imgproc/src/morph.simd.hpp b/modules/imgproc/src/morph.simd.hpp new file mode 100644 index 0000000000..9b3023f8f0 --- /dev/null +++ b/modules/imgproc/src/morph.simd.hpp @@ -0,0 +1,846 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" +#include +#include "opencv2/core/hal/intrin.hpp" + +/****************************************************************************************\ + Basic Morphological Operations: Erosion & Dilation +\****************************************************************************************/ + +namespace cv { +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN +// forward declarations +Ptr getMorphologyRowFilter(int op, int type, int ksize, int anchor); +Ptr getMorphologyColumnFilter(int op, int type, int ksize, int anchor); +Ptr getMorphologyFilter(int op, int type, const Mat& kernel, Point anchor); + +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY + +namespace { +template struct MinOp +{ + typedef T type1; + typedef T type2; + typedef T rtype; + T operator ()(const T a, const T b) const { return std::min(a, b); } +}; + +template struct MaxOp +{ + typedef T type1; + typedef T type2; + typedef T rtype; + T operator ()(const T a, const T b) const { return std::max(a, b); } +}; + + +#if !defined(CV_SIMD) // min/max operation are usually fast enough (without using of control flow 'if' statements) + +#undef CV_MIN_8U +#undef CV_MAX_8U +#define CV_MIN_8U(a,b) ((a) - CV_FAST_CAST_8U((a) - (b))) +#define CV_MAX_8U(a,b) ((a) + CV_FAST_CAST_8U((b) - (a))) + +template<> inline uchar MinOp::operator ()(const uchar a, const uchar b) const { return CV_MIN_8U(a, b); } +template<> inline uchar MaxOp::operator ()(const uchar a, const uchar b) const { return CV_MAX_8U(a, b); } + +#endif + + + +struct MorphRowNoVec +{ + MorphRowNoVec(int, int) {} + int operator()(const uchar*, uchar*, int, int) const { return 0; } +}; + +struct MorphColumnNoVec +{ + MorphColumnNoVec(int, int) {} + int operator()(const uchar**, uchar*, int, int, int) const { return 0; } +}; + +struct MorphNoVec +{ + int operator()(uchar**, int, uchar*, int) const { return 0; } +}; + +#if CV_SIMD + +template struct MorphRowVec +{ + typedef typename VecUpdate::vtype vtype; + typedef typename vtype::lane_type stype; + MorphRowVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {} + int operator()(const uchar* src, uchar* dst, int width, int cn) const + { + CV_INSTRUMENT_REGION(); + + int i, k, _ksize = ksize*cn; + width *= cn; + VecUpdate updateOp; + + for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes ) + { + vtype s0 = vx_load((const stype*)src + i); + vtype s1 = vx_load((const stype*)src + i + vtype::nlanes); + vtype s2 = vx_load((const stype*)src + i + 2*vtype::nlanes); + vtype s3 = vx_load((const stype*)src + i + 3*vtype::nlanes); + for (k = cn; k < _ksize; k += cn) + { + s0 = updateOp(s0, vx_load((const stype*)src + i + k)); + s1 = updateOp(s1, vx_load((const stype*)src + i + k + vtype::nlanes)); + s2 = updateOp(s2, vx_load((const stype*)src + i + k + 2*vtype::nlanes)); + s3 = updateOp(s3, vx_load((const stype*)src + i + k + 3*vtype::nlanes)); + } + v_store((stype*)dst + i, s0); + v_store((stype*)dst + i + vtype::nlanes, s1); + v_store((stype*)dst + i + 2*vtype::nlanes, s2); + v_store((stype*)dst + i + 3*vtype::nlanes, s3); + } + if( i <= width - 2*vtype::nlanes ) + { + vtype s0 = vx_load((const stype*)src + i); + vtype s1 = vx_load((const stype*)src + i + vtype::nlanes); + for( k = cn; k < _ksize; k += cn ) + { + s0 = updateOp(s0, vx_load((const stype*)src + i + k)); + s1 = updateOp(s1, vx_load((const stype*)src + i + k + vtype::nlanes)); + } + v_store((stype*)dst + i, s0); + v_store((stype*)dst + i + vtype::nlanes, s1); + i += 2*vtype::nlanes; + } + if( i <= width - vtype::nlanes ) + { + vtype s = vx_load((const stype*)src + i); + for( k = cn; k < _ksize; k += cn ) + s = updateOp(s, vx_load((const stype*)src + i + k)); + v_store((stype*)dst + i, s); + i += vtype::nlanes; + } + if( i <= width - vtype::nlanes/2 ) + { + vtype s = vx_load_low((const stype*)src + i); + for( k = cn; k < _ksize; k += cn ) + s = updateOp(s, vx_load_low((const stype*)src + i + k)); + v_store_low((stype*)dst + i, s); + i += vtype::nlanes/2; + } + + return i - i % cn; + } + + int ksize, anchor; +}; + + +template struct MorphColumnVec +{ + typedef typename VecUpdate::vtype vtype; + typedef typename vtype::lane_type stype; + MorphColumnVec(int _ksize, int _anchor) : ksize(_ksize), anchor(_anchor) {} + int operator()(const uchar** _src, uchar* _dst, int dststep, int count, int width) const + { + CV_INSTRUMENT_REGION(); + + int i = 0, k, _ksize = ksize; + VecUpdate updateOp; + + for( i = 0; i < count + ksize - 1; i++ ) + CV_Assert( ((size_t)_src[i] & (CV_SIMD_WIDTH-1)) == 0 ); + + const stype** src = (const stype**)_src; + stype* dst = (stype*)_dst; + dststep /= sizeof(dst[0]); + + for( ; _ksize > 1 && count > 1; count -= 2, dst += dststep*2, src += 2 ) + { + for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes) + { + const stype* sptr = src[1] + i; + vtype s0 = vx_load_aligned(sptr); + vtype s1 = vx_load_aligned(sptr + vtype::nlanes); + vtype s2 = vx_load_aligned(sptr + 2*vtype::nlanes); + vtype s3 = vx_load_aligned(sptr + 3*vtype::nlanes); + + for( k = 2; k < _ksize; k++ ) + { + sptr = src[k] + i; + s0 = updateOp(s0, vx_load_aligned(sptr)); + s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)); + s2 = updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes)); + s3 = updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes)); + } + + sptr = src[0] + i; + v_store(dst + i, updateOp(s0, vx_load_aligned(sptr))); + v_store(dst + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes))); + v_store(dst + i + 2*vtype::nlanes, updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes))); + v_store(dst + i + 3*vtype::nlanes, updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes))); + + sptr = src[k] + i; + v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(sptr))); + v_store(dst + dststep + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes))); + v_store(dst + dststep + i + 2*vtype::nlanes, updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes))); + v_store(dst + dststep + i + 3*vtype::nlanes, updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes))); + } + if( i <= width - 2*vtype::nlanes ) + { + const stype* sptr = src[1] + i; + vtype s0 = vx_load_aligned(sptr); + vtype s1 = vx_load_aligned(sptr + vtype::nlanes); + + for( k = 2; k < _ksize; k++ ) + { + sptr = src[k] + i; + s0 = updateOp(s0, vx_load_aligned(sptr)); + s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)); + } + + sptr = src[0] + i; + v_store(dst + i, updateOp(s0, vx_load_aligned(sptr))); + v_store(dst + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes))); + + sptr = src[k] + i; + v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(sptr))); + v_store(dst + dststep + i + vtype::nlanes, updateOp(s1, vx_load_aligned(sptr + vtype::nlanes))); + i += 2*vtype::nlanes; + } + if( i <= width - vtype::nlanes ) + { + vtype s0 = vx_load_aligned(src[1] + i); + + for( k = 2; k < _ksize; k++ ) + s0 = updateOp(s0, vx_load_aligned(src[k] + i)); + + v_store(dst + i, updateOp(s0, vx_load_aligned(src[0] + i))); + v_store(dst + dststep + i, updateOp(s0, vx_load_aligned(src[k] + i))); + i += vtype::nlanes; + } + if( i <= width - vtype::nlanes/2 ) + { + vtype s0 = vx_load_low(src[1] + i); + + for( k = 2; k < _ksize; k++ ) + s0 = updateOp(s0, vx_load_low(src[k] + i)); + + v_store_low(dst + i, updateOp(s0, vx_load_low(src[0] + i))); + v_store_low(dst + dststep + i, updateOp(s0, vx_load_low(src[k] + i))); + i += vtype::nlanes/2; + } + } + + for( ; count > 0; count--, dst += dststep, src++ ) + { + for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes) + { + const stype* sptr = src[0] + i; + vtype s0 = vx_load_aligned(sptr); + vtype s1 = vx_load_aligned(sptr + vtype::nlanes); + vtype s2 = vx_load_aligned(sptr + 2*vtype::nlanes); + vtype s3 = vx_load_aligned(sptr + 3*vtype::nlanes); + + for( k = 1; k < _ksize; k++ ) + { + sptr = src[k] + i; + s0 = updateOp(s0, vx_load_aligned(sptr)); + s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)); + s2 = updateOp(s2, vx_load_aligned(sptr + 2*vtype::nlanes)); + s3 = updateOp(s3, vx_load_aligned(sptr + 3*vtype::nlanes)); + } + v_store(dst + i, s0); + v_store(dst + i + vtype::nlanes, s1); + v_store(dst + i + 2*vtype::nlanes, s2); + v_store(dst + i + 3*vtype::nlanes, s3); + } + if( i <= width - 2*vtype::nlanes ) + { + const stype* sptr = src[0] + i; + vtype s0 = vx_load_aligned(sptr); + vtype s1 = vx_load_aligned(sptr + vtype::nlanes); + + for( k = 1; k < _ksize; k++ ) + { + sptr = src[k] + i; + s0 = updateOp(s0, vx_load_aligned(sptr)); + s1 = updateOp(s1, vx_load_aligned(sptr + vtype::nlanes)); + } + v_store(dst + i, s0); + v_store(dst + i + vtype::nlanes, s1); + i += 2*vtype::nlanes; + } + if( i <= width - vtype::nlanes ) + { + vtype s0 = vx_load_aligned(src[0] + i); + + for( k = 1; k < _ksize; k++ ) + s0 = updateOp(s0, vx_load_aligned(src[k] + i)); + v_store(dst + i, s0); + i += vtype::nlanes; + } + if( i <= width - vtype::nlanes/2 ) + { + vtype s0 = vx_load_low(src[0] + i); + + for( k = 1; k < _ksize; k++ ) + s0 = updateOp(s0, vx_load_low(src[k] + i)); + v_store_low(dst + i, s0); + i += vtype::nlanes/2; + } + } + + return i; + } + + int ksize, anchor; +}; + + +template struct MorphVec +{ + typedef typename VecUpdate::vtype vtype; + typedef typename vtype::lane_type stype; + int operator()(uchar** _src, int nz, uchar* _dst, int width) const + { + CV_INSTRUMENT_REGION(); + + const stype** src = (const stype**)_src; + stype* dst = (stype*)_dst; + int i, k; + VecUpdate updateOp; + + for( i = 0; i <= width - 4*vtype::nlanes; i += 4*vtype::nlanes ) + { + const stype* sptr = src[0] + i; + vtype s0 = vx_load(sptr); + vtype s1 = vx_load(sptr + vtype::nlanes); + vtype s2 = vx_load(sptr + 2*vtype::nlanes); + vtype s3 = vx_load(sptr + 3*vtype::nlanes); + for( k = 1; k < nz; k++ ) + { + sptr = src[k] + i; + s0 = updateOp(s0, vx_load(sptr)); + s1 = updateOp(s1, vx_load(sptr + vtype::nlanes)); + s2 = updateOp(s2, vx_load(sptr + 2*vtype::nlanes)); + s3 = updateOp(s3, vx_load(sptr + 3*vtype::nlanes)); + } + v_store(dst + i, s0); + v_store(dst + i + vtype::nlanes, s1); + v_store(dst + i + 2*vtype::nlanes, s2); + v_store(dst + i + 3*vtype::nlanes, s3); + } + if( i <= width - 2*vtype::nlanes ) + { + const stype* sptr = src[0] + i; + vtype s0 = vx_load(sptr); + vtype s1 = vx_load(sptr + vtype::nlanes); + for( k = 1; k < nz; k++ ) + { + sptr = src[k] + i; + s0 = updateOp(s0, vx_load(sptr)); + s1 = updateOp(s1, vx_load(sptr + vtype::nlanes)); + } + v_store(dst + i, s0); + v_store(dst + i + vtype::nlanes, s1); + i += 2*vtype::nlanes; + } + if( i <= width - vtype::nlanes ) + { + vtype s0 = vx_load(src[0] + i); + for( k = 1; k < nz; k++ ) + s0 = updateOp(s0, vx_load(src[k] + i)); + v_store(dst + i, s0); + i += vtype::nlanes; + } + if( i <= width - vtype::nlanes/2 ) + { + vtype s0 = vx_load_low(src[0] + i); + for( k = 1; k < nz; k++ ) + s0 = updateOp(s0, vx_load_low(src[k] + i)); + v_store_low(dst + i, s0); + i += vtype::nlanes/2; + } + return i; + } +}; + +template struct VMin +{ + typedef T vtype; + vtype operator()(const vtype& a, const vtype& b) const { return v_min(a,b); } +}; +template struct VMax +{ + typedef T vtype; + vtype operator()(const vtype& a, const vtype& b) const { return v_max(a,b); } +}; + +typedef MorphRowVec > ErodeRowVec8u; +typedef MorphRowVec > DilateRowVec8u; +typedef MorphRowVec > ErodeRowVec16u; +typedef MorphRowVec > DilateRowVec16u; +typedef MorphRowVec > ErodeRowVec16s; +typedef MorphRowVec > DilateRowVec16s; +typedef MorphRowVec > ErodeRowVec32f; +typedef MorphRowVec > DilateRowVec32f; + +typedef MorphColumnVec > ErodeColumnVec8u; +typedef MorphColumnVec > DilateColumnVec8u; +typedef MorphColumnVec > ErodeColumnVec16u; +typedef MorphColumnVec > DilateColumnVec16u; +typedef MorphColumnVec > ErodeColumnVec16s; +typedef MorphColumnVec > DilateColumnVec16s; +typedef MorphColumnVec > ErodeColumnVec32f; +typedef MorphColumnVec > DilateColumnVec32f; + +typedef MorphVec > ErodeVec8u; +typedef MorphVec > DilateVec8u; +typedef MorphVec > ErodeVec16u; +typedef MorphVec > DilateVec16u; +typedef MorphVec > ErodeVec16s; +typedef MorphVec > DilateVec16s; +typedef MorphVec > ErodeVec32f; +typedef MorphVec > DilateVec32f; + +#else + +typedef MorphRowNoVec ErodeRowVec8u; +typedef MorphRowNoVec DilateRowVec8u; + +typedef MorphColumnNoVec ErodeColumnVec8u; +typedef MorphColumnNoVec DilateColumnVec8u; + +typedef MorphRowNoVec ErodeRowVec16u; +typedef MorphRowNoVec DilateRowVec16u; +typedef MorphRowNoVec ErodeRowVec16s; +typedef MorphRowNoVec DilateRowVec16s; +typedef MorphRowNoVec ErodeRowVec32f; +typedef MorphRowNoVec DilateRowVec32f; + +typedef MorphColumnNoVec ErodeColumnVec16u; +typedef MorphColumnNoVec DilateColumnVec16u; +typedef MorphColumnNoVec ErodeColumnVec16s; +typedef MorphColumnNoVec DilateColumnVec16s; +typedef MorphColumnNoVec ErodeColumnVec32f; +typedef MorphColumnNoVec DilateColumnVec32f; + +typedef MorphNoVec ErodeVec8u; +typedef MorphNoVec DilateVec8u; +typedef MorphNoVec ErodeVec16u; +typedef MorphNoVec DilateVec16u; +typedef MorphNoVec ErodeVec16s; +typedef MorphNoVec DilateVec16s; +typedef MorphNoVec ErodeVec32f; +typedef MorphNoVec DilateVec32f; + +#endif + +typedef MorphRowNoVec ErodeRowVec64f; +typedef MorphRowNoVec DilateRowVec64f; +typedef MorphColumnNoVec ErodeColumnVec64f; +typedef MorphColumnNoVec DilateColumnVec64f; +typedef MorphNoVec ErodeVec64f; +typedef MorphNoVec DilateVec64f; + + +template struct MorphRowFilter : public BaseRowFilter +{ + typedef typename Op::rtype T; + + MorphRowFilter( int _ksize, int _anchor ) : vecOp(_ksize, _anchor) + { + ksize = _ksize; + anchor = _anchor; + } + + void operator()(const uchar* src, uchar* dst, int width, int cn) CV_OVERRIDE + { + CV_INSTRUMENT_REGION(); + + int i, j, k, _ksize = ksize*cn; + const T* S = (const T*)src; + Op op; + T* D = (T*)dst; + + if( _ksize == cn ) + { + for( i = 0; i < width*cn; i++ ) + D[i] = S[i]; + return; + } + + int i0 = vecOp(src, dst, width, cn); + width *= cn; + + for( k = 0; k < cn; k++, S++, D++ ) + { + for( i = i0; i <= width - cn*2; i += cn*2 ) + { + const T* s = S + i; + T m = s[cn]; + for( j = cn*2; j < _ksize; j += cn ) + m = op(m, s[j]); + D[i] = op(m, s[0]); + D[i+cn] = op(m, s[j]); + } + + for( ; i < width; i += cn ) + { + const T* s = S + i; + T m = s[0]; + for( j = cn; j < _ksize; j += cn ) + m = op(m, s[j]); + D[i] = m; + } + } + } + + VecOp vecOp; +}; + + +template struct MorphColumnFilter : public BaseColumnFilter +{ + typedef typename Op::rtype T; + + MorphColumnFilter( int _ksize, int _anchor ) : vecOp(_ksize, _anchor) + { + ksize = _ksize; + anchor = _anchor; + } + + void operator()(const uchar** _src, uchar* dst, int dststep, int count, int width) CV_OVERRIDE + { + CV_INSTRUMENT_REGION(); + + int i, k, _ksize = ksize; + const T** src = (const T**)_src; + T* D = (T*)dst; + Op op; + + int i0 = vecOp(_src, dst, dststep, count, width); + dststep /= sizeof(D[0]); + + for( ; _ksize > 1 && count > 1; count -= 2, D += dststep*2, src += 2 ) + { + i = i0; + #if CV_ENABLE_UNROLLED + for( ; i <= width - 4; i += 4 ) + { + const T* sptr = src[1] + i; + T s0 = sptr[0], s1 = sptr[1], s2 = sptr[2], s3 = sptr[3]; + + for( k = 2; k < _ksize; k++ ) + { + sptr = src[k] + i; + s0 = op(s0, sptr[0]); s1 = op(s1, sptr[1]); + s2 = op(s2, sptr[2]); s3 = op(s3, sptr[3]); + } + + sptr = src[0] + i; + D[i] = op(s0, sptr[0]); + D[i+1] = op(s1, sptr[1]); + D[i+2] = op(s2, sptr[2]); + D[i+3] = op(s3, sptr[3]); + + sptr = src[k] + i; + D[i+dststep] = op(s0, sptr[0]); + D[i+dststep+1] = op(s1, sptr[1]); + D[i+dststep+2] = op(s2, sptr[2]); + D[i+dststep+3] = op(s3, sptr[3]); + } + #endif + for( ; i < width; i++ ) + { + T s0 = src[1][i]; + + for( k = 2; k < _ksize; k++ ) + s0 = op(s0, src[k][i]); + + D[i] = op(s0, src[0][i]); + D[i+dststep] = op(s0, src[k][i]); + } + } + + for( ; count > 0; count--, D += dststep, src++ ) + { + i = i0; + #if CV_ENABLE_UNROLLED + for( ; i <= width - 4; i += 4 ) + { + const T* sptr = src[0] + i; + T s0 = sptr[0], s1 = sptr[1], s2 = sptr[2], s3 = sptr[3]; + + for( k = 1; k < _ksize; k++ ) + { + sptr = src[k] + i; + s0 = op(s0, sptr[0]); s1 = op(s1, sptr[1]); + s2 = op(s2, sptr[2]); s3 = op(s3, sptr[3]); + } + + D[i] = s0; D[i+1] = s1; + D[i+2] = s2; D[i+3] = s3; + } + #endif + for( ; i < width; i++ ) + { + T s0 = src[0][i]; + for( k = 1; k < _ksize; k++ ) + s0 = op(s0, src[k][i]); + D[i] = s0; + } + } + } + + VecOp vecOp; +}; + + +template struct MorphFilter : BaseFilter +{ + typedef typename Op::rtype T; + + MorphFilter( const Mat& _kernel, Point _anchor ) + { + anchor = _anchor; + ksize = _kernel.size(); + CV_Assert( _kernel.type() == CV_8U ); + + std::vector coeffs; // we do not really the values of non-zero + // kernel elements, just their locations + preprocess2DKernel( _kernel, coords, coeffs ); + ptrs.resize( coords.size() ); + } + + void operator()(const uchar** src, uchar* dst, int dststep, int count, int width, int cn) CV_OVERRIDE + { + CV_INSTRUMENT_REGION(); + + const Point* pt = &coords[0]; + const T** kp = (const T**)&ptrs[0]; + int i, k, nz = (int)coords.size(); + Op op; + + width *= cn; + for( ; count > 0; count--, dst += dststep, src++ ) + { + T* D = (T*)dst; + + for( k = 0; k < nz; k++ ) + kp[k] = (const T*)src[pt[k].y] + pt[k].x*cn; + + i = vecOp(&ptrs[0], nz, dst, width); + #if CV_ENABLE_UNROLLED + for( ; i <= width - 4; i += 4 ) + { + const T* sptr = kp[0] + i; + T s0 = sptr[0], s1 = sptr[1], s2 = sptr[2], s3 = sptr[3]; + + for( k = 1; k < nz; k++ ) + { + sptr = kp[k] + i; + s0 = op(s0, sptr[0]); s1 = op(s1, sptr[1]); + s2 = op(s2, sptr[2]); s3 = op(s3, sptr[3]); + } + + D[i] = s0; D[i+1] = s1; + D[i+2] = s2; D[i+3] = s3; + } + #endif + for( ; i < width; i++ ) + { + T s0 = kp[0][i]; + for( k = 1; k < nz; k++ ) + s0 = op(s0, kp[k][i]); + D[i] = s0; + } + } + } + + std::vector coords; + std::vector ptrs; + VecOp vecOp; +}; + +} // namespace anon + +/////////////////////////////////// External Interface ///////////////////////////////////// + +Ptr getMorphologyRowFilter(int op, int type, int ksize, int anchor) +{ + CV_INSTRUMENT_REGION(); + + int depth = CV_MAT_DEPTH(type); + if( anchor < 0 ) + anchor = ksize/2; + CV_Assert( op == MORPH_ERODE || op == MORPH_DILATE ); + if( op == MORPH_ERODE ) + { + if( depth == CV_8U ) + return makePtr, + ErodeRowVec8u> >(ksize, anchor); + if( depth == CV_16U ) + return makePtr, + ErodeRowVec16u> >(ksize, anchor); + if( depth == CV_16S ) + return makePtr, + ErodeRowVec16s> >(ksize, anchor); + if( depth == CV_32F ) + return makePtr, + ErodeRowVec32f> >(ksize, anchor); + if( depth == CV_64F ) + return makePtr, + ErodeRowVec64f> >(ksize, anchor); + } + else + { + if( depth == CV_8U ) + return makePtr, + DilateRowVec8u> >(ksize, anchor); + if( depth == CV_16U ) + return makePtr, + DilateRowVec16u> >(ksize, anchor); + if( depth == CV_16S ) + return makePtr, + DilateRowVec16s> >(ksize, anchor); + if( depth == CV_32F ) + return makePtr, + DilateRowVec32f> >(ksize, anchor); + if( depth == CV_64F ) + return makePtr, + DilateRowVec64f> >(ksize, anchor); + } + + CV_Error_( CV_StsNotImplemented, ("Unsupported data type (=%d)", type)); +} + +Ptr getMorphologyColumnFilter(int op, int type, int ksize, int anchor) +{ + CV_INSTRUMENT_REGION(); + + int depth = CV_MAT_DEPTH(type); + if( anchor < 0 ) + anchor = ksize/2; + CV_Assert( op == MORPH_ERODE || op == MORPH_DILATE ); + if( op == MORPH_ERODE ) + { + if( depth == CV_8U ) + return makePtr, + ErodeColumnVec8u> >(ksize, anchor); + if( depth == CV_16U ) + return makePtr, + ErodeColumnVec16u> >(ksize, anchor); + if( depth == CV_16S ) + return makePtr, + ErodeColumnVec16s> >(ksize, anchor); + if( depth == CV_32F ) + return makePtr, + ErodeColumnVec32f> >(ksize, anchor); + if( depth == CV_64F ) + return makePtr, + ErodeColumnVec64f> >(ksize, anchor); + } + else + { + if( depth == CV_8U ) + return makePtr, + DilateColumnVec8u> >(ksize, anchor); + if( depth == CV_16U ) + return makePtr, + DilateColumnVec16u> >(ksize, anchor); + if( depth == CV_16S ) + return makePtr, + DilateColumnVec16s> >(ksize, anchor); + if( depth == CV_32F ) + return makePtr, + DilateColumnVec32f> >(ksize, anchor); + if( depth == CV_64F ) + return makePtr, + DilateColumnVec64f> >(ksize, anchor); + } + + CV_Error_( CV_StsNotImplemented, ("Unsupported data type (=%d)", type)); +} + +Ptr getMorphologyFilter(int op, int type, const Mat& kernel, Point anchor) +{ + CV_INSTRUMENT_REGION(); + + int depth = CV_MAT_DEPTH(type); + anchor = normalizeAnchor(anchor, kernel.size()); + CV_Assert( op == MORPH_ERODE || op == MORPH_DILATE ); + if( op == MORPH_ERODE ) + { + if( depth == CV_8U ) + return makePtr, ErodeVec8u> >(kernel, anchor); + if( depth == CV_16U ) + return makePtr, ErodeVec16u> >(kernel, anchor); + if( depth == CV_16S ) + return makePtr, ErodeVec16s> >(kernel, anchor); + if( depth == CV_32F ) + return makePtr, ErodeVec32f> >(kernel, anchor); + if( depth == CV_64F ) + return makePtr, ErodeVec64f> >(kernel, anchor); + } + else + { + if( depth == CV_8U ) + return makePtr, DilateVec8u> >(kernel, anchor); + if( depth == CV_16U ) + return makePtr, DilateVec16u> >(kernel, anchor); + if( depth == CV_16S ) + return makePtr, DilateVec16s> >(kernel, anchor); + if( depth == CV_32F ) + return makePtr, DilateVec32f> >(kernel, anchor); + if( depth == CV_64F ) + return makePtr, DilateVec64f> >(kernel, anchor); + } + + CV_Error_( CV_StsNotImplemented, ("Unsupported data type (=%d)", type)); +} + +#endif +CV_CPU_OPTIMIZATION_NAMESPACE_END +} // namespace diff --git a/modules/imgproc/src/smooth.dispatch.cpp b/modules/imgproc/src/smooth.dispatch.cpp new file mode 100644 index 0000000000..4e514eb8b8 --- /dev/null +++ b/modules/imgproc/src/smooth.dispatch.cpp @@ -0,0 +1,582 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2014-2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" + +#include + +#include "opencv2/core/hal/intrin.hpp" +#include "opencl_kernels_imgproc.hpp" + +#include "opencv2/core/openvx/ovx_defs.hpp" + +#include "filter.hpp" + +#include "opencv2/core/softfloat.hpp" + +namespace cv { +#include "fixedpoint.inl.hpp" +} + +#include "smooth.simd.hpp" +#include "smooth.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content + +namespace cv { + +/****************************************************************************************\ + Gaussian Blur +\****************************************************************************************/ + +Mat getGaussianKernel(int n, double sigma, int ktype) +{ + CV_Assert(n > 0); + const int SMALL_GAUSSIAN_SIZE = 7; + static const float small_gaussian_tab[][SMALL_GAUSSIAN_SIZE] = + { + {1.f}, + {0.25f, 0.5f, 0.25f}, + {0.0625f, 0.25f, 0.375f, 0.25f, 0.0625f}, + {0.03125f, 0.109375f, 0.21875f, 0.28125f, 0.21875f, 0.109375f, 0.03125f} + }; + + const float* fixed_kernel = n % 2 == 1 && n <= SMALL_GAUSSIAN_SIZE && sigma <= 0 ? + small_gaussian_tab[n>>1] : 0; + + CV_Assert( ktype == CV_32F || ktype == CV_64F ); + Mat kernel(n, 1, ktype); + float* cf = kernel.ptr(); + double* cd = kernel.ptr(); + + double sigmaX = sigma > 0 ? sigma : ((n-1)*0.5 - 1)*0.3 + 0.8; + double scale2X = -0.5/(sigmaX*sigmaX); + double sum = 0; + + int i; + for( i = 0; i < n; i++ ) + { + double x = i - (n-1)*0.5; + double t = fixed_kernel ? (double)fixed_kernel[i] : std::exp(scale2X*x*x); + if( ktype == CV_32F ) + { + cf[i] = (float)t; + sum += cf[i]; + } + else + { + cd[i] = t; + sum += cd[i]; + } + } + + CV_DbgAssert(fabs(sum) > 0); + sum = 1./sum; + for( i = 0; i < n; i++ ) + { + if( ktype == CV_32F ) + cf[i] = (float)(cf[i]*sum); + else + cd[i] *= sum; + } + + return kernel; +} + +template +static std::vector getFixedpointGaussianKernel( int n, double sigma ) +{ + if (sigma <= 0) + { + if(n == 1) + return std::vector(1, softdouble(1.0)); + else if(n == 3) + { + T v3[] = { softdouble(0.25), softdouble(0.5), softdouble(0.25) }; + return std::vector(v3, v3 + 3); + } + else if(n == 5) + { + T v5[] = { softdouble(0.0625), softdouble(0.25), softdouble(0.375), softdouble(0.25), softdouble(0.0625) }; + return std::vector(v5, v5 + 5); + } + else if(n == 7) + { + T v7[] = { softdouble(0.03125), softdouble(0.109375), softdouble(0.21875), softdouble(0.28125), softdouble(0.21875), softdouble(0.109375), softdouble(0.03125) }; + return std::vector(v7, v7 + 7); + } + } + + + softdouble sigmaX = sigma > 0 ? softdouble(sigma) : mulAdd(softdouble(n),softdouble(0.15),softdouble(0.35));// softdouble(((n-1)*0.5 - 1)*0.3 + 0.8) + softdouble scale2X = softdouble(-0.5*0.25)/(sigmaX*sigmaX); + std::vector values(n); + softdouble sum(0.); + for(int i = 0, x = 1 - n; i < n; i++, x+=2 ) + { + // x = i - (n - 1)*0.5 + // t = std::exp(scale2X*x*x) + values[i] = exp(softdouble(x*x)*scale2X); + sum += values[i]; + } + sum = softdouble::one()/sum; + + std::vector kernel(n); + for(int i = 0; i < n; i++ ) + { + kernel[i] = values[i] * sum; + } + + return kernel; +}; + +static void getGaussianKernel(int n, double sigma, int ktype, Mat& res) { res = getGaussianKernel(n, sigma, ktype); } +template static void getGaussianKernel(int n, double sigma, int, std::vector& res) { res = getFixedpointGaussianKernel(n, sigma); } + +template +static void createGaussianKernels( T & kx, T & ky, int type, Size &ksize, + double sigma1, double sigma2 ) +{ + int depth = CV_MAT_DEPTH(type); + if( sigma2 <= 0 ) + sigma2 = sigma1; + + // automatic detection of kernel size from sigma + if( ksize.width <= 0 && sigma1 > 0 ) + ksize.width = cvRound(sigma1*(depth == CV_8U ? 3 : 4)*2 + 1)|1; + if( ksize.height <= 0 && sigma2 > 0 ) + ksize.height = cvRound(sigma2*(depth == CV_8U ? 3 : 4)*2 + 1)|1; + + CV_Assert( ksize.width > 0 && ksize.width % 2 == 1 && + ksize.height > 0 && ksize.height % 2 == 1 ); + + sigma1 = std::max( sigma1, 0. ); + sigma2 = std::max( sigma2, 0. ); + + getGaussianKernel( ksize.width, sigma1, std::max(depth, CV_32F), kx ); + if( ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON ) + ky = kx; + else + getGaussianKernel( ksize.height, sigma2, std::max(depth, CV_32F), ky ); +} + +Ptr createGaussianFilter( int type, Size ksize, + double sigma1, double sigma2, + int borderType ) +{ + Mat kx, ky; + createGaussianKernels(kx, ky, type, ksize, sigma1, sigma2); + + return createSeparableLinearFilter( type, type, kx, ky, Point(-1,-1), 0, borderType ); +} + +#ifdef HAVE_OPENCL + +static bool ocl_GaussianBlur_8UC1(InputArray _src, OutputArray _dst, Size ksize, int ddepth, + InputArray _kernelX, InputArray _kernelY, int borderType) +{ + const ocl::Device & dev = ocl::Device::getDefault(); + int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); + + if ( !(dev.isIntel() && (type == CV_8UC1) && + (_src.offset() == 0) && (_src.step() % 4 == 0) && + ((ksize.width == 5 && (_src.cols() % 4 == 0)) || + (ksize.width == 3 && (_src.cols() % 16 == 0) && (_src.rows() % 2 == 0)))) ) + return false; + + Mat kernelX = _kernelX.getMat().reshape(1, 1); + if (kernelX.cols % 2 != 1) + return false; + Mat kernelY = _kernelY.getMat().reshape(1, 1); + if (kernelY.cols % 2 != 1) + return false; + + if (ddepth < 0) + ddepth = sdepth; + + Size size = _src.size(); + size_t globalsize[2] = { 0, 0 }; + size_t localsize[2] = { 0, 0 }; + + if (ksize.width == 3) + { + globalsize[0] = size.width / 16; + globalsize[1] = size.height / 2; + } + else if (ksize.width == 5) + { + globalsize[0] = size.width / 4; + globalsize[1] = size.height / 1; + } + + const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" }; + char build_opts[1024]; + sprintf(build_opts, "-D %s %s%s", borderMap[borderType & ~BORDER_ISOLATED], + ocl::kernelToStr(kernelX, CV_32F, "KERNEL_MATRIX_X").c_str(), + ocl::kernelToStr(kernelY, CV_32F, "KERNEL_MATRIX_Y").c_str()); + + ocl::Kernel kernel; + + if (ksize.width == 3) + kernel.create("gaussianBlur3x3_8UC1_cols16_rows2", cv::ocl::imgproc::gaussianBlur3x3_oclsrc, build_opts); + else if (ksize.width == 5) + kernel.create("gaussianBlur5x5_8UC1_cols4", cv::ocl::imgproc::gaussianBlur5x5_oclsrc, build_opts); + + if (kernel.empty()) + return false; + + UMat src = _src.getUMat(); + _dst.create(size, CV_MAKETYPE(ddepth, cn)); + if (!(_dst.offset() == 0 && _dst.step() % 4 == 0)) + return false; + UMat dst = _dst.getUMat(); + + int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src)); + idxArg = kernel.set(idxArg, (int)src.step); + idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst)); + idxArg = kernel.set(idxArg, (int)dst.step); + idxArg = kernel.set(idxArg, (int)dst.rows); + idxArg = kernel.set(idxArg, (int)dst.cols); + + return kernel.run(2, globalsize, (localsize[0] == 0) ? NULL : localsize, false); +} + +#endif + +#ifdef HAVE_OPENVX + +namespace ovx { + template <> inline bool skipSmallImages(int w, int h) { return w*h < 320 * 240; } +} +static bool openvx_gaussianBlur(InputArray _src, OutputArray _dst, Size ksize, + double sigma1, double sigma2, int borderType) +{ + if (sigma2 <= 0) + sigma2 = sigma1; + // automatic detection of kernel size from sigma + if (ksize.width <= 0 && sigma1 > 0) + ksize.width = cvRound(sigma1*6 + 1) | 1; + if (ksize.height <= 0 && sigma2 > 0) + ksize.height = cvRound(sigma2*6 + 1) | 1; + + if (_src.type() != CV_8UC1 || + _src.cols() < 3 || _src.rows() < 3 || + ksize.width != 3 || ksize.height != 3) + return false; + + sigma1 = std::max(sigma1, 0.); + sigma2 = std::max(sigma2, 0.); + + if (!(sigma1 == 0.0 || (sigma1 - 0.8) < DBL_EPSILON) || !(sigma2 == 0.0 || (sigma2 - 0.8) < DBL_EPSILON) || + ovx::skipSmallImages(_src.cols(), _src.rows())) + return false; + + Mat src = _src.getMat(); + Mat dst = _dst.getMat(); + + if ((borderType & BORDER_ISOLATED) == 0 && src.isSubmatrix()) + return false; //Process isolated borders only + vx_enum border; + switch (borderType & ~BORDER_ISOLATED) + { + case BORDER_CONSTANT: + border = VX_BORDER_CONSTANT; + break; + case BORDER_REPLICATE: + border = VX_BORDER_REPLICATE; + break; + default: + return false; + } + + try + { + ivx::Context ctx = ovx::getOpenVXContext(); + + Mat a; + if (dst.data != src.data) + a = src; + else + src.copyTo(a); + + ivx::Image + ia = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8, + ivx::Image::createAddressing(a.cols, a.rows, 1, (vx_int32)(a.step)), a.data), + ib = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8, + ivx::Image::createAddressing(dst.cols, dst.rows, 1, (vx_int32)(dst.step)), dst.data); + + //ATTENTION: VX_CONTEXT_IMMEDIATE_BORDER attribute change could lead to strange issues in multi-threaded environments + //since OpenVX standard says nothing about thread-safety for now + ivx::border_t prevBorder = ctx.immediateBorder(); + ctx.setImmediateBorder(border, (vx_uint8)(0)); + ivx::IVX_CHECK_STATUS(vxuGaussian3x3(ctx, ia, ib)); + ctx.setImmediateBorder(prevBorder); + } + catch (const ivx::RuntimeError & e) + { + VX_DbgThrow(e.what()); + } + catch (const ivx::WrapperError & e) + { + VX_DbgThrow(e.what()); + } + return true; +} + +#endif + +#ifdef HAVE_IPP +// IW 2017u2 has bug which doesn't allow use of partial inMem with tiling +#if IPP_DISABLE_GAUSSIANBLUR_PARALLEL +#define IPP_GAUSSIANBLUR_PARALLEL 0 +#else +#define IPP_GAUSSIANBLUR_PARALLEL 1 +#endif + +#ifdef HAVE_IPP_IW + +class ipp_gaussianBlurParallel: public ParallelLoopBody +{ +public: + ipp_gaussianBlurParallel(::ipp::IwiImage &src, ::ipp::IwiImage &dst, int kernelSize, float sigma, ::ipp::IwiBorderType &border, bool *pOk): + m_src(src), m_dst(dst), m_kernelSize(kernelSize), m_sigma(sigma), m_border(border), m_pOk(pOk) { + *m_pOk = true; + } + ~ipp_gaussianBlurParallel() + { + } + + virtual void operator() (const Range& range) const CV_OVERRIDE + { + CV_INSTRUMENT_REGION_IPP(); + + if(!*m_pOk) + return; + + try + { + ::ipp::IwiTile tile = ::ipp::IwiRoi(0, range.start, m_dst.m_size.width, range.end - range.start); + CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterGaussian, m_src, m_dst, m_kernelSize, m_sigma, ::ipp::IwDefault(), m_border, tile); + } + catch(const ::ipp::IwException &) + { + *m_pOk = false; + return; + } + } +private: + ::ipp::IwiImage &m_src; + ::ipp::IwiImage &m_dst; + + int m_kernelSize; + float m_sigma; + ::ipp::IwiBorderType &m_border; + + volatile bool *m_pOk; + const ipp_gaussianBlurParallel& operator= (const ipp_gaussianBlurParallel&); +}; + +#endif + +static bool ipp_GaussianBlur(InputArray _src, OutputArray _dst, Size ksize, + double sigma1, double sigma2, int borderType ) +{ +#ifdef HAVE_IPP_IW + CV_INSTRUMENT_REGION_IPP(); + +#if IPP_VERSION_X100 < 201800 && ((defined _MSC_VER && defined _M_IX86) || (defined __GNUC__ && defined __i386__)) + CV_UNUSED(_src); CV_UNUSED(_dst); CV_UNUSED(ksize); CV_UNUSED(sigma1); CV_UNUSED(sigma2); CV_UNUSED(borderType); + return false; // bug on ia32 +#else + if(sigma1 != sigma2) + return false; + + if(sigma1 < FLT_EPSILON) + return false; + + if(ksize.width != ksize.height) + return false; + + // Acquire data and begin processing + try + { + Mat src = _src.getMat(); + Mat dst = _dst.getMat(); + ::ipp::IwiImage iwSrc = ippiGetImage(src); + ::ipp::IwiImage iwDst = ippiGetImage(dst); + ::ipp::IwiBorderSize borderSize = ::ipp::iwiSizeToBorderSize(ippiGetSize(ksize)); + ::ipp::IwiBorderType ippBorder(ippiGetBorder(iwSrc, borderType, borderSize)); + if(!ippBorder) + return false; + + const int threads = ippiSuggestThreadsNum(iwDst, 2); + if(IPP_GAUSSIANBLUR_PARALLEL && threads > 1) { + bool ok; + ipp_gaussianBlurParallel invoker(iwSrc, iwDst, ksize.width, (float) sigma1, ippBorder, &ok); + + if(!ok) + return false; + const Range range(0, (int) iwDst.m_size.height); + parallel_for_(range, invoker, threads*4); + + if(!ok) + return false; + } else { + CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterGaussian, iwSrc, iwDst, ksize.width, sigma1, ::ipp::IwDefault(), ippBorder); + } + } + catch (const ::ipp::IwException &) + { + return false; + } + + return true; +#endif +#else + CV_UNUSED(_src); CV_UNUSED(_dst); CV_UNUSED(ksize); CV_UNUSED(sigma1); CV_UNUSED(sigma2); CV_UNUSED(borderType); + return false; +#endif +} +#endif + +void GaussianBlur(InputArray _src, OutputArray _dst, Size ksize, + double sigma1, double sigma2, + int borderType) +{ + CV_INSTRUMENT_REGION(); + + int type = _src.type(); + Size size = _src.size(); + _dst.create( size, type ); + + if( (borderType & ~BORDER_ISOLATED) != BORDER_CONSTANT && + ((borderType & BORDER_ISOLATED) != 0 || !_src.getMat().isSubmatrix()) ) + { + if( size.height == 1 ) + ksize.height = 1; + if( size.width == 1 ) + ksize.width = 1; + } + + if( ksize.width == 1 && ksize.height == 1 ) + { + _src.copyTo(_dst); + return; + } + + bool useOpenCL = (ocl::isOpenCLActivated() && _dst.isUMat() && _src.dims() <= 2 && + ((ksize.width == 3 && ksize.height == 3) || + (ksize.width == 5 && ksize.height == 5)) && + _src.rows() > ksize.height && _src.cols() > ksize.width); + CV_UNUSED(useOpenCL); + + int sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); + + Mat kx, ky; + createGaussianKernels(kx, ky, type, ksize, sigma1, sigma2); + + CV_OCL_RUN(useOpenCL, ocl_GaussianBlur_8UC1(_src, _dst, ksize, CV_MAT_DEPTH(type), kx, ky, borderType)); + + CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && (size_t)_src.rows() > kx.total() && (size_t)_src.cols() > kx.total(), + ocl_sepFilter2D(_src, _dst, sdepth, kx, ky, Point(-1, -1), 0, borderType)) + + Mat src = _src.getMat(); + Mat dst = _dst.getMat(); + + Point ofs; + Size wsz(src.cols, src.rows); + if(!(borderType & BORDER_ISOLATED)) + src.locateROI( wsz, ofs ); + + CALL_HAL(gaussianBlur, cv_hal_gaussianBlur, src.ptr(), src.step, dst.ptr(), dst.step, src.cols, src.rows, sdepth, cn, + ofs.x, ofs.y, wsz.width - src.cols - ofs.x, wsz.height - src.rows - ofs.y, ksize.width, ksize.height, + sigma1, sigma2, borderType&~BORDER_ISOLATED); + + CV_OVX_RUN(true, + openvx_gaussianBlur(src, dst, ksize, sigma1, sigma2, borderType)) + + CV_IPP_RUN_FAST(ipp_GaussianBlur(src, dst, ksize, sigma1, sigma2, borderType)); + + if(sdepth == CV_8U && ((borderType & BORDER_ISOLATED) || !_src.getMat().isSubmatrix())) + { + std::vector fkx, fky; + createGaussianKernels(fkx, fky, type, ksize, sigma1, sigma2); + if (src.data == dst.data) + src = src.clone(); + CV_CPU_DISPATCH(GaussianBlurFixedPoint, (src, dst, (const uint16_t*)&fkx[0], (int)fkx.size(), (const uint16_t*)&fky[0], (int)fky.size(), borderType), + CV_CPU_DISPATCH_MODES_ALL); + return; + } + + sepFilter2D(src, dst, sdepth, kx, ky, Point(-1, -1), 0, borderType); +} + +} // namespace + +////////////////////////////////////////////////////////////////////////////////////////// + +CV_IMPL void +cvSmooth( const void* srcarr, void* dstarr, int smooth_type, + int param1, int param2, double param3, double param4 ) +{ + cv::Mat src = cv::cvarrToMat(srcarr), dst0 = cv::cvarrToMat(dstarr), dst = dst0; + + CV_Assert( dst.size() == src.size() && + (smooth_type == CV_BLUR_NO_SCALE || dst.type() == src.type()) ); + + if( param2 <= 0 ) + param2 = param1; + + if( smooth_type == CV_BLUR || smooth_type == CV_BLUR_NO_SCALE ) + cv::boxFilter( src, dst, dst.depth(), cv::Size(param1, param2), cv::Point(-1,-1), + smooth_type == CV_BLUR, cv::BORDER_REPLICATE ); + else if( smooth_type == CV_GAUSSIAN ) + cv::GaussianBlur( src, dst, cv::Size(param1, param2), param3, param4, cv::BORDER_REPLICATE ); + else if( smooth_type == CV_MEDIAN ) + cv::medianBlur( src, dst, param1 ); + else + cv::bilateralFilter( src, dst, param1, param3, param4, cv::BORDER_REPLICATE ); + + if( dst.data != dst0.data ) + CV_Error( CV_StsUnmatchedFormats, "The destination image does not have the proper type" ); +} + +/* End of file. */ diff --git a/modules/imgproc/src/smooth.cpp b/modules/imgproc/src/smooth.simd.hpp similarity index 84% rename from modules/imgproc/src/smooth.cpp rename to modules/imgproc/src/smooth.simd.hpp index 909ffa919c..4f52bc0d80 100644 --- a/modules/imgproc/src/smooth.cpp +++ b/modules/imgproc/src/smooth.simd.hpp @@ -46,120 +46,28 @@ #include #include "opencv2/core/hal/intrin.hpp" -#include "opencl_kernels_imgproc.hpp" - -#include "opencv2/core/openvx/ovx_defs.hpp" #include "filter.hpp" -#include "fixedpoint.inl.hpp" - -/****************************************************************************************\ - Gaussian Blur -\****************************************************************************************/ - -cv::Mat cv::getGaussianKernel( int n, double sigma, int ktype ) -{ - CV_Assert(n > 0); - const int SMALL_GAUSSIAN_SIZE = 7; - static const float small_gaussian_tab[][SMALL_GAUSSIAN_SIZE] = - { - {1.f}, - {0.25f, 0.5f, 0.25f}, - {0.0625f, 0.25f, 0.375f, 0.25f, 0.0625f}, - {0.03125f, 0.109375f, 0.21875f, 0.28125f, 0.21875f, 0.109375f, 0.03125f} - }; - - const float* fixed_kernel = n % 2 == 1 && n <= SMALL_GAUSSIAN_SIZE && sigma <= 0 ? - small_gaussian_tab[n>>1] : 0; - - CV_Assert( ktype == CV_32F || ktype == CV_64F ); - Mat kernel(n, 1, ktype); - float* cf = kernel.ptr(); - double* cd = kernel.ptr(); - - double sigmaX = sigma > 0 ? sigma : ((n-1)*0.5 - 1)*0.3 + 0.8; - double scale2X = -0.5/(sigmaX*sigmaX); - double sum = 0; - - int i; - for( i = 0; i < n; i++ ) - { - double x = i - (n-1)*0.5; - double t = fixed_kernel ? (double)fixed_kernel[i] : std::exp(scale2X*x*x); - if( ktype == CV_32F ) - { - cf[i] = (float)t; - sum += cf[i]; - } - else - { - cd[i] = t; - sum += cd[i]; - } - } - - CV_DbgAssert(fabs(sum) > 0); - sum = 1./sum; - for( i = 0; i < n; i++ ) - { - if( ktype == CV_32F ) - cf[i] = (float)(cf[i]*sum); - else - cd[i] *= sum; - } - - return kernel; -} +#include "opencv2/core/softfloat.hpp" namespace cv { +CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN +// forward declarations +void GaussianBlurFixedPoint(const Mat& src, /*const*/ Mat& dst, + const uint16_t/*ufixedpoint16*/* fkx, int fkx_size, + const uint16_t/*ufixedpoint16*/* fky, int fky_size, + int borderType); -template -static std::vector getFixedpointGaussianKernel( int n, double sigma ) -{ - if (sigma <= 0) - { - if(n == 1) - return std::vector(1, softdouble(1.0)); - else if(n == 3) - { - T v3[] = { softdouble(0.25), softdouble(0.5), softdouble(0.25) }; - return std::vector(v3, v3 + 3); - } - else if(n == 5) - { - T v5[] = { softdouble(0.0625), softdouble(0.25), softdouble(0.375), softdouble(0.25), softdouble(0.0625) }; - return std::vector(v5, v5 + 5); - } - else if(n == 7) - { - T v7[] = { softdouble(0.03125), softdouble(0.109375), softdouble(0.21875), softdouble(0.28125), softdouble(0.21875), softdouble(0.109375), softdouble(0.03125) }; - return std::vector(v7, v7 + 7); - } - } - - - softdouble sigmaX = sigma > 0 ? softdouble(sigma) : mulAdd(softdouble(n),softdouble(0.15),softdouble(0.35));// softdouble(((n-1)*0.5 - 1)*0.3 + 0.8) - softdouble scale2X = softdouble(-0.5*0.25)/(sigmaX*sigmaX); - std::vector values(n); - softdouble sum(0.); - for(int i = 0, x = 1 - n; i < n; i++, x+=2 ) - { - // x = i - (n - 1)*0.5 - // t = std::exp(scale2X*x*x) - values[i] = exp(softdouble(x*x)*scale2X); - sum += values[i]; - } - sum = softdouble::one()/sum; +#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY - std::vector kernel(n); - for(int i = 0; i < n; i++ ) - { - kernel[i] = values[i] * sum; - } +#if defined(CV_CPU_BASELINE_MODE) +// included in dispatch.cpp +#else +#include "fixedpoint.inl.hpp" +#endif - return kernel; -}; +namespace { template void hlineSmooth1N(const ET* src, int cn, const FT* m, int, FT* dst, int len, int) @@ -2119,418 +2027,27 @@ private: fixedSmoothInvoker& operator=(const fixedSmoothInvoker&); }; -static void getGaussianKernel(int n, double sigma, int ktype, Mat& res) { res = getGaussianKernel(n, sigma, ktype); } -template static void getGaussianKernel(int n, double sigma, int, std::vector& res) { res = getFixedpointGaussianKernel(n, sigma); } - -template -static void createGaussianKernels( T & kx, T & ky, int type, Size &ksize, - double sigma1, double sigma2 ) -{ - int depth = CV_MAT_DEPTH(type); - if( sigma2 <= 0 ) - sigma2 = sigma1; - - // automatic detection of kernel size from sigma - if( ksize.width <= 0 && sigma1 > 0 ) - ksize.width = cvRound(sigma1*(depth == CV_8U ? 3 : 4)*2 + 1)|1; - if( ksize.height <= 0 && sigma2 > 0 ) - ksize.height = cvRound(sigma2*(depth == CV_8U ? 3 : 4)*2 + 1)|1; - - CV_Assert( ksize.width > 0 && ksize.width % 2 == 1 && - ksize.height > 0 && ksize.height % 2 == 1 ); - - sigma1 = std::max( sigma1, 0. ); - sigma2 = std::max( sigma2, 0. ); +} // namespace anon - getGaussianKernel( ksize.width, sigma1, std::max(depth, CV_32F), kx ); - if( ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON ) - ky = kx; - else - getGaussianKernel( ksize.height, sigma2, std::max(depth, CV_32F), ky ); -} - -} - -cv::Ptr cv::createGaussianFilter( int type, Size ksize, - double sigma1, double sigma2, - int borderType ) -{ - Mat kx, ky; - createGaussianKernels(kx, ky, type, ksize, sigma1, sigma2); - - return createSeparableLinearFilter( type, type, kx, ky, Point(-1,-1), 0, borderType ); -} - -namespace cv -{ -#ifdef HAVE_OPENCL - -static bool ocl_GaussianBlur_8UC1(InputArray _src, OutputArray _dst, Size ksize, int ddepth, - InputArray _kernelX, InputArray _kernelY, int borderType) -{ - const ocl::Device & dev = ocl::Device::getDefault(); - int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - - if ( !(dev.isIntel() && (type == CV_8UC1) && - (_src.offset() == 0) && (_src.step() % 4 == 0) && - ((ksize.width == 5 && (_src.cols() % 4 == 0)) || - (ksize.width == 3 && (_src.cols() % 16 == 0) && (_src.rows() % 2 == 0)))) ) - return false; - - Mat kernelX = _kernelX.getMat().reshape(1, 1); - if (kernelX.cols % 2 != 1) - return false; - Mat kernelY = _kernelY.getMat().reshape(1, 1); - if (kernelY.cols % 2 != 1) - return false; - - if (ddepth < 0) - ddepth = sdepth; - - Size size = _src.size(); - size_t globalsize[2] = { 0, 0 }; - size_t localsize[2] = { 0, 0 }; - - if (ksize.width == 3) - { - globalsize[0] = size.width / 16; - globalsize[1] = size.height / 2; - } - else if (ksize.width == 5) - { - globalsize[0] = size.width / 4; - globalsize[1] = size.height / 1; - } - - const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 0, "BORDER_REFLECT_101" }; - char build_opts[1024]; - sprintf(build_opts, "-D %s %s%s", borderMap[borderType & ~BORDER_ISOLATED], - ocl::kernelToStr(kernelX, CV_32F, "KERNEL_MATRIX_X").c_str(), - ocl::kernelToStr(kernelY, CV_32F, "KERNEL_MATRIX_Y").c_str()); - - ocl::Kernel kernel; - - if (ksize.width == 3) - kernel.create("gaussianBlur3x3_8UC1_cols16_rows2", cv::ocl::imgproc::gaussianBlur3x3_oclsrc, build_opts); - else if (ksize.width == 5) - kernel.create("gaussianBlur5x5_8UC1_cols4", cv::ocl::imgproc::gaussianBlur5x5_oclsrc, build_opts); - - if (kernel.empty()) - return false; - - UMat src = _src.getUMat(); - _dst.create(size, CV_MAKETYPE(ddepth, cn)); - if (!(_dst.offset() == 0 && _dst.step() % 4 == 0)) - return false; - UMat dst = _dst.getUMat(); - - int idxArg = kernel.set(0, ocl::KernelArg::PtrReadOnly(src)); - idxArg = kernel.set(idxArg, (int)src.step); - idxArg = kernel.set(idxArg, ocl::KernelArg::PtrWriteOnly(dst)); - idxArg = kernel.set(idxArg, (int)dst.step); - idxArg = kernel.set(idxArg, (int)dst.rows); - idxArg = kernel.set(idxArg, (int)dst.cols); - - return kernel.run(2, globalsize, (localsize[0] == 0) ? NULL : localsize, false); -} - -#endif - -#ifdef HAVE_OPENVX - -namespace ovx { - template <> inline bool skipSmallImages(int w, int h) { return w*h < 320 * 240; } -} -static bool openvx_gaussianBlur(InputArray _src, OutputArray _dst, Size ksize, - double sigma1, double sigma2, int borderType) -{ - if (sigma2 <= 0) - sigma2 = sigma1; - // automatic detection of kernel size from sigma - if (ksize.width <= 0 && sigma1 > 0) - ksize.width = cvRound(sigma1*6 + 1) | 1; - if (ksize.height <= 0 && sigma2 > 0) - ksize.height = cvRound(sigma2*6 + 1) | 1; - - if (_src.type() != CV_8UC1 || - _src.cols() < 3 || _src.rows() < 3 || - ksize.width != 3 || ksize.height != 3) - return false; - - sigma1 = std::max(sigma1, 0.); - sigma2 = std::max(sigma2, 0.); - - if (!(sigma1 == 0.0 || (sigma1 - 0.8) < DBL_EPSILON) || !(sigma2 == 0.0 || (sigma2 - 0.8) < DBL_EPSILON) || - ovx::skipSmallImages(_src.cols(), _src.rows())) - return false; - - Mat src = _src.getMat(); - Mat dst = _dst.getMat(); - - if ((borderType & BORDER_ISOLATED) == 0 && src.isSubmatrix()) - return false; //Process isolated borders only - vx_enum border; - switch (borderType & ~BORDER_ISOLATED) - { - case BORDER_CONSTANT: - border = VX_BORDER_CONSTANT; - break; - case BORDER_REPLICATE: - border = VX_BORDER_REPLICATE; - break; - default: - return false; - } - - try - { - ivx::Context ctx = ovx::getOpenVXContext(); - - Mat a; - if (dst.data != src.data) - a = src; - else - src.copyTo(a); - - ivx::Image - ia = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8, - ivx::Image::createAddressing(a.cols, a.rows, 1, (vx_int32)(a.step)), a.data), - ib = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8, - ivx::Image::createAddressing(dst.cols, dst.rows, 1, (vx_int32)(dst.step)), dst.data); - - //ATTENTION: VX_CONTEXT_IMMEDIATE_BORDER attribute change could lead to strange issues in multi-threaded environments - //since OpenVX standard says nothing about thread-safety for now - ivx::border_t prevBorder = ctx.immediateBorder(); - ctx.setImmediateBorder(border, (vx_uint8)(0)); - ivx::IVX_CHECK_STATUS(vxuGaussian3x3(ctx, ia, ib)); - ctx.setImmediateBorder(prevBorder); - } - catch (const ivx::RuntimeError & e) - { - VX_DbgThrow(e.what()); - } - catch (const ivx::WrapperError & e) - { - VX_DbgThrow(e.what()); - } - return true; -} - -#endif - -#ifdef HAVE_IPP -// IW 2017u2 has bug which doesn't allow use of partial inMem with tiling -#if IPP_DISABLE_GAUSSIANBLUR_PARALLEL -#define IPP_GAUSSIANBLUR_PARALLEL 0 -#else -#define IPP_GAUSSIANBLUR_PARALLEL 1 -#endif - -#ifdef HAVE_IPP_IW - -class ipp_gaussianBlurParallel: public ParallelLoopBody -{ -public: - ipp_gaussianBlurParallel(::ipp::IwiImage &src, ::ipp::IwiImage &dst, int kernelSize, float sigma, ::ipp::IwiBorderType &border, bool *pOk): - m_src(src), m_dst(dst), m_kernelSize(kernelSize), m_sigma(sigma), m_border(border), m_pOk(pOk) { - *m_pOk = true; - } - ~ipp_gaussianBlurParallel() - { - } - - virtual void operator() (const Range& range) const CV_OVERRIDE - { - CV_INSTRUMENT_REGION_IPP(); - - if(!*m_pOk) - return; - - try - { - ::ipp::IwiTile tile = ::ipp::IwiRoi(0, range.start, m_dst.m_size.width, range.end - range.start); - CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterGaussian, m_src, m_dst, m_kernelSize, m_sigma, ::ipp::IwDefault(), m_border, tile); - } - catch(const ::ipp::IwException &) - { - *m_pOk = false; - return; - } - } -private: - ::ipp::IwiImage &m_src; - ::ipp::IwiImage &m_dst; - - int m_kernelSize; - float m_sigma; - ::ipp::IwiBorderType &m_border; - - volatile bool *m_pOk; - const ipp_gaussianBlurParallel& operator= (const ipp_gaussianBlurParallel&); -}; - -#endif - -static bool ipp_GaussianBlur(InputArray _src, OutputArray _dst, Size ksize, - double sigma1, double sigma2, int borderType ) -{ -#ifdef HAVE_IPP_IW - CV_INSTRUMENT_REGION_IPP(); - -#if IPP_VERSION_X100 < 201800 && ((defined _MSC_VER && defined _M_IX86) || (defined __GNUC__ && defined __i386__)) - CV_UNUSED(_src); CV_UNUSED(_dst); CV_UNUSED(ksize); CV_UNUSED(sigma1); CV_UNUSED(sigma2); CV_UNUSED(borderType); - return false; // bug on ia32 -#else - if(sigma1 != sigma2) - return false; - - if(sigma1 < FLT_EPSILON) - return false; - - if(ksize.width != ksize.height) - return false; - - // Acquire data and begin processing - try - { - Mat src = _src.getMat(); - Mat dst = _dst.getMat(); - ::ipp::IwiImage iwSrc = ippiGetImage(src); - ::ipp::IwiImage iwDst = ippiGetImage(dst); - ::ipp::IwiBorderSize borderSize = ::ipp::iwiSizeToBorderSize(ippiGetSize(ksize)); - ::ipp::IwiBorderType ippBorder(ippiGetBorder(iwSrc, borderType, borderSize)); - if(!ippBorder) - return false; - - const int threads = ippiSuggestThreadsNum(iwDst, 2); - if(IPP_GAUSSIANBLUR_PARALLEL && threads > 1) { - bool ok; - ipp_gaussianBlurParallel invoker(iwSrc, iwDst, ksize.width, (float) sigma1, ippBorder, &ok); - - if(!ok) - return false; - const Range range(0, (int) iwDst.m_size.height); - parallel_for_(range, invoker, threads*4); - - if(!ok) - return false; - } else { - CV_INSTRUMENT_FUN_IPP(::ipp::iwiFilterGaussian, iwSrc, iwDst, ksize.width, sigma1, ::ipp::IwDefault(), ippBorder); - } - } - catch (const ::ipp::IwException &) - { - return false; - } - - return true; -#endif -#else - CV_UNUSED(_src); CV_UNUSED(_dst); CV_UNUSED(ksize); CV_UNUSED(sigma1); CV_UNUSED(sigma2); CV_UNUSED(borderType); - return false; -#endif -} -#endif -} - -void cv::GaussianBlur( InputArray _src, OutputArray _dst, Size ksize, - double sigma1, double sigma2, - int borderType ) +void GaussianBlurFixedPoint(const Mat& src, /*const*/ Mat& dst, + const uint16_t/*ufixedpoint16*/* fkx, int fkx_size, + const uint16_t/*ufixedpoint16*/* fky, int fky_size, + int borderType) { CV_INSTRUMENT_REGION(); - int type = _src.type(); - Size size = _src.size(); - _dst.create( size, type ); - - if( (borderType & ~BORDER_ISOLATED) != BORDER_CONSTANT && - ((borderType & BORDER_ISOLATED) != 0 || !_src.getMat().isSubmatrix()) ) + CV_Assert(src.depth() == CV_8U && ((borderType & BORDER_ISOLATED) || !src.isSubmatrix())); + fixedSmoothInvoker invoker( + src.ptr(), src.step1(), + dst.ptr(), dst.step1(), dst.cols, dst.rows, dst.channels(), + (const ufixedpoint16*)fkx, fkx_size, (const ufixedpoint16*)fky, fky_size, + borderType & ~BORDER_ISOLATED); { - if( size.height == 1 ) - ksize.height = 1; - if( size.width == 1 ) - ksize.width = 1; - } - - if( ksize.width == 1 && ksize.height == 1 ) - { - _src.copyTo(_dst); - return; - } - - bool useOpenCL = (ocl::isOpenCLActivated() && _dst.isUMat() && _src.dims() <= 2 && - ((ksize.width == 3 && ksize.height == 3) || - (ksize.width == 5 && ksize.height == 5)) && - _src.rows() > ksize.height && _src.cols() > ksize.width); - CV_UNUSED(useOpenCL); - - int sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - - Mat kx, ky; - createGaussianKernels(kx, ky, type, ksize, sigma1, sigma2); - - CV_OCL_RUN(useOpenCL, ocl_GaussianBlur_8UC1(_src, _dst, ksize, CV_MAT_DEPTH(type), kx, ky, borderType)); - - CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2 && (size_t)_src.rows() > kx.total() && (size_t)_src.cols() > kx.total(), - ocl_sepFilter2D(_src, _dst, sdepth, kx, ky, Point(-1, -1), 0, borderType)) - - Mat src = _src.getMat(); - Mat dst = _dst.getMat(); - - Point ofs; - Size wsz(src.cols, src.rows); - if(!(borderType & BORDER_ISOLATED)) - src.locateROI( wsz, ofs ); - - CALL_HAL(gaussianBlur, cv_hal_gaussianBlur, src.ptr(), src.step, dst.ptr(), dst.step, src.cols, src.rows, sdepth, cn, - ofs.x, ofs.y, wsz.width - src.cols - ofs.x, wsz.height - src.rows - ofs.y, ksize.width, ksize.height, - sigma1, sigma2, borderType&~BORDER_ISOLATED); - - CV_OVX_RUN(true, - openvx_gaussianBlur(src, dst, ksize, sigma1, sigma2, borderType)) - - CV_IPP_RUN_FAST(ipp_GaussianBlur(src, dst, ksize, sigma1, sigma2, borderType)); - - if(sdepth == CV_8U && ((borderType & BORDER_ISOLATED) || !_src.getMat().isSubmatrix())) - { - std::vector fkx, fky; - createGaussianKernels(fkx, fky, type, ksize, sigma1, sigma2); - if (src.data == dst.data) - src = src.clone(); - fixedSmoothInvoker invoker(src.ptr(), src.step1(), dst.ptr(), dst.step1(), dst.cols, dst.rows, dst.channels(), &fkx[0], (int)fkx.size(), &fky[0], (int)fky.size(), borderType & ~BORDER_ISOLATED); + // TODO AVX guard (external call) parallel_for_(Range(0, dst.rows), invoker, std::max(1, std::min(getNumThreads(), getNumberOfCPUs()))); - return; } - - sepFilter2D(src, dst, sdepth, kx, ky, Point(-1, -1), 0, borderType); } -////////////////////////////////////////////////////////////////////////////////////////// - -CV_IMPL void -cvSmooth( const void* srcarr, void* dstarr, int smooth_type, - int param1, int param2, double param3, double param4 ) -{ - cv::Mat src = cv::cvarrToMat(srcarr), dst0 = cv::cvarrToMat(dstarr), dst = dst0; - - CV_Assert( dst.size() == src.size() && - (smooth_type == CV_BLUR_NO_SCALE || dst.type() == src.type()) ); - - if( param2 <= 0 ) - param2 = param1; - - if( smooth_type == CV_BLUR || smooth_type == CV_BLUR_NO_SCALE ) - cv::boxFilter( src, dst, dst.depth(), cv::Size(param1, param2), cv::Point(-1,-1), - smooth_type == CV_BLUR, cv::BORDER_REPLICATE ); - else if( smooth_type == CV_GAUSSIAN ) - cv::GaussianBlur( src, dst, cv::Size(param1, param2), param3, param4, cv::BORDER_REPLICATE ); - else if( smooth_type == CV_MEDIAN ) - cv::medianBlur( src, dst, param1 ); - else - cv::bilateralFilter( src, dst, param1, param3, param4, cv::BORDER_REPLICATE ); - - if( dst.data != dst0.data ) - CV_Error( CV_StsUnmatchedFormats, "The destination image does not have the proper type" ); -} - -/* End of file. */ +#endif +CV_CPU_OPTIMIZATION_NAMESPACE_END +} // namespace