From 4d0f789e0a4c884381ac3a89cf37b4bcf850a293 Mon Sep 17 00:00:00 2001 From: Vitaly Tuzov Date: Thu, 29 Jun 2017 19:38:01 +0300 Subject: [PATCH] AVX optimized implementation of separable filters migrated to separate file --- modules/imgproc/src/filter.avx2.cpp | 200 ++++++++++++++++++++++++++++ modules/imgproc/src/filter.cpp | 126 +++--------------- modules/imgproc/src/filter.hpp | 57 ++++++++ 3 files changed, 274 insertions(+), 109 deletions(-) create mode 100644 modules/imgproc/src/filter.avx2.cpp create mode 100644 modules/imgproc/src/filter.hpp diff --git a/modules/imgproc/src/filter.avx2.cpp b/modules/imgproc/src/filter.avx2.cpp new file mode 100644 index 0000000000..b469329598 --- /dev/null +++ b/modules/imgproc/src/filter.avx2.cpp @@ -0,0 +1,200 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" +#include "filter.hpp" + +namespace cv +{ + +int RowVec_32f_AVX(const float* src0, const float* _kx, float* dst, int width, int cn, int _ksize) +{ + int i = 0, k; + for (; i <= width - 8; i += 8) + { + const float* src = src0 + i; + __m256 f, x0; + __m256 s0 = _mm256_set1_ps(0.0f); + for (k = 0; k < _ksize; k++, src += cn) + { + f = _mm256_set1_ps(_kx[k]); + x0 = _mm256_loadu_ps(src); +#if CV_FMA3 + s0 = _mm256_fmadd_ps(x0, f, s0); +#else + s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f)); +#endif + } + _mm256_storeu_ps(dst + i, s0); + } + _mm256_zeroupper(); + return i; +} + +int SymmColumnVec_32f_Symm_AVX(const float** src, const float* ky, float* dst, float delta, int width, int ksize2) +{ + int i = 0, k; + const float *S, *S2; + const __m128 d4 = _mm_set1_ps(delta); + const __m256 d8 = _mm256_set1_ps(delta); + + for( ; i <= width - 16; i += 16 ) + { + __m256 f = _mm256_set1_ps(ky[0]); + __m256 s0, s1; + __m256 x0; + S = src[0] + i; + s0 = _mm256_loadu_ps(S); +#if CV_FMA3 + s0 = _mm256_fmadd_ps(s0, f, d8); +#else + s0 = _mm256_add_ps(_mm256_mul_ps(s0, f), d8); +#endif + s1 = _mm256_loadu_ps(S+8); +#if CV_FMA3 + s1 = _mm256_fmadd_ps(s1, f, d8); +#else + s1 = _mm256_add_ps(_mm256_mul_ps(s1, f), d8); +#endif + + for( k = 1; k <= ksize2; k++ ) + { + S = src[k] + i; + S2 = src[-k] + i; + f = _mm256_set1_ps(ky[k]); + x0 = _mm256_add_ps(_mm256_loadu_ps(S), _mm256_loadu_ps(S2)); +#if CV_FMA3 + s0 = _mm256_fmadd_ps(x0, f, s0); +#else + s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f)); +#endif + x0 = _mm256_add_ps(_mm256_loadu_ps(S+8), _mm256_loadu_ps(S2+8)); +#if CV_FMA3 + s1 = _mm256_fmadd_ps(x0, f, s1); +#else + s1 = _mm256_add_ps(s1, _mm256_mul_ps(x0, f)); +#endif + } + + _mm256_storeu_ps(dst + i, s0); + _mm256_storeu_ps(dst + i + 8, s1); + } + + for( ; i <= width - 4; i += 4 ) + { + __m128 f = _mm_set1_ps(ky[0]); + __m128 x0, s0 = _mm_load_ps(src[0] + i); + s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4); + + for( k = 1; k <= ksize2; k++ ) + { + f = _mm_set1_ps(ky[k]); + S = src[k] + i; + S2 = src[-k] + i; + x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i)); + s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); + } + + _mm_storeu_ps(dst + i, s0); + } + + _mm256_zeroupper(); + return i; +} + +int SymmColumnVec_32f_Unsymm_AVX(const float** src, const float* ky, float* dst, float delta, int width, int ksize2) +{ + int i = 0, k; + const float *S, *S2; + const __m128 d4 = _mm_set1_ps(delta); + const __m256 d8 = _mm256_set1_ps(delta); + + for (; i <= width - 16; i += 16) + { + __m256 f, s0 = d8, s1 = d8; + __m256 x0; + S = src[0] + i; + + for (k = 1; k <= ksize2; k++) + { + S = src[k] + i; + S2 = src[-k] + i; + f = _mm256_set1_ps(ky[k]); + x0 = _mm256_sub_ps(_mm256_loadu_ps(S), _mm256_loadu_ps(S2)); +#if CV_FMA3 + s0 = _mm256_fmadd_ps(x0, f, s0); +#else + s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f)); +#endif + x0 = _mm256_sub_ps(_mm256_loadu_ps(S + 8), _mm256_loadu_ps(S2 + 8)); +#if CV_FMA3 + s1 = _mm256_fmadd_ps(x0, f, s1); +#else + s1 = _mm256_add_ps(s1, _mm256_mul_ps(x0, f)); +#endif + } + + _mm256_storeu_ps(dst + i, s0); + _mm256_storeu_ps(dst + i + 8, s1); + } + + for (; i <= width - 4; i += 4) + { + __m128 f, x0, s0 = d4; + + for (k = 1; k <= ksize2; k++) + { + f = _mm_set1_ps(ky[k]); + x0 = _mm_sub_ps(_mm_load_ps(src[k] + i), _mm_load_ps(src[-k] + i)); + s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); + } + + _mm_storeu_ps(dst + i, s0); + } + + _mm256_zeroupper(); + return i; +} + +} + +/* End of file. */ diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp index 5f25f428b4..50f71eefce 100644 --- a/modules/imgproc/src/filter.cpp +++ b/modules/imgproc/src/filter.cpp @@ -44,6 +44,8 @@ #include "opencv2/core/opencl/ocl_defs.hpp" #include "opencl_kernels_imgproc.hpp" #include "hal_replacement.hpp" +#include "filter.hpp" + /****************************************************************************************\ Base Image Filter @@ -1362,7 +1364,7 @@ struct RowVec_32f RowVec_32f() { haveSSE = checkHardwareSupport(CV_CPU_SSE); - haveAVX2 = checkHardwareSupport(CV_CPU_AVX2); + haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2; #if defined USE_IPP_SEP_FILTERS bufsz = -1; #endif @@ -1372,7 +1374,7 @@ struct RowVec_32f { kernel = _kernel; haveSSE = checkHardwareSupport(CV_CPU_SSE); - haveAVX2 = checkHardwareSupport(CV_CPU_AVX2); + haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2; #if defined USE_IPP_SEP_FILTERS bufsz = -1; #endif @@ -1399,28 +1401,9 @@ struct RowVec_32f int i = 0, k; width *= cn; -#if CV_AVX2 -if ( haveAVX2 ) -{ - for( ; i <= width - 8; i += 8 ) - { - const float* src = src0 + i; - __m256 f, x0; - __m256 s0 = _mm256_set1_ps(0.0f); - for( k = 0; k < _ksize; k++, src += cn ) - { - f = _mm256_set1_ps(_kx[k]); - x0 = _mm256_loadu_ps(src); -#if CV_FMA3 - s0 = _mm256_fmadd_ps(x0, f, s0); -#else - s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f)); -#endif - } - _mm256_storeu_ps(dst + i, s0); - } - return i; -} +#if CV_TRY_AVX2 + if (haveAVX2) + return RowVec_32f_AVX(src0, _kx, dst, width, cn, _ksize); #endif for( ; i <= width - 8; i += 8 ) { @@ -1685,7 +1668,7 @@ struct SymmColumnVec_32f SymmColumnVec_32f() { symmetryType=0; haveSSE = checkHardwareSupport(CV_CPU_SSE); - haveAVX2 = checkHardwareSupport(CV_CPU_AVX2); + haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2; delta = 0; } SymmColumnVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta) @@ -1694,7 +1677,7 @@ struct SymmColumnVec_32f kernel = _kernel; delta = (float)_delta; haveSSE = checkHardwareSupport(CV_CPU_SSE); - haveAVX2 = checkHardwareSupport(CV_CPU_AVX2); + haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2; CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); } @@ -1710,61 +1693,15 @@ struct SymmColumnVec_32f const float** src = (const float**)_src; const float *S, *S2; float* dst = (float*)_dst; - const __m128 d4 = _mm_set1_ps(delta); -#if CV_AVX2 - const __m256 d8 = _mm256_set1_ps(delta); -#endif if( symmetrical ) { -#if CV_AVX2 -if ( haveAVX2 ) -{ - for( ; i <= width - 16; i += 16 ) - { - __m256 f = _mm256_set1_ps(ky[0]); - __m256 s0, s1; - __m256 x0; - S = src[0] + i; - s0 = _mm256_loadu_ps(S); -#if CV_FMA3 - s0 = _mm256_fmadd_ps(s0, f, d8); -#else - s0 = _mm256_add_ps(_mm256_mul_ps(s0, f), d8); -#endif - s1 = _mm256_loadu_ps(S+8); -#if CV_FMA3 - s1 = _mm256_fmadd_ps(s1, f, d8); -#else - s1 = _mm256_add_ps(_mm256_mul_ps(s1, f), d8); -#endif - - for( k = 1; k <= ksize2; k++ ) - { - S = src[k] + i; - S2 = src[-k] + i; - f = _mm256_set1_ps(ky[k]); - x0 = _mm256_add_ps(_mm256_loadu_ps(S), _mm256_loadu_ps(S2)); -#if CV_FMA3 - s0 = _mm256_fmadd_ps(x0, f, s0); -#else - s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f)); -#endif - x0 = _mm256_add_ps(_mm256_loadu_ps(S+8), _mm256_loadu_ps(S2+8)); -#if CV_FMA3 - s1 = _mm256_fmadd_ps(x0, f, s1); -#else - s1 = _mm256_add_ps(s1, _mm256_mul_ps(x0, f)); -#endif - } - - _mm256_storeu_ps(dst + i, s0); - _mm256_storeu_ps(dst + i + 8, s1); - } - _mm256_zeroupper(); -} +#if CV_TRY_AVX2 + if (haveAVX2) + return SymmColumnVec_32f_Symm_AVX(src, ky, dst, delta, width, ksize2); #endif + const __m128 d4 = _mm_set1_ps(delta); for( ; i <= width - 16; i += 16 ) { __m128 f = _mm_set1_ps(ky[0]); @@ -1821,40 +1758,11 @@ if ( haveAVX2 ) } else { -#if CV_AVX2 -if ( haveAVX2 ) -{ - for( ; i <= width - 16; i += 16 ) - { - __m256 f, s0 = d8, s1 = d8; - __m256 x0; - S = src[0] + i; - - for( k = 1; k <= ksize2; k++ ) - { - S = src[k] + i; - S2 = src[-k] + i; - f = _mm256_set1_ps(ky[k]); - x0 = _mm256_sub_ps(_mm256_loadu_ps(S), _mm256_loadu_ps(S2)); -#if CV_FMA3 - s0 = _mm256_fmadd_ps(x0, f, s0); -#else - s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f)); -#endif - x0 = _mm256_sub_ps(_mm256_loadu_ps(S+8), _mm256_loadu_ps(S2+8)); -#if CV_FMA3 - s1 = _mm256_fmadd_ps(x0, f, s1); -#else - s1 = _mm256_add_ps(s1, _mm256_mul_ps(x0, f)); -#endif - } - - _mm256_storeu_ps(dst + i, s0); - _mm256_storeu_ps(dst + i + 8, s1); - } - _mm256_zeroupper(); -} +#if CV_TRY_AVX2 + if (haveAVX2) + return SymmColumnVec_32f_Unsymm_AVX(src, ky, dst, delta, width, ksize2); #endif + const __m128 d4 = _mm_set1_ps(delta); for( ; i <= width - 16; i += 16 ) { __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4; diff --git a/modules/imgproc/src/filter.hpp b/modules/imgproc/src/filter.hpp new file mode 100644 index 0000000000..c878fea281 --- /dev/null +++ b/modules/imgproc/src/filter.hpp @@ -0,0 +1,57 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef OPENCV_IMGPROC_FILTER_HPP +#define OPENCV_IMGPROC_FILTER_HPP + +namespace cv +{ +#if CV_TRY_AVX2 + int RowVec_32f_AVX(const float* src0, const float* _kx, float* dst, int width, int cn, int _ksize); + int SymmColumnVec_32f_Symm_AVX(const float** src, const float* ky, float* dst, float delta, int width, int ksize2); + int SymmColumnVec_32f_Unsymm_AVX(const float** src, const float* ky, float* dst, float delta, int width, int ksize2); +#endif +} + +#endif + +/* End of file. */