mirror of https://github.com/opencv/opencv.git
- initialize arithmetic dispatcher - add new universal intrinsic v_absdiffs - add new universal intrinsic v_pack_b - add accumulate version of universal intrinsic v_round - fix sse/avx2:uint8 multiplication overflow - reimplement arithmetic, logic and comparison operations into wide universal intrinsics with full support for all types - reimplement IPP arithmetic, logic and comparison operations in a sperate file arithm_ipp.hpp - avoid scalar multiplication if scaling factor eq 1 and use integer multiplication - move C arithmetic operations to precomp.hpp and delete [arithm_simd|arithm_core].hpp - add compatibility with new opencv4 divide policypull/12064/head
parent
d61ad04f11
commit
93ffebc273
14 changed files with 2896 additions and 3702 deletions
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,11 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html
|
||||
|
||||
#include "precomp.hpp" |
||||
#include "arithm_ipp.hpp" |
||||
#include "arithm.simd.hpp" |
||||
#include "arithm.simd_declarations.hpp" |
||||
|
||||
#define ARITHM_DISPATCHING_ONLY |
||||
#include "arithm.simd.hpp" |
File diff suppressed because it is too large
Load Diff
@ -1,623 +0,0 @@ |
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
|
||||
// Copyright (C) 2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_ARITHM_CORE_HPP__ |
||||
#define __OPENCV_ARITHM_CORE_HPP__ |
||||
|
||||
#include "arithm_simd.hpp" |
||||
|
||||
namespace cv { |
||||
|
||||
template<typename T1, typename T2=T1, typename T3=T1> struct OpAdd |
||||
{ |
||||
typedef T1 type1; |
||||
typedef T2 type2; |
||||
typedef T3 rtype; |
||||
T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a + b); } |
||||
}; |
||||
|
||||
template<typename T1, typename T2=T1, typename T3=T1> struct OpSub |
||||
{ |
||||
typedef T1 type1; |
||||
typedef T2 type2; |
||||
typedef T3 rtype; |
||||
T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a - b); } |
||||
}; |
||||
|
||||
template<typename T1, typename T2=T1, typename T3=T1> struct OpRSub |
||||
{ |
||||
typedef T1 type1; |
||||
typedef T2 type2; |
||||
typedef T3 rtype; |
||||
T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(b - a); } |
||||
}; |
||||
|
||||
template<typename T> struct OpMin |
||||
{ |
||||
typedef T type1; |
||||
typedef T type2; |
||||
typedef T rtype; |
||||
T operator ()(const T a, const T b) const { return std::min(a, b); } |
||||
}; |
||||
|
||||
template<typename T> struct OpMax |
||||
{ |
||||
typedef T type1; |
||||
typedef T type2; |
||||
typedef T rtype; |
||||
T operator ()(const T a, const T b) const { return std::max(a, b); } |
||||
}; |
||||
|
||||
template<typename T> struct OpAbsDiff |
||||
{ |
||||
typedef T type1; |
||||
typedef T type2; |
||||
typedef T rtype; |
||||
T operator()(T a, T b) const { return a > b ? a - b : b - a; } |
||||
}; |
||||
|
||||
// specializations to prevent "-0" results
|
||||
template<> struct OpAbsDiff<float> |
||||
{ |
||||
typedef float type1; |
||||
typedef float type2; |
||||
typedef float rtype; |
||||
float operator()(float a, float b) const { return std::abs(a - b); } |
||||
}; |
||||
template<> struct OpAbsDiff<double> |
||||
{ |
||||
typedef double type1; |
||||
typedef double type2; |
||||
typedef double rtype; |
||||
double operator()(double a, double b) const { return std::abs(a - b); } |
||||
}; |
||||
|
||||
template<typename T> struct OpAnd |
||||
{ |
||||
typedef T type1; |
||||
typedef T type2; |
||||
typedef T rtype; |
||||
T operator()( T a, T b ) const { return a & b; } |
||||
}; |
||||
|
||||
template<typename T> struct OpOr |
||||
{ |
||||
typedef T type1; |
||||
typedef T type2; |
||||
typedef T rtype; |
||||
T operator()( T a, T b ) const { return a | b; } |
||||
}; |
||||
|
||||
template<typename T> struct OpXor |
||||
{ |
||||
typedef T type1; |
||||
typedef T type2; |
||||
typedef T rtype; |
||||
T operator()( T a, T b ) const { return a ^ b; } |
||||
}; |
||||
|
||||
template<typename T> struct OpNot |
||||
{ |
||||
typedef T type1; |
||||
typedef T type2; |
||||
typedef T rtype; |
||||
T operator()( T a, T ) const { return ~a; } |
||||
}; |
||||
|
||||
//=============================================================================
|
||||
|
||||
template<typename T, class Op, class VOp> |
||||
void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, int width, int height) |
||||
{ |
||||
#if CV_SSE2 || CV_NEON |
||||
VOp vop; |
||||
#endif |
||||
Op op; |
||||
|
||||
for( ; height--; src1 = (const T *)((const uchar *)src1 + step1), |
||||
src2 = (const T *)((const uchar *)src2 + step2), |
||||
dst = (T *)((uchar *)dst + step) ) |
||||
{ |
||||
int x = 0; |
||||
|
||||
#if CV_NEON || CV_SSE2 |
||||
#if CV_AVX2 |
||||
if( USE_AVX2 ) |
||||
{ |
||||
for( ; x <= width - 32/(int)sizeof(T); x += 32/sizeof(T) ) |
||||
{ |
||||
typename VLoadStore256<T>::reg_type r0 = VLoadStore256<T>::load(src1 + x); |
||||
r0 = vop(r0, VLoadStore256<T>::load(src2 + x)); |
||||
VLoadStore256<T>::store(dst + x, r0); |
||||
} |
||||
} |
||||
#else |
||||
#if CV_SSE2 |
||||
if( USE_SSE2 ) |
||||
{ |
||||
#endif // CV_SSE2
|
||||
for( ; x <= width - 32/(int)sizeof(T); x += 32/sizeof(T) ) |
||||
{ |
||||
typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x ); |
||||
typename VLoadStore128<T>::reg_type r1 = VLoadStore128<T>::load(src1 + x + 16/sizeof(T)); |
||||
r0 = vop(r0, VLoadStore128<T>::load(src2 + x )); |
||||
r1 = vop(r1, VLoadStore128<T>::load(src2 + x + 16/sizeof(T))); |
||||
VLoadStore128<T>::store(dst + x , r0); |
||||
VLoadStore128<T>::store(dst + x + 16/sizeof(T), r1); |
||||
} |
||||
#if CV_SSE2 |
||||
} |
||||
#endif // CV_SSE2
|
||||
#endif // CV_AVX2
|
||||
#endif // CV_NEON || CV_SSE2
|
||||
|
||||
#if CV_AVX2 |
||||
// nothing
|
||||
#elif CV_SSE2 |
||||
if( USE_SSE2 ) |
||||
{ |
||||
for( ; x <= width - 8/(int)sizeof(T); x += 8/sizeof(T) ) |
||||
{ |
||||
typename VLoadStore64<T>::reg_type r = VLoadStore64<T>::load(src1 + x); |
||||
r = vop(r, VLoadStore64<T>::load(src2 + x)); |
||||
VLoadStore64<T>::store(dst + x, r); |
||||
} |
||||
} |
||||
#endif |
||||
|
||||
#if CV_ENABLE_UNROLLED |
||||
for( ; x <= width - 4; x += 4 ) |
||||
{ |
||||
T v0 = op(src1[x], src2[x]); |
||||
T v1 = op(src1[x+1], src2[x+1]); |
||||
dst[x] = v0; dst[x+1] = v1; |
||||
v0 = op(src1[x+2], src2[x+2]); |
||||
v1 = op(src1[x+3], src2[x+3]); |
||||
dst[x+2] = v0; dst[x+3] = v1; |
||||
} |
||||
#endif |
||||
|
||||
for( ; x < width; x++ ) |
||||
dst[x] = op(src1[x], src2[x]); |
||||
} |
||||
} |
||||
|
||||
template<typename T, class Op, class Op32> |
||||
void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2, |
||||
T* dst, size_t step, int width, int height) |
||||
{ |
||||
#if CV_SSE2 || CV_NEON |
||||
Op32 op32; |
||||
#endif |
||||
Op op; |
||||
|
||||
for( ; height--; src1 = (const T *)((const uchar *)src1 + step1), |
||||
src2 = (const T *)((const uchar *)src2 + step2), |
||||
dst = (T *)((uchar *)dst + step) ) |
||||
{ |
||||
int x = 0; |
||||
|
||||
#if CV_AVX2 |
||||
if( USE_AVX2 ) |
||||
{ |
||||
if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 ) |
||||
{ |
||||
for( ; x <= width - 8; x += 8 ) |
||||
{ |
||||
typename VLoadStore256Aligned<T>::reg_type r0 = VLoadStore256Aligned<T>::load(src1 + x); |
||||
r0 = op32(r0, VLoadStore256Aligned<T>::load(src2 + x)); |
||||
VLoadStore256Aligned<T>::store(dst + x, r0); |
||||
} |
||||
} |
||||
} |
||||
#elif CV_SSE2 |
||||
if( USE_SSE2 ) |
||||
{ |
||||
if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 ) |
||||
{ |
||||
for( ; x <= width - 8; x += 8 ) |
||||
{ |
||||
typename VLoadStore128Aligned<T>::reg_type r0 = VLoadStore128Aligned<T>::load(src1 + x ); |
||||
typename VLoadStore128Aligned<T>::reg_type r1 = VLoadStore128Aligned<T>::load(src1 + x + 4); |
||||
r0 = op32(r0, VLoadStore128Aligned<T>::load(src2 + x )); |
||||
r1 = op32(r1, VLoadStore128Aligned<T>::load(src2 + x + 4)); |
||||
VLoadStore128Aligned<T>::store(dst + x , r0); |
||||
VLoadStore128Aligned<T>::store(dst + x + 4, r1); |
||||
} |
||||
} |
||||
} |
||||
#endif // CV_AVX2
|
||||
|
||||
#if CV_NEON || CV_SSE2 |
||||
#if CV_AVX2 |
||||
if( USE_AVX2 ) |
||||
{ |
||||
for( ; x <= width - 8; x += 8 ) |
||||
{ |
||||
typename VLoadStore256<T>::reg_type r0 = VLoadStore256<T>::load(src1 + x); |
||||
r0 = op32(r0, VLoadStore256<T>::load(src2 + x)); |
||||
VLoadStore256<T>::store(dst + x, r0); |
||||
} |
||||
} |
||||
#else |
||||
#if CV_SSE2 |
||||
if( USE_SSE2 ) |
||||
{ |
||||
#endif // CV_SSE2
|
||||
for( ; x <= width - 8; x += 8 ) |
||||
{ |
||||
typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x ); |
||||
typename VLoadStore128<T>::reg_type r1 = VLoadStore128<T>::load(src1 + x + 4); |
||||
r0 = op32(r0, VLoadStore128<T>::load(src2 + x )); |
||||
r1 = op32(r1, VLoadStore128<T>::load(src2 + x + 4)); |
||||
VLoadStore128<T>::store(dst + x , r0); |
||||
VLoadStore128<T>::store(dst + x + 4, r1); |
||||
} |
||||
#if CV_SSE2 |
||||
} |
||||
#endif // CV_SSE2
|
||||
#endif // CV_AVX2
|
||||
#endif // CV_NEON || CV_SSE2
|
||||
|
||||
#if CV_ENABLE_UNROLLED |
||||
for( ; x <= width - 4; x += 4 ) |
||||
{ |
||||
T v0 = op(src1[x], src2[x]); |
||||
T v1 = op(src1[x+1], src2[x+1]); |
||||
dst[x] = v0; dst[x+1] = v1; |
||||
v0 = op(src1[x+2], src2[x+2]); |
||||
v1 = op(src1[x+3], src2[x+3]); |
||||
dst[x+2] = v0; dst[x+3] = v1; |
||||
} |
||||
#endif |
||||
|
||||
for( ; x < width; x++ ) |
||||
dst[x] = op(src1[x], src2[x]); |
||||
} |
||||
} |
||||
|
||||
|
||||
template<typename T, class Op, class Op64> |
||||
void vBinOp64(const T* src1, size_t step1, const T* src2, size_t step2, |
||||
T* dst, size_t step, int width, int height) |
||||
{ |
||||
#if CV_SSE2 |
||||
Op64 op64; |
||||
#endif |
||||
Op op; |
||||
|
||||
for( ; height--; src1 = (const T *)((const uchar *)src1 + step1), |
||||
src2 = (const T *)((const uchar *)src2 + step2), |
||||
dst = (T *)((uchar *)dst + step) ) |
||||
{ |
||||
int x = 0; |
||||
|
||||
#if CV_AVX2 |
||||
if( USE_AVX2 ) |
||||
{ |
||||
if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 ) |
||||
{ |
||||
for( ; x <= width - 4; x += 4 ) |
||||
{ |
||||
typename VLoadStore256Aligned<T>::reg_type r0 = VLoadStore256Aligned<T>::load(src1 + x); |
||||
r0 = op64(r0, VLoadStore256Aligned<T>::load(src2 + x)); |
||||
VLoadStore256Aligned<T>::store(dst + x, r0); |
||||
} |
||||
} |
||||
} |
||||
#elif CV_SSE2 |
||||
if( USE_SSE2 ) |
||||
{ |
||||
if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 ) |
||||
{ |
||||
for( ; x <= width - 4; x += 4 ) |
||||
{ |
||||
typename VLoadStore128Aligned<T>::reg_type r0 = VLoadStore128Aligned<T>::load(src1 + x ); |
||||
typename VLoadStore128Aligned<T>::reg_type r1 = VLoadStore128Aligned<T>::load(src1 + x + 2); |
||||
r0 = op64(r0, VLoadStore128Aligned<T>::load(src2 + x )); |
||||
r1 = op64(r1, VLoadStore128Aligned<T>::load(src2 + x + 2)); |
||||
VLoadStore128Aligned<T>::store(dst + x , r0); |
||||
VLoadStore128Aligned<T>::store(dst + x + 2, r1); |
||||
} |
||||
} |
||||
} |
||||
#endif |
||||
|
||||
for( ; x <= width - 4; x += 4 ) |
||||
{ |
||||
T v0 = op(src1[x], src2[x]); |
||||
T v1 = op(src1[x+1], src2[x+1]); |
||||
dst[x] = v0; dst[x+1] = v1; |
||||
v0 = op(src1[x+2], src2[x+2]); |
||||
v1 = op(src1[x+3], src2[x+3]); |
||||
dst[x+2] = v0; dst[x+3] = v1; |
||||
} |
||||
|
||||
for( ; x < width; x++ ) |
||||
dst[x] = op(src1[x], src2[x]); |
||||
} |
||||
} |
||||
|
||||
template<typename T> static void |
||||
cmp_(const T* src1, size_t step1, const T* src2, size_t step2, |
||||
uchar* dst, size_t step, int width, int height, int code) |
||||
{ |
||||
step1 /= sizeof(src1[0]); |
||||
step2 /= sizeof(src2[0]); |
||||
if( code == CMP_GE || code == CMP_LT ) |
||||
{ |
||||
std::swap(src1, src2); |
||||
std::swap(step1, step2); |
||||
code = code == CMP_GE ? CMP_LE : CMP_GT; |
||||
} |
||||
|
||||
Cmp_SIMD<T> vop(code); |
||||
|
||||
if( code == CMP_GT || code == CMP_LE ) |
||||
{ |
||||
int m = code == CMP_GT ? 0 : 255; |
||||
for( ; height--; src1 += step1, src2 += step2, dst += step ) |
||||
{ |
||||
int x = vop(src1, src2, dst, width); |
||||
#if CV_ENABLE_UNROLLED |
||||
for( ; x <= width - 4; x += 4 ) |
||||
{ |
||||
int t0, t1; |
||||
t0 = -(src1[x] > src2[x]) ^ m; |
||||
t1 = -(src1[x+1] > src2[x+1]) ^ m; |
||||
dst[x] = (uchar)t0; dst[x+1] = (uchar)t1; |
||||
t0 = -(src1[x+2] > src2[x+2]) ^ m; |
||||
t1 = -(src1[x+3] > src2[x+3]) ^ m; |
||||
dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1; |
||||
} |
||||
#endif |
||||
for( ; x < width; x++ ) |
||||
dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m); |
||||
} |
||||
} |
||||
else if( code == CMP_EQ || code == CMP_NE ) |
||||
{ |
||||
int m = code == CMP_EQ ? 0 : 255; |
||||
for( ; height--; src1 += step1, src2 += step2, dst += step ) |
||||
{ |
||||
int x = 0; |
||||
#if CV_ENABLE_UNROLLED |
||||
for( ; x <= width - 4; x += 4 ) |
||||
{ |
||||
int t0, t1; |
||||
t0 = -(src1[x] == src2[x]) ^ m; |
||||
t1 = -(src1[x+1] == src2[x+1]) ^ m; |
||||
dst[x] = (uchar)t0; dst[x+1] = (uchar)t1; |
||||
t0 = -(src1[x+2] == src2[x+2]) ^ m; |
||||
t1 = -(src1[x+3] == src2[x+3]) ^ m; |
||||
dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1; |
||||
} |
||||
#endif |
||||
for( ; x < width; x++ ) |
||||
dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m); |
||||
} |
||||
} |
||||
} |
||||
|
||||
template<typename T, typename WT> static void |
||||
mul_( const T* src1, size_t step1, const T* src2, size_t step2, |
||||
T* dst, size_t step, int width, int height, WT scale ) |
||||
{ |
||||
step1 /= sizeof(src1[0]); |
||||
step2 /= sizeof(src2[0]); |
||||
step /= sizeof(dst[0]); |
||||
|
||||
Mul_SIMD<T, WT> vop; |
||||
|
||||
if( scale == (WT)1. ) |
||||
{ |
||||
for( ; height--; src1 += step1, src2 += step2, dst += step ) |
||||
{ |
||||
int i = vop(src1, src2, dst, width, scale); |
||||
#if CV_ENABLE_UNROLLED |
||||
for(; i <= width - 4; i += 4 ) |
||||
{ |
||||
T t0; |
||||
T t1; |
||||
t0 = saturate_cast<T>(src1[i ] * src2[i ]); |
||||
t1 = saturate_cast<T>(src1[i+1] * src2[i+1]); |
||||
dst[i ] = t0; |
||||
dst[i+1] = t1; |
||||
|
||||
t0 = saturate_cast<T>(src1[i+2] * src2[i+2]); |
||||
t1 = saturate_cast<T>(src1[i+3] * src2[i+3]); |
||||
dst[i+2] = t0; |
||||
dst[i+3] = t1; |
||||
} |
||||
#endif |
||||
for( ; i < width; i++ ) |
||||
dst[i] = saturate_cast<T>(src1[i] * src2[i]); |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
for( ; height--; src1 += step1, src2 += step2, dst += step ) |
||||
{ |
||||
int i = vop(src1, src2, dst, width, scale); |
||||
#if CV_ENABLE_UNROLLED |
||||
for(; i <= width - 4; i += 4 ) |
||||
{ |
||||
T t0 = saturate_cast<T>(scale*(WT)src1[i]*src2[i]); |
||||
T t1 = saturate_cast<T>(scale*(WT)src1[i+1]*src2[i+1]); |
||||
dst[i] = t0; dst[i+1] = t1; |
||||
|
||||
t0 = saturate_cast<T>(scale*(WT)src1[i+2]*src2[i+2]); |
||||
t1 = saturate_cast<T>(scale*(WT)src1[i+3]*src2[i+3]); |
||||
dst[i+2] = t0; dst[i+3] = t1; |
||||
} |
||||
#endif |
||||
for( ; i < width; i++ ) |
||||
dst[i] = saturate_cast<T>(scale*(WT)src1[i]*src2[i]); |
||||
} |
||||
} |
||||
} |
||||
|
||||
|
||||
template<typename T> static void |
||||
div_i( const T* src1, size_t step1, const T* src2, size_t step2, |
||||
T* dst, size_t step, int width, int height, double scale ) |
||||
{ |
||||
step1 /= sizeof(src1[0]); |
||||
step2 /= sizeof(src2[0]); |
||||
step /= sizeof(dst[0]); |
||||
|
||||
Div_SIMD<T> vop; |
||||
float scale_f = (float)scale; |
||||
|
||||
for( ; height--; src1 += step1, src2 += step2, dst += step ) |
||||
{ |
||||
int i = vop(src1, src2, dst, width, scale); |
||||
for( ; i < width; i++ ) |
||||
{ |
||||
T num = src1[i], denom = src2[i]; |
||||
dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0; |
||||
} |
||||
} |
||||
} |
||||
|
||||
template<typename T> static void |
||||
div_f( const T* src1, size_t step1, const T* src2, size_t step2, |
||||
T* dst, size_t step, int width, int height, double scale ) |
||||
{ |
||||
T scale_f = (T)scale; |
||||
step1 /= sizeof(src1[0]); |
||||
step2 /= sizeof(src2[0]); |
||||
step /= sizeof(dst[0]); |
||||
|
||||
Div_SIMD<T> vop; |
||||
|
||||
for( ; height--; src1 += step1, src2 += step2, dst += step ) |
||||
{ |
||||
int i = vop(src1, src2, dst, width, scale); |
||||
for( ; i < width; i++ ) |
||||
{ |
||||
T num = src1[i], denom = src2[i]; |
||||
dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0; |
||||
} |
||||
} |
||||
} |
||||
|
||||
template<typename T> static void |
||||
recip_i( const T* src2, size_t step2, |
||||
T* dst, size_t step, int width, int height, double scale ) |
||||
{ |
||||
step2 /= sizeof(src2[0]); |
||||
step /= sizeof(dst[0]); |
||||
|
||||
Recip_SIMD<T> vop; |
||||
float scale_f = (float)scale; |
||||
|
||||
for( ; height--; src2 += step2, dst += step ) |
||||
{ |
||||
int i = vop(src2, dst, width, scale); |
||||
for( ; i < width; i++ ) |
||||
{ |
||||
T denom = src2[i]; |
||||
dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0; |
||||
} |
||||
} |
||||
} |
||||
|
||||
template<typename T> static void |
||||
recip_f( const T* src2, size_t step2, |
||||
T* dst, size_t step, int width, int height, double scale ) |
||||
{ |
||||
T scale_f = (T)scale; |
||||
step2 /= sizeof(src2[0]); |
||||
step /= sizeof(dst[0]); |
||||
|
||||
Recip_SIMD<T> vop; |
||||
|
||||
for( ; height--; src2 += step2, dst += step ) |
||||
{ |
||||
int i = vop(src2, dst, width, scale); |
||||
for( ; i < width; i++ ) |
||||
{ |
||||
T denom = src2[i]; |
||||
dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0; |
||||
} |
||||
} |
||||
} |
||||
|
||||
template<typename T, typename WT> static void |
||||
addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2, |
||||
T* dst, size_t step, int width, int height, void* _scalars ) |
||||
{ |
||||
const double* scalars = (const double*)_scalars; |
||||
WT alpha = (WT)scalars[0], beta = (WT)scalars[1], gamma = (WT)scalars[2]; |
||||
step1 /= sizeof(src1[0]); |
||||
step2 /= sizeof(src2[0]); |
||||
step /= sizeof(dst[0]); |
||||
|
||||
AddWeighted_SIMD<T, WT> vop; |
||||
|
||||
for( ; height--; src1 += step1, src2 += step2, dst += step ) |
||||
{ |
||||
int x = vop(src1, src2, dst, width, alpha, beta, gamma); |
||||
#if CV_ENABLE_UNROLLED |
||||
for( ; x <= width - 4; x += 4 ) |
||||
{ |
||||
T t0 = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma); |
||||
T t1 = saturate_cast<T>(src1[x+1]*alpha + src2[x+1]*beta + gamma); |
||||
dst[x] = t0; dst[x+1] = t1; |
||||
|
||||
t0 = saturate_cast<T>(src1[x+2]*alpha + src2[x+2]*beta + gamma); |
||||
t1 = saturate_cast<T>(src1[x+3]*alpha + src2[x+3]*beta + gamma); |
||||
dst[x+2] = t0; dst[x+3] = t1; |
||||
} |
||||
#endif |
||||
for( ; x < width; x++ ) |
||||
dst[x] = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma); |
||||
} |
||||
} |
||||
|
||||
} // cv::
|
||||
|
||||
|
||||
#endif // __OPENCV_ARITHM_CORE_HPP__
|
@ -0,0 +1,417 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html
|
||||
#if ARITHM_USE_IPP |
||||
|
||||
namespace cv { namespace hal { |
||||
|
||||
//=======================================
|
||||
// Arithmetic and logical operations
|
||||
// +, -, *, /, &, |, ^, ~, abs ...
|
||||
//=======================================
|
||||
|
||||
#define ARITHM_IPP_BIN(fun, ...) \ |
||||
do { \
|
||||
if (!CV_IPP_CHECK_COND) \
|
||||
return 0; \
|
||||
if (height == 1) \
|
||||
step1 = step2 = step = width * sizeof(dst[0]); \
|
||||
if (0 <= CV_INSTRUMENT_FUN_IPP(fun, __VA_ARGS__)) \
|
||||
{ \
|
||||
CV_IMPL_ADD(CV_IMPL_IPP); \
|
||||
return 1; \
|
||||
} \
|
||||
setIppErrorStatus(); \
|
||||
return 0; \
|
||||
} while(0) |
||||
|
||||
//=======================================
|
||||
// Addition
|
||||
//=======================================
|
||||
|
||||
inline int arithm_ipp_add8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, |
||||
uchar* dst, size_t step, int width, int height) |
||||
{ |
||||
ARITHM_IPP_BIN(ippiAdd_8u_C1RSfs, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0); |
||||
} |
||||
|
||||
inline int arithm_ipp_add16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2, |
||||
ushort* dst, size_t step, int width, int height) |
||||
{ |
||||
ARITHM_IPP_BIN(ippiAdd_16u_C1RSfs, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0); |
||||
} |
||||
|
||||
inline int arithm_ipp_add16s(const short* src1, size_t step1, const short* src2, size_t step2, |
||||
short* dst, size_t step, int width, int height) |
||||
{ |
||||
ARITHM_IPP_BIN(ippiAdd_16s_C1RSfs, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0); |
||||
} |
||||
|
||||
inline int arithm_ipp_add32f(const float* src1, size_t step1, const float* src2, size_t step2, |
||||
float* dst, size_t step, int width, int height) |
||||
{ |
||||
ARITHM_IPP_BIN(ippiAdd_32f_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height)); |
||||
} |
||||
|
||||
#define arithm_ipp_add8s(...) 0 |
||||
#define arithm_ipp_add32s(...) 0 |
||||
#define arithm_ipp_add64f(...) 0 |
||||
|
||||
//=======================================
|
||||
// Subtract
|
||||
//=======================================
|
||||
|
||||
inline int arithm_ipp_sub8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, |
||||
uchar* dst, size_t step, int width, int height) |
||||
{ |
||||
ARITHM_IPP_BIN(ippiSub_8u_C1RSfs, src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height), 0); |
||||
} |
||||
|
||||
inline int arithm_ipp_sub16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2, |
||||
ushort* dst, size_t step, int width, int height) |
||||
{ |
||||
ARITHM_IPP_BIN(ippiSub_16u_C1RSfs, src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height), 0); |
||||
} |
||||
|
||||
inline int arithm_ipp_sub16s(const short* src1, size_t step1, const short* src2, size_t step2, |
||||
short* dst, size_t step, int width, int height) |
||||
{ |
||||
ARITHM_IPP_BIN(ippiSub_16s_C1RSfs, src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height), 0); |
||||
} |
||||
|
||||
inline int arithm_ipp_sub32f(const float* src1, size_t step1, const float* src2, size_t step2, |
||||
float* dst, size_t step, int width, int height) |
||||
{ |
||||
ARITHM_IPP_BIN(ippiSub_32f_C1R, src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height)); |
||||
} |
||||
|
||||
#define arithm_ipp_sub8s(...) 0 |
||||
#define arithm_ipp_sub32s(...) 0 |
||||
#define arithm_ipp_sub64f(...) 0 |
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define ARITHM_IPP_MIN_MAX(fun, type) \ |
||||
do { \
|
||||
if (!CV_IPP_CHECK_COND) \
|
||||
return 0; \
|
||||
type* s1 = (type*)src1; \
|
||||
type* s2 = (type*)src2; \
|
||||
type* d = dst; \
|
||||
if (height == 1) \
|
||||
step1 = step2 = step = width * sizeof(dst[0]); \
|
||||
int i = 0; \
|
||||
for(; i < height; i++) \
|
||||
{ \
|
||||
if (0 > CV_INSTRUMENT_FUN_IPP(fun, s1, s2, d, width)) \
|
||||
break; \
|
||||
s1 = (type*)((uchar*)s1 + step1); \
|
||||
s2 = (type*)((uchar*)s2 + step2); \
|
||||
d = (type*)((uchar*)d + step); \
|
||||
} \
|
||||
if (i == height) \
|
||||
{ \
|
||||
CV_IMPL_ADD(CV_IMPL_IPP); \
|
||||
return 1; \
|
||||
} \
|
||||
setIppErrorStatus(); \
|
||||
return 0; \
|
||||
} while(0) |
||||
|
||||
//=======================================
|
||||
// Max
|
||||
//=======================================
|
||||
|
||||
inline int arithm_ipp_max8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, |
||||
uchar* dst, size_t step, int width, int height) |
||||
{ |
||||
ARITHM_IPP_MIN_MAX(ippsMaxEvery_8u, uchar); |
||||
} |
||||
|
||||
inline int arithm_ipp_max16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2, |
||||
ushort* dst, size_t step, int width, int height) |
||||
{ |
||||
ARITHM_IPP_MIN_MAX(ippsMaxEvery_16u, ushort); |
||||
} |
||||
|
||||
inline int arithm_ipp_max32f(const float* src1, size_t step1, const float* src2, size_t step2, |
||||
float* dst, size_t step, int width, int height) |
||||
{ |
||||
ARITHM_IPP_MIN_MAX(ippsMaxEvery_32f, float); |
||||
} |
||||
|
||||
inline int arithm_ipp_max64f(const double* src1, size_t step1, const double* src2, size_t step2, |
||||
double* dst, size_t step, int width, int height) |
||||
{ |
||||
ARITHM_IPP_MIN_MAX(ippsMaxEvery_64f, double); |
||||
} |
||||
|
||||
#define arithm_ipp_max8s(...) 0 |
||||
#define arithm_ipp_max16s(...) 0 |
||||
#define arithm_ipp_max32s(...) 0 |
||||
|
||||
//=======================================
|
||||
// Min
|
||||
//=======================================
|
||||
|
||||
inline int arithm_ipp_min8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, |
||||
uchar* dst, size_t step, int width, int height) |
||||
{ |
||||
ARITHM_IPP_MIN_MAX(ippsMinEvery_8u, uchar); |
||||
} |
||||
|
||||
inline int arithm_ipp_min16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2, |
||||
ushort* dst, size_t step, int width, int height) |
||||
{ |
||||
ARITHM_IPP_MIN_MAX(ippsMinEvery_16u, ushort); |
||||
} |
||||
|
||||
inline int arithm_ipp_min32f(const float* src1, size_t step1, const float* src2,size_t step2, |
||||
float* dst, size_t step, int width, int height) |
||||
{ |
||||
ARITHM_IPP_MIN_MAX(ippsMinEvery_32f, float); |
||||
} |
||||
|
||||
inline int arithm_ipp_min64f(const double* src1, size_t step1, const double* src2, size_t step2, |
||||
double* dst, size_t step, int width, int height) |
||||
{ |
||||
ARITHM_IPP_MIN_MAX(ippsMinEvery_64f, double); |
||||
} |
||||
|
||||
#define arithm_ipp_min8s(...) 0 |
||||
#define arithm_ipp_min16s(...) 0 |
||||
#define arithm_ipp_min32s(...) 0 |
||||
|
||||
//=======================================
|
||||
// AbsDiff
|
||||
//=======================================
|
||||
|
||||
inline int arithm_ipp_absdiff8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, |
||||
uchar* dst, size_t step, int width, int height) |
||||
{ |
||||
ARITHM_IPP_BIN(ippiAbsDiff_8u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height)); |
||||
} |
||||
|
||||
inline int arithm_ipp_absdiff16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2, |
||||
ushort* dst, size_t step, int width, int height) |
||||
{ |
||||
ARITHM_IPP_BIN(ippiAbsDiff_16u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height)); |
||||
} |
||||
|
||||
inline int arithm_ipp_absdiff32f(const float* src1, size_t step1, const float* src2, size_t step2, |
||||
float* dst, size_t step, int width, int height) |
||||
{ |
||||
ARITHM_IPP_BIN(ippiAbsDiff_32f_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height)); |
||||
} |
||||
#define arithm_ipp_absdiff8s(...) 0 |
||||
#define arithm_ipp_absdiff16s(...) 0 |
||||
#define arithm_ipp_absdiff32s(...) 0 |
||||
#define arithm_ipp_absdiff64f(...) 0 |
||||
|
||||
//=======================================
|
||||
// Logical
|
||||
//=======================================
|
||||
|
||||
inline int arithm_ipp_and8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, |
||||
uchar* dst, size_t step, int width, int height) |
||||
{ |
||||
ARITHM_IPP_BIN(ippiAnd_8u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height)); |
||||
} |
||||
|
||||
inline int arithm_ipp_or8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, |
||||
uchar* dst, size_t step, int width, int height) |
||||
{ |
||||
ARITHM_IPP_BIN(ippiOr_8u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height)); |
||||
} |
||||
|
||||
inline int arithm_ipp_xor8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, |
||||
uchar* dst, size_t step, int width, int height) |
||||
{ |
||||
ARITHM_IPP_BIN(ippiXor_8u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height)); |
||||
} |
||||
|
||||
inline int arithm_ipp_not8u(const uchar* src1, size_t step1, uchar* dst, size_t step, int width, int height) |
||||
{ |
||||
if (!CV_IPP_CHECK_COND) |
||||
return 0; |
||||
if (height == 1) |
||||
step1 = step = width * sizeof(dst[0]); |
||||
if (0 <= CV_INSTRUMENT_FUN_IPP(ippiNot_8u_C1R, src1, (int)step1, dst, (int)step, ippiSize(width, height))) |
||||
{ |
||||
CV_IMPL_ADD(CV_IMPL_IPP); |
||||
return 1; |
||||
} |
||||
setIppErrorStatus(); |
||||
return 0; |
||||
} |
||||
|
||||
//=======================================
|
||||
// Compare
|
||||
//=======================================
|
||||
|
||||
#define ARITHM_IPP_CMP(fun, ...) \ |
||||
do { \
|
||||
if (!CV_IPP_CHECK_COND) \
|
||||
return 0; \
|
||||
IppCmpOp op = arithm_ipp_convert_cmp(cmpop); \
|
||||
if (op < 0) \
|
||||
return 0; \
|
||||
if (height == 1) \
|
||||
step1 = step2 = step = width * sizeof(dst[0]); \
|
||||
if (0 <= CV_INSTRUMENT_FUN_IPP(fun, __VA_ARGS__, op)) \
|
||||
{ \
|
||||
CV_IMPL_ADD(CV_IMPL_IPP); \
|
||||
return 1; \
|
||||
} \
|
||||
setIppErrorStatus(); \
|
||||
return 0; \
|
||||
} while(0) |
||||
|
||||
inline IppCmpOp arithm_ipp_convert_cmp(int cmpop) |
||||
{ |
||||
switch(cmpop) |
||||
{ |
||||
case CMP_EQ: return ippCmpEq; |
||||
case CMP_GT: return ippCmpGreater; |
||||
case CMP_GE: return ippCmpGreaterEq; |
||||
case CMP_LT: return ippCmpLess; |
||||
case CMP_LE: return ippCmpLessEq; |
||||
default: return (IppCmpOp)-1; |
||||
} |
||||
} |
||||
|
||||
inline int arithm_ipp_cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, |
||||
uchar* dst, size_t step, int width, int height, int cmpop) |
||||
{ |
||||
ARITHM_IPP_CMP(ippiCompare_8u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height)); |
||||
} |
||||
|
||||
inline int arithm_ipp_cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2, |
||||
uchar* dst, size_t step, int width, int height, int cmpop) |
||||
{ |
||||
ARITHM_IPP_CMP(ippiCompare_16u_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height)); |
||||
} |
||||
|
||||
inline int arithm_ipp_cmp16s(const short* src1, size_t step1, const short* src2, size_t step2, |
||||
uchar* dst, size_t step, int width, int height, int cmpop) |
||||
{ |
||||
ARITHM_IPP_CMP(ippiCompare_16s_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height)); |
||||
} |
||||
|
||||
inline int arithm_ipp_cmp32f(const float* src1, size_t step1, const float* src2, size_t step2, |
||||
uchar* dst, size_t step, int width, int height, int cmpop) |
||||
{ |
||||
ARITHM_IPP_CMP(ippiCompare_32f_C1R, src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height)); |
||||
} |
||||
|
||||
#define arithm_ipp_cmp8s(...) 0 |
||||
#define arithm_ipp_cmp32s(...) 0 |
||||
#define arithm_ipp_cmp64f(...) 0 |
||||
|
||||
//=======================================
|
||||
// Multiply
|
||||
//=======================================
|
||||
|
||||
#define ARITHM_IPP_MUL(fun, ...) \ |
||||
do { \
|
||||
if (!CV_IPP_CHECK_COND) \
|
||||
return 0; \
|
||||
float fscale = (float)scale; \
|
||||
if (std::fabs(fscale - 1) > FLT_EPSILON) \
|
||||
return 0; \
|
||||
if (0 <= CV_INSTRUMENT_FUN_IPP(fun, __VA_ARGS__)) \
|
||||
{ \
|
||||
CV_IMPL_ADD(CV_IMPL_IPP); \
|
||||
return 1; \
|
||||
} \
|
||||
setIppErrorStatus(); \
|
||||
return 0; \
|
||||
} while(0) |
||||
|
||||
inline int arithm_ipp_mul8u(const uchar *src1, size_t step1, const uchar *src2, size_t step2, |
||||
uchar *dst, size_t step, int width, int height, double scale) |
||||
{ |
||||
ARITHM_IPP_MUL(ippiMul_8u_C1RSfs, src1, (int)step1, src2, (int)step2,dst, (int)step, ippiSize(width, height), 0); |
||||
} |
||||
inline int arithm_ipp_mul16u(const ushort *src1, size_t step1, const ushort *src2, size_t step2, |
||||
ushort *dst, size_t step, int width, int height, double scale) |
||||
{ |
||||
ARITHM_IPP_MUL(ippiMul_16u_C1RSfs, src1, (int)step1, src2, (int)step2,dst, (int)step, ippiSize(width, height), 0); |
||||
} |
||||
|
||||
inline int arithm_ipp_mul16s(const short *src1, size_t step1, const short *src2, size_t step2, |
||||
short *dst, size_t step, int width, int height, double scale) |
||||
{ |
||||
ARITHM_IPP_MUL(ippiMul_16s_C1RSfs, src1, (int)step1, src2, (int)step2,dst, (int)step, ippiSize(width, height), 0); |
||||
} |
||||
|
||||
inline int arithm_ipp_mul32f(const float *src1, size_t step1, const float *src2, size_t step2, |
||||
float *dst, size_t step, int width, int height, double scale) |
||||
{ |
||||
ARITHM_IPP_MUL(ippiMul_32f_C1R, src1, (int)step1, src2, (int)step2,dst, (int)step, ippiSize(width, height)); |
||||
} |
||||
|
||||
#define arithm_ipp_mul8s(...) 0 |
||||
#define arithm_ipp_mul32s(...) 0 |
||||
#define arithm_ipp_mul64f(...) 0 |
||||
|
||||
//=======================================
|
||||
// Div
|
||||
//=======================================
|
||||
|
||||
#define arithm_ipp_div8u(...) 0 |
||||
#define arithm_ipp_div8s(...) 0 |
||||
#define arithm_ipp_div16u(...) 0 |
||||
#define arithm_ipp_div16s(...) 0 |
||||
#define arithm_ipp_div32s(...) 0 |
||||
#define arithm_ipp_div32f(...) 0 |
||||
#define arithm_ipp_div64f(...) 0 |
||||
|
||||
//=======================================
|
||||
// AddWeighted
|
||||
//=======================================
|
||||
|
||||
#define arithm_ipp_addWeighted8u(...) 0 |
||||
#define arithm_ipp_addWeighted8s(...) 0 |
||||
#define arithm_ipp_addWeighted16u(...) 0 |
||||
#define arithm_ipp_addWeighted16s(...) 0 |
||||
#define arithm_ipp_addWeighted32s(...) 0 |
||||
#define arithm_ipp_addWeighted32f(...) 0 |
||||
#define arithm_ipp_addWeighted64f(...) 0 |
||||
|
||||
//=======================================
|
||||
// Reciprocial
|
||||
//=======================================
|
||||
|
||||
#define arithm_ipp_recip8u(...) 0 |
||||
#define arithm_ipp_recip8s(...) 0 |
||||
#define arithm_ipp_recip16u(...) 0 |
||||
#define arithm_ipp_recip16s(...) 0 |
||||
#define arithm_ipp_recip32s(...) 0 |
||||
#define arithm_ipp_recip32f(...) 0 |
||||
#define arithm_ipp_recip64f(...) 0 |
||||
|
||||
/** empty block in case if you have "fun"
|
||||
#define arithm_ipp_8u(...) 0 |
||||
#define arithm_ipp_8s(...) 0 |
||||
#define arithm_ipp_16u(...) 0 |
||||
#define arithm_ipp_16s(...) 0 |
||||
#define arithm_ipp_32s(...) 0 |
||||
#define arithm_ipp_32f(...) 0 |
||||
#define arithm_ipp_64f(...) 0 |
||||
**/ |
||||
|
||||
}} // cv::hal::
|
||||
|
||||
#define ARITHM_CALL_IPP(fun, ...) \ |
||||
{ \
|
||||
if (__CV_EXPAND(fun(__VA_ARGS__))) \
|
||||
return; \
|
||||
} |
||||
|
||||
#endif // ARITHM_USE_IPP
|
||||
|
||||
|
||||
#if !ARITHM_USE_IPP |
||||
#define ARITHM_CALL_IPP(...) |
||||
#endif |
File diff suppressed because it is too large
Load Diff
Loading…
Reference in new issue