Open Source Computer Vision Library https://opencv.org/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

1131 lines
42 KiB

/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Copyright (C) 2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
#include "arithm_simd.hpp"
#include "arithm_core.hpp"
#include "replacement.hpp"
namespace cv { namespace hal {
//=======================================
#undef CALL_HAL
#define CALL_HAL(fun) \
int res = fun(src1, step1, src2, step2, dst, step, width, height); \
if (res == Error::Ok) \
return; \
else if (res != Error::NotImplemented) \
throw Failure(res);
#if (ARITHM_USE_IPP == 1)
static inline void fixSteps(width, height, size_t elemSize, size_t& step1, size_t& step2, size_t& step)
{
if( height == 1 )
step1 = step2 = step = width*elemSize;
}
#define CALL_IPP_BIN_12(fun) \
CV_IPP_CHECK() \
{ \
fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \
if (0 <= fun(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0)) \
{ \
CV_IMPL_ADD(CV_IMPL_IPP); \
return; \
} \
setIppErrorStatus(); \
}
#else
#define CALL_IPP_BIN_12(fun)
#endif
//=======================================
// Add
//=======================================
void add8u( const uchar* src1, size_t step1,
const uchar* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_add8u)
CALL_IPP_BIN_12(ippiAdd_8u_C1RSfs)
(vBinOp<uchar, cv::OpAdd<uchar>, IF_SIMD(VAdd<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
}
void add8s( const schar* src1, size_t step1,
const schar* src2, size_t step2,
schar* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_add8s)
vBinOp<schar, cv::OpAdd<schar>, IF_SIMD(VAdd<schar>)>(src1, step1, src2, step2, dst, step, width, height);
}
void add16u( const ushort* src1, size_t step1,
const ushort* src2, size_t step2,
ushort* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_add16u)
CALL_IPP_BIN_12(ippiAdd_16u_C1RSfs)
(vBinOp<ushort, cv::OpAdd<ushort>, IF_SIMD(VAdd<ushort>)>(src1, step1, src2, step2, dst, step, width, height));
}
void add16s( const short* src1, size_t step1,
const short* src2, size_t step2,
short* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_add16s)
CALL_IPP_BIN_12(ippiAdd_16s_C1RSfs)
(vBinOp<short, cv::OpAdd<short>, IF_SIMD(VAdd<short>)>(src1, step1, src2, step2, dst, step, width, height));
}
void add32s( const int* src1, size_t step1,
const int* src2, size_t step2,
int* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_add32s)
vBinOp32<int, cv::OpAdd<int>, IF_SIMD(VAdd<int>)>(src1, step1, src2, step2, dst, step, width, height);
}
void add32f( const float* src1, size_t step1,
const float* src2, size_t step2,
float* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_add32f)
CALL_IPP_BIN_12(ippiAdd_32f_C1R)
(vBinOp32<float, cv::OpAdd<float>, IF_SIMD(VAdd<float>)>(src1, step1, src2, step2, dst, step, width, height));
}
void add64f( const double* src1, size_t step1,
const double* src2, size_t step2,
double* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_add64f)
vBinOp64<double, cv::OpAdd<double>, IF_SIMD(VAdd<double>)>(src1, step1, src2, step2, dst, step, width, height);
}
//=======================================
#if (ARITHM_USE_IPP == 1)
#define CALL_IPP_BIN_21(fun) \
CV_IPP_CHECK() \
{ \
fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \
if (0 <= fun(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height), 0)) \
{ \
CV_IMPL_ADD(CV_IMPL_IPP); \
return; \
} \
setIppErrorStatus(); \
}
#else
#define CALL_IPP_BIN_21(fun)
#endif
//=======================================
// Subtract
//=======================================
void sub8u( const uchar* src1, size_t step1,
const uchar* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_sub8u)
CALL_IPP_BIN_21(ippiSub_8u_C1RSfs)
(vBinOp<uchar, cv::OpSub<uchar>, IF_SIMD(VSub<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
}
void sub8s( const schar* src1, size_t step1,
const schar* src2, size_t step2,
schar* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_sub8s)
vBinOp<schar, cv::OpSub<schar>, IF_SIMD(VSub<schar>)>(src1, step1, src2, step2, dst, step, width, height);
}
void sub16u( const ushort* src1, size_t step1,
const ushort* src2, size_t step2,
ushort* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_sub16u)
CALL_IPP_BIN_21(ippiSub_16u_C1RSfs)
(vBinOp<ushort, cv::OpSub<ushort>, IF_SIMD(VSub<ushort>)>(src1, step1, src2, step2, dst, step, width, height));
}
void sub16s( const short* src1, size_t step1,
const short* src2, size_t step2,
short* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_sub16s)
CALL_IPP_BIN_21(ippiSub_16s_C1RSfs)
(vBinOp<short, cv::OpSub<short>, IF_SIMD(VSub<short>)>(src1, step1, src2, step2, dst, step, width, height));
}
void sub32s( const int* src1, size_t step1,
const int* src2, size_t step2,
int* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_sub32s)
vBinOp32<int, cv::OpSub<int>, IF_SIMD(VSub<int>)>(src1, step1, src2, step2, dst, step, width, height);
}
void sub32f( const float* src1, size_t step1,
const float* src2, size_t step2,
float* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_sub32f)
CALL_IPP_BIN_21(ippiSub_32f_C1R)
(vBinOp32<float, cv::OpSub<float>, IF_SIMD(VSub<float>)>(src1, step1, src2, step2, dst, step, width, height));
}
void sub64f( const double* src1, size_t step1,
const double* src2, size_t step2,
double* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_sub64f)
vBinOp64<double, cv::OpSub<double>, IF_SIMD(VSub<double>)>(src1, step1, src2, step2, dst, step, width, height);
}
//=======================================
#if (ARITHM_USE_IPP == 1)
#define CALL_IPP_MIN_MAX(fun, type) \
CV_IPP_CHECK() \
{ \
type* s1 = (type*)src1; \
type* s2 = (type*)src2; \
type* d = dst; \
fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \
int i = 0; \
for(; i < height; i++) \
{ \
if (0 > fun(s1, s2, d, width)) \
break; \
s1 = (type*)((uchar*)s1 + step1); \
s2 = (type*)((uchar*)s2 + step2); \
d = (type*)((uchar*)d + step); \
} \
if (i == height) \
{ \
CV_IMPL_ADD(CV_IMPL_IPP); \
return; \
} \
setIppErrorStatus(); \
}
#else
#define CALL_IPP_MIN_MAX(fun, type)
#endif
//=======================================
// Max
//=======================================
void max8u( const uchar* src1, size_t step1,
const uchar* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_max8u)
CALL_IPP_MIN_MAX(ippsMaxEvery_8u, uchar)
vBinOp<uchar, cv::OpMax<uchar>, IF_SIMD(VMax<uchar>)>(src1, step1, src2, step2, dst, step, width, height);
}
void max8s( const schar* src1, size_t step1,
const schar* src2, size_t step2,
schar* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_max8s)
vBinOp<schar, cv::OpMax<schar>, IF_SIMD(VMax<schar>)>(src1, step1, src2, step2, dst, step, width, height);
}
void max16u( const ushort* src1, size_t step1,
const ushort* src2, size_t step2,
ushort* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_max16u)
CALL_IPP_MIN_MAX(ippsMaxEvery_16u, ushort)
vBinOp<ushort, cv::OpMax<ushort>, IF_SIMD(VMax<ushort>)>(src1, step1, src2, step2, dst, step, width, height);
}
void max16s( const short* src1, size_t step1,
const short* src2, size_t step2,
short* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_max16s)
vBinOp<short, cv::OpMax<short>, IF_SIMD(VMax<short>)>(src1, step1, src2, step2, dst, step, width, height);
}
void max32s( const int* src1, size_t step1,
const int* src2, size_t step2,
int* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_max32s)
vBinOp32<int, cv::OpMax<int>, IF_SIMD(VMax<int>)>(src1, step1, src2, step2, dst, step, width, height);
}
void max32f( const float* src1, size_t step1,
const float* src2, size_t step2,
float* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_max32f)
CALL_IPP_MIN_MAX(ippsMaxEvery_32f, float)
vBinOp32<float, cv::OpMax<float>, IF_SIMD(VMax<float>)>(src1, step1, src2, step2, dst, step, width, height);
}
void max64f( const double* src1, size_t step1,
const double* src2, size_t step2,
double* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_max64f)
CALL_IPP_MIN_MAX(ippsMaxEvery_64f, double)
vBinOp64<double, cv::OpMax<double>, IF_SIMD(VMax<double>)>(src1, step1, src2, step2, dst, step, width, height);
}
//=======================================
// Min
//=======================================
void min8u( const uchar* src1, size_t step1,
const uchar* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_min8u)
CALL_IPP_MIN_MAX(ippsMinEvery_8u, uchar)
vBinOp<uchar, cv::OpMin<uchar>, IF_SIMD(VMin<uchar>)>(src1, step1, src2, step2, dst, step, width, height);
}
void min8s( const schar* src1, size_t step1,
const schar* src2, size_t step2,
schar* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_min8s)
vBinOp<schar, cv::OpMin<schar>, IF_SIMD(VMin<schar>)>(src1, step1, src2, step2, dst, step, width, height);
}
void min16u( const ushort* src1, size_t step1,
const ushort* src2, size_t step2,
ushort* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_min16u)
CALL_IPP_MIN_MAX(ippsMinEvery_16u, ushort)
vBinOp<ushort, cv::OpMin<ushort>, IF_SIMD(VMin<ushort>)>(src1, step1, src2, step2, dst, step, width, height);
}
void min16s( const short* src1, size_t step1,
const short* src2, size_t step2,
short* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_min16s)
vBinOp<short, cv::OpMin<short>, IF_SIMD(VMin<short>)>(src1, step1, src2, step2, dst, step, width, height);
}
void min32s( const int* src1, size_t step1,
const int* src2, size_t step2,
int* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_min32s)
vBinOp32<int, cv::OpMin<int>, IF_SIMD(VMin<int>)>(src1, step1, src2, step2, dst, step, width, height);
}
void min32f( const float* src1, size_t step1,
const float* src2, size_t step2,
float* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_min32f)
CALL_IPP_MIN_MAX(ippsMinEvery_32f, float)
vBinOp32<float, cv::OpMin<float>, IF_SIMD(VMin<float>)>(src1, step1, src2, step2, dst, step, width, height);
}
void min64f( const double* src1, size_t step1,
const double* src2, size_t step2,
double* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_min64f)
CALL_IPP_MIN_MAX(ippsMinEvery_64f, double)
vBinOp64<double, cv::OpMin<double>, IF_SIMD(VMin<double>)>(src1, step1, src2, step2, dst, step, width, height);
}
//=======================================
// AbsDiff
//=======================================
void absdiff8u( const uchar* src1, size_t step1,
const uchar* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_absdiff8u)
CALL_IPP_BIN_12(ippiAbsDiff_8u_C1R)
(vBinOp<uchar, cv::OpAbsDiff<uchar>, IF_SIMD(VAbsDiff<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
}
void absdiff8s( const schar* src1, size_t step1,
const schar* src2, size_t step2,
schar* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_absdiff8s)
vBinOp<schar, cv::OpAbsDiff<schar>, IF_SIMD(VAbsDiff<schar>)>(src1, step1, src2, step2, dst, step, width, height);
}
void absdiff16u( const ushort* src1, size_t step1,
const ushort* src2, size_t step2,
ushort* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_absdiff16u)
CALL_IPP_BIN_12(ippiAbsDiff_16u_C1R)
(vBinOp<ushort, cv::OpAbsDiff<ushort>, IF_SIMD(VAbsDiff<ushort>)>(src1, step1, src2, step2, dst, step, width, height));
}
void absdiff16s( const short* src1, size_t step1,
const short* src2, size_t step2,
short* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_absdiff16s)
vBinOp<short, cv::OpAbsDiff<short>, IF_SIMD(VAbsDiff<short>)>(src1, step1, src2, step2, dst, step, width, height);
}
void absdiff32s( const int* src1, size_t step1,
const int* src2, size_t step2,
int* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_absdiff32s)
vBinOp32<int, cv::OpAbsDiff<int>, IF_SIMD(VAbsDiff<int>)>(src1, step1, src2, step2, dst, step, width, height);
}
void absdiff32f( const float* src1, size_t step1,
const float* src2, size_t step2,
float* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_absdiff32f)
CALL_IPP_BIN_12(ippiAbsDiff_32f_C1R)
(vBinOp32<float, cv::OpAbsDiff<float>, IF_SIMD(VAbsDiff<float>)>(src1, step1, src2, step2, dst, step, width, height));
}
void absdiff64f( const double* src1, size_t step1,
const double* src2, size_t step2,
double* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_absdiff64f)
vBinOp64<double, cv::OpAbsDiff<double>, IF_SIMD(VAbsDiff<double>)>(src1, step1, src2, step2, dst, step, width, height);
}
//=======================================
// Logical
//=======================================
void and8u( const uchar* src1, size_t step1,
const uchar* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_and8u)
CALL_IPP_BIN_12(ippiAnd_8u_C1R)
(vBinOp<uchar, cv::OpAnd<uchar>, IF_SIMD(VAnd<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
}
void or8u( const uchar* src1, size_t step1,
const uchar* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_or8u)
CALL_IPP_BIN_12(ippiOr_8u_C1R)
(vBinOp<uchar, cv::OpOr<uchar>, IF_SIMD(VOr<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
}
void xor8u( const uchar* src1, size_t step1,
const uchar* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_xor8u)
CALL_IPP_BIN_12(ippiXor_8u_C1R)
(vBinOp<uchar, cv::OpXor<uchar>, IF_SIMD(VXor<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
}
void not8u( const uchar* src1, size_t step1,
const uchar* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* )
{
CALL_HAL(hal_not8u)
CALL_IPP_BIN_12(ippiNot_8u_C1R)
(vBinOp<uchar, cv::OpNot<uchar>, IF_SIMD(VNot<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
}
//=======================================
#undef CALL_HAL
#define CALL_HAL(fun) \
int res = fun(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop); \
if (res == Error::Ok) \
return; \
else if (res != Error::NotImplemented) \
throw Failure(res);
#if ARITHM_USE_IPP
inline static IppCmpOp convert_cmp(int _cmpop)
{
return _cmpop == CMP_EQ ? ippCmpEq :
_cmpop == CMP_GT ? ippCmpGreater :
_cmpop == CMP_GE ? ippCmpGreaterEq :
_cmpop == CMP_LT ? ippCmpLess :
_cmpop == CMP_LE ? ippCmpLessEq :
(IppCmpOp)-1;
}
#define CALL_IPP_CMP(fun) \
CV_IPP_CHECK() \
{ \
IppCmpOp op = convert_cmp(*(int *)_cmpop); \
if( op >= 0 ) \
{ \
fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \
if (0 <= fun(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), op)) \
{ \
CV_IMPL_ADD(CV_IMPL_IPP); \
return; \
} \
setIppErrorStatus(); \
} \
}
#else
#define CALL_IPP_CMP(fun)
#endif
//=======================================
// Compare
//=======================================
void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* _cmpop)
{
CALL_HAL(hal_cmp8u)
CALL_IPP_CMP(ippiCompare_8u_C1R)
//vz optimized cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
int code = *(int*)_cmpop;
step1 /= sizeof(src1[0]);
step2 /= sizeof(src2[0]);
if( code == CMP_GE || code == CMP_LT )
{
std::swap(src1, src2);
std::swap(step1, step2);
code = code == CMP_GE ? CMP_LE : CMP_GT;
}
if( code == CMP_GT || code == CMP_LE )
{
int m = code == CMP_GT ? 0 : 255;
for( ; height--; src1 += step1, src2 += step2, dst += step )
{
int x =0;
#if CV_SSE2
if( USE_SSE2 )
{
__m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi8 (-1);
__m128i c128 = _mm_set1_epi8 (-128);
for( ; x <= width - 16; x += 16 )
{
__m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
__m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
// no simd for 8u comparison, that's why we need the trick
r00 = _mm_sub_epi8(r00,c128);
r10 = _mm_sub_epi8(r10,c128);
r00 =_mm_xor_si128(_mm_cmpgt_epi8(r00, r10), m128);
_mm_storeu_si128((__m128i*)(dst + x),r00);
}
}
#elif CV_NEON
uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255);
for( ; x <= width - 16; x += 16 )
{
vst1q_u8(dst+x, veorq_u8(vcgtq_u8(vld1q_u8(src1+x), vld1q_u8(src2+x)), mask));
}
#endif
for( ; x < width; x++ ){
dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
}
}
}
else if( code == CMP_EQ || code == CMP_NE )
{
int m = code == CMP_EQ ? 0 : 255;
for( ; height--; src1 += step1, src2 += step2, dst += step )
{
int x = 0;
#if CV_SSE2
if( USE_SSE2 )
{
__m128i m128 = code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi8 (-1);
for( ; x <= width - 16; x += 16 )
{
__m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
__m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
r00 = _mm_xor_si128 ( _mm_cmpeq_epi8 (r00, r10), m128);
_mm_storeu_si128((__m128i*)(dst + x), r00);
}
}
#elif CV_NEON
uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255);
for( ; x <= width - 16; x += 16 )
{
vst1q_u8(dst+x, veorq_u8(vceqq_u8(vld1q_u8(src1+x), vld1q_u8(src2+x)), mask));
}
#endif
for( ; x < width; x++ )
dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
}
}
}
void cmp8s(const schar* src1, size_t step1, const schar* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* _cmpop)
{
CALL_HAL(hal_cmp8s)
cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
}
void cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* _cmpop)
{
CALL_HAL(hal_cmp16u)
CALL_IPP_CMP(ippiCompare_16u_C1R)
cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
}
void cmp16s(const short* src1, size_t step1, const short* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* _cmpop)
{
CALL_HAL(hal_cmp16s)
CALL_IPP_CMP(ippiCompare_16s_C1R)
//vz optimized cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
int code = *(int*)_cmpop;
step1 /= sizeof(src1[0]);
step2 /= sizeof(src2[0]);
if( code == CMP_GE || code == CMP_LT )
{
std::swap(src1, src2);
std::swap(step1, step2);
code = code == CMP_GE ? CMP_LE : CMP_GT;
}
if( code == CMP_GT || code == CMP_LE )
{
int m = code == CMP_GT ? 0 : 255;
for( ; height--; src1 += step1, src2 += step2, dst += step )
{
int x =0;
#if CV_SSE2
if( USE_SSE2)
{
__m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi16 (-1);
for( ; x <= width - 16; x += 16 )
{
__m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
__m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
r00 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r00, r10), m128);
__m128i r01 = _mm_loadu_si128((const __m128i*)(src1 + x + 8));
__m128i r11 = _mm_loadu_si128((const __m128i*)(src2 + x + 8));
r01 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r01, r11), m128);
r11 = _mm_packs_epi16(r00, r01);
_mm_storeu_si128((__m128i*)(dst + x), r11);
}
if( x <= width-8)
{
__m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
__m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
r00 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r00, r10), m128);
r10 = _mm_packs_epi16(r00, r00);
_mm_storel_epi64((__m128i*)(dst + x), r10);
x += 8;
}
}
#elif CV_NEON
uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255);
for( ; x <= width - 16; x += 16 )
{
int16x8_t in1 = vld1q_s16(src1 + x);
int16x8_t in2 = vld1q_s16(src2 + x);
uint8x8_t t1 = vmovn_u16(vcgtq_s16(in1, in2));
in1 = vld1q_s16(src1 + x + 8);
in2 = vld1q_s16(src2 + x + 8);
uint8x8_t t2 = vmovn_u16(vcgtq_s16(in1, in2));
vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask));
}
#endif
for( ; x < width; x++ ){
dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
}
}
}
else if( code == CMP_EQ || code == CMP_NE )
{
int m = code == CMP_EQ ? 0 : 255;
for( ; height--; src1 += step1, src2 += step2, dst += step )
{
int x = 0;
#if CV_SSE2
if( USE_SSE2 )
{
__m128i m128 = code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi16 (-1);
for( ; x <= width - 16; x += 16 )
{
__m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
__m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
r00 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r00, r10), m128);
__m128i r01 = _mm_loadu_si128((const __m128i*)(src1 + x + 8));
__m128i r11 = _mm_loadu_si128((const __m128i*)(src2 + x + 8));
r01 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r01, r11), m128);
r11 = _mm_packs_epi16(r00, r01);
_mm_storeu_si128((__m128i*)(dst + x), r11);
}
if( x <= width - 8)
{
__m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
__m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
r00 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r00, r10), m128);
r10 = _mm_packs_epi16(r00, r00);
_mm_storel_epi64((__m128i*)(dst + x), r10);
x += 8;
}
}
#elif CV_NEON
uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255);
for( ; x <= width - 16; x += 16 )
{
int16x8_t in1 = vld1q_s16(src1 + x);
int16x8_t in2 = vld1q_s16(src2 + x);
uint8x8_t t1 = vmovn_u16(vceqq_s16(in1, in2));
in1 = vld1q_s16(src1 + x + 8);
in2 = vld1q_s16(src2 + x + 8);
uint8x8_t t2 = vmovn_u16(vceqq_s16(in1, in2));
vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask));
}
#endif
for( ; x < width; x++ )
dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
}
}
}
void cmp32s(const int* src1, size_t step1, const int* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* _cmpop)
{
CALL_HAL(hal_cmp32s)
cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
}
void cmp32f(const float* src1, size_t step1, const float* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* _cmpop)
{
CALL_HAL(hal_cmp32f)
CALL_IPP_CMP(ippiCompare_32f_C1R)
cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
}
void cmp64f(const double* src1, size_t step1, const double* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* _cmpop)
{
CALL_HAL(hal_cmp64f)
cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
}
//=======================================
#undef CALL_HAL
#define CALL_HAL(fun) \
int res = fun(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); \
if (res == Error::Ok) \
return; \
else if (res != Error::NotImplemented) \
throw Failure(res);
#if defined HAVE_IPP
#define CALL_IPP_MUL(fun) \
CV_IPP_CHECK() \
{ \
if (std::fabs(fscale - 1) <= FLT_EPSILON) \
{ \
if (fun(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0) >= 0) \
{ \
CV_IMPL_ADD(CV_IMPL_IPP); \
return; \
} \
setIppErrorStatus(); \
} \
}
#else
#define CALL_IPP_MUL(fun)
#endif
//=======================================
// Multilpy
//=======================================
void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_mul8u)
float fscale = (float)*(const double*)scale;
CALL_IPP_MUL(ippiMul_8u_C1RSfs)
mul_(src1, step1, src2, step2, dst, step, width, height, fscale);
}
void mul8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
schar* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_mul8s)
mul_(src1, step1, src2, step2, dst, step, width, height, (float)*(const double*)scale);
}
void mul16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
ushort* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_mul16u)
float fscale = (float)*(const double*)scale;
CALL_IPP_MUL(ippiMul_16u_C1RSfs)
mul_(src1, step1, src2, step2, dst, step, width, height, fscale);
}
void mul16s( const short* src1, size_t step1, const short* src2, size_t step2,
short* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_mul16s)
float fscale = (float)*(const double*)scale;
CALL_IPP_MUL(ippiMul_16s_C1RSfs)
mul_(src1, step1, src2, step2, dst, step, width, height, fscale);
}
void mul32s( const int* src1, size_t step1, const int* src2, size_t step2,
int* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_mul32s)
mul_(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
}
void mul32f( const float* src1, size_t step1, const float* src2, size_t step2,
float* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_mul32f)
float fscale = (float)*(const double*)scale;
CALL_IPP_MUL(ippiMul_32f_C1R)
mul_(src1, step1, src2, step2, dst, step, width, height, fscale);
}
void mul64f( const double* src1, size_t step1, const double* src2, size_t step2,
double* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_mul64f)
mul_(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
}
//=======================================
// Divide
//=======================================
void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_div8u)
if( src1 )
div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
else
recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
}
void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
schar* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_div8s)
div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
}
void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
ushort* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_div16u)
div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
}
void div16s( const short* src1, size_t step1, const short* src2, size_t step2,
short* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_div16s)
div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
}
void div32s( const int* src1, size_t step1, const int* src2, size_t step2,
int* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_div32s)
div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
}
void div32f( const float* src1, size_t step1, const float* src2, size_t step2,
float* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_div32f)
div_f(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
}
void div64f( const double* src1, size_t step1, const double* src2, size_t step2,
double* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_div64f)
div_f(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
}
//=======================================
// Reciprocial
//=======================================
void recip8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
uchar* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_recip8u)
recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
}
void recip8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
schar* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_recip8s)
recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
}
void recip16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
ushort* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_recip16u)
recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
}
void recip16s( const short* src1, size_t step1, const short* src2, size_t step2,
short* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_recip16s)
recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
}
void recip32s( const int* src1, size_t step1, const int* src2, size_t step2,
int* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_recip32s)
recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
}
void recip32f( const float* src1, size_t step1, const float* src2, size_t step2,
float* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_recip32f)
recip_f(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
}
void recip64f( const double* src1, size_t step1, const double* src2, size_t step2,
double* dst, size_t step, int width, int height, void* scale)
{
CALL_HAL(hal_recip64f)
recip_f(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
}
//=======================================
#undef CALL_HAL
#define CALL_HAL(fun) \
int res = fun(src1, step1, src2, step2, dst, step, width, height, scalars); \
if (res == Error::Ok) \
return; \
else if (res != Error::NotImplemented) \
throw Failure(res);
//=======================================
// Add weighted
//=======================================
void
addWeighted8u( const uchar* src1, size_t step1,
const uchar* src2, size_t step2,
uchar* dst, size_t step, int width, int height,
void* scalars )
{
CALL_HAL(hal_addWeighted8u)
const double* scalars_ = (const double*)scalars;
float alpha = (float)scalars_[0], beta = (float)scalars_[1], gamma = (float)scalars_[2];
for( ; height--; src1 += step1, src2 += step2, dst += step )
{
int x = 0;
#if CV_SSE2
if( USE_SSE2 )
{
__m128 a4 = _mm_set1_ps(alpha), b4 = _mm_set1_ps(beta), g4 = _mm_set1_ps(gamma);
__m128i z = _mm_setzero_si128();
for( ; x <= width - 8; x += 8 )
{
__m128i u = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src1 + x)), z);
__m128i v = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src2 + x)), z);
__m128 u0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(u, z));
__m128 u1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(u, z));
__m128 v0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v, z));
__m128 v1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v, z));
u0 = _mm_add_ps(_mm_mul_ps(u0, a4), _mm_mul_ps(v0, b4));
u1 = _mm_add_ps(_mm_mul_ps(u1, a4), _mm_mul_ps(v1, b4));
u0 = _mm_add_ps(u0, g4); u1 = _mm_add_ps(u1, g4);
u = _mm_packs_epi32(_mm_cvtps_epi32(u0), _mm_cvtps_epi32(u1));
u = _mm_packus_epi16(u, u);
_mm_storel_epi64((__m128i*)(dst + x), u);
}
}
#elif CV_NEON
float32x4_t g = vdupq_n_f32 (gamma);
for( ; x <= width - 8; x += 8 )
{
uint8x8_t in1 = vld1_u8(src1+x);
uint16x8_t in1_16 = vmovl_u8(in1);
float32x4_t in1_f_l = vcvtq_f32_u32(vmovl_u16(vget_low_u16(in1_16)));
float32x4_t in1_f_h = vcvtq_f32_u32(vmovl_u16(vget_high_u16(in1_16)));
uint8x8_t in2 = vld1_u8(src2+x);
uint16x8_t in2_16 = vmovl_u8(in2);
float32x4_t in2_f_l = vcvtq_f32_u32(vmovl_u16(vget_low_u16(in2_16)));
float32x4_t in2_f_h = vcvtq_f32_u32(vmovl_u16(vget_high_u16(in2_16)));
float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta));
float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta));
out_f_l = vaddq_f32(out_f_l, g);
out_f_h = vaddq_f32(out_f_h, g);
uint16x4_t out_16_l = vqmovun_s32(cv_vrndq_s32_f32(out_f_l));
uint16x4_t out_16_h = vqmovun_s32(cv_vrndq_s32_f32(out_f_h));
uint16x8_t out_16 = vcombine_u16(out_16_l, out_16_h);
uint8x8_t out = vqmovn_u16(out_16);
vst1_u8(dst+x, out);
}
#endif
#if CV_ENABLE_UNROLLED
for( ; x <= width - 4; x += 4 )
{
float t0, t1;
t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma;
t1 = CV_8TO32F(src1[x+1])*alpha + CV_8TO32F(src2[x+1])*beta + gamma;
dst[x] = saturate_cast<uchar>(t0);
dst[x+1] = saturate_cast<uchar>(t1);
t0 = CV_8TO32F(src1[x+2])*alpha + CV_8TO32F(src2[x+2])*beta + gamma;
t1 = CV_8TO32F(src1[x+3])*alpha + CV_8TO32F(src2[x+3])*beta + gamma;
dst[x+2] = saturate_cast<uchar>(t0);
dst[x+3] = saturate_cast<uchar>(t1);
}
#endif
for( ; x < width; x++ )
{
float t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma;
dst[x] = saturate_cast<uchar>(t0);
}
}
}
void addWeighted8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
schar* dst, size_t step, int width, int height, void* scalars )
{
CALL_HAL(hal_addWeighted8s)
addWeighted_<schar, float>(src1, step1, src2, step2, dst, step, width, height, scalars);
}
void addWeighted16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
ushort* dst, size_t step, int width, int height, void* scalars )
{
CALL_HAL(hal_addWeighted16u)
addWeighted_<ushort, float>(src1, step1, src2, step2, dst, step, width, height, scalars);
}
void addWeighted16s( const short* src1, size_t step1, const short* src2, size_t step2,
short* dst, size_t step, int width, int height, void* scalars )
{
CALL_HAL(hal_addWeighted16s)
addWeighted_<short, float>(src1, step1, src2, step2, dst, step, width, height, scalars);
}
void addWeighted32s( const int* src1, size_t step1, const int* src2, size_t step2,
int* dst, size_t step, int width, int height, void* scalars )
{
CALL_HAL(hal_addWeighted32s)
addWeighted_<int, double>(src1, step1, src2, step2, dst, step, width, height, scalars);
}
void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2,
float* dst, size_t step, int width, int height, void* scalars )
{
CALL_HAL(hal_addWeighted32f)
addWeighted_<float, double>(src1, step1, src2, step2, dst, step, width, height, scalars);
}
void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2,
double* dst, size_t step, int width, int height, void* scalars )
{
CALL_HAL(hal_addWeighted64f)
addWeighted_<double, double>(src1, step1, src2, step2, dst, step, width, height, scalars);
}
}} // cv::hal::