mirror of https://github.com/opencv/opencv.git
parent
ab0f0f26a1
commit
4437e0c3b9
9 changed files with 4517 additions and 4504 deletions
@ -0,0 +1,339 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html
|
||||
|
||||
|
||||
#include "precomp.hpp" |
||||
#include "stat.hpp" |
||||
|
||||
namespace cv |
||||
{ |
||||
|
||||
template<typename _Tp, typename _Rt> |
||||
void batchDistL1_(const _Tp* src1, const _Tp* src2, size_t step2, |
||||
int nvecs, int len, _Rt* dist, const uchar* mask) |
||||
{ |
||||
step2 /= sizeof(src2[0]); |
||||
if( !mask ) |
||||
{ |
||||
for( int i = 0; i < nvecs; i++ ) |
||||
dist[i] = normL1<_Tp, _Rt>(src1, src2 + step2*i, len); |
||||
} |
||||
else |
||||
{ |
||||
_Rt val0 = std::numeric_limits<_Rt>::max(); |
||||
for( int i = 0; i < nvecs; i++ ) |
||||
dist[i] = mask[i] ? normL1<_Tp, _Rt>(src1, src2 + step2*i, len) : val0; |
||||
} |
||||
} |
||||
|
||||
template<typename _Tp, typename _Rt> |
||||
void batchDistL2Sqr_(const _Tp* src1, const _Tp* src2, size_t step2, |
||||
int nvecs, int len, _Rt* dist, const uchar* mask) |
||||
{ |
||||
step2 /= sizeof(src2[0]); |
||||
if( !mask ) |
||||
{ |
||||
for( int i = 0; i < nvecs; i++ ) |
||||
dist[i] = normL2Sqr<_Tp, _Rt>(src1, src2 + step2*i, len); |
||||
} |
||||
else |
||||
{ |
||||
_Rt val0 = std::numeric_limits<_Rt>::max(); |
||||
for( int i = 0; i < nvecs; i++ ) |
||||
dist[i] = mask[i] ? normL2Sqr<_Tp, _Rt>(src1, src2 + step2*i, len) : val0; |
||||
} |
||||
} |
||||
|
||||
template<typename _Tp, typename _Rt> |
||||
void batchDistL2_(const _Tp* src1, const _Tp* src2, size_t step2, |
||||
int nvecs, int len, _Rt* dist, const uchar* mask) |
||||
{ |
||||
step2 /= sizeof(src2[0]); |
||||
if( !mask ) |
||||
{ |
||||
for( int i = 0; i < nvecs; i++ ) |
||||
dist[i] = std::sqrt(normL2Sqr<_Tp, _Rt>(src1, src2 + step2*i, len)); |
||||
} |
||||
else |
||||
{ |
||||
_Rt val0 = std::numeric_limits<_Rt>::max(); |
||||
for( int i = 0; i < nvecs; i++ ) |
||||
dist[i] = mask[i] ? std::sqrt(normL2Sqr<_Tp, _Rt>(src1, src2 + step2*i, len)) : val0; |
||||
} |
||||
} |
||||
|
||||
static void batchDistHamming(const uchar* src1, const uchar* src2, size_t step2, |
||||
int nvecs, int len, int* dist, const uchar* mask) |
||||
{ |
||||
step2 /= sizeof(src2[0]); |
||||
if( !mask ) |
||||
{ |
||||
for( int i = 0; i < nvecs; i++ ) |
||||
dist[i] = hal::normHamming(src1, src2 + step2*i, len); |
||||
} |
||||
else |
||||
{ |
||||
int val0 = INT_MAX; |
||||
for( int i = 0; i < nvecs; i++ ) |
||||
{ |
||||
if (mask[i]) |
||||
dist[i] = hal::normHamming(src1, src2 + step2*i, len); |
||||
else |
||||
dist[i] = val0; |
||||
} |
||||
} |
||||
} |
||||
|
||||
static void batchDistHamming2(const uchar* src1, const uchar* src2, size_t step2, |
||||
int nvecs, int len, int* dist, const uchar* mask) |
||||
{ |
||||
step2 /= sizeof(src2[0]); |
||||
if( !mask ) |
||||
{ |
||||
for( int i = 0; i < nvecs; i++ ) |
||||
dist[i] = hal::normHamming(src1, src2 + step2*i, len, 2); |
||||
} |
||||
else |
||||
{ |
||||
int val0 = INT_MAX; |
||||
for( int i = 0; i < nvecs; i++ ) |
||||
{ |
||||
if (mask[i]) |
||||
dist[i] = hal::normHamming(src1, src2 + step2*i, len, 2); |
||||
else |
||||
dist[i] = val0; |
||||
} |
||||
} |
||||
} |
||||
|
||||
static void batchDistL1_8u32s(const uchar* src1, const uchar* src2, size_t step2, |
||||
int nvecs, int len, int* dist, const uchar* mask) |
||||
{ |
||||
batchDistL1_<uchar, int>(src1, src2, step2, nvecs, len, dist, mask); |
||||
} |
||||
|
||||
static void batchDistL1_8u32f(const uchar* src1, const uchar* src2, size_t step2, |
||||
int nvecs, int len, float* dist, const uchar* mask) |
||||
{ |
||||
batchDistL1_<uchar, float>(src1, src2, step2, nvecs, len, dist, mask); |
||||
} |
||||
|
||||
static void batchDistL2Sqr_8u32s(const uchar* src1, const uchar* src2, size_t step2, |
||||
int nvecs, int len, int* dist, const uchar* mask) |
||||
{ |
||||
batchDistL2Sqr_<uchar, int>(src1, src2, step2, nvecs, len, dist, mask); |
||||
} |
||||
|
||||
static void batchDistL2Sqr_8u32f(const uchar* src1, const uchar* src2, size_t step2, |
||||
int nvecs, int len, float* dist, const uchar* mask) |
||||
{ |
||||
batchDistL2Sqr_<uchar, float>(src1, src2, step2, nvecs, len, dist, mask); |
||||
} |
||||
|
||||
static void batchDistL2_8u32f(const uchar* src1, const uchar* src2, size_t step2, |
||||
int nvecs, int len, float* dist, const uchar* mask) |
||||
{ |
||||
batchDistL2_<uchar, float>(src1, src2, step2, nvecs, len, dist, mask); |
||||
} |
||||
|
||||
static void batchDistL1_32f(const float* src1, const float* src2, size_t step2, |
||||
int nvecs, int len, float* dist, const uchar* mask) |
||||
{ |
||||
batchDistL1_<float, float>(src1, src2, step2, nvecs, len, dist, mask); |
||||
} |
||||
|
||||
static void batchDistL2Sqr_32f(const float* src1, const float* src2, size_t step2, |
||||
int nvecs, int len, float* dist, const uchar* mask) |
||||
{ |
||||
batchDistL2Sqr_<float, float>(src1, src2, step2, nvecs, len, dist, mask); |
||||
} |
||||
|
||||
static void batchDistL2_32f(const float* src1, const float* src2, size_t step2, |
||||
int nvecs, int len, float* dist, const uchar* mask) |
||||
{ |
||||
batchDistL2_<float, float>(src1, src2, step2, nvecs, len, dist, mask); |
||||
} |
||||
|
||||
typedef void (*BatchDistFunc)(const uchar* src1, const uchar* src2, size_t step2, |
||||
int nvecs, int len, uchar* dist, const uchar* mask); |
||||
|
||||
|
||||
struct BatchDistInvoker : public ParallelLoopBody |
||||
{ |
||||
BatchDistInvoker( const Mat& _src1, const Mat& _src2, |
||||
Mat& _dist, Mat& _nidx, int _K, |
||||
const Mat& _mask, int _update, |
||||
BatchDistFunc _func) |
||||
{ |
||||
src1 = &_src1; |
||||
src2 = &_src2; |
||||
dist = &_dist; |
||||
nidx = &_nidx; |
||||
K = _K; |
||||
mask = &_mask; |
||||
update = _update; |
||||
func = _func; |
||||
} |
||||
|
||||
void operator()(const Range& range) const |
||||
{ |
||||
AutoBuffer<int> buf(src2->rows); |
||||
int* bufptr = buf; |
||||
|
||||
for( int i = range.start; i < range.end; i++ ) |
||||
{ |
||||
func(src1->ptr(i), src2->ptr(), src2->step, src2->rows, src2->cols, |
||||
K > 0 ? (uchar*)bufptr : dist->ptr(i), mask->data ? mask->ptr(i) : 0); |
||||
|
||||
if( K > 0 ) |
||||
{ |
||||
int* nidxptr = nidx->ptr<int>(i); |
||||
// since positive float's can be compared just like int's,
|
||||
// we handle both CV_32S and CV_32F cases with a single branch
|
||||
int* distptr = (int*)dist->ptr(i); |
||||
|
||||
int j, k; |
||||
|
||||
for( j = 0; j < src2->rows; j++ ) |
||||
{ |
||||
int d = bufptr[j]; |
||||
if( d < distptr[K-1] ) |
||||
{ |
||||
for( k = K-2; k >= 0 && distptr[k] > d; k-- ) |
||||
{ |
||||
nidxptr[k+1] = nidxptr[k]; |
||||
distptr[k+1] = distptr[k]; |
||||
} |
||||
nidxptr[k+1] = j + update; |
||||
distptr[k+1] = d; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
const Mat *src1; |
||||
const Mat *src2; |
||||
Mat *dist; |
||||
Mat *nidx; |
||||
const Mat *mask; |
||||
int K; |
||||
int update; |
||||
BatchDistFunc func; |
||||
}; |
||||
|
||||
} |
||||
|
||||
void cv::batchDistance( InputArray _src1, InputArray _src2, |
||||
OutputArray _dist, int dtype, OutputArray _nidx, |
||||
int normType, int K, InputArray _mask, |
||||
int update, bool crosscheck ) |
||||
{ |
||||
CV_INSTRUMENT_REGION() |
||||
|
||||
Mat src1 = _src1.getMat(), src2 = _src2.getMat(), mask = _mask.getMat(); |
||||
int type = src1.type(); |
||||
CV_Assert( type == src2.type() && src1.cols == src2.cols && |
||||
(type == CV_32F || type == CV_8U)); |
||||
CV_Assert( _nidx.needed() == (K > 0) ); |
||||
|
||||
if( dtype == -1 ) |
||||
{ |
||||
dtype = normType == NORM_HAMMING || normType == NORM_HAMMING2 ? CV_32S : CV_32F; |
||||
} |
||||
CV_Assert( (type == CV_8U && dtype == CV_32S) || dtype == CV_32F); |
||||
|
||||
K = std::min(K, src2.rows); |
||||
|
||||
_dist.create(src1.rows, (K > 0 ? K : src2.rows), dtype); |
||||
Mat dist = _dist.getMat(), nidx; |
||||
if( _nidx.needed() ) |
||||
{ |
||||
_nidx.create(dist.size(), CV_32S); |
||||
nidx = _nidx.getMat(); |
||||
} |
||||
|
||||
if( update == 0 && K > 0 ) |
||||
{ |
||||
dist = Scalar::all(dtype == CV_32S ? (double)INT_MAX : (double)FLT_MAX); |
||||
nidx = Scalar::all(-1); |
||||
} |
||||
|
||||
if( crosscheck ) |
||||
{ |
||||
CV_Assert( K == 1 && update == 0 && mask.empty() ); |
||||
Mat tdist, tidx; |
||||
batchDistance(src2, src1, tdist, dtype, tidx, normType, K, mask, 0, false); |
||||
|
||||
// if an idx-th element from src1 appeared to be the nearest to i-th element of src2,
|
||||
// we update the minimum mutual distance between idx-th element of src1 and the whole src2 set.
|
||||
// As a result, if nidx[idx] = i*, it means that idx-th element of src1 is the nearest
|
||||
// to i*-th element of src2 and i*-th element of src2 is the closest to idx-th element of src1.
|
||||
// If nidx[idx] = -1, it means that there is no such ideal couple for it in src2.
|
||||
// This O(N) procedure is called cross-check and it helps to eliminate some false matches.
|
||||
if( dtype == CV_32S ) |
||||
{ |
||||
for( int i = 0; i < tdist.rows; i++ ) |
||||
{ |
||||
int idx = tidx.at<int>(i); |
||||
int d = tdist.at<int>(i), d0 = dist.at<int>(idx); |
||||
if( d < d0 ) |
||||
{ |
||||
dist.at<int>(idx) = d; |
||||
nidx.at<int>(idx) = i + update; |
||||
} |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
for( int i = 0; i < tdist.rows; i++ ) |
||||
{ |
||||
int idx = tidx.at<int>(i); |
||||
float d = tdist.at<float>(i), d0 = dist.at<float>(idx); |
||||
if( d < d0 ) |
||||
{ |
||||
dist.at<float>(idx) = d; |
||||
nidx.at<int>(idx) = i + update; |
||||
} |
||||
} |
||||
} |
||||
return; |
||||
} |
||||
|
||||
BatchDistFunc func = 0; |
||||
if( type == CV_8U ) |
||||
{ |
||||
if( normType == NORM_L1 && dtype == CV_32S ) |
||||
func = (BatchDistFunc)batchDistL1_8u32s; |
||||
else if( normType == NORM_L1 && dtype == CV_32F ) |
||||
func = (BatchDistFunc)batchDistL1_8u32f; |
||||
else if( normType == NORM_L2SQR && dtype == CV_32S ) |
||||
func = (BatchDistFunc)batchDistL2Sqr_8u32s; |
||||
else if( normType == NORM_L2SQR && dtype == CV_32F ) |
||||
func = (BatchDistFunc)batchDistL2Sqr_8u32f; |
||||
else if( normType == NORM_L2 && dtype == CV_32F ) |
||||
func = (BatchDistFunc)batchDistL2_8u32f; |
||||
else if( normType == NORM_HAMMING && dtype == CV_32S ) |
||||
func = (BatchDistFunc)batchDistHamming; |
||||
else if( normType == NORM_HAMMING2 && dtype == CV_32S ) |
||||
func = (BatchDistFunc)batchDistHamming2; |
||||
} |
||||
else if( type == CV_32F && dtype == CV_32F ) |
||||
{ |
||||
if( normType == NORM_L1 ) |
||||
func = (BatchDistFunc)batchDistL1_32f; |
||||
else if( normType == NORM_L2SQR ) |
||||
func = (BatchDistFunc)batchDistL2Sqr_32f; |
||||
else if( normType == NORM_L2 ) |
||||
func = (BatchDistFunc)batchDistL2_32f; |
||||
} |
||||
|
||||
if( func == 0 ) |
||||
CV_Error_(CV_StsUnsupportedFormat, |
||||
("The combination of type=%d, dtype=%d and normType=%d is not supported", |
||||
type, dtype, normType)); |
||||
|
||||
parallel_for_(Range(0, src1.rows), |
||||
BatchDistInvoker(src1, src2, dist, nidx, K, mask, update, func)); |
||||
} |
@ -0,0 +1,417 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html
|
||||
|
||||
|
||||
#include "precomp.hpp" |
||||
#include "opencl_kernels_core.hpp" |
||||
#include "stat.hpp" |
||||
|
||||
namespace cv { |
||||
|
||||
template<typename T> |
||||
static int countNonZero_(const T* src, int len ) |
||||
{ |
||||
int i=0, nz = 0; |
||||
#if CV_ENABLE_UNROLLED |
||||
for(; i <= len - 4; i += 4 ) |
||||
nz += (src[i] != 0) + (src[i+1] != 0) + (src[i+2] != 0) + (src[i+3] != 0); |
||||
#endif |
||||
for( ; i < len; i++ ) |
||||
nz += src[i] != 0; |
||||
return nz; |
||||
} |
||||
|
||||
static int countNonZero8u( const uchar* src, int len ) |
||||
{ |
||||
int i=0, nz = 0; |
||||
#if CV_SSE2 |
||||
if(USE_SSE2)//5x-6x
|
||||
{ |
||||
__m128i v_zero = _mm_setzero_si128(); |
||||
__m128i sum = _mm_setzero_si128(); |
||||
|
||||
for (; i<=len-16; i+=16) |
||||
{ |
||||
__m128i r0 = _mm_loadu_si128((const __m128i*)(src+i)); |
||||
sum = _mm_add_epi32(sum, _mm_sad_epu8(_mm_sub_epi8(v_zero, _mm_cmpeq_epi8(r0, v_zero)), v_zero)); |
||||
} |
||||
nz = i - _mm_cvtsi128_si32(_mm_add_epi32(sum, _mm_unpackhi_epi64(sum, sum))); |
||||
} |
||||
#elif CV_NEON |
||||
int len0 = len & -16, blockSize1 = (1 << 8) - 16, blockSize0 = blockSize1 << 6; |
||||
uint32x4_t v_nz = vdupq_n_u32(0u); |
||||
uint8x16_t v_zero = vdupq_n_u8(0), v_1 = vdupq_n_u8(1); |
||||
const uchar * src0 = src; |
||||
|
||||
while( i < len0 ) |
||||
{ |
||||
int blockSizei = std::min(len0 - i, blockSize0), j = 0; |
||||
|
||||
while (j < blockSizei) |
||||
{ |
||||
int blockSizej = std::min(blockSizei - j, blockSize1), k = 0; |
||||
uint8x16_t v_pz = v_zero; |
||||
|
||||
for( ; k <= blockSizej - 16; k += 16 ) |
||||
v_pz = vaddq_u8(v_pz, vandq_u8(vceqq_u8(vld1q_u8(src0 + k), v_zero), v_1)); |
||||
|
||||
uint16x8_t v_p1 = vmovl_u8(vget_low_u8(v_pz)), v_p2 = vmovl_u8(vget_high_u8(v_pz)); |
||||
v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_p1), vget_high_u16(v_p1)), v_nz); |
||||
v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_p2), vget_high_u16(v_p2)), v_nz); |
||||
|
||||
src0 += blockSizej; |
||||
j += blockSizej; |
||||
} |
||||
|
||||
i += blockSizei; |
||||
} |
||||
|
||||
CV_DECL_ALIGNED(16) unsigned int buf[4]; |
||||
vst1q_u32(buf, v_nz); |
||||
nz += i - saturate_cast<int>(buf[0] + buf[1] + buf[2] + buf[3]); |
||||
#endif |
||||
for( ; i < len; i++ ) |
||||
nz += src[i] != 0; |
||||
return nz; |
||||
} |
||||
|
||||
static int countNonZero16u( const ushort* src, int len ) |
||||
{ |
||||
int i = 0, nz = 0; |
||||
#if CV_SSE2 |
||||
if (USE_SSE2) |
||||
{ |
||||
__m128i v_zero = _mm_setzero_si128 (); |
||||
__m128i sum = _mm_setzero_si128(); |
||||
|
||||
for ( ; i <= len - 8; i += 8) |
||||
{ |
||||
__m128i r0 = _mm_loadu_si128((const __m128i*)(src + i)); |
||||
sum = _mm_add_epi32(sum, _mm_sad_epu8(_mm_sub_epi8(v_zero, _mm_cmpeq_epi16(r0, v_zero)), v_zero)); |
||||
} |
||||
|
||||
nz = i - (_mm_cvtsi128_si32(_mm_add_epi32(sum, _mm_unpackhi_epi64(sum, sum))) >> 1); |
||||
src += i; |
||||
} |
||||
#elif CV_NEON |
||||
int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6; |
||||
uint32x4_t v_nz = vdupq_n_u32(0u); |
||||
uint16x8_t v_zero = vdupq_n_u16(0), v_1 = vdupq_n_u16(1); |
||||
|
||||
while( i < len0 ) |
||||
{ |
||||
int blockSizei = std::min(len0 - i, blockSize0), j = 0; |
||||
|
||||
while (j < blockSizei) |
||||
{ |
||||
int blockSizej = std::min(blockSizei - j, blockSize1), k = 0; |
||||
uint16x8_t v_pz = v_zero; |
||||
|
||||
for( ; k <= blockSizej - 8; k += 8 ) |
||||
v_pz = vaddq_u16(v_pz, vandq_u16(vceqq_u16(vld1q_u16(src + k), v_zero), v_1)); |
||||
|
||||
v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_pz), vget_high_u16(v_pz)), v_nz); |
||||
|
||||
src += blockSizej; |
||||
j += blockSizej; |
||||
} |
||||
|
||||
i += blockSizei; |
||||
} |
||||
|
||||
CV_DECL_ALIGNED(16) unsigned int buf[4]; |
||||
vst1q_u32(buf, v_nz); |
||||
nz += i - saturate_cast<int>(buf[0] + buf[1] + buf[2] + buf[3]); |
||||
#endif |
||||
return nz + countNonZero_(src, len - i); |
||||
} |
||||
|
||||
static int countNonZero32s( const int* src, int len ) |
||||
{ |
||||
int i = 0, nz = 0; |
||||
#if CV_SSE2 |
||||
if (USE_SSE2) |
||||
{ |
||||
__m128i v_zero = _mm_setzero_si128 (); |
||||
__m128i sum = _mm_setzero_si128(); |
||||
|
||||
for ( ; i <= len - 4; i += 4) |
||||
{ |
||||
__m128i r0 = _mm_loadu_si128((const __m128i*)(src + i)); |
||||
sum = _mm_add_epi32(sum, _mm_sad_epu8(_mm_sub_epi8(v_zero, _mm_cmpeq_epi32(r0, v_zero)), v_zero)); |
||||
} |
||||
|
||||
nz = i - (_mm_cvtsi128_si32(_mm_add_epi32(sum, _mm_unpackhi_epi64(sum, sum))) >> 2); |
||||
src += i; |
||||
} |
||||
#elif CV_NEON |
||||
int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6; |
||||
uint32x4_t v_nz = vdupq_n_u32(0u); |
||||
int32x4_t v_zero = vdupq_n_s32(0.0f); |
||||
uint16x8_t v_1 = vdupq_n_u16(1u), v_zerou = vdupq_n_u16(0u); |
||||
|
||||
while( i < len0 ) |
||||
{ |
||||
int blockSizei = std::min(len0 - i, blockSize0), j = 0; |
||||
|
||||
while (j < blockSizei) |
||||
{ |
||||
int blockSizej = std::min(blockSizei - j, blockSize1), k = 0; |
||||
uint16x8_t v_pz = v_zerou; |
||||
|
||||
for( ; k <= blockSizej - 8; k += 8 ) |
||||
v_pz = vaddq_u16(v_pz, vandq_u16(vcombine_u16(vmovn_u32(vceqq_s32(vld1q_s32(src + k), v_zero)), |
||||
vmovn_u32(vceqq_s32(vld1q_s32(src + k + 4), v_zero))), v_1)); |
||||
|
||||
v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_pz), vget_high_u16(v_pz)), v_nz); |
||||
|
||||
src += blockSizej; |
||||
j += blockSizej; |
||||
} |
||||
|
||||
i += blockSizei; |
||||
} |
||||
|
||||
CV_DECL_ALIGNED(16) unsigned int buf[4]; |
||||
vst1q_u32(buf, v_nz); |
||||
nz += i - saturate_cast<int>(buf[0] + buf[1] + buf[2] + buf[3]); |
||||
#endif |
||||
return nz + countNonZero_(src, len - i); |
||||
} |
||||
|
||||
static int countNonZero32f( const float* src, int len ) |
||||
{ |
||||
int i = 0, nz = 0; |
||||
#if CV_SSE2 |
||||
if (USE_SSE2) |
||||
{ |
||||
__m128 v_zero_f = _mm_setzero_ps(); |
||||
__m128i v_zero = _mm_setzero_si128 (); |
||||
__m128i sum = _mm_setzero_si128(); |
||||
|
||||
for ( ; i <= len - 4; i += 4) |
||||
{ |
||||
__m128 r0 = _mm_loadu_ps(src + i); |
||||
sum = _mm_add_epi32(sum, _mm_sad_epu8(_mm_sub_epi8(v_zero, _mm_castps_si128(_mm_cmpeq_ps(r0, v_zero_f))), v_zero)); |
||||
} |
||||
|
||||
nz = i - (_mm_cvtsi128_si32(_mm_add_epi32(sum, _mm_unpackhi_epi64(sum, sum))) >> 2); |
||||
src += i; |
||||
} |
||||
#elif CV_NEON |
||||
int len0 = len & -8, blockSize1 = (1 << 15), blockSize0 = blockSize1 << 6; |
||||
uint32x4_t v_nz = vdupq_n_u32(0u); |
||||
float32x4_t v_zero = vdupq_n_f32(0.0f); |
||||
uint16x8_t v_1 = vdupq_n_u16(1u), v_zerou = vdupq_n_u16(0u); |
||||
|
||||
while( i < len0 ) |
||||
{ |
||||
int blockSizei = std::min(len0 - i, blockSize0), j = 0; |
||||
|
||||
while (j < blockSizei) |
||||
{ |
||||
int blockSizej = std::min(blockSizei - j, blockSize1), k = 0; |
||||
uint16x8_t v_pz = v_zerou; |
||||
|
||||
for( ; k <= blockSizej - 8; k += 8 ) |
||||
v_pz = vaddq_u16(v_pz, vandq_u16(vcombine_u16(vmovn_u32(vceqq_f32(vld1q_f32(src + k), v_zero)), |
||||
vmovn_u32(vceqq_f32(vld1q_f32(src + k + 4), v_zero))), v_1)); |
||||
|
||||
v_nz = vaddq_u32(vaddl_u16(vget_low_u16(v_pz), vget_high_u16(v_pz)), v_nz); |
||||
|
||||
src += blockSizej; |
||||
j += blockSizej; |
||||
} |
||||
|
||||
i += blockSizei; |
||||
} |
||||
|
||||
CV_DECL_ALIGNED(16) unsigned int buf[4]; |
||||
vst1q_u32(buf, v_nz); |
||||
nz += i - saturate_cast<int>(buf[0] + buf[1] + buf[2] + buf[3]); |
||||
#endif |
||||
return nz + countNonZero_(src, len - i); |
||||
} |
||||
|
||||
static int countNonZero64f( const double* src, int len ) |
||||
{ |
||||
return countNonZero_(src, len); |
||||
} |
||||
|
||||
typedef int (*CountNonZeroFunc)(const uchar*, int); |
||||
|
||||
static CountNonZeroFunc getCountNonZeroTab(int depth) |
||||
{ |
||||
static CountNonZeroFunc countNonZeroTab[] = |
||||
{ |
||||
(CountNonZeroFunc)GET_OPTIMIZED(countNonZero8u), (CountNonZeroFunc)GET_OPTIMIZED(countNonZero8u), |
||||
(CountNonZeroFunc)GET_OPTIMIZED(countNonZero16u), (CountNonZeroFunc)GET_OPTIMIZED(countNonZero16u), |
||||
(CountNonZeroFunc)GET_OPTIMIZED(countNonZero32s), (CountNonZeroFunc)GET_OPTIMIZED(countNonZero32f), |
||||
(CountNonZeroFunc)GET_OPTIMIZED(countNonZero64f), 0 |
||||
}; |
||||
|
||||
return countNonZeroTab[depth]; |
||||
} |
||||
|
||||
|
||||
#ifdef HAVE_OPENCL |
||||
static bool ocl_countNonZero( InputArray _src, int & res ) |
||||
{ |
||||
int type = _src.type(), depth = CV_MAT_DEPTH(type), kercn = ocl::predictOptimalVectorWidth(_src); |
||||
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; |
||||
|
||||
if (depth == CV_64F && !doubleSupport) |
||||
return false; |
||||
|
||||
int dbsize = ocl::Device::getDefault().maxComputeUnits(); |
||||
size_t wgs = ocl::Device::getDefault().maxWorkGroupSize(); |
||||
|
||||
int wgs2_aligned = 1; |
||||
while (wgs2_aligned < (int)wgs) |
||||
wgs2_aligned <<= 1; |
||||
wgs2_aligned >>= 1; |
||||
|
||||
ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, |
||||
format("-D srcT=%s -D srcT1=%s -D cn=1 -D OP_COUNT_NON_ZERO" |
||||
" -D WGS=%d -D kercn=%d -D WGS2_ALIGNED=%d%s%s", |
||||
ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), |
||||
ocl::typeToStr(depth), (int)wgs, kercn, |
||||
wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "", |
||||
_src.isContinuous() ? " -D HAVE_SRC_CONT" : "")); |
||||
if (k.empty()) |
||||
return false; |
||||
|
||||
UMat src = _src.getUMat(), db(1, dbsize, CV_32SC1); |
||||
k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(), |
||||
dbsize, ocl::KernelArg::PtrWriteOnly(db)); |
||||
|
||||
size_t globalsize = dbsize * wgs; |
||||
if (k.run(1, &globalsize, &wgs, true)) |
||||
return res = saturate_cast<int>(cv::sum(db.getMat(ACCESS_READ))[0]), true; |
||||
return false; |
||||
} |
||||
#endif |
||||
|
||||
#if defined HAVE_IPP |
||||
static bool ipp_countNonZero( Mat &src, int &res ) |
||||
{ |
||||
CV_INSTRUMENT_REGION_IPP() |
||||
|
||||
#if IPP_VERSION_X100 < 201801 |
||||
// Poor performance of SSE42
|
||||
if(cv::ipp::getIppTopFeatures() == ippCPUID_SSE42) |
||||
return false; |
||||
#endif |
||||
|
||||
Ipp32s count = 0; |
||||
int depth = src.depth(); |
||||
|
||||
if(src.dims <= 2) |
||||
{ |
||||
IppStatus status; |
||||
IppiSize size = {src.cols*src.channels(), src.rows}; |
||||
|
||||
if(depth == CV_8U) |
||||
status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_8u_C1R, (const Ipp8u *)src.ptr(), (int)src.step, size, &count, 0, 0); |
||||
else if(depth == CV_32F) |
||||
status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_32f_C1R, (const Ipp32f *)src.ptr(), (int)src.step, size, &count, 0, 0); |
||||
else |
||||
return false; |
||||
|
||||
if(status < 0) |
||||
return false; |
||||
|
||||
res = size.width*size.height - count; |
||||
} |
||||
else |
||||
{ |
||||
IppStatus status; |
||||
const Mat *arrays[] = {&src, NULL}; |
||||
Mat planes[1]; |
||||
NAryMatIterator it(arrays, planes, 1); |
||||
IppiSize size = {(int)it.size*src.channels(), 1}; |
||||
res = 0; |
||||
for (size_t i = 0; i < it.nplanes; i++, ++it) |
||||
{ |
||||
if(depth == CV_8U) |
||||
status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_8u_C1R, it.planes->ptr<Ipp8u>(), (int)it.planes->step, size, &count, 0, 0); |
||||
else if(depth == CV_32F) |
||||
status = CV_INSTRUMENT_FUN_IPP(ippiCountInRange_32f_C1R, it.planes->ptr<Ipp32f>(), (int)it.planes->step, size, &count, 0, 0); |
||||
else |
||||
return false; |
||||
|
||||
if(status < 0 || (int)it.planes->total()*src.channels() < count) |
||||
return false; |
||||
|
||||
res += (int)it.planes->total()*src.channels() - count; |
||||
} |
||||
} |
||||
|
||||
return true; |
||||
} |
||||
#endif |
||||
|
||||
} // cv::
|
||||
|
||||
int cv::countNonZero( InputArray _src ) |
||||
{ |
||||
CV_INSTRUMENT_REGION() |
||||
|
||||
int type = _src.type(), cn = CV_MAT_CN(type); |
||||
CV_Assert( cn == 1 ); |
||||
|
||||
#if defined HAVE_OPENCL || defined HAVE_IPP |
||||
int res = -1; |
||||
#endif |
||||
|
||||
#ifdef HAVE_OPENCL |
||||
CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2, |
||||
ocl_countNonZero(_src, res), |
||||
res) |
||||
#endif |
||||
|
||||
Mat src = _src.getMat(); |
||||
CV_IPP_RUN_FAST(ipp_countNonZero(src, res), res); |
||||
|
||||
CountNonZeroFunc func = getCountNonZeroTab(src.depth()); |
||||
CV_Assert( func != 0 ); |
||||
|
||||
const Mat* arrays[] = {&src, 0}; |
||||
uchar* ptrs[1]; |
||||
NAryMatIterator it(arrays, ptrs); |
||||
int total = (int)it.size, nz = 0; |
||||
|
||||
for( size_t i = 0; i < it.nplanes; i++, ++it ) |
||||
nz += func( ptrs[0], total ); |
||||
|
||||
return nz; |
||||
} |
||||
|
||||
void cv::findNonZero( InputArray _src, OutputArray _idx ) |
||||
{ |
||||
CV_INSTRUMENT_REGION() |
||||
|
||||
Mat src = _src.getMat(); |
||||
CV_Assert( src.type() == CV_8UC1 ); |
||||
int n = countNonZero(src); |
||||
if( n == 0 ) |
||||
{ |
||||
_idx.release(); |
||||
return; |
||||
} |
||||
if( _idx.kind() == _InputArray::MAT && !_idx.getMatRef().isContinuous() ) |
||||
_idx.release(); |
||||
_idx.create(n, 1, CV_32SC2); |
||||
Mat idx = _idx.getMat(); |
||||
CV_Assert(idx.isContinuous()); |
||||
Point* idx_ptr = idx.ptr<Point>(); |
||||
|
||||
for( int i = 0; i < src.rows; i++ ) |
||||
{ |
||||
const uchar* bin_ptr = src.ptr(i); |
||||
for( int j = 0; j < src.cols; j++ ) |
||||
if( bin_ptr[j] ) |
||||
*idx_ptr++ = Point(j, i); |
||||
} |
||||
} |
@ -0,0 +1,869 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html
|
||||
|
||||
|
||||
#include "precomp.hpp" |
||||
#include "opencl_kernels_core.hpp" |
||||
#include "opencv2/core/openvx/ovx_defs.hpp" |
||||
#include "stat.hpp" |
||||
|
||||
#if defined HAVE_IPP |
||||
namespace cv |
||||
{ |
||||
static bool ipp_mean( Mat &src, Mat &mask, Scalar &ret ) |
||||
{ |
||||
CV_INSTRUMENT_REGION_IPP() |
||||
|
||||
#if IPP_VERSION_X100 >= 700 |
||||
size_t total_size = src.total(); |
||||
int cn = src.channels(); |
||||
if (cn > 4) |
||||
return false; |
||||
int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0; |
||||
if( src.dims == 2 || (src.isContinuous() && mask.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) ) |
||||
{ |
||||
IppiSize sz = { cols, rows }; |
||||
int type = src.type(); |
||||
if( !mask.empty() ) |
||||
{ |
||||
typedef IppStatus (CV_STDCALL* ippiMaskMeanFuncC1)(const void *, int, const void *, int, IppiSize, Ipp64f *); |
||||
ippiMaskMeanFuncC1 ippiMean_C1MR = |
||||
type == CV_8UC1 ? (ippiMaskMeanFuncC1)ippiMean_8u_C1MR : |
||||
type == CV_16UC1 ? (ippiMaskMeanFuncC1)ippiMean_16u_C1MR : |
||||
type == CV_32FC1 ? (ippiMaskMeanFuncC1)ippiMean_32f_C1MR : |
||||
0; |
||||
if( ippiMean_C1MR ) |
||||
{ |
||||
Ipp64f res; |
||||
if( CV_INSTRUMENT_FUN_IPP(ippiMean_C1MR, src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, &res) >= 0 ) |
||||
{ |
||||
ret = Scalar(res); |
||||
return true; |
||||
} |
||||
} |
||||
typedef IppStatus (CV_STDCALL* ippiMaskMeanFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *); |
||||
ippiMaskMeanFuncC3 ippiMean_C3MR = |
||||
type == CV_8UC3 ? (ippiMaskMeanFuncC3)ippiMean_8u_C3CMR : |
||||
type == CV_16UC3 ? (ippiMaskMeanFuncC3)ippiMean_16u_C3CMR : |
||||
type == CV_32FC3 ? (ippiMaskMeanFuncC3)ippiMean_32f_C3CMR : |
||||
0; |
||||
if( ippiMean_C3MR ) |
||||
{ |
||||
Ipp64f res1, res2, res3; |
||||
if( CV_INSTRUMENT_FUN_IPP(ippiMean_C3MR, src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 1, &res1) >= 0 && |
||||
CV_INSTRUMENT_FUN_IPP(ippiMean_C3MR, src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 2, &res2) >= 0 && |
||||
CV_INSTRUMENT_FUN_IPP(ippiMean_C3MR, src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 3, &res3) >= 0 ) |
||||
{ |
||||
ret = Scalar(res1, res2, res3); |
||||
return true; |
||||
} |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
typedef IppStatus (CV_STDCALL* ippiMeanFuncHint)(const void*, int, IppiSize, double *, IppHintAlgorithm); |
||||
typedef IppStatus (CV_STDCALL* ippiMeanFuncNoHint)(const void*, int, IppiSize, double *); |
||||
ippiMeanFuncHint ippiMeanHint = |
||||
type == CV_32FC1 ? (ippiMeanFuncHint)ippiMean_32f_C1R : |
||||
type == CV_32FC3 ? (ippiMeanFuncHint)ippiMean_32f_C3R : |
||||
type == CV_32FC4 ? (ippiMeanFuncHint)ippiMean_32f_C4R : |
||||
0; |
||||
ippiMeanFuncNoHint ippiMean = |
||||
type == CV_8UC1 ? (ippiMeanFuncNoHint)ippiMean_8u_C1R : |
||||
type == CV_8UC3 ? (ippiMeanFuncNoHint)ippiMean_8u_C3R : |
||||
type == CV_8UC4 ? (ippiMeanFuncNoHint)ippiMean_8u_C4R : |
||||
type == CV_16UC1 ? (ippiMeanFuncNoHint)ippiMean_16u_C1R : |
||||
type == CV_16UC3 ? (ippiMeanFuncNoHint)ippiMean_16u_C3R : |
||||
type == CV_16UC4 ? (ippiMeanFuncNoHint)ippiMean_16u_C4R : |
||||
type == CV_16SC1 ? (ippiMeanFuncNoHint)ippiMean_16s_C1R : |
||||
type == CV_16SC3 ? (ippiMeanFuncNoHint)ippiMean_16s_C3R : |
||||
type == CV_16SC4 ? (ippiMeanFuncNoHint)ippiMean_16s_C4R : |
||||
0; |
||||
// Make sure only zero or one version of the function pointer is valid
|
||||
CV_Assert(!ippiMeanHint || !ippiMean); |
||||
if( ippiMeanHint || ippiMean ) |
||||
{ |
||||
Ipp64f res[4]; |
||||
IppStatus status = ippiMeanHint ? CV_INSTRUMENT_FUN_IPP(ippiMeanHint, src.ptr(), (int)src.step[0], sz, res, ippAlgHintAccurate) : |
||||
CV_INSTRUMENT_FUN_IPP(ippiMean, src.ptr(), (int)src.step[0], sz, res); |
||||
if( status >= 0 ) |
||||
{ |
||||
for( int i = 0; i < cn; i++ ) |
||||
ret[i] = res[i]; |
||||
return true; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
return false; |
||||
#else |
||||
return false; |
||||
#endif |
||||
} |
||||
} |
||||
#endif |
||||
|
||||
cv::Scalar cv::mean( InputArray _src, InputArray _mask ) |
||||
{ |
||||
CV_INSTRUMENT_REGION() |
||||
|
||||
Mat src = _src.getMat(), mask = _mask.getMat(); |
||||
CV_Assert( mask.empty() || mask.type() == CV_8U ); |
||||
|
||||
int k, cn = src.channels(), depth = src.depth(); |
||||
Scalar s; |
||||
|
||||
CV_IPP_RUN(IPP_VERSION_X100 >= 700, ipp_mean(src, mask, s), s) |
||||
|
||||
SumFunc func = getSumFunc(depth); |
||||
|
||||
CV_Assert( cn <= 4 && func != 0 ); |
||||
|
||||
const Mat* arrays[] = {&src, &mask, 0}; |
||||
uchar* ptrs[2]; |
||||
NAryMatIterator it(arrays, ptrs); |
||||
int total = (int)it.size, blockSize = total, intSumBlockSize = 0; |
||||
int j, count = 0; |
||||
AutoBuffer<int> _buf; |
||||
int* buf = (int*)&s[0]; |
||||
bool blockSum = depth <= CV_16S; |
||||
size_t esz = 0, nz0 = 0; |
||||
|
||||
if( blockSum ) |
||||
{ |
||||
intSumBlockSize = depth <= CV_8S ? (1 << 23) : (1 << 15); |
||||
blockSize = std::min(blockSize, intSumBlockSize); |
||||
_buf.allocate(cn); |
||||
buf = _buf; |
||||
|
||||
for( k = 0; k < cn; k++ ) |
||||
buf[k] = 0; |
||||
esz = src.elemSize(); |
||||
} |
||||
|
||||
for( size_t i = 0; i < it.nplanes; i++, ++it ) |
||||
{ |
||||
for( j = 0; j < total; j += blockSize ) |
||||
{ |
||||
int bsz = std::min(total - j, blockSize); |
||||
int nz = func( ptrs[0], ptrs[1], (uchar*)buf, bsz, cn ); |
||||
count += nz; |
||||
nz0 += nz; |
||||
if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) ) |
||||
{ |
||||
for( k = 0; k < cn; k++ ) |
||||
{ |
||||
s[k] += buf[k]; |
||||
buf[k] = 0; |
||||
} |
||||
count = 0; |
||||
} |
||||
ptrs[0] += bsz*esz; |
||||
if( ptrs[1] ) |
||||
ptrs[1] += bsz; |
||||
} |
||||
} |
||||
return s*(nz0 ? 1./nz0 : 0); |
||||
} |
||||
|
||||
//==================================================================================================
|
||||
|
||||
namespace cv { |
||||
|
||||
template <typename T, typename ST, typename SQT> |
||||
struct SumSqr_SIMD |
||||
{ |
||||
int operator () (const T *, const uchar *, ST *, SQT *, int, int) const |
||||
{ |
||||
return 0; |
||||
} |
||||
}; |
||||
|
||||
template <typename T> |
||||
inline void addSqrChannels(T * sum, T * sqsum, T * buf, int cn) |
||||
{ |
||||
for (int i = 0; i < 4; ++i) |
||||
{ |
||||
sum[i % cn] += buf[i]; |
||||
sqsum[i % cn] += buf[4 + i]; |
||||
} |
||||
} |
||||
|
||||
#if CV_SSE2 |
||||
|
||||
template <> |
||||
struct SumSqr_SIMD<uchar, int, int> |
||||
{ |
||||
int operator () (const uchar * src0, const uchar * mask, int * sum, int * sqsum, int len, int cn) const |
||||
{ |
||||
if (mask || (cn != 1 && cn != 2) || !USE_SSE2) |
||||
return 0; |
||||
|
||||
int x = 0; |
||||
__m128i v_zero = _mm_setzero_si128(), v_sum = v_zero, v_sqsum = v_zero; |
||||
const int len_16 = len & ~15; |
||||
|
||||
for ( ; x <= len_16 - 16; ) |
||||
{ |
||||
const int len_tmp = min(x + 2048, len_16); |
||||
__m128i v_sum_tmp = v_zero; |
||||
for ( ; x <= len_tmp - 16; x += 16) |
||||
{ |
||||
__m128i v_src = _mm_loadu_si128((const __m128i *)(src0 + x)); |
||||
__m128i v_half_0 = _mm_unpacklo_epi8(v_src, v_zero); |
||||
__m128i v_half_1 = _mm_unpackhi_epi8(v_src, v_zero); |
||||
v_sum_tmp = _mm_add_epi16(v_sum_tmp, _mm_add_epi16(v_half_0, v_half_1)); |
||||
__m128i v_half_2 = _mm_unpacklo_epi16(v_half_0, v_half_1); |
||||
__m128i v_half_3 = _mm_unpackhi_epi16(v_half_0, v_half_1); |
||||
v_sqsum = _mm_add_epi32(v_sqsum, _mm_madd_epi16(v_half_2, v_half_2)); |
||||
v_sqsum = _mm_add_epi32(v_sqsum, _mm_madd_epi16(v_half_3, v_half_3)); |
||||
} |
||||
v_sum = _mm_add_epi32(v_sum, _mm_unpacklo_epi16(v_sum_tmp, v_zero)); |
||||
v_sum = _mm_add_epi32(v_sum, _mm_unpackhi_epi16(v_sum_tmp, v_zero)); |
||||
} |
||||
|
||||
for ( ; x <= len - 8; x += 8) |
||||
{ |
||||
__m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src0 + x)), v_zero); |
||||
__m128i v_half_0 = _mm_unpackhi_epi64(v_src, v_src); |
||||
__m128i v_sum_tmp = _mm_add_epi16(v_src, v_half_0); |
||||
__m128i v_half_1 = _mm_unpacklo_epi16(v_src, v_half_0); |
||||
|
||||
v_sum = _mm_add_epi32(v_sum, _mm_unpacklo_epi16(v_sum_tmp, v_zero)); |
||||
v_sqsum = _mm_add_epi32(v_sqsum, _mm_madd_epi16(v_half_1, v_half_1)); |
||||
} |
||||
|
||||
int CV_DECL_ALIGNED(16) ar[8]; |
||||
_mm_store_si128((__m128i*)ar, v_sum); |
||||
_mm_store_si128((__m128i*)(ar + 4), v_sqsum); |
||||
|
||||
addSqrChannels(sum, sqsum, ar, cn); |
||||
|
||||
return x / cn; |
||||
} |
||||
}; |
||||
|
||||
template <> |
||||
struct SumSqr_SIMD<schar, int, int> |
||||
{ |
||||
int operator () (const schar * src0, const uchar * mask, int * sum, int * sqsum, int len, int cn) const |
||||
{ |
||||
if (mask || (cn != 1 && cn != 2) || !USE_SSE2) |
||||
return 0; |
||||
|
||||
int x = 0; |
||||
__m128i v_zero = _mm_setzero_si128(), v_sum = v_zero, v_sqsum = v_zero; |
||||
const int len_16 = len & ~15; |
||||
|
||||
for ( ; x <= len_16 - 16; ) |
||||
{ |
||||
const int len_tmp = min(x + 2048, len_16); |
||||
__m128i v_sum_tmp = v_zero; |
||||
for ( ; x <= len_tmp - 16; x += 16) |
||||
{ |
||||
__m128i v_src = _mm_loadu_si128((const __m128i *)(src0 + x)); |
||||
__m128i v_half_0 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src), 8); |
||||
__m128i v_half_1 = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero, v_src), 8); |
||||
v_sum_tmp = _mm_add_epi16(v_sum_tmp, _mm_add_epi16(v_half_0, v_half_1)); |
||||
__m128i v_half_2 = _mm_unpacklo_epi16(v_half_0, v_half_1); |
||||
__m128i v_half_3 = _mm_unpackhi_epi16(v_half_0, v_half_1); |
||||
v_sqsum = _mm_add_epi32(v_sqsum, _mm_madd_epi16(v_half_2, v_half_2)); |
||||
v_sqsum = _mm_add_epi32(v_sqsum, _mm_madd_epi16(v_half_3, v_half_3)); |
||||
} |
||||
v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_sum_tmp), 16)); |
||||
v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_sum_tmp), 16)); |
||||
} |
||||
|
||||
for ( ; x <= len - 8; x += 8) |
||||
{ |
||||
__m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src0 + x))), 8); |
||||
__m128i v_half_0 = _mm_unpackhi_epi64(v_src, v_src); |
||||
__m128i v_sum_tmp = _mm_add_epi16(v_src, v_half_0); |
||||
__m128i v_half_1 = _mm_unpacklo_epi16(v_src, v_half_0); |
||||
|
||||
v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_sum_tmp), 16)); |
||||
v_sqsum = _mm_add_epi32(v_sqsum, _mm_madd_epi16(v_half_1, v_half_1)); |
||||
} |
||||
|
||||
int CV_DECL_ALIGNED(16) ar[8]; |
||||
_mm_store_si128((__m128i*)ar, v_sum); |
||||
_mm_store_si128((__m128i*)(ar + 4), v_sqsum); |
||||
|
||||
addSqrChannels(sum, sqsum, ar, cn); |
||||
|
||||
return x / cn; |
||||
} |
||||
}; |
||||
|
||||
#endif |
||||
|
||||
template<typename T, typename ST, typename SQT> |
||||
static int sumsqr_(const T* src0, const uchar* mask, ST* sum, SQT* sqsum, int len, int cn ) |
||||
{ |
||||
const T* src = src0; |
||||
|
||||
if( !mask ) |
||||
{ |
||||
SumSqr_SIMD<T, ST, SQT> vop; |
||||
int i = vop(src0, mask, sum, sqsum, len, cn), k = cn % 4; |
||||
src += i * cn; |
||||
|
||||
if( k == 1 ) |
||||
{ |
||||
ST s0 = sum[0]; |
||||
SQT sq0 = sqsum[0]; |
||||
for( ; i < len; i++, src += cn ) |
||||
{ |
||||
T v = src[0]; |
||||
s0 += v; sq0 += (SQT)v*v; |
||||
} |
||||
sum[0] = s0; |
||||
sqsum[0] = sq0; |
||||
} |
||||
else if( k == 2 ) |
||||
{ |
||||
ST s0 = sum[0], s1 = sum[1]; |
||||
SQT sq0 = sqsum[0], sq1 = sqsum[1]; |
||||
for( ; i < len; i++, src += cn ) |
||||
{ |
||||
T v0 = src[0], v1 = src[1]; |
||||
s0 += v0; sq0 += (SQT)v0*v0; |
||||
s1 += v1; sq1 += (SQT)v1*v1; |
||||
} |
||||
sum[0] = s0; sum[1] = s1; |
||||
sqsum[0] = sq0; sqsum[1] = sq1; |
||||
} |
||||
else if( k == 3 ) |
||||
{ |
||||
ST s0 = sum[0], s1 = sum[1], s2 = sum[2]; |
||||
SQT sq0 = sqsum[0], sq1 = sqsum[1], sq2 = sqsum[2]; |
||||
for( ; i < len; i++, src += cn ) |
||||
{ |
||||
T v0 = src[0], v1 = src[1], v2 = src[2]; |
||||
s0 += v0; sq0 += (SQT)v0*v0; |
||||
s1 += v1; sq1 += (SQT)v1*v1; |
||||
s2 += v2; sq2 += (SQT)v2*v2; |
||||
} |
||||
sum[0] = s0; sum[1] = s1; sum[2] = s2; |
||||
sqsum[0] = sq0; sqsum[1] = sq1; sqsum[2] = sq2; |
||||
} |
||||
|
||||
for( ; k < cn; k += 4 ) |
||||
{ |
||||
src = src0 + k; |
||||
ST s0 = sum[k], s1 = sum[k+1], s2 = sum[k+2], s3 = sum[k+3]; |
||||
SQT sq0 = sqsum[k], sq1 = sqsum[k+1], sq2 = sqsum[k+2], sq3 = sqsum[k+3]; |
||||
for( ; i < len; i++, src += cn ) |
||||
{ |
||||
T v0, v1; |
||||
v0 = src[0], v1 = src[1]; |
||||
s0 += v0; sq0 += (SQT)v0*v0; |
||||
s1 += v1; sq1 += (SQT)v1*v1; |
||||
v0 = src[2], v1 = src[3]; |
||||
s2 += v0; sq2 += (SQT)v0*v0; |
||||
s3 += v1; sq3 += (SQT)v1*v1; |
||||
} |
||||
sum[k] = s0; sum[k+1] = s1; |
||||
sum[k+2] = s2; sum[k+3] = s3; |
||||
sqsum[k] = sq0; sqsum[k+1] = sq1; |
||||
sqsum[k+2] = sq2; sqsum[k+3] = sq3; |
||||
} |
||||
return len; |
||||
} |
||||
|
||||
int i, nzm = 0; |
||||
|
||||
if( cn == 1 ) |
||||
{ |
||||
ST s0 = sum[0]; |
||||
SQT sq0 = sqsum[0]; |
||||
for( i = 0; i < len; i++ ) |
||||
if( mask[i] ) |
||||
{ |
||||
T v = src[i]; |
||||
s0 += v; sq0 += (SQT)v*v; |
||||
nzm++; |
||||
} |
||||
sum[0] = s0; |
||||
sqsum[0] = sq0; |
||||
} |
||||
else if( cn == 3 ) |
||||
{ |
||||
ST s0 = sum[0], s1 = sum[1], s2 = sum[2]; |
||||
SQT sq0 = sqsum[0], sq1 = sqsum[1], sq2 = sqsum[2]; |
||||
for( i = 0; i < len; i++, src += 3 ) |
||||
if( mask[i] ) |
||||
{ |
||||
T v0 = src[0], v1 = src[1], v2 = src[2]; |
||||
s0 += v0; sq0 += (SQT)v0*v0; |
||||
s1 += v1; sq1 += (SQT)v1*v1; |
||||
s2 += v2; sq2 += (SQT)v2*v2; |
||||
nzm++; |
||||
} |
||||
sum[0] = s0; sum[1] = s1; sum[2] = s2; |
||||
sqsum[0] = sq0; sqsum[1] = sq1; sqsum[2] = sq2; |
||||
} |
||||
else |
||||
{ |
||||
for( i = 0; i < len; i++, src += cn ) |
||||
if( mask[i] ) |
||||
{ |
||||
for( int k = 0; k < cn; k++ ) |
||||
{ |
||||
T v = src[k]; |
||||
ST s = sum[k] + v; |
||||
SQT sq = sqsum[k] + (SQT)v*v; |
||||
sum[k] = s; sqsum[k] = sq; |
||||
} |
||||
nzm++; |
||||
} |
||||
} |
||||
return nzm; |
||||
} |
||||
|
||||
|
||||
static int sqsum8u( const uchar* src, const uchar* mask, int* sum, int* sqsum, int len, int cn ) |
||||
{ return sumsqr_(src, mask, sum, sqsum, len, cn); } |
||||
|
||||
static int sqsum8s( const schar* src, const uchar* mask, int* sum, int* sqsum, int len, int cn ) |
||||
{ return sumsqr_(src, mask, sum, sqsum, len, cn); } |
||||
|
||||
static int sqsum16u( const ushort* src, const uchar* mask, int* sum, double* sqsum, int len, int cn ) |
||||
{ return sumsqr_(src, mask, sum, sqsum, len, cn); } |
||||
|
||||
static int sqsum16s( const short* src, const uchar* mask, int* sum, double* sqsum, int len, int cn ) |
||||
{ return sumsqr_(src, mask, sum, sqsum, len, cn); } |
||||
|
||||
static int sqsum32s( const int* src, const uchar* mask, double* sum, double* sqsum, int len, int cn ) |
||||
{ return sumsqr_(src, mask, sum, sqsum, len, cn); } |
||||
|
||||
static int sqsum32f( const float* src, const uchar* mask, double* sum, double* sqsum, int len, int cn ) |
||||
{ return sumsqr_(src, mask, sum, sqsum, len, cn); } |
||||
|
||||
static int sqsum64f( const double* src, const uchar* mask, double* sum, double* sqsum, int len, int cn ) |
||||
{ return sumsqr_(src, mask, sum, sqsum, len, cn); } |
||||
|
||||
typedef int (*SumSqrFunc)(const uchar*, const uchar* mask, uchar*, uchar*, int, int); |
||||
|
||||
static SumSqrFunc getSumSqrTab(int depth) |
||||
{ |
||||
static SumSqrFunc sumSqrTab[] = |
||||
{ |
||||
(SumSqrFunc)GET_OPTIMIZED(sqsum8u), (SumSqrFunc)sqsum8s, (SumSqrFunc)sqsum16u, (SumSqrFunc)sqsum16s, |
||||
(SumSqrFunc)sqsum32s, (SumSqrFunc)GET_OPTIMIZED(sqsum32f), (SumSqrFunc)sqsum64f, 0 |
||||
}; |
||||
|
||||
return sumSqrTab[depth]; |
||||
} |
||||
|
||||
#ifdef HAVE_OPENCL |
||||
static bool ocl_meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv, InputArray _mask ) |
||||
{ |
||||
CV_INSTRUMENT_REGION_OPENCL() |
||||
|
||||
bool haveMask = _mask.kind() != _InputArray::NONE; |
||||
int nz = haveMask ? -1 : (int)_src.total(); |
||||
Scalar mean(0), stddev(0); |
||||
const int cn = _src.channels(); |
||||
if (cn > 4) |
||||
return false; |
||||
|
||||
{ |
||||
int type = _src.type(), depth = CV_MAT_DEPTH(type); |
||||
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0, |
||||
isContinuous = _src.isContinuous(), |
||||
isMaskContinuous = _mask.isContinuous(); |
||||
const ocl::Device &defDev = ocl::Device::getDefault(); |
||||
int groups = defDev.maxComputeUnits(); |
||||
if (defDev.isIntel()) |
||||
{ |
||||
static const int subSliceEUCount = 10; |
||||
groups = (groups / subSliceEUCount) * 2; |
||||
} |
||||
size_t wgs = defDev.maxWorkGroupSize(); |
||||
|
||||
int ddepth = std::max(CV_32S, depth), sqddepth = std::max(CV_32F, depth), |
||||
dtype = CV_MAKE_TYPE(ddepth, cn), |
||||
sqdtype = CV_MAKETYPE(sqddepth, cn); |
||||
CV_Assert(!haveMask || _mask.type() == CV_8UC1); |
||||
|
||||
int wgs2_aligned = 1; |
||||
while (wgs2_aligned < (int)wgs) |
||||
wgs2_aligned <<= 1; |
||||
wgs2_aligned >>= 1; |
||||
|
||||
if ( (!doubleSupport && depth == CV_64F) ) |
||||
return false; |
||||
|
||||
char cvt[2][40]; |
||||
String opts = format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D sqddepth=%d" |
||||
" -D sqdstT=%s -D sqdstT1=%s -D convertToSDT=%s -D cn=%d%s%s" |
||||
" -D convertToDT=%s -D WGS=%d -D WGS2_ALIGNED=%d%s%s", |
||||
ocl::typeToStr(type), ocl::typeToStr(depth), |
||||
ocl::typeToStr(dtype), ocl::typeToStr(ddepth), sqddepth, |
||||
ocl::typeToStr(sqdtype), ocl::typeToStr(sqddepth), |
||||
ocl::convertTypeStr(depth, sqddepth, cn, cvt[0]), |
||||
cn, isContinuous ? " -D HAVE_SRC_CONT" : "", |
||||
isMaskContinuous ? " -D HAVE_MASK_CONT" : "", |
||||
ocl::convertTypeStr(depth, ddepth, cn, cvt[1]), |
||||
(int)wgs, wgs2_aligned, haveMask ? " -D HAVE_MASK" : "", |
||||
doubleSupport ? " -D DOUBLE_SUPPORT" : ""); |
||||
|
||||
ocl::Kernel k("meanStdDev", ocl::core::meanstddev_oclsrc, opts); |
||||
if (k.empty()) |
||||
return false; |
||||
|
||||
int dbsize = groups * ((haveMask ? CV_ELEM_SIZE1(CV_32S) : 0) + |
||||
CV_ELEM_SIZE(sqdtype) + CV_ELEM_SIZE(dtype)); |
||||
UMat src = _src.getUMat(), db(1, dbsize, CV_8UC1), mask = _mask.getUMat(); |
||||
|
||||
ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src), |
||||
dbarg = ocl::KernelArg::PtrWriteOnly(db), |
||||
maskarg = ocl::KernelArg::ReadOnlyNoSize(mask); |
||||
|
||||
if (haveMask) |
||||
k.args(srcarg, src.cols, (int)src.total(), groups, dbarg, maskarg); |
||||
else |
||||
k.args(srcarg, src.cols, (int)src.total(), groups, dbarg); |
||||
|
||||
size_t globalsize = groups * wgs; |
||||
|
||||
if(!k.run(1, &globalsize, &wgs, false)) |
||||
return false; |
||||
|
||||
typedef Scalar (* part_sum)(Mat m); |
||||
part_sum funcs[3] = { ocl_part_sum<int>, ocl_part_sum<float>, ocl_part_sum<double> }; |
||||
Mat dbm = db.getMat(ACCESS_READ); |
||||
|
||||
mean = funcs[ddepth - CV_32S](Mat(1, groups, dtype, dbm.ptr())); |
||||
stddev = funcs[sqddepth - CV_32S](Mat(1, groups, sqdtype, dbm.ptr() + groups * CV_ELEM_SIZE(dtype))); |
||||
|
||||
if (haveMask) |
||||
nz = saturate_cast<int>(funcs[0](Mat(1, groups, CV_32SC1, dbm.ptr() + |
||||
groups * (CV_ELEM_SIZE(dtype) + |
||||
CV_ELEM_SIZE(sqdtype))))[0]); |
||||
} |
||||
|
||||
double total = nz != 0 ? 1.0 / nz : 0; |
||||
int k, j; |
||||
for (int i = 0; i < cn; ++i) |
||||
{ |
||||
mean[i] *= total; |
||||
stddev[i] = std::sqrt(std::max(stddev[i] * total - mean[i] * mean[i] , 0.)); |
||||
} |
||||
|
||||
for( j = 0; j < 2; j++ ) |
||||
{ |
||||
const double * const sptr = j == 0 ? &mean[0] : &stddev[0]; |
||||
_OutputArray _dst = j == 0 ? _mean : _sdv; |
||||
if( !_dst.needed() ) |
||||
continue; |
||||
|
||||
if( !_dst.fixedSize() ) |
||||
_dst.create(cn, 1, CV_64F, -1, true); |
||||
Mat dst = _dst.getMat(); |
||||
int dcn = (int)dst.total(); |
||||
CV_Assert( dst.type() == CV_64F && dst.isContinuous() && |
||||
(dst.cols == 1 || dst.rows == 1) && dcn >= cn ); |
||||
double* dptr = dst.ptr<double>(); |
||||
for( k = 0; k < cn; k++ ) |
||||
dptr[k] = sptr[k]; |
||||
for( ; k < dcn; k++ ) |
||||
dptr[k] = 0; |
||||
} |
||||
|
||||
return true; |
||||
} |
||||
#endif |
||||
|
||||
#ifdef HAVE_OPENVX |
||||
static bool openvx_meanStdDev(Mat& src, OutputArray _mean, OutputArray _sdv, Mat& mask) |
||||
{ |
||||
size_t total_size = src.total(); |
||||
int rows = src.size[0], cols = rows ? (int)(total_size / rows) : 0; |
||||
if (src.type() != CV_8UC1|| !mask.empty() || |
||||
(src.dims != 2 && !(src.isContinuous() && cols > 0 && (size_t)rows*cols == total_size)) |
||||
) |
||||
return false; |
||||
|
||||
try |
||||
{ |
||||
ivx::Context ctx = ovx::getOpenVXContext(); |
||||
#ifndef VX_VERSION_1_1 |
||||
if (ctx.vendorID() == VX_ID_KHRONOS) |
||||
return false; // Do not use OpenVX meanStdDev estimation for sample 1.0.1 implementation due to lack of accuracy
|
||||
#endif |
||||
|
||||
ivx::Image |
||||
ia = ivx::Image::createFromHandle(ctx, VX_DF_IMAGE_U8, |
||||
ivx::Image::createAddressing(cols, rows, 1, (vx_int32)(src.step[0])), src.ptr()); |
||||
|
||||
vx_float32 mean_temp, stddev_temp; |
||||
ivx::IVX_CHECK_STATUS(vxuMeanStdDev(ctx, ia, &mean_temp, &stddev_temp)); |
||||
|
||||
if (_mean.needed()) |
||||
{ |
||||
if (!_mean.fixedSize()) |
||||
_mean.create(1, 1, CV_64F, -1, true); |
||||
Mat mean = _mean.getMat(); |
||||
CV_Assert(mean.type() == CV_64F && mean.isContinuous() && |
||||
(mean.cols == 1 || mean.rows == 1) && mean.total() >= 1); |
||||
double *pmean = mean.ptr<double>(); |
||||
pmean[0] = mean_temp; |
||||
for (int c = 1; c < (int)mean.total(); c++) |
||||
pmean[c] = 0; |
||||
} |
||||
|
||||
if (_sdv.needed()) |
||||
{ |
||||
if (!_sdv.fixedSize()) |
||||
_sdv.create(1, 1, CV_64F, -1, true); |
||||
Mat stddev = _sdv.getMat(); |
||||
CV_Assert(stddev.type() == CV_64F && stddev.isContinuous() && |
||||
(stddev.cols == 1 || stddev.rows == 1) && stddev.total() >= 1); |
||||
double *pstddev = stddev.ptr<double>(); |
||||
pstddev[0] = stddev_temp; |
||||
for (int c = 1; c < (int)stddev.total(); c++) |
||||
pstddev[c] = 0; |
||||
} |
||||
} |
||||
catch (ivx::RuntimeError & e) |
||||
{ |
||||
VX_DbgThrow(e.what()); |
||||
} |
||||
catch (ivx::WrapperError & e) |
||||
{ |
||||
VX_DbgThrow(e.what()); |
||||
} |
||||
|
||||
return true; |
||||
} |
||||
#endif |
||||
|
||||
#ifdef HAVE_IPP |
||||
static bool ipp_meanStdDev(Mat& src, OutputArray _mean, OutputArray _sdv, Mat& mask) |
||||
{ |
||||
CV_INSTRUMENT_REGION_IPP() |
||||
|
||||
#if IPP_VERSION_X100 >= 700 |
||||
int cn = src.channels(); |
||||
|
||||
#if IPP_VERSION_X100 < 201801 |
||||
// IPP_DISABLE: C3C functions can read outside of allocated memory
|
||||
if (cn > 1) |
||||
return false; |
||||
#endif |
||||
|
||||
size_t total_size = src.total(); |
||||
int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0; |
||||
if( src.dims == 2 || (src.isContinuous() && mask.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) ) |
||||
{ |
||||
Ipp64f mean_temp[3]; |
||||
Ipp64f stddev_temp[3]; |
||||
Ipp64f *pmean = &mean_temp[0]; |
||||
Ipp64f *pstddev = &stddev_temp[0]; |
||||
Mat mean, stddev; |
||||
int dcn_mean = -1; |
||||
if( _mean.needed() ) |
||||
{ |
||||
if( !_mean.fixedSize() ) |
||||
_mean.create(cn, 1, CV_64F, -1, true); |
||||
mean = _mean.getMat(); |
||||
dcn_mean = (int)mean.total(); |
||||
pmean = mean.ptr<Ipp64f>(); |
||||
} |
||||
int dcn_stddev = -1; |
||||
if( _sdv.needed() ) |
||||
{ |
||||
if( !_sdv.fixedSize() ) |
||||
_sdv.create(cn, 1, CV_64F, -1, true); |
||||
stddev = _sdv.getMat(); |
||||
dcn_stddev = (int)stddev.total(); |
||||
pstddev = stddev.ptr<Ipp64f>(); |
||||
} |
||||
for( int c = cn; c < dcn_mean; c++ ) |
||||
pmean[c] = 0; |
||||
for( int c = cn; c < dcn_stddev; c++ ) |
||||
pstddev[c] = 0; |
||||
IppiSize sz = { cols, rows }; |
||||
int type = src.type(); |
||||
if( !mask.empty() ) |
||||
{ |
||||
typedef IppStatus (CV_STDCALL* ippiMaskMeanStdDevFuncC1)(const void *, int, const void *, int, IppiSize, Ipp64f *, Ipp64f *); |
||||
ippiMaskMeanStdDevFuncC1 ippiMean_StdDev_C1MR = |
||||
type == CV_8UC1 ? (ippiMaskMeanStdDevFuncC1)ippiMean_StdDev_8u_C1MR : |
||||
type == CV_16UC1 ? (ippiMaskMeanStdDevFuncC1)ippiMean_StdDev_16u_C1MR : |
||||
type == CV_32FC1 ? (ippiMaskMeanStdDevFuncC1)ippiMean_StdDev_32f_C1MR : |
||||
0; |
||||
if( ippiMean_StdDev_C1MR ) |
||||
{ |
||||
if( CV_INSTRUMENT_FUN_IPP(ippiMean_StdDev_C1MR, src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, pmean, pstddev) >= 0 ) |
||||
{ |
||||
return true; |
||||
} |
||||
} |
||||
typedef IppStatus (CV_STDCALL* ippiMaskMeanStdDevFuncC3)(const void *, int, const void *, int, IppiSize, int, Ipp64f *, Ipp64f *); |
||||
ippiMaskMeanStdDevFuncC3 ippiMean_StdDev_C3CMR = |
||||
type == CV_8UC3 ? (ippiMaskMeanStdDevFuncC3)ippiMean_StdDev_8u_C3CMR : |
||||
type == CV_16UC3 ? (ippiMaskMeanStdDevFuncC3)ippiMean_StdDev_16u_C3CMR : |
||||
type == CV_32FC3 ? (ippiMaskMeanStdDevFuncC3)ippiMean_StdDev_32f_C3CMR : |
||||
0; |
||||
if( ippiMean_StdDev_C3CMR ) |
||||
{ |
||||
if( CV_INSTRUMENT_FUN_IPP(ippiMean_StdDev_C3CMR, src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 1, &pmean[0], &pstddev[0]) >= 0 && |
||||
CV_INSTRUMENT_FUN_IPP(ippiMean_StdDev_C3CMR, src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 2, &pmean[1], &pstddev[1]) >= 0 && |
||||
CV_INSTRUMENT_FUN_IPP(ippiMean_StdDev_C3CMR, src.ptr(), (int)src.step[0], mask.ptr(), (int)mask.step[0], sz, 3, &pmean[2], &pstddev[2]) >= 0 ) |
||||
{ |
||||
return true; |
||||
} |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
typedef IppStatus (CV_STDCALL* ippiMeanStdDevFuncC1)(const void *, int, IppiSize, Ipp64f *, Ipp64f *); |
||||
ippiMeanStdDevFuncC1 ippiMean_StdDev_C1R = |
||||
type == CV_8UC1 ? (ippiMeanStdDevFuncC1)ippiMean_StdDev_8u_C1R : |
||||
type == CV_16UC1 ? (ippiMeanStdDevFuncC1)ippiMean_StdDev_16u_C1R : |
||||
#if (IPP_VERSION_X100 >= 810) |
||||
type == CV_32FC1 ? (ippiMeanStdDevFuncC1)ippiMean_StdDev_32f_C1R ://Aug 2013: bug in IPP 7.1, 8.0
|
||||
#endif |
||||
0; |
||||
if( ippiMean_StdDev_C1R ) |
||||
{ |
||||
if( CV_INSTRUMENT_FUN_IPP(ippiMean_StdDev_C1R, src.ptr(), (int)src.step[0], sz, pmean, pstddev) >= 0 ) |
||||
{ |
||||
return true; |
||||
} |
||||
} |
||||
typedef IppStatus (CV_STDCALL* ippiMeanStdDevFuncC3)(const void *, int, IppiSize, int, Ipp64f *, Ipp64f *); |
||||
ippiMeanStdDevFuncC3 ippiMean_StdDev_C3CR = |
||||
type == CV_8UC3 ? (ippiMeanStdDevFuncC3)ippiMean_StdDev_8u_C3CR : |
||||
type == CV_16UC3 ? (ippiMeanStdDevFuncC3)ippiMean_StdDev_16u_C3CR : |
||||
type == CV_32FC3 ? (ippiMeanStdDevFuncC3)ippiMean_StdDev_32f_C3CR : |
||||
0; |
||||
if( ippiMean_StdDev_C3CR ) |
||||
{ |
||||
if( CV_INSTRUMENT_FUN_IPP(ippiMean_StdDev_C3CR, src.ptr(), (int)src.step[0], sz, 1, &pmean[0], &pstddev[0]) >= 0 && |
||||
CV_INSTRUMENT_FUN_IPP(ippiMean_StdDev_C3CR, src.ptr(), (int)src.step[0], sz, 2, &pmean[1], &pstddev[1]) >= 0 && |
||||
CV_INSTRUMENT_FUN_IPP(ippiMean_StdDev_C3CR, src.ptr(), (int)src.step[0], sz, 3, &pmean[2], &pstddev[2]) >= 0 ) |
||||
{ |
||||
return true; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
#else |
||||
CV_UNUSED(src); CV_UNUSED(_mean); CV_UNUSED(_sdv); CV_UNUSED(mask); |
||||
#endif |
||||
return false; |
||||
} |
||||
#endif |
||||
|
||||
} // cv::
|
||||
|
||||
void cv::meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv, InputArray _mask ) |
||||
{ |
||||
CV_INSTRUMENT_REGION() |
||||
|
||||
CV_OCL_RUN(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2, |
||||
ocl_meanStdDev(_src, _mean, _sdv, _mask)) |
||||
|
||||
Mat src = _src.getMat(), mask = _mask.getMat(); |
||||
CV_Assert( mask.empty() || mask.type() == CV_8UC1 ); |
||||
|
||||
CV_OVX_RUN(!ovx::skipSmallImages<VX_KERNEL_MEAN_STDDEV>(src.cols, src.rows), |
||||
openvx_meanStdDev(src, _mean, _sdv, mask)) |
||||
|
||||
CV_IPP_RUN(IPP_VERSION_X100 >= 700, ipp_meanStdDev(src, _mean, _sdv, mask)); |
||||
|
||||
int k, cn = src.channels(), depth = src.depth(); |
||||
|
||||
SumSqrFunc func = getSumSqrTab(depth); |
||||
|
||||
CV_Assert( func != 0 ); |
||||
|
||||
const Mat* arrays[] = {&src, &mask, 0}; |
||||
uchar* ptrs[2]; |
||||
NAryMatIterator it(arrays, ptrs); |
||||
int total = (int)it.size, blockSize = total, intSumBlockSize = 0; |
||||
int j, count = 0, nz0 = 0; |
||||
AutoBuffer<double> _buf(cn*4); |
||||
double *s = (double*)_buf, *sq = s + cn; |
||||
int *sbuf = (int*)s, *sqbuf = (int*)sq; |
||||
bool blockSum = depth <= CV_16S, blockSqSum = depth <= CV_8S; |
||||
size_t esz = 0; |
||||
|
||||
for( k = 0; k < cn; k++ ) |
||||
s[k] = sq[k] = 0; |
||||
|
||||
if( blockSum ) |
||||
{ |
||||
intSumBlockSize = 1 << 15; |
||||
blockSize = std::min(blockSize, intSumBlockSize); |
||||
sbuf = (int*)(sq + cn); |
||||
if( blockSqSum ) |
||||
sqbuf = sbuf + cn; |
||||
for( k = 0; k < cn; k++ ) |
||||
sbuf[k] = sqbuf[k] = 0; |
||||
esz = src.elemSize(); |
||||
} |
||||
|
||||
for( size_t i = 0; i < it.nplanes; i++, ++it ) |
||||
{ |
||||
for( j = 0; j < total; j += blockSize ) |
||||
{ |
||||
int bsz = std::min(total - j, blockSize); |
||||
int nz = func( ptrs[0], ptrs[1], (uchar*)sbuf, (uchar*)sqbuf, bsz, cn ); |
||||
count += nz; |
||||
nz0 += nz; |
||||
if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) ) |
||||
{ |
||||
for( k = 0; k < cn; k++ ) |
||||
{ |
||||
s[k] += sbuf[k]; |
||||
sbuf[k] = 0; |
||||
} |
||||
if( blockSqSum ) |
||||
{ |
||||
for( k = 0; k < cn; k++ ) |
||||
{ |
||||
sq[k] += sqbuf[k]; |
||||
sqbuf[k] = 0; |
||||
} |
||||
} |
||||
count = 0; |
||||
} |
||||
ptrs[0] += bsz*esz; |
||||
if( ptrs[1] ) |
||||
ptrs[1] += bsz; |
||||
} |
||||
} |
||||
|
||||
double scale = nz0 ? 1./nz0 : 0.; |
||||
for( k = 0; k < cn; k++ ) |
||||
{ |
||||
s[k] *= scale; |
||||
sq[k] = std::sqrt(std::max(sq[k]*scale - s[k]*s[k], 0.)); |
||||
} |
||||
|
||||
for( j = 0; j < 2; j++ ) |
||||
{ |
||||
const double* sptr = j == 0 ? s : sq; |
||||
_OutputArray _dst = j == 0 ? _mean : _sdv; |
||||
if( !_dst.needed() ) |
||||
continue; |
||||
|
||||
if( !_dst.fixedSize() ) |
||||
_dst.create(cn, 1, CV_64F, -1, true); |
||||
Mat dst = _dst.getMat(); |
||||
int dcn = (int)dst.total(); |
||||
CV_Assert( dst.type() == CV_64F && dst.isContinuous() && |
||||
(dst.cols == 1 || dst.rows == 1) && dcn >= cn ); |
||||
double* dptr = dst.ptr<double>(); |
||||
for( k = 0; k < cn; k++ ) |
||||
dptr[k] = sptr[k]; |
||||
for( ; k < dcn; k++ ) |
||||
dptr[k] = 0; |
||||
} |
||||
} |
@ -0,0 +1,830 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html
|
||||
|
||||
|
||||
#include "precomp.hpp" |
||||
#include "opencl_kernels_core.hpp" |
||||
#include "opencv2/core/openvx/ovx_defs.hpp" |
||||
#include "stat.hpp" |
||||
|
||||
/****************************************************************************************\
|
||||
* minMaxLoc * |
||||
\****************************************************************************************/ |
||||
|
||||
namespace cv |
||||
{ |
||||
|
||||
template<typename T, typename WT> static void |
||||
minMaxIdx_( const T* src, const uchar* mask, WT* _minVal, WT* _maxVal, |
||||
size_t* _minIdx, size_t* _maxIdx, int len, size_t startIdx ) |
||||
{ |
||||
WT minVal = *_minVal, maxVal = *_maxVal; |
||||
size_t minIdx = *_minIdx, maxIdx = *_maxIdx; |
||||
|
||||
if( !mask ) |
||||
{ |
||||
for( int i = 0; i < len; i++ ) |
||||
{ |
||||
T val = src[i]; |
||||
if( val < minVal ) |
||||
{ |
||||
minVal = val; |
||||
minIdx = startIdx + i; |
||||
} |
||||
if( val > maxVal ) |
||||
{ |
||||
maxVal = val; |
||||
maxIdx = startIdx + i; |
||||
} |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
for( int i = 0; i < len; i++ ) |
||||
{ |
||||
T val = src[i]; |
||||
if( mask[i] && val < minVal ) |
||||
{ |
||||
minVal = val; |
||||
minIdx = startIdx + i; |
||||
} |
||||
if( mask[i] && val > maxVal ) |
||||
{ |
||||
maxVal = val; |
||||
maxIdx = startIdx + i; |
||||
} |
||||
} |
||||
} |
||||
|
||||
*_minIdx = minIdx; |
||||
*_maxIdx = maxIdx; |
||||
*_minVal = minVal; |
||||
*_maxVal = maxVal; |
||||
} |
||||
|
||||
static void minMaxIdx_8u(const uchar* src, const uchar* mask, int* minval, int* maxval, |
||||
size_t* minidx, size_t* maxidx, int len, size_t startidx ) |
||||
{ minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); } |
||||
|
||||
static void minMaxIdx_8s(const schar* src, const uchar* mask, int* minval, int* maxval, |
||||
size_t* minidx, size_t* maxidx, int len, size_t startidx ) |
||||
{ minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); } |
||||
|
||||
static void minMaxIdx_16u(const ushort* src, const uchar* mask, int* minval, int* maxval, |
||||
size_t* minidx, size_t* maxidx, int len, size_t startidx ) |
||||
{ minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); } |
||||
|
||||
static void minMaxIdx_16s(const short* src, const uchar* mask, int* minval, int* maxval, |
||||
size_t* minidx, size_t* maxidx, int len, size_t startidx ) |
||||
{ minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); } |
||||
|
||||
static void minMaxIdx_32s(const int* src, const uchar* mask, int* minval, int* maxval, |
||||
size_t* minidx, size_t* maxidx, int len, size_t startidx ) |
||||
{ minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); } |
||||
|
||||
static void minMaxIdx_32f(const float* src, const uchar* mask, float* minval, float* maxval, |
||||
size_t* minidx, size_t* maxidx, int len, size_t startidx ) |
||||
{ minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); } |
||||
|
||||
static void minMaxIdx_64f(const double* src, const uchar* mask, double* minval, double* maxval, |
||||
size_t* minidx, size_t* maxidx, int len, size_t startidx ) |
||||
{ minMaxIdx_(src, mask, minval, maxval, minidx, maxidx, len, startidx ); } |
||||
|
||||
typedef void (*MinMaxIdxFunc)(const uchar*, const uchar*, int*, int*, size_t*, size_t*, int, size_t); |
||||
|
||||
static MinMaxIdxFunc getMinmaxTab(int depth) |
||||
{ |
||||
static MinMaxIdxFunc minmaxTab[] = |
||||
{ |
||||
(MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_8u), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_8s), |
||||
(MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_16u), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_16s), |
||||
(MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_32s), |
||||
(MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_32f), (MinMaxIdxFunc)GET_OPTIMIZED(minMaxIdx_64f), |
||||
0 |
||||
}; |
||||
|
||||
return minmaxTab[depth]; |
||||
} |
||||
|
||||
static void ofs2idx(const Mat& a, size_t ofs, int* idx) |
||||
{ |
||||
int i, d = a.dims; |
||||
if( ofs > 0 ) |
||||
{ |
||||
ofs--; |
||||
for( i = d-1; i >= 0; i-- ) |
||||
{ |
||||
int sz = a.size[i]; |
||||
idx[i] = (int)(ofs % sz); |
||||
ofs /= sz; |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
for( i = d-1; i >= 0; i-- ) |
||||
idx[i] = -1; |
||||
} |
||||
} |
||||
|
||||
#ifdef HAVE_OPENCL |
||||
|
||||
#define MINMAX_STRUCT_ALIGNMENT 8 // sizeof double
|
||||
|
||||
template <typename T> |
||||
void getMinMaxRes(const Mat & db, double * minVal, double * maxVal, |
||||
int* minLoc, int* maxLoc, |
||||
int groupnum, int cols, double * maxVal2) |
||||
{ |
||||
uint index_max = std::numeric_limits<uint>::max(); |
||||
T minval = std::numeric_limits<T>::max(); |
||||
T maxval = std::numeric_limits<T>::min() > 0 ? -std::numeric_limits<T>::max() : std::numeric_limits<T>::min(), maxval2 = maxval; |
||||
uint minloc = index_max, maxloc = index_max; |
||||
|
||||
size_t index = 0; |
||||
const T * minptr = NULL, * maxptr = NULL, * maxptr2 = NULL; |
||||
const uint * minlocptr = NULL, * maxlocptr = NULL; |
||||
if (minVal || minLoc) |
||||
{ |
||||
minptr = db.ptr<T>(); |
||||
index += sizeof(T) * groupnum; |
||||
index = alignSize(index, MINMAX_STRUCT_ALIGNMENT); |
||||
} |
||||
if (maxVal || maxLoc) |
||||
{ |
||||
maxptr = (const T *)(db.ptr() + index); |
||||
index += sizeof(T) * groupnum; |
||||
index = alignSize(index, MINMAX_STRUCT_ALIGNMENT); |
||||
} |
||||
if (minLoc) |
||||
{ |
||||
minlocptr = (const uint *)(db.ptr() + index); |
||||
index += sizeof(uint) * groupnum; |
||||
index = alignSize(index, MINMAX_STRUCT_ALIGNMENT); |
||||
} |
||||
if (maxLoc) |
||||
{ |
||||
maxlocptr = (const uint *)(db.ptr() + index); |
||||
index += sizeof(uint) * groupnum; |
||||
index = alignSize(index, MINMAX_STRUCT_ALIGNMENT); |
||||
} |
||||
if (maxVal2) |
||||
maxptr2 = (const T *)(db.ptr() + index); |
||||
|
||||
for (int i = 0; i < groupnum; i++) |
||||
{ |
||||
if (minptr && minptr[i] <= minval) |
||||
{ |
||||
if (minptr[i] == minval) |
||||
{ |
||||
if (minlocptr) |
||||
minloc = std::min(minlocptr[i], minloc); |
||||
} |
||||
else |
||||
{ |
||||
if (minlocptr) |
||||
minloc = minlocptr[i]; |
||||
minval = minptr[i]; |
||||
} |
||||
} |
||||
if (maxptr && maxptr[i] >= maxval) |
||||
{ |
||||
if (maxptr[i] == maxval) |
||||
{ |
||||
if (maxlocptr) |
||||
maxloc = std::min(maxlocptr[i], maxloc); |
||||
} |
||||
else |
||||
{ |
||||
if (maxlocptr) |
||||
maxloc = maxlocptr[i]; |
||||
maxval = maxptr[i]; |
||||
} |
||||
} |
||||
if (maxptr2 && maxptr2[i] > maxval2) |
||||
maxval2 = maxptr2[i]; |
||||
} |
||||
bool zero_mask = (minLoc && minloc == index_max) || |
||||
(maxLoc && maxloc == index_max); |
||||
|
||||
if (minVal) |
||||
*minVal = zero_mask ? 0 : (double)minval; |
||||
if (maxVal) |
||||
*maxVal = zero_mask ? 0 : (double)maxval; |
||||
if (maxVal2) |
||||
*maxVal2 = zero_mask ? 0 : (double)maxval2; |
||||
|
||||
if (minLoc) |
||||
{ |
||||
minLoc[0] = zero_mask ? -1 : minloc / cols; |
||||
minLoc[1] = zero_mask ? -1 : minloc % cols; |
||||
} |
||||
if (maxLoc) |
||||
{ |
||||
maxLoc[0] = zero_mask ? -1 : maxloc / cols; |
||||
maxLoc[1] = zero_mask ? -1 : maxloc % cols; |
||||
} |
||||
} |
||||
|
||||
typedef void (*getMinMaxResFunc)(const Mat & db, double * minVal, double * maxVal, |
||||
int * minLoc, int *maxLoc, int gropunum, int cols, double * maxVal2); |
||||
|
||||
bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* minLoc, int* maxLoc, InputArray _mask, |
||||
int ddepth, bool absValues, InputArray _src2, double * maxVal2) |
||||
{ |
||||
const ocl::Device & dev = ocl::Device::getDefault(); |
||||
|
||||
#ifdef __ANDROID__ |
||||
if (dev.isNVidia()) |
||||
return false; |
||||
#endif |
||||
|
||||
bool doubleSupport = dev.doubleFPConfig() > 0, haveMask = !_mask.empty(), |
||||
haveSrc2 = _src2.kind() != _InputArray::NONE; |
||||
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), |
||||
kercn = haveMask ? cn : std::min(4, ocl::predictOptimalVectorWidth(_src, _src2)); |
||||
|
||||
// disabled following modes since it occasionally fails on AMD devices (e.g. A10-6800K, sep. 2014)
|
||||
if ((haveMask || type == CV_32FC1) && dev.isAMD()) |
||||
return false; |
||||
|
||||
CV_Assert( (cn == 1 && (!haveMask || _mask.type() == CV_8U)) || |
||||
(cn >= 1 && !minLoc && !maxLoc) ); |
||||
|
||||
if (ddepth < 0) |
||||
ddepth = depth; |
||||
|
||||
CV_Assert(!haveSrc2 || _src2.type() == type); |
||||
|
||||
if (depth == CV_32S) |
||||
return false; |
||||
|
||||
if ((depth == CV_64F || ddepth == CV_64F) && !doubleSupport) |
||||
return false; |
||||
|
||||
int groupnum = dev.maxComputeUnits(); |
||||
size_t wgs = dev.maxWorkGroupSize(); |
||||
|
||||
int wgs2_aligned = 1; |
||||
while (wgs2_aligned < (int)wgs) |
||||
wgs2_aligned <<= 1; |
||||
wgs2_aligned >>= 1; |
||||
|
||||
bool needMinVal = minVal || minLoc, needMinLoc = minLoc != NULL, |
||||
needMaxVal = maxVal || maxLoc, needMaxLoc = maxLoc != NULL; |
||||
|
||||
// in case of mask we must know whether mask is filled with zeros or not
|
||||
// so let's calculate min or max location, if it's undefined, so mask is zeros
|
||||
if (!(needMaxLoc || needMinLoc) && haveMask) |
||||
{ |
||||
if (needMinVal) |
||||
needMinLoc = true; |
||||
else |
||||
needMaxLoc = true; |
||||
} |
||||
|
||||
char cvt[2][40]; |
||||
String opts = format("-D DEPTH_%d -D srcT1=%s%s -D WGS=%d -D srcT=%s" |
||||
" -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d%s%s%s%s" |
||||
" -D dstT1=%s -D dstT=%s -D convertToDT=%s%s%s%s%s -D wdepth=%d -D convertFromU=%s" |
||||
" -D MINMAX_STRUCT_ALIGNMENT=%d", |
||||
depth, ocl::typeToStr(depth), haveMask ? " -D HAVE_MASK" : "", (int)wgs, |
||||
ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), wgs2_aligned, |
||||
doubleSupport ? " -D DOUBLE_SUPPORT" : "", |
||||
_src.isContinuous() ? " -D HAVE_SRC_CONT" : "", |
||||
_mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn, |
||||
needMinVal ? " -D NEED_MINVAL" : "", needMaxVal ? " -D NEED_MAXVAL" : "", |
||||
needMinLoc ? " -D NEED_MINLOC" : "", needMaxLoc ? " -D NEED_MAXLOC" : "", |
||||
ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKE_TYPE(ddepth, kercn)), |
||||
ocl::convertTypeStr(depth, ddepth, kercn, cvt[0]), |
||||
absValues ? " -D OP_ABS" : "", |
||||
haveSrc2 ? " -D HAVE_SRC2" : "", maxVal2 ? " -D OP_CALC2" : "", |
||||
haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "", ddepth, |
||||
depth <= CV_32S && ddepth == CV_32S ? ocl::convertTypeStr(CV_8U, ddepth, kercn, cvt[1]) : "noconvert", |
||||
MINMAX_STRUCT_ALIGNMENT); |
||||
|
||||
ocl::Kernel k("minmaxloc", ocl::core::minmaxloc_oclsrc, opts); |
||||
if (k.empty()) |
||||
return false; |
||||
|
||||
int esz = CV_ELEM_SIZE(ddepth), esz32s = CV_ELEM_SIZE1(CV_32S), |
||||
dbsize = groupnum * ((needMinVal ? esz : 0) + (needMaxVal ? esz : 0) + |
||||
(needMinLoc ? esz32s : 0) + (needMaxLoc ? esz32s : 0) + |
||||
(maxVal2 ? esz : 0)) |
||||
+ 5 * MINMAX_STRUCT_ALIGNMENT; |
||||
UMat src = _src.getUMat(), src2 = _src2.getUMat(), db(1, dbsize, CV_8UC1), mask = _mask.getUMat(); |
||||
|
||||
if (cn > 1 && !haveMask) |
||||
{ |
||||
src = src.reshape(1); |
||||
src2 = src2.reshape(1); |
||||
} |
||||
|
||||
if (haveSrc2) |
||||
{ |
||||
if (!haveMask) |
||||
k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(), |
||||
groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(src2)); |
||||
else |
||||
k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(), |
||||
groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(mask), |
||||
ocl::KernelArg::ReadOnlyNoSize(src2)); |
||||
} |
||||
else |
||||
{ |
||||
if (!haveMask) |
||||
k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(), |
||||
groupnum, ocl::KernelArg::PtrWriteOnly(db)); |
||||
else |
||||
k.args(ocl::KernelArg::ReadOnlyNoSize(src), src.cols, (int)src.total(), |
||||
groupnum, ocl::KernelArg::PtrWriteOnly(db), ocl::KernelArg::ReadOnlyNoSize(mask)); |
||||
} |
||||
|
||||
size_t globalsize = groupnum * wgs; |
||||
if (!k.run(1, &globalsize, &wgs, true)) |
||||
return false; |
||||
|
||||
static const getMinMaxResFunc functab[7] = |
||||
{ |
||||
getMinMaxRes<uchar>, |
||||
getMinMaxRes<char>, |
||||
getMinMaxRes<ushort>, |
||||
getMinMaxRes<short>, |
||||
getMinMaxRes<int>, |
||||
getMinMaxRes<float>, |
||||
getMinMaxRes<double> |
||||
}; |
||||
|
||||
getMinMaxResFunc func = functab[ddepth]; |
||||
|
||||
int locTemp[2]; |
||||
func(db.getMat(ACCESS_READ), minVal, maxVal, |
||||
needMinLoc ? minLoc ? minLoc : locTemp : minLoc, |
||||
needMaxLoc ? maxLoc ? maxLoc : locTemp : maxLoc, |
||||
groupnum, src.cols, maxVal2); |
||||
|
||||
return true; |
||||
} |
||||
|
||||
#endif |
||||
|
||||
#ifdef HAVE_OPENVX |
||||
namespace ovx { |
||||
template <> inline bool skipSmallImages<VX_KERNEL_MINMAXLOC>(int w, int h) { return w*h < 3840 * 2160; } |
||||
} |
||||
static bool openvx_minMaxIdx(Mat &src, double* minVal, double* maxVal, int* minIdx, int* maxIdx, Mat &mask) |
||||
{ |
||||
int stype = src.type(); |
||||
size_t total_size = src.total(); |
||||
int rows = src.size[0], cols = rows ? (int)(total_size / rows) : 0; |
||||
if ((stype != CV_8UC1 && stype != CV_16SC1) || !mask.empty() || |
||||
(src.dims != 2 && !(src.isContinuous() && cols > 0 && (size_t)rows*cols == total_size)) |
||||
) |
||||
return false; |
||||
|
||||
try |
||||
{ |
||||
ivx::Context ctx = ovx::getOpenVXContext(); |
||||
ivx::Image |
||||
ia = ivx::Image::createFromHandle(ctx, stype == CV_8UC1 ? VX_DF_IMAGE_U8 : VX_DF_IMAGE_S16, |
||||
ivx::Image::createAddressing(cols, rows, stype == CV_8UC1 ? 1 : 2, (vx_int32)(src.step[0])), src.ptr()); |
||||
|
||||
ivx::Scalar vxMinVal = ivx::Scalar::create(ctx, stype == CV_8UC1 ? VX_TYPE_UINT8 : VX_TYPE_INT16, 0); |
||||
ivx::Scalar vxMaxVal = ivx::Scalar::create(ctx, stype == CV_8UC1 ? VX_TYPE_UINT8 : VX_TYPE_INT16, 0); |
||||
ivx::Array vxMinInd, vxMaxInd; |
||||
ivx::Scalar vxMinCount, vxMaxCount; |
||||
if (minIdx) |
||||
{ |
||||
vxMinInd = ivx::Array::create(ctx, VX_TYPE_COORDINATES2D, 1); |
||||
vxMinCount = ivx::Scalar::create(ctx, VX_TYPE_UINT32, 0); |
||||
} |
||||
if (maxIdx) |
||||
{ |
||||
vxMaxInd = ivx::Array::create(ctx, VX_TYPE_COORDINATES2D, 1); |
||||
vxMaxCount = ivx::Scalar::create(ctx, VX_TYPE_UINT32, 0); |
||||
} |
||||
|
||||
ivx::IVX_CHECK_STATUS(vxuMinMaxLoc(ctx, ia, vxMinVal, vxMaxVal, vxMinInd, vxMaxInd, vxMinCount, vxMaxCount)); |
||||
|
||||
if (minVal) |
||||
{ |
||||
*minVal = stype == CV_8UC1 ? vxMinVal.getValue<vx_uint8>() : vxMinVal.getValue<vx_int16>(); |
||||
} |
||||
if (maxVal) |
||||
{ |
||||
*maxVal = stype == CV_8UC1 ? vxMaxVal.getValue<vx_uint8>() : vxMaxVal.getValue<vx_int16>(); |
||||
} |
||||
if (minIdx) |
||||
{ |
||||
if(vxMinCount.getValue<vx_uint32>()<1) throw ivx::RuntimeError(VX_ERROR_INVALID_VALUE, std::string(__func__) + "(): minimum value location not found"); |
||||
vx_coordinates2d_t loc; |
||||
vxMinInd.copyRangeTo(0, 1, &loc); |
||||
size_t minidx = loc.y * cols + loc.x + 1; |
||||
ofs2idx(src, minidx, minIdx); |
||||
} |
||||
if (maxIdx) |
||||
{ |
||||
if (vxMaxCount.getValue<vx_uint32>()<1) throw ivx::RuntimeError(VX_ERROR_INVALID_VALUE, std::string(__func__) + "(): maximum value location not found"); |
||||
vx_coordinates2d_t loc; |
||||
vxMaxInd.copyRangeTo(0, 1, &loc); |
||||
size_t maxidx = loc.y * cols + loc.x + 1; |
||||
ofs2idx(src, maxidx, maxIdx); |
||||
} |
||||
} |
||||
catch (ivx::RuntimeError & e) |
||||
{ |
||||
VX_DbgThrow(e.what()); |
||||
} |
||||
catch (ivx::WrapperError & e) |
||||
{ |
||||
VX_DbgThrow(e.what()); |
||||
} |
||||
|
||||
return true; |
||||
} |
||||
#endif |
||||
|
||||
#ifdef HAVE_IPP |
||||
static IppStatus ipp_minMaxIndex_wrap(const void* pSrc, int srcStep, IppiSize size, IppDataType dataType, |
||||
float* pMinVal, float* pMaxVal, IppiPoint* pMinIndex, IppiPoint* pMaxIndex, const Ipp8u*, int) |
||||
{ |
||||
switch(dataType) |
||||
{ |
||||
case ipp8u: return CV_INSTRUMENT_FUN_IPP(ippiMinMaxIndx_8u_C1R, (const Ipp8u*)pSrc, srcStep, size, pMinVal, pMaxVal, pMinIndex, pMaxIndex); |
||||
case ipp16u: return CV_INSTRUMENT_FUN_IPP(ippiMinMaxIndx_16u_C1R, (const Ipp16u*)pSrc, srcStep, size, pMinVal, pMaxVal, pMinIndex, pMaxIndex); |
||||
case ipp32f: return CV_INSTRUMENT_FUN_IPP(ippiMinMaxIndx_32f_C1R, (const Ipp32f*)pSrc, srcStep, size, pMinVal, pMaxVal, pMinIndex, pMaxIndex); |
||||
default: return ippStsDataTypeErr; |
||||
} |
||||
} |
||||
|
||||
static IppStatus ipp_minMaxIndexMask_wrap(const void* pSrc, int srcStep, IppiSize size, IppDataType dataType, |
||||
float* pMinVal, float* pMaxVal, IppiPoint* pMinIndex, IppiPoint* pMaxIndex, const Ipp8u* pMask, int maskStep) |
||||
{ |
||||
switch(dataType) |
||||
{ |
||||
case ipp8u: return CV_INSTRUMENT_FUN_IPP(ippiMinMaxIndx_8u_C1MR, (const Ipp8u*)pSrc, srcStep, pMask, maskStep, size, pMinVal, pMaxVal, pMinIndex, pMaxIndex); |
||||
case ipp16u: return CV_INSTRUMENT_FUN_IPP(ippiMinMaxIndx_16u_C1MR, (const Ipp16u*)pSrc, srcStep, pMask, maskStep, size, pMinVal, pMaxVal, pMinIndex, pMaxIndex); |
||||
case ipp32f: return CV_INSTRUMENT_FUN_IPP(ippiMinMaxIndx_32f_C1MR, (const Ipp32f*)pSrc, srcStep, pMask, maskStep, size, pMinVal, pMaxVal, pMinIndex, pMaxIndex); |
||||
default: return ippStsDataTypeErr; |
||||
} |
||||
} |
||||
|
||||
static IppStatus ipp_minMax_wrap(const void* pSrc, int srcStep, IppiSize size, IppDataType dataType, |
||||
float* pMinVal, float* pMaxVal, IppiPoint*, IppiPoint*, const Ipp8u*, int) |
||||
{ |
||||
IppStatus status; |
||||
|
||||
switch(dataType) |
||||
{ |
||||
#if IPP_VERSION_X100 > 201701 // wrong min values
|
||||
case ipp8u: |
||||
{ |
||||
Ipp8u val[2]; |
||||
status = CV_INSTRUMENT_FUN_IPP(ippiMinMax_8u_C1R, (const Ipp8u*)pSrc, srcStep, size, &val[0], &val[1]); |
||||
*pMinVal = val[0]; |
||||
*pMaxVal = val[1]; |
||||
return status; |
||||
} |
||||
#endif |
||||
case ipp16u: |
||||
{ |
||||
Ipp16u val[2]; |
||||
status = CV_INSTRUMENT_FUN_IPP(ippiMinMax_16u_C1R, (const Ipp16u*)pSrc, srcStep, size, &val[0], &val[1]); |
||||
*pMinVal = val[0]; |
||||
*pMaxVal = val[1]; |
||||
return status; |
||||
} |
||||
case ipp16s: |
||||
{ |
||||
Ipp16s val[2]; |
||||
status = CV_INSTRUMENT_FUN_IPP(ippiMinMax_16s_C1R, (const Ipp16s*)pSrc, srcStep, size, &val[0], &val[1]); |
||||
*pMinVal = val[0]; |
||||
*pMaxVal = val[1]; |
||||
return status; |
||||
} |
||||
case ipp32f: return CV_INSTRUMENT_FUN_IPP(ippiMinMax_32f_C1R, (const Ipp32f*)pSrc, srcStep, size, pMinVal, pMaxVal); |
||||
default: return ipp_minMaxIndex_wrap(pSrc, srcStep, size, dataType, pMinVal, pMaxVal, NULL, NULL, NULL, 0); |
||||
} |
||||
} |
||||
|
||||
static IppStatus ipp_minIdx_wrap(const void* pSrc, int srcStep, IppiSize size, IppDataType dataType, |
||||
float* pMinVal, float*, IppiPoint* pMinIndex, IppiPoint*, const Ipp8u*, int) |
||||
{ |
||||
IppStatus status; |
||||
|
||||
switch(dataType) |
||||
{ |
||||
case ipp8u: |
||||
{ |
||||
Ipp8u val; |
||||
status = CV_INSTRUMENT_FUN_IPP(ippiMinIndx_8u_C1R, (const Ipp8u*)pSrc, srcStep, size, &val, &pMinIndex->x, &pMinIndex->y); |
||||
*pMinVal = val; |
||||
return status; |
||||
} |
||||
case ipp16u: |
||||
{ |
||||
Ipp16u val; |
||||
status = CV_INSTRUMENT_FUN_IPP(ippiMinIndx_16u_C1R, (const Ipp16u*)pSrc, srcStep, size, &val, &pMinIndex->x, &pMinIndex->y); |
||||
*pMinVal = val; |
||||
return status; |
||||
} |
||||
case ipp16s: |
||||
{ |
||||
Ipp16s val; |
||||
status = CV_INSTRUMENT_FUN_IPP(ippiMinIndx_16s_C1R, (const Ipp16s*)pSrc, srcStep, size, &val, &pMinIndex->x, &pMinIndex->y); |
||||
*pMinVal = val; |
||||
return status; |
||||
} |
||||
case ipp32f: return CV_INSTRUMENT_FUN_IPP(ippiMinIndx_32f_C1R, (const Ipp32f*)pSrc, srcStep, size, pMinVal, &pMinIndex->x, &pMinIndex->y); |
||||
default: return ipp_minMaxIndex_wrap(pSrc, srcStep, size, dataType, pMinVal, NULL, pMinIndex, NULL, NULL, 0); |
||||
} |
||||
} |
||||
|
||||
static IppStatus ipp_maxIdx_wrap(const void* pSrc, int srcStep, IppiSize size, IppDataType dataType, |
||||
float*, float* pMaxVal, IppiPoint*, IppiPoint* pMaxIndex, const Ipp8u*, int) |
||||
{ |
||||
IppStatus status; |
||||
|
||||
switch(dataType) |
||||
{ |
||||
case ipp8u: |
||||
{ |
||||
Ipp8u val; |
||||
status = CV_INSTRUMENT_FUN_IPP(ippiMaxIndx_8u_C1R, (const Ipp8u*)pSrc, srcStep, size, &val, &pMaxIndex->x, &pMaxIndex->y); |
||||
*pMaxVal = val; |
||||
return status; |
||||
} |
||||
case ipp16u: |
||||
{ |
||||
Ipp16u val; |
||||
status = CV_INSTRUMENT_FUN_IPP(ippiMaxIndx_16u_C1R, (const Ipp16u*)pSrc, srcStep, size, &val, &pMaxIndex->x, &pMaxIndex->y); |
||||
*pMaxVal = val; |
||||
return status; |
||||
} |
||||
case ipp16s: |
||||
{ |
||||
Ipp16s val; |
||||
status = CV_INSTRUMENT_FUN_IPP(ippiMaxIndx_16s_C1R, (const Ipp16s*)pSrc, srcStep, size, &val, &pMaxIndex->x, &pMaxIndex->y); |
||||
*pMaxVal = val; |
||||
return status; |
||||
} |
||||
case ipp32f: return CV_INSTRUMENT_FUN_IPP(ippiMaxIndx_32f_C1R, (const Ipp32f*)pSrc, srcStep, size, pMaxVal, &pMaxIndex->x, &pMaxIndex->y); |
||||
default: return ipp_minMaxIndex_wrap(pSrc, srcStep, size, dataType, NULL, pMaxVal, NULL, pMaxIndex, NULL, 0); |
||||
} |
||||
} |
||||
|
||||
typedef IppStatus (*IppMinMaxSelector)(const void* pSrc, int srcStep, IppiSize size, IppDataType dataType, |
||||
float* pMinVal, float* pMaxVal, IppiPoint* pMinIndex, IppiPoint* pMaxIndex, const Ipp8u* pMask, int maskStep); |
||||
|
||||
static bool ipp_minMaxIdx(Mat &src, double* _minVal, double* _maxVal, int* _minIdx, int* _maxIdx, Mat &mask) |
||||
{ |
||||
#if IPP_VERSION_X100 >= 700 |
||||
CV_INSTRUMENT_REGION_IPP() |
||||
|
||||
#if IPP_VERSION_X100 < 201800 |
||||
// cv::minMaxIdx problem with NaN input
|
||||
// Disable 32F processing only
|
||||
if(src.depth() == CV_32F && cv::ipp::getIppTopFeatures() == ippCPUID_SSE42) |
||||
return false; |
||||
#endif |
||||
|
||||
#if IPP_VERSION_X100 < 201801 |
||||
// cv::minMaxIdx problem with index positions on AVX
|
||||
if(!mask.empty() && _maxIdx && cv::ipp::getIppTopFeatures() != ippCPUID_SSE42) |
||||
return false; |
||||
#endif |
||||
|
||||
IppStatus status; |
||||
IppDataType dataType = ippiGetDataType(src.depth()); |
||||
float minVal = 0; |
||||
float maxVal = 0; |
||||
IppiPoint minIdx = {-1, -1}; |
||||
IppiPoint maxIdx = {-1, -1}; |
||||
|
||||
float *pMinVal = (_minVal || _minIdx)?&minVal:NULL; |
||||
float *pMaxVal = (_maxVal || _maxIdx)?&maxVal:NULL; |
||||
IppiPoint *pMinIdx = (_minIdx)?&minIdx:NULL; |
||||
IppiPoint *pMaxIdx = (_maxIdx)?&maxIdx:NULL; |
||||
|
||||
IppMinMaxSelector ippMinMaxFun = ipp_minMaxIndexMask_wrap; |
||||
if(mask.empty()) |
||||
{ |
||||
if(_maxVal && _maxIdx && !_minVal && !_minIdx) |
||||
ippMinMaxFun = ipp_maxIdx_wrap; |
||||
else if(!_maxVal && !_maxIdx && _minVal && _minIdx) |
||||
ippMinMaxFun = ipp_minIdx_wrap; |
||||
else if(_maxVal && !_maxIdx && _minVal && !_minIdx) |
||||
ippMinMaxFun = ipp_minMax_wrap; |
||||
else if(!_maxVal && !_maxIdx && !_minVal && !_minIdx) |
||||
return false; |
||||
else |
||||
ippMinMaxFun = ipp_minMaxIndex_wrap; |
||||
} |
||||
|
||||
if(src.dims <= 2) |
||||
{ |
||||
IppiSize size = ippiSize(src.size()); |
||||
size.width *= src.channels(); |
||||
|
||||
status = ippMinMaxFun(src.ptr(), (int)src.step, size, dataType, pMinVal, pMaxVal, pMinIdx, pMaxIdx, (Ipp8u*)mask.ptr(), (int)mask.step); |
||||
if(status < 0) |
||||
return false; |
||||
if(_minVal) |
||||
*_minVal = minVal; |
||||
if(_maxVal) |
||||
*_maxVal = maxVal; |
||||
if(_minIdx) |
||||
{ |
||||
#if IPP_VERSION_X100 < 201801 |
||||
// Should be just ippStsNoOperation check, but there is a bug in the function so we need additional checks
|
||||
if(status == ippStsNoOperation && !mask.empty() && !pMinIdx->x && !pMinIdx->y) |
||||
#else |
||||
if(status == ippStsNoOperation) |
||||
#endif |
||||
{ |
||||
_minIdx[0] = -1; |
||||
_minIdx[1] = -1; |
||||
} |
||||
else |
||||
{ |
||||
_minIdx[0] = minIdx.y; |
||||
_minIdx[1] = minIdx.x; |
||||
} |
||||
} |
||||
if(_maxIdx) |
||||
{ |
||||
#if IPP_VERSION_X100 < 201801 |
||||
// Should be just ippStsNoOperation check, but there is a bug in the function so we need additional checks
|
||||
if(status == ippStsNoOperation && !mask.empty() && !pMaxIdx->x && !pMaxIdx->y) |
||||
#else |
||||
if(status == ippStsNoOperation) |
||||
#endif |
||||
{ |
||||
_maxIdx[0] = -1; |
||||
_maxIdx[1] = -1; |
||||
} |
||||
else |
||||
{ |
||||
_maxIdx[0] = maxIdx.y; |
||||
_maxIdx[1] = maxIdx.x; |
||||
} |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
const Mat *arrays[] = {&src, mask.empty()?NULL:&mask, NULL}; |
||||
uchar *ptrs[3] = {NULL}; |
||||
NAryMatIterator it(arrays, ptrs); |
||||
IppiSize size = ippiSize(it.size*src.channels(), 1); |
||||
int srcStep = (int)(size.width*src.elemSize1()); |
||||
int maskStep = size.width; |
||||
size_t idxPos = 1; |
||||
size_t minIdxAll = 0; |
||||
size_t maxIdxAll = 0; |
||||
float minValAll = IPP_MAXABS_32F; |
||||
float maxValAll = -IPP_MAXABS_32F; |
||||
|
||||
for(size_t i = 0; i < it.nplanes; i++, ++it, idxPos += size.width) |
||||
{ |
||||
status = ippMinMaxFun(ptrs[0], srcStep, size, dataType, pMinVal, pMaxVal, pMinIdx, pMaxIdx, ptrs[1], maskStep); |
||||
if(status < 0) |
||||
return false; |
||||
#if IPP_VERSION_X100 > 201701 |
||||
// Zero-mask check, function should return ippStsNoOperation warning
|
||||
if(status == ippStsNoOperation) |
||||
continue; |
||||
#else |
||||
// Crude zero-mask check, waiting for fix in IPP function
|
||||
if(ptrs[1]) |
||||
{ |
||||
Mat localMask(Size(size.width, 1), CV_8U, ptrs[1], maskStep); |
||||
if(!cv::countNonZero(localMask)) |
||||
continue; |
||||
} |
||||
#endif |
||||
|
||||
if(_minVal && minVal < minValAll) |
||||
{ |
||||
minValAll = minVal; |
||||
minIdxAll = idxPos+minIdx.x; |
||||
} |
||||
if(_maxVal && maxVal > maxValAll) |
||||
{ |
||||
maxValAll = maxVal; |
||||
maxIdxAll = idxPos+maxIdx.x; |
||||
} |
||||
} |
||||
if(!src.empty() && mask.empty()) |
||||
{ |
||||
if(minIdxAll == 0) |
||||
minIdxAll = 1; |
||||
if(maxValAll == 0) |
||||
maxValAll = 1; |
||||
} |
||||
|
||||
if(_minVal) |
||||
*_minVal = minValAll; |
||||
if(_maxVal) |
||||
*_maxVal = maxValAll; |
||||
if(_minIdx) |
||||
ofs2idx(src, minIdxAll, _minIdx); |
||||
if(_maxIdx) |
||||
ofs2idx(src, maxIdxAll, _maxIdx); |
||||
} |
||||
|
||||
return true; |
||||
#else |
||||
CV_UNUSED(src); CV_UNUSED(minVal); CV_UNUSED(maxVal); CV_UNUSED(minIdx); CV_UNUSED(maxIdx); CV_UNUSED(mask); |
||||
return false; |
||||
#endif |
||||
} |
||||
#endif |
||||
|
||||
} |
||||
|
||||
void cv::minMaxIdx(InputArray _src, double* minVal, |
||||
double* maxVal, int* minIdx, int* maxIdx, |
||||
InputArray _mask) |
||||
{ |
||||
CV_INSTRUMENT_REGION() |
||||
|
||||
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); |
||||
CV_Assert( (cn == 1 && (_mask.empty() || _mask.type() == CV_8U)) || |
||||
(cn > 1 && _mask.empty() && !minIdx && !maxIdx) ); |
||||
|
||||
CV_OCL_RUN(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2 && (_mask.empty() || _src.size() == _mask.size()), |
||||
ocl_minMaxIdx(_src, minVal, maxVal, minIdx, maxIdx, _mask)) |
||||
|
||||
Mat src = _src.getMat(), mask = _mask.getMat(); |
||||
|
||||
if (src.dims <= 2) |
||||
CALL_HAL(minMaxIdx, cv_hal_minMaxIdx, src.data, src.step, src.cols, src.rows, src.depth(), minVal, maxVal, |
||||
minIdx, maxIdx, mask.data); |
||||
|
||||
CV_OVX_RUN(!ovx::skipSmallImages<VX_KERNEL_MINMAXLOC>(src.cols, src.rows), |
||||
openvx_minMaxIdx(src, minVal, maxVal, minIdx, maxIdx, mask)) |
||||
|
||||
CV_IPP_RUN_FAST(ipp_minMaxIdx(src, minVal, maxVal, minIdx, maxIdx, mask)) |
||||
|
||||
MinMaxIdxFunc func = getMinmaxTab(depth); |
||||
CV_Assert( func != 0 ); |
||||
|
||||
const Mat* arrays[] = {&src, &mask, 0}; |
||||
uchar* ptrs[2]; |
||||
NAryMatIterator it(arrays, ptrs); |
||||
|
||||
size_t minidx = 0, maxidx = 0; |
||||
int iminval = INT_MAX, imaxval = INT_MIN; |
||||
float fminval = std::numeric_limits<float>::infinity(), fmaxval = -fminval; |
||||
double dminval = std::numeric_limits<double>::infinity(), dmaxval = -dminval; |
||||
size_t startidx = 1; |
||||
int *minval = &iminval, *maxval = &imaxval; |
||||
int planeSize = (int)it.size*cn; |
||||
|
||||
if( depth == CV_32F ) |
||||
minval = (int*)&fminval, maxval = (int*)&fmaxval; |
||||
else if( depth == CV_64F ) |
||||
minval = (int*)&dminval, maxval = (int*)&dmaxval; |
||||
|
||||
for( size_t i = 0; i < it.nplanes; i++, ++it, startidx += planeSize ) |
||||
func( ptrs[0], ptrs[1], minval, maxval, &minidx, &maxidx, planeSize, startidx ); |
||||
|
||||
if (!src.empty() && mask.empty()) |
||||
{ |
||||
if( minidx == 0 ) |
||||
minidx = 1; |
||||
if( maxidx == 0 ) |
||||
maxidx = 1; |
||||
} |
||||
|
||||
if( minidx == 0 ) |
||||
dminval = dmaxval = 0; |
||||
else if( depth == CV_32F ) |
||||
dminval = fminval, dmaxval = fmaxval; |
||||
else if( depth <= CV_32S ) |
||||
dminval = iminval, dmaxval = imaxval; |
||||
|
||||
if( minVal ) |
||||
*minVal = dminval; |
||||
if( maxVal ) |
||||
*maxVal = dmaxval; |
||||
|
||||
if( minIdx ) |
||||
ofs2idx(src, minidx, minIdx); |
||||
if( maxIdx ) |
||||
ofs2idx(src, maxidx, maxIdx); |
||||
} |
||||
|
||||
void cv::minMaxLoc( InputArray _img, double* minVal, double* maxVal, |
||||
Point* minLoc, Point* maxLoc, InputArray mask ) |
||||
{ |
||||
CV_INSTRUMENT_REGION() |
||||
|
||||
CV_Assert(_img.dims() <= 2); |
||||
|
||||
minMaxIdx(_img, minVal, maxVal, (int*)minLoc, (int*)maxLoc, mask); |
||||
if( minLoc ) |
||||
std::swap(minLoc->x, minLoc->y); |
||||
if( maxLoc ) |
||||
std::swap(maxLoc->x, maxLoc->y); |
||||
} |
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,43 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html
|
||||
|
||||
|
||||
#ifndef SRC_STAT_HPP |
||||
#define SRC_STAT_HPP |
||||
|
||||
#include "opencv2/core/mat.hpp" |
||||
|
||||
namespace cv { |
||||
|
||||
#ifdef HAVE_OPENCL |
||||
|
||||
enum { OCL_OP_SUM = 0, OCL_OP_SUM_ABS = 1, OCL_OP_SUM_SQR = 2 }; |
||||
bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask = noArray(), |
||||
InputArray _src2 = noArray(), bool calc2 = false, const Scalar & res2 = Scalar() ); |
||||
bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int* minLoc, int* maxLoc, InputArray _mask, |
||||
int ddepth = -1, bool absValues = false, InputArray _src2 = noArray(), double * maxVal2 = NULL); |
||||
|
||||
template <typename T> Scalar ocl_part_sum(Mat m) |
||||
{ |
||||
CV_Assert(m.rows == 1); |
||||
|
||||
Scalar s = Scalar::all(0); |
||||
int cn = m.channels(); |
||||
const T * const ptr = m.ptr<T>(0); |
||||
|
||||
for (int x = 0, w = m.cols * cn; x < w; ) |
||||
for (int c = 0; c < cn; ++c, ++x) |
||||
s[c] += ptr[x]; |
||||
|
||||
return s; |
||||
} |
||||
|
||||
#endif |
||||
|
||||
typedef int (*SumFunc)(const uchar*, const uchar* mask, uchar*, int, int); |
||||
SumFunc getSumFunc(int depth); |
||||
|
||||
} |
||||
|
||||
#endif // SRC_STAT_HPP
|
@ -0,0 +1,119 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html
|
||||
|
||||
|
||||
#include "precomp.hpp" |
||||
|
||||
CV_IMPL CvScalar cvSum( const CvArr* srcarr ) |
||||
{ |
||||
cv::Scalar sum = cv::sum(cv::cvarrToMat(srcarr, false, true, 1)); |
||||
if( CV_IS_IMAGE(srcarr) ) |
||||
{ |
||||
int coi = cvGetImageCOI((IplImage*)srcarr); |
||||
if( coi ) |
||||
{ |
||||
CV_Assert( 0 < coi && coi <= 4 ); |
||||
sum = cv::Scalar(sum[coi-1]); |
||||
} |
||||
} |
||||
return sum; |
||||
} |
||||
|
||||
CV_IMPL int cvCountNonZero( const CvArr* imgarr ) |
||||
{ |
||||
cv::Mat img = cv::cvarrToMat(imgarr, false, true, 1); |
||||
if( img.channels() > 1 ) |
||||
cv::extractImageCOI(imgarr, img); |
||||
return countNonZero(img); |
||||
} |
||||
|
||||
|
||||
CV_IMPL CvScalar |
||||
cvAvg( const void* imgarr, const void* maskarr ) |
||||
{ |
||||
cv::Mat img = cv::cvarrToMat(imgarr, false, true, 1); |
||||
cv::Scalar mean = !maskarr ? cv::mean(img) : cv::mean(img, cv::cvarrToMat(maskarr)); |
||||
if( CV_IS_IMAGE(imgarr) ) |
||||
{ |
||||
int coi = cvGetImageCOI((IplImage*)imgarr); |
||||
if( coi ) |
||||
{ |
||||
CV_Assert( 0 < coi && coi <= 4 ); |
||||
mean = cv::Scalar(mean[coi-1]); |
||||
} |
||||
} |
||||
return mean; |
||||
} |
||||
|
||||
|
||||
CV_IMPL void |
||||
cvAvgSdv( const CvArr* imgarr, CvScalar* _mean, CvScalar* _sdv, const void* maskarr ) |
||||
{ |
||||
cv::Scalar mean, sdv; |
||||
|
||||
cv::Mat mask; |
||||
if( maskarr ) |
||||
mask = cv::cvarrToMat(maskarr); |
||||
|
||||
cv::meanStdDev(cv::cvarrToMat(imgarr, false, true, 1), mean, sdv, mask ); |
||||
|
||||
if( CV_IS_IMAGE(imgarr) ) |
||||
{ |
||||
int coi = cvGetImageCOI((IplImage*)imgarr); |
||||
if( coi ) |
||||
{ |
||||
CV_Assert( 0 < coi && coi <= 4 ); |
||||
mean = cv::Scalar(mean[coi-1]); |
||||
sdv = cv::Scalar(sdv[coi-1]); |
||||
} |
||||
} |
||||
|
||||
if( _mean ) |
||||
*(cv::Scalar*)_mean = mean; |
||||
if( _sdv ) |
||||
*(cv::Scalar*)_sdv = sdv; |
||||
} |
||||
|
||||
|
||||
CV_IMPL void |
||||
cvMinMaxLoc( const void* imgarr, double* _minVal, double* _maxVal, |
||||
CvPoint* _minLoc, CvPoint* _maxLoc, const void* maskarr ) |
||||
{ |
||||
cv::Mat mask, img = cv::cvarrToMat(imgarr, false, true, 1); |
||||
if( maskarr ) |
||||
mask = cv::cvarrToMat(maskarr); |
||||
if( img.channels() > 1 ) |
||||
cv::extractImageCOI(imgarr, img); |
||||
|
||||
cv::minMaxLoc( img, _minVal, _maxVal, |
||||
(cv::Point*)_minLoc, (cv::Point*)_maxLoc, mask ); |
||||
} |
||||
|
||||
|
||||
CV_IMPL double |
||||
cvNorm( const void* imgA, const void* imgB, int normType, const void* maskarr ) |
||||
{ |
||||
cv::Mat a, mask; |
||||
if( !imgA ) |
||||
{ |
||||
imgA = imgB; |
||||
imgB = 0; |
||||
} |
||||
|
||||
a = cv::cvarrToMat(imgA, false, true, 1); |
||||
if( maskarr ) |
||||
mask = cv::cvarrToMat(maskarr); |
||||
|
||||
if( a.channels() > 1 && CV_IS_IMAGE(imgA) && cvGetImageCOI((const IplImage*)imgA) > 0 ) |
||||
cv::extractImageCOI(imgA, a); |
||||
|
||||
if( !imgB ) |
||||
return !maskarr ? cv::norm(a, normType) : cv::norm(a, normType, mask); |
||||
|
||||
cv::Mat b = cv::cvarrToMat(imgB, false, true, 1); |
||||
if( b.channels() > 1 && CV_IS_IMAGE(imgB) && cvGetImageCOI((const IplImage*)imgB) > 0 ) |
||||
cv::extractImageCOI(imgB, b); |
||||
|
||||
return !maskarr ? cv::norm(a, b, normType) : cv::norm(a, b, normType, mask); |
||||
} |
@ -0,0 +1,647 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html
|
||||
|
||||
|
||||
#include "precomp.hpp" |
||||
#include "opencl_kernels_core.hpp" |
||||
#include "stat.hpp" |
||||
|
||||
namespace cv |
||||
{ |
||||
|
||||
template <typename T, typename ST> |
||||
struct Sum_SIMD |
||||
{ |
||||
int operator () (const T *, const uchar *, ST *, int, int) const |
||||
{ |
||||
return 0; |
||||
} |
||||
}; |
||||
|
||||
template <typename ST, typename DT> |
||||
inline void addChannels(DT * dst, ST * buf, int cn) |
||||
{ |
||||
for (int i = 0; i < 4; ++i) |
||||
dst[i % cn] += buf[i]; |
||||
} |
||||
|
||||
#if CV_SSE2 |
||||
|
||||
template <> |
||||
struct Sum_SIMD<schar, int> |
||||
{ |
||||
int operator () (const schar * src0, const uchar * mask, int * dst, int len, int cn) const |
||||
{ |
||||
if (mask || (cn != 1 && cn != 2 && cn != 4) || !USE_SSE2) |
||||
return 0; |
||||
|
||||
int x = 0; |
||||
__m128i v_zero = _mm_setzero_si128(), v_sum = v_zero; |
||||
|
||||
for ( ; x <= len - 16; x += 16) |
||||
{ |
||||
__m128i v_src = _mm_loadu_si128((const __m128i *)(src0 + x)); |
||||
__m128i v_half = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src), 8); |
||||
|
||||
v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16)); |
||||
v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16)); |
||||
|
||||
v_half = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero, v_src), 8); |
||||
v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_half), 16)); |
||||
v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_half), 16)); |
||||
} |
||||
|
||||
for ( ; x <= len - 8; x += 8) |
||||
{ |
||||
__m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src0 + x))), 8); |
||||
|
||||
v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16)); |
||||
v_sum = _mm_add_epi32(v_sum, _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16)); |
||||
} |
||||
|
||||
int CV_DECL_ALIGNED(16) ar[4]; |
||||
_mm_store_si128((__m128i*)ar, v_sum); |
||||
|
||||
addChannels(dst, ar, cn); |
||||
|
||||
return x / cn; |
||||
} |
||||
}; |
||||
|
||||
template <> |
||||
struct Sum_SIMD<int, double> |
||||
{ |
||||
int operator () (const int * src0, const uchar * mask, double * dst, int len, int cn) const |
||||
{ |
||||
if (mask || (cn != 1 && cn != 2 && cn != 4) || !USE_SSE2) |
||||
return 0; |
||||
|
||||
int x = 0; |
||||
__m128d v_zero = _mm_setzero_pd(), v_sum0 = v_zero, v_sum1 = v_zero; |
||||
|
||||
for ( ; x <= len - 4; x += 4) |
||||
{ |
||||
__m128i v_src = _mm_loadu_si128((__m128i const *)(src0 + x)); |
||||
v_sum0 = _mm_add_pd(v_sum0, _mm_cvtepi32_pd(v_src)); |
||||
v_sum1 = _mm_add_pd(v_sum1, _mm_cvtepi32_pd(_mm_srli_si128(v_src, 8))); |
||||
} |
||||
|
||||
double CV_DECL_ALIGNED(16) ar[4]; |
||||
_mm_store_pd(ar, v_sum0); |
||||
_mm_store_pd(ar + 2, v_sum1); |
||||
|
||||
addChannels(dst, ar, cn); |
||||
|
||||
return x / cn; |
||||
} |
||||
}; |
||||
|
||||
template <> |
||||
struct Sum_SIMD<float, double> |
||||
{ |
||||
int operator () (const float * src0, const uchar * mask, double * dst, int len, int cn) const |
||||
{ |
||||
if (mask || (cn != 1 && cn != 2 && cn != 4) || !USE_SSE2) |
||||
return 0; |
||||
|
||||
int x = 0; |
||||
__m128d v_zero = _mm_setzero_pd(), v_sum0 = v_zero, v_sum1 = v_zero; |
||||
|
||||
for ( ; x <= len - 4; x += 4) |
||||
{ |
||||
__m128 v_src = _mm_loadu_ps(src0 + x); |
||||
v_sum0 = _mm_add_pd(v_sum0, _mm_cvtps_pd(v_src)); |
||||
v_src = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8)); |
||||
v_sum1 = _mm_add_pd(v_sum1, _mm_cvtps_pd(v_src)); |
||||
} |
||||
|
||||
double CV_DECL_ALIGNED(16) ar[4]; |
||||
_mm_store_pd(ar, v_sum0); |
||||
_mm_store_pd(ar + 2, v_sum1); |
||||
|
||||
addChannels(dst, ar, cn); |
||||
|
||||
return x / cn; |
||||
} |
||||
}; |
||||
|
||||
|
||||
#elif CV_NEON |
||||
|
||||
template <> |
||||
struct Sum_SIMD<uchar, int> |
||||
{ |
||||
int operator () (const uchar * src0, const uchar * mask, int * dst, int len, int cn) const |
||||
{ |
||||
if (mask || (cn != 1 && cn != 2 && cn != 4)) |
||||
return 0; |
||||
|
||||
int x = 0; |
||||
uint32x4_t v_sum = vdupq_n_u32(0u); |
||||
|
||||
for ( ; x <= len - 16; x += 16) |
||||
{ |
||||
uint8x16_t v_src = vld1q_u8(src0 + x); |
||||
uint16x8_t v_half = vmovl_u8(vget_low_u8(v_src)); |
||||
|
||||
v_sum = vaddw_u16(v_sum, vget_low_u16(v_half)); |
||||
v_sum = vaddw_u16(v_sum, vget_high_u16(v_half)); |
||||
|
||||
v_half = vmovl_u8(vget_high_u8(v_src)); |
||||
v_sum = vaddw_u16(v_sum, vget_low_u16(v_half)); |
||||
v_sum = vaddw_u16(v_sum, vget_high_u16(v_half)); |
||||
} |
||||
|
||||
for ( ; x <= len - 8; x += 8) |
||||
{ |
||||
uint16x8_t v_src = vmovl_u8(vld1_u8(src0 + x)); |
||||
|
||||
v_sum = vaddw_u16(v_sum, vget_low_u16(v_src)); |
||||
v_sum = vaddw_u16(v_sum, vget_high_u16(v_src)); |
||||
} |
||||
|
||||
unsigned int CV_DECL_ALIGNED(16) ar[4]; |
||||
vst1q_u32(ar, v_sum); |
||||
|
||||
addChannels(dst, ar, cn); |
||||
|
||||
return x / cn; |
||||
} |
||||
}; |
||||
|
||||
template <> |
||||
struct Sum_SIMD<schar, int> |
||||
{ |
||||
int operator () (const schar * src0, const uchar * mask, int * dst, int len, int cn) const |
||||
{ |
||||
if (mask || (cn != 1 && cn != 2 && cn != 4)) |
||||
return 0; |
||||
|
||||
int x = 0; |
||||
int32x4_t v_sum = vdupq_n_s32(0); |
||||
|
||||
for ( ; x <= len - 16; x += 16) |
||||
{ |
||||
int8x16_t v_src = vld1q_s8(src0 + x); |
||||
int16x8_t v_half = vmovl_s8(vget_low_s8(v_src)); |
||||
|
||||
v_sum = vaddw_s16(v_sum, vget_low_s16(v_half)); |
||||
v_sum = vaddw_s16(v_sum, vget_high_s16(v_half)); |
||||
|
||||
v_half = vmovl_s8(vget_high_s8(v_src)); |
||||
v_sum = vaddw_s16(v_sum, vget_low_s16(v_half)); |
||||
v_sum = vaddw_s16(v_sum, vget_high_s16(v_half)); |
||||
} |
||||
|
||||
for ( ; x <= len - 8; x += 8) |
||||
{ |
||||
int16x8_t v_src = vmovl_s8(vld1_s8(src0 + x)); |
||||
|
||||
v_sum = vaddw_s16(v_sum, vget_low_s16(v_src)); |
||||
v_sum = vaddw_s16(v_sum, vget_high_s16(v_src)); |
||||
} |
||||
|
||||
int CV_DECL_ALIGNED(16) ar[4]; |
||||
vst1q_s32(ar, v_sum); |
||||
|
||||
addChannels(dst, ar, cn); |
||||
|
||||
return x / cn; |
||||
} |
||||
}; |
||||
|
||||
template <> |
||||
struct Sum_SIMD<ushort, int> |
||||
{ |
||||
int operator () (const ushort * src0, const uchar * mask, int * dst, int len, int cn) const |
||||
{ |
||||
if (mask || (cn != 1 && cn != 2 && cn != 4)) |
||||
return 0; |
||||
|
||||
int x = 0; |
||||
uint32x4_t v_sum = vdupq_n_u32(0u); |
||||
|
||||
for ( ; x <= len - 8; x += 8) |
||||
{ |
||||
uint16x8_t v_src = vld1q_u16(src0 + x); |
||||
|
||||
v_sum = vaddw_u16(v_sum, vget_low_u16(v_src)); |
||||
v_sum = vaddw_u16(v_sum, vget_high_u16(v_src)); |
||||
} |
||||
|
||||
for ( ; x <= len - 4; x += 4) |
||||
v_sum = vaddw_u16(v_sum, vld1_u16(src0 + x)); |
||||
|
||||
unsigned int CV_DECL_ALIGNED(16) ar[4]; |
||||
vst1q_u32(ar, v_sum); |
||||
|
||||
addChannels(dst, ar, cn); |
||||
|
||||
return x / cn; |
||||
} |
||||
}; |
||||
|
||||
template <> |
||||
struct Sum_SIMD<short, int> |
||||
{ |
||||
int operator () (const short * src0, const uchar * mask, int * dst, int len, int cn) const |
||||
{ |
||||
if (mask || (cn != 1 && cn != 2 && cn != 4)) |
||||
return 0; |
||||
|
||||
int x = 0; |
||||
int32x4_t v_sum = vdupq_n_s32(0u); |
||||
|
||||
for ( ; x <= len - 8; x += 8) |
||||
{ |
||||
int16x8_t v_src = vld1q_s16(src0 + x); |
||||
|
||||
v_sum = vaddw_s16(v_sum, vget_low_s16(v_src)); |
||||
v_sum = vaddw_s16(v_sum, vget_high_s16(v_src)); |
||||
} |
||||
|
||||
for ( ; x <= len - 4; x += 4) |
||||
v_sum = vaddw_s16(v_sum, vld1_s16(src0 + x)); |
||||
|
||||
int CV_DECL_ALIGNED(16) ar[4]; |
||||
vst1q_s32(ar, v_sum); |
||||
|
||||
addChannels(dst, ar, cn); |
||||
|
||||
return x / cn; |
||||
} |
||||
}; |
||||
|
||||
#endif |
||||
|
||||
template<typename T, typename ST> |
||||
static int sum_(const T* src0, const uchar* mask, ST* dst, int len, int cn ) |
||||
{ |
||||
const T* src = src0; |
||||
if( !mask ) |
||||
{ |
||||
Sum_SIMD<T, ST> vop; |
||||
int i = vop(src0, mask, dst, len, cn), k = cn % 4; |
||||
src += i * cn; |
||||
|
||||
if( k == 1 ) |
||||
{ |
||||
ST s0 = dst[0]; |
||||
|
||||
#if CV_ENABLE_UNROLLED |
||||
for(; i <= len - 4; i += 4, src += cn*4 ) |
||||
s0 += src[0] + src[cn] + src[cn*2] + src[cn*3]; |
||||
#endif |
||||
for( ; i < len; i++, src += cn ) |
||||
s0 += src[0]; |
||||
dst[0] = s0; |
||||
} |
||||
else if( k == 2 ) |
||||
{ |
||||
ST s0 = dst[0], s1 = dst[1]; |
||||
for( ; i < len; i++, src += cn ) |
||||
{ |
||||
s0 += src[0]; |
||||
s1 += src[1]; |
||||
} |
||||
dst[0] = s0; |
||||
dst[1] = s1; |
||||
} |
||||
else if( k == 3 ) |
||||
{ |
||||
ST s0 = dst[0], s1 = dst[1], s2 = dst[2]; |
||||
for( ; i < len; i++, src += cn ) |
||||
{ |
||||
s0 += src[0]; |
||||
s1 += src[1]; |
||||
s2 += src[2]; |
||||
} |
||||
dst[0] = s0; |
||||
dst[1] = s1; |
||||
dst[2] = s2; |
||||
} |
||||
|
||||
for( ; k < cn; k += 4 ) |
||||
{ |
||||
src = src0 + i*cn + k; |
||||
ST s0 = dst[k], s1 = dst[k+1], s2 = dst[k+2], s3 = dst[k+3]; |
||||
for( ; i < len; i++, src += cn ) |
||||
{ |
||||
s0 += src[0]; s1 += src[1]; |
||||
s2 += src[2]; s3 += src[3]; |
||||
} |
||||
dst[k] = s0; |
||||
dst[k+1] = s1; |
||||
dst[k+2] = s2; |
||||
dst[k+3] = s3; |
||||
} |
||||
return len; |
||||
} |
||||
|
||||
int i, nzm = 0; |
||||
if( cn == 1 ) |
||||
{ |
||||
ST s = dst[0]; |
||||
for( i = 0; i < len; i++ ) |
||||
if( mask[i] ) |
||||
{ |
||||
s += src[i]; |
||||
nzm++; |
||||
} |
||||
dst[0] = s; |
||||
} |
||||
else if( cn == 3 ) |
||||
{ |
||||
ST s0 = dst[0], s1 = dst[1], s2 = dst[2]; |
||||
for( i = 0; i < len; i++, src += 3 ) |
||||
if( mask[i] ) |
||||
{ |
||||
s0 += src[0]; |
||||
s1 += src[1]; |
||||
s2 += src[2]; |
||||
nzm++; |
||||
} |
||||
dst[0] = s0; |
||||
dst[1] = s1; |
||||
dst[2] = s2; |
||||
} |
||||
else |
||||
{ |
||||
for( i = 0; i < len; i++, src += cn ) |
||||
if( mask[i] ) |
||||
{ |
||||
int k = 0; |
||||
#if CV_ENABLE_UNROLLED |
||||
for( ; k <= cn - 4; k += 4 ) |
||||
{ |
||||
ST s0, s1; |
||||
s0 = dst[k] + src[k]; |
||||
s1 = dst[k+1] + src[k+1]; |
||||
dst[k] = s0; dst[k+1] = s1; |
||||
s0 = dst[k+2] + src[k+2]; |
||||
s1 = dst[k+3] + src[k+3]; |
||||
dst[k+2] = s0; dst[k+3] = s1; |
||||
} |
||||
#endif |
||||
for( ; k < cn; k++ ) |
||||
dst[k] += src[k]; |
||||
nzm++; |
||||
} |
||||
} |
||||
return nzm; |
||||
} |
||||
|
||||
|
||||
static int sum8u( const uchar* src, const uchar* mask, int* dst, int len, int cn ) |
||||
{ return sum_(src, mask, dst, len, cn); } |
||||
|
||||
static int sum8s( const schar* src, const uchar* mask, int* dst, int len, int cn ) |
||||
{ return sum_(src, mask, dst, len, cn); } |
||||
|
||||
static int sum16u( const ushort* src, const uchar* mask, int* dst, int len, int cn ) |
||||
{ return sum_(src, mask, dst, len, cn); } |
||||
|
||||
static int sum16s( const short* src, const uchar* mask, int* dst, int len, int cn ) |
||||
{ return sum_(src, mask, dst, len, cn); } |
||||
|
||||
static int sum32s( const int* src, const uchar* mask, double* dst, int len, int cn ) |
||||
{ return sum_(src, mask, dst, len, cn); } |
||||
|
||||
static int sum32f( const float* src, const uchar* mask, double* dst, int len, int cn ) |
||||
{ return sum_(src, mask, dst, len, cn); } |
||||
|
||||
static int sum64f( const double* src, const uchar* mask, double* dst, int len, int cn ) |
||||
{ return sum_(src, mask, dst, len, cn); } |
||||
|
||||
SumFunc getSumFunc(int depth) |
||||
{ |
||||
static SumFunc sumTab[] = |
||||
{ |
||||
(SumFunc)GET_OPTIMIZED(sum8u), (SumFunc)sum8s, |
||||
(SumFunc)sum16u, (SumFunc)sum16s, |
||||
(SumFunc)sum32s, |
||||
(SumFunc)GET_OPTIMIZED(sum32f), (SumFunc)sum64f, |
||||
0 |
||||
}; |
||||
|
||||
return sumTab[depth]; |
||||
} |
||||
|
||||
#ifdef HAVE_OPENCL |
||||
|
||||
bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask, |
||||
InputArray _src2, bool calc2, const Scalar & res2 ) |
||||
{ |
||||
CV_Assert(sum_op == OCL_OP_SUM || sum_op == OCL_OP_SUM_ABS || sum_op == OCL_OP_SUM_SQR); |
||||
|
||||
const ocl::Device & dev = ocl::Device::getDefault(); |
||||
bool doubleSupport = dev.doubleFPConfig() > 0, |
||||
haveMask = _mask.kind() != _InputArray::NONE, |
||||
haveSrc2 = _src2.kind() != _InputArray::NONE; |
||||
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), |
||||
kercn = cn == 1 && !haveMask ? ocl::predictOptimalVectorWidth(_src, _src2) : 1, |
||||
mcn = std::max(cn, kercn); |
||||
CV_Assert(!haveSrc2 || _src2.type() == type); |
||||
int convert_cn = haveSrc2 ? mcn : cn; |
||||
|
||||
if ( (!doubleSupport && depth == CV_64F) || cn > 4 ) |
||||
return false; |
||||
|
||||
int ngroups = dev.maxComputeUnits(), dbsize = ngroups * (calc2 ? 2 : 1); |
||||
size_t wgs = dev.maxWorkGroupSize(); |
||||
|
||||
int ddepth = std::max(sum_op == OCL_OP_SUM_SQR ? CV_32F : CV_32S, depth), |
||||
dtype = CV_MAKE_TYPE(ddepth, cn); |
||||
CV_Assert(!haveMask || _mask.type() == CV_8UC1); |
||||
|
||||
int wgs2_aligned = 1; |
||||
while (wgs2_aligned < (int)wgs) |
||||
wgs2_aligned <<= 1; |
||||
wgs2_aligned >>= 1; |
||||
|
||||
static const char * const opMap[3] = { "OP_SUM", "OP_SUM_ABS", "OP_SUM_SQR" }; |
||||
char cvt[2][40]; |
||||
String opts = format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstTK=%s -D dstT1=%s -D ddepth=%d -D cn=%d" |
||||
" -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s%s -D kercn=%d%s%s%s -D convertFromU=%s", |
||||
ocl::typeToStr(CV_MAKE_TYPE(depth, mcn)), ocl::typeToStr(depth), |
||||
ocl::typeToStr(dtype), ocl::typeToStr(CV_MAKE_TYPE(ddepth, mcn)), |
||||
ocl::typeToStr(ddepth), ddepth, cn, |
||||
ocl::convertTypeStr(depth, ddepth, mcn, cvt[0]), |
||||
opMap[sum_op], (int)wgs, wgs2_aligned, |
||||
doubleSupport ? " -D DOUBLE_SUPPORT" : "", |
||||
haveMask ? " -D HAVE_MASK" : "", |
||||
_src.isContinuous() ? " -D HAVE_SRC_CONT" : "", |
||||
haveMask && _mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn, |
||||
haveSrc2 ? " -D HAVE_SRC2" : "", calc2 ? " -D OP_CALC2" : "", |
||||
haveSrc2 && _src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "", |
||||
depth <= CV_32S && ddepth == CV_32S ? ocl::convertTypeStr(CV_8U, ddepth, convert_cn, cvt[1]) : "noconvert"); |
||||
|
||||
ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, opts); |
||||
if (k.empty()) |
||||
return false; |
||||
|
||||
UMat src = _src.getUMat(), src2 = _src2.getUMat(), |
||||
db(1, dbsize, dtype), mask = _mask.getUMat(); |
||||
|
||||
ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src), |
||||
dbarg = ocl::KernelArg::PtrWriteOnly(db), |
||||
maskarg = ocl::KernelArg::ReadOnlyNoSize(mask), |
||||
src2arg = ocl::KernelArg::ReadOnlyNoSize(src2); |
||||
|
||||
if (haveMask) |
||||
{ |
||||
if (haveSrc2) |
||||
k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg, maskarg, src2arg); |
||||
else |
||||
k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg, maskarg); |
||||
} |
||||
else |
||||
{ |
||||
if (haveSrc2) |
||||
k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg, src2arg); |
||||
else |
||||
k.args(srcarg, src.cols, (int)src.total(), ngroups, dbarg); |
||||
} |
||||
|
||||
size_t globalsize = ngroups * wgs; |
||||
if (k.run(1, &globalsize, &wgs, false)) |
||||
{ |
||||
typedef Scalar (*part_sum)(Mat m); |
||||
part_sum funcs[3] = { ocl_part_sum<int>, ocl_part_sum<float>, ocl_part_sum<double> }, |
||||
func = funcs[ddepth - CV_32S]; |
||||
|
||||
Mat mres = db.getMat(ACCESS_READ); |
||||
if (calc2) |
||||
const_cast<Scalar &>(res2) = func(mres.colRange(ngroups, dbsize)); |
||||
|
||||
res = func(mres.colRange(0, ngroups)); |
||||
return true; |
||||
} |
||||
return false; |
||||
} |
||||
|
||||
#endif |
||||
|
||||
#ifdef HAVE_IPP |
||||
static bool ipp_sum(Mat &src, Scalar &_res) |
||||
{ |
||||
CV_INSTRUMENT_REGION_IPP() |
||||
|
||||
#if IPP_VERSION_X100 >= 700 |
||||
int cn = src.channels(); |
||||
if (cn > 4) |
||||
return false; |
||||
size_t total_size = src.total(); |
||||
int rows = src.size[0], cols = rows ? (int)(total_size/rows) : 0; |
||||
if( src.dims == 2 || (src.isContinuous() && cols > 0 && (size_t)rows*cols == total_size) ) |
||||
{ |
||||
IppiSize sz = { cols, rows }; |
||||
int type = src.type(); |
||||
typedef IppStatus (CV_STDCALL* ippiSumFuncHint)(const void*, int, IppiSize, double *, IppHintAlgorithm); |
||||
typedef IppStatus (CV_STDCALL* ippiSumFuncNoHint)(const void*, int, IppiSize, double *); |
||||
ippiSumFuncHint ippiSumHint = |
||||
type == CV_32FC1 ? (ippiSumFuncHint)ippiSum_32f_C1R : |
||||
type == CV_32FC3 ? (ippiSumFuncHint)ippiSum_32f_C3R : |
||||
type == CV_32FC4 ? (ippiSumFuncHint)ippiSum_32f_C4R : |
||||
0; |
||||
ippiSumFuncNoHint ippiSum = |
||||
type == CV_8UC1 ? (ippiSumFuncNoHint)ippiSum_8u_C1R : |
||||
type == CV_8UC3 ? (ippiSumFuncNoHint)ippiSum_8u_C3R : |
||||
type == CV_8UC4 ? (ippiSumFuncNoHint)ippiSum_8u_C4R : |
||||
type == CV_16UC1 ? (ippiSumFuncNoHint)ippiSum_16u_C1R : |
||||
type == CV_16UC3 ? (ippiSumFuncNoHint)ippiSum_16u_C3R : |
||||
type == CV_16UC4 ? (ippiSumFuncNoHint)ippiSum_16u_C4R : |
||||
type == CV_16SC1 ? (ippiSumFuncNoHint)ippiSum_16s_C1R : |
||||
type == CV_16SC3 ? (ippiSumFuncNoHint)ippiSum_16s_C3R : |
||||
type == CV_16SC4 ? (ippiSumFuncNoHint)ippiSum_16s_C4R : |
||||
0; |
||||
CV_Assert(!ippiSumHint || !ippiSum); |
||||
if( ippiSumHint || ippiSum ) |
||||
{ |
||||
Ipp64f res[4]; |
||||
IppStatus ret = ippiSumHint ? |
||||
CV_INSTRUMENT_FUN_IPP(ippiSumHint, src.ptr(), (int)src.step[0], sz, res, ippAlgHintAccurate) : |
||||
CV_INSTRUMENT_FUN_IPP(ippiSum, src.ptr(), (int)src.step[0], sz, res); |
||||
if( ret >= 0 ) |
||||
{ |
||||
for( int i = 0; i < cn; i++ ) |
||||
_res[i] = res[i]; |
||||
return true; |
||||
} |
||||
} |
||||
} |
||||
#else |
||||
CV_UNUSED(src); CV_UNUSED(_res); |
||||
#endif |
||||
return false; |
||||
} |
||||
#endif |
||||
|
||||
} // cv::
|
||||
|
||||
cv::Scalar cv::sum( InputArray _src ) |
||||
{ |
||||
CV_INSTRUMENT_REGION() |
||||
|
||||
#if defined HAVE_OPENCL || defined HAVE_IPP |
||||
Scalar _res; |
||||
#endif |
||||
|
||||
#ifdef HAVE_OPENCL |
||||
CV_OCL_RUN_(OCL_PERFORMANCE_CHECK(_src.isUMat()) && _src.dims() <= 2, |
||||
ocl_sum(_src, _res, OCL_OP_SUM), |
||||
_res) |
||||
#endif |
||||
|
||||
Mat src = _src.getMat(); |
||||
CV_IPP_RUN(IPP_VERSION_X100 >= 700, ipp_sum(src, _res), _res); |
||||
|
||||
int k, cn = src.channels(), depth = src.depth(); |
||||
SumFunc func = getSumFunc(depth); |
||||
CV_Assert( cn <= 4 && func != 0 ); |
||||
|
||||
const Mat* arrays[] = {&src, 0}; |
||||
uchar* ptrs[1]; |
||||
NAryMatIterator it(arrays, ptrs); |
||||
Scalar s; |
||||
int total = (int)it.size, blockSize = total, intSumBlockSize = 0; |
||||
int j, count = 0; |
||||
AutoBuffer<int> _buf; |
||||
int* buf = (int*)&s[0]; |
||||
size_t esz = 0; |
||||
bool blockSum = depth < CV_32S; |
||||
|
||||
if( blockSum ) |
||||
{ |
||||
intSumBlockSize = depth <= CV_8S ? (1 << 23) : (1 << 15); |
||||
blockSize = std::min(blockSize, intSumBlockSize); |
||||
_buf.allocate(cn); |
||||
buf = _buf; |
||||
|
||||
for( k = 0; k < cn; k++ ) |
||||
buf[k] = 0; |
||||
esz = src.elemSize(); |
||||
} |
||||
|
||||
for( size_t i = 0; i < it.nplanes; i++, ++it ) |
||||
{ |
||||
for( j = 0; j < total; j += blockSize ) |
||||
{ |
||||
int bsz = std::min(total - j, blockSize); |
||||
func( ptrs[0], 0, (uchar*)buf, bsz, cn ); |
||||
count += bsz; |
||||
if( blockSum && (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total)) ) |
||||
{ |
||||
for( k = 0; k < cn; k++ ) |
||||
{ |
||||
s[k] += buf[k]; |
||||
buf[k] = 0; |
||||
} |
||||
count = 0; |
||||
} |
||||
ptrs[0] += bsz*esz; |
||||
} |
||||
} |
||||
return s; |
||||
} |
Loading…
Reference in new issue