|
|
|
@ -50,6 +50,99 @@ static IppStatus sts = ippInit(); |
|
|
|
|
namespace cv |
|
|
|
|
{ |
|
|
|
|
|
|
|
|
|
template <typename T, typename ST, typename QT> |
|
|
|
|
struct Integral_SIMD |
|
|
|
|
{ |
|
|
|
|
bool operator()(const T *, size_t, |
|
|
|
|
ST *, size_t, |
|
|
|
|
QT *, size_t, |
|
|
|
|
ST *, size_t, |
|
|
|
|
Size, int) const |
|
|
|
|
{ |
|
|
|
|
return false; |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
#if CV_SSE2 |
|
|
|
|
|
|
|
|
|
template <> |
|
|
|
|
struct Integral_SIMD<uchar, int, double> |
|
|
|
|
{ |
|
|
|
|
Integral_SIMD() |
|
|
|
|
{ |
|
|
|
|
haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
bool operator()(const uchar * src, size_t _srcstep, |
|
|
|
|
int * sum, size_t _sumstep, |
|
|
|
|
double * sqsum, size_t, |
|
|
|
|
int * tilted, size_t, |
|
|
|
|
Size size, int cn) const |
|
|
|
|
{ |
|
|
|
|
if (sqsum || tilted || cn != 1 || !haveSSE2) |
|
|
|
|
return false; |
|
|
|
|
|
|
|
|
|
// the first iteration
|
|
|
|
|
memset(sum, 0, (size.width + 1) * sizeof(int)); |
|
|
|
|
|
|
|
|
|
__m128i v_zero = _mm_setzero_si128(), prev = v_zero; |
|
|
|
|
int j = 0; |
|
|
|
|
|
|
|
|
|
// the others
|
|
|
|
|
for (int i = 0; i < size.height; ++i) |
|
|
|
|
{ |
|
|
|
|
const uchar * src_row = src + _srcstep * i; |
|
|
|
|
int * prev_sum_row = (int *)((uchar *)sum + _sumstep * i) + 1; |
|
|
|
|
int * sum_row = (int *)((uchar *)sum + _sumstep * (i + 1)) + 1; |
|
|
|
|
|
|
|
|
|
sum_row[-1] = 0; |
|
|
|
|
|
|
|
|
|
prev = v_zero; |
|
|
|
|
j = 0; |
|
|
|
|
|
|
|
|
|
for ( ; j + 7 < size.width; j += 8) |
|
|
|
|
{ |
|
|
|
|
__m128i vsuml = _mm_loadu_si128((const __m128i *)(prev_sum_row + j)); |
|
|
|
|
__m128i vsumh = _mm_loadu_si128((const __m128i *)(prev_sum_row + j + 4)); |
|
|
|
|
|
|
|
|
|
__m128i el8shr0 = _mm_loadl_epi64((const __m128i *)(src_row + j)); |
|
|
|
|
__m128i el8shr1 = _mm_slli_si128(el8shr0, 1); |
|
|
|
|
__m128i el8shr2 = _mm_slli_si128(el8shr0, 2); |
|
|
|
|
__m128i el8shr3 = _mm_slli_si128(el8shr0, 3); |
|
|
|
|
|
|
|
|
|
vsuml = _mm_add_epi32(vsuml, prev); |
|
|
|
|
vsumh = _mm_add_epi32(vsumh, prev); |
|
|
|
|
|
|
|
|
|
__m128i el8shr12 = _mm_add_epi16(_mm_unpacklo_epi8(el8shr1, v_zero), |
|
|
|
|
_mm_unpacklo_epi8(el8shr2, v_zero)); |
|
|
|
|
__m128i el8shr03 = _mm_add_epi16(_mm_unpacklo_epi8(el8shr0, v_zero), |
|
|
|
|
_mm_unpacklo_epi8(el8shr3, v_zero)); |
|
|
|
|
__m128i el8 = _mm_add_epi16(el8shr12, el8shr03); |
|
|
|
|
|
|
|
|
|
__m128i el4h = _mm_add_epi16(_mm_unpackhi_epi16(el8, v_zero), |
|
|
|
|
_mm_unpacklo_epi16(el8, v_zero)); |
|
|
|
|
|
|
|
|
|
vsuml = _mm_add_epi32(vsuml, _mm_unpacklo_epi16(el8, v_zero)); |
|
|
|
|
vsumh = _mm_add_epi32(vsumh, el4h); |
|
|
|
|
|
|
|
|
|
_mm_storeu_si128((__m128i *)(sum_row + j), vsuml); |
|
|
|
|
_mm_storeu_si128((__m128i *)(sum_row + j + 4), vsumh); |
|
|
|
|
|
|
|
|
|
prev = _mm_add_epi32(prev, _mm_shuffle_epi32(el4h, _MM_SHUFFLE(3, 3, 3, 3))); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
for (int v = sum_row[j - 1] - prev_sum_row[j - 1]; j < size.width; ++j) |
|
|
|
|
sum_row[j] = (v += src_row[j]) + prev_sum_row[j]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return true; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
bool haveSSE2; |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
template<typename T, typename ST, typename QT> |
|
|
|
|
void integral_( const T* src, size_t _srcstep, ST* sum, size_t _sumstep, |
|
|
|
|
QT* sqsum, size_t _sqsumstep, ST* tilted, size_t _tiltedstep, |
|
|
|
@ -57,6 +150,13 @@ void integral_( const T* src, size_t _srcstep, ST* sum, size_t _sumstep, |
|
|
|
|
{ |
|
|
|
|
int x, y, k; |
|
|
|
|
|
|
|
|
|
if (Integral_SIMD<T, ST, QT>()(src, _srcstep, |
|
|
|
|
sum, _sumstep, |
|
|
|
|
sqsum, _sqsumstep, |
|
|
|
|
tilted, _tiltedstep, |
|
|
|
|
size, cn)) |
|
|
|
|
return; |
|
|
|
|
|
|
|
|
|
int srcstep = (int)(_srcstep/sizeof(T)); |
|
|
|
|
int sumstep = (int)(_sumstep/sizeof(ST)); |
|
|
|
|
int tiltedstep = (int)(_tiltedstep/sizeof(ST)); |
|
|
|
|