mirror of https://github.com/opencv/opencv.git
Open Source Computer Vision Library
https://opencv.org/
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
442 lines
15 KiB
442 lines
15 KiB
/* |
|
* By downloading, copying, installing or using the software you agree to this license. |
|
* If you do not agree to this license, do not download, install, |
|
* copy or use the software. |
|
* |
|
* |
|
* License Agreement |
|
* For Open Source Computer Vision Library |
|
* (3-clause BSD License) |
|
* |
|
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved. |
|
* Third party copyrights are property of their respective owners. |
|
* |
|
* Redistribution and use in source and binary forms, with or without modification, |
|
* are permitted provided that the following conditions are met: |
|
* |
|
* * Redistributions of source code must retain the above copyright notice, |
|
* this list of conditions and the following disclaimer. |
|
* |
|
* * Redistributions in binary form must reproduce the above copyright notice, |
|
* this list of conditions and the following disclaimer in the documentation |
|
* and/or other materials provided with the distribution. |
|
* |
|
* * Neither the names of the copyright holders nor the names of the contributors |
|
* may be used to endorse or promote products derived from this software |
|
* without specific prior written permission. |
|
* |
|
* This software is provided by the copyright holders and contributors "as is" and |
|
* any express or implied warranties, including, but not limited to, the implied |
|
* warranties of merchantability and fitness for a particular purpose are disclaimed. |
|
* In no event shall copyright holders or contributors be liable for any direct, |
|
* indirect, incidental, special, exemplary, or consequential damages |
|
* (including, but not limited to, procurement of substitute goods or services; |
|
* loss of use, data, or profits; or business interruption) however caused |
|
* and on any theory of liability, whether in contract, strict liability, |
|
* or tort (including negligence or otherwise) arising in any way out of |
|
* the use of this software, even if advised of the possibility of such damage. |
|
*/ |
|
|
|
#include "common.hpp" |
|
|
|
namespace CAROTENE_NS { |
|
|
|
#ifdef CAROTENE_NEON |
|
|
|
namespace { |
|
|
|
template <typename T> |
|
void process(const T * src, size_t j0, size_t j1, size_t i, |
|
T minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity, |
|
T maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity) |
|
{ |
|
for (size_t j = j0; j < j1; ++j) |
|
{ |
|
T val = src[j]; |
|
|
|
if (val == maxVal) |
|
{ |
|
if (maxLocCount < maxLocCapacity) |
|
{ |
|
maxLocPtr[maxLocCount] = j; |
|
maxLocPtr[maxLocCount + 1] = i; |
|
} |
|
maxLocCount += 2; |
|
} |
|
|
|
if (val == minVal) |
|
{ |
|
if (minLocCount < minLocCapacity) |
|
{ |
|
minLocPtr[minLocCount] = j; |
|
minLocPtr[minLocCount + 1] = i; |
|
} |
|
minLocCount += 2; |
|
} |
|
} |
|
} |
|
|
|
} // namespace |
|
|
|
#endif |
|
|
|
void fillMinMaxLocs(const Size2D & size, |
|
const u8 * srcBase, ptrdiff_t srcStride, |
|
u8 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity, |
|
u8 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity) |
|
{ |
|
internal::assertSupportedConfiguration(); |
|
#ifdef CAROTENE_NEON |
|
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; |
|
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; |
|
|
|
uint8x16_t v_maxval16 = vdupq_n_u8(maxVal), v_minval16 = vdupq_n_u8(minVal); |
|
uint8x8_t v_maxval8 = vdup_n_u8(maxVal), v_minval8 = vdup_n_u8(minVal); |
|
|
|
u64 mask[2] = { 0ul }; |
|
|
|
minLocCapacity <<= 1; |
|
maxLocCapacity <<= 1; |
|
|
|
for (size_t i = 0; i < size.height; ++i) |
|
{ |
|
const u8 * src = internal::getRowPtr(srcBase, srcStride, i); |
|
size_t j = 0; |
|
|
|
for ( ; j < roiw16; j += 16) |
|
{ |
|
internal::prefetch(src + j); |
|
uint8x16_t v_src = vld1q_u8(src + j); |
|
|
|
uint8x16_t v_maxmask = vceqq_u8(v_src, v_maxval16); |
|
uint8x16_t v_minmask = vceqq_u8(v_src, v_minval16); |
|
uint8x16_t v_mask = vorrq_u8(v_maxmask, v_minmask); |
|
|
|
vst1q_u8((u8 *)&mask[0], v_mask); |
|
|
|
if (mask[0]) |
|
process(src, j, j + 8, i, |
|
minVal, minLocPtr, minLocCount, minLocCapacity, |
|
maxVal, maxLocPtr, maxLocCount, maxLocCapacity); |
|
if (mask[1]) |
|
process(src, j + 8, j + 16, i, |
|
minVal, minLocPtr, minLocCount, minLocCapacity, |
|
maxVal, maxLocPtr, maxLocCount, maxLocCapacity); |
|
} |
|
for ( ; j < roiw8; j += 8) |
|
{ |
|
uint8x8_t v_src = vld1_u8(src + j); |
|
|
|
uint8x8_t v_maxmask = vceq_u8(v_src, v_maxval8); |
|
uint8x8_t v_minmask = vceq_u8(v_src, v_minval8); |
|
uint8x8_t v_mask = vorr_u8(v_maxmask, v_minmask); |
|
|
|
vst1_u8((u8 *)&mask[0], v_mask); |
|
|
|
if (mask[0]) |
|
process(src, j, j + 8, i, |
|
minVal, minLocPtr, minLocCount, minLocCapacity, |
|
maxVal, maxLocPtr, maxLocCount, maxLocCapacity); |
|
} |
|
|
|
process(src, j, size.width, i, |
|
minVal, minLocPtr, minLocCount, minLocCapacity, |
|
maxVal, maxLocPtr, maxLocCount, maxLocCapacity); |
|
} |
|
|
|
minLocCount >>= 1; |
|
maxLocCount >>= 1; |
|
#else |
|
(void)size; |
|
(void)srcBase; |
|
(void)srcStride; |
|
(void)minVal; |
|
(void)minLocPtr; |
|
(void)minLocCount; |
|
(void)minLocCapacity; |
|
(void)maxVal; |
|
(void)maxLocPtr; |
|
(void)maxLocCount; |
|
(void)maxLocCapacity; |
|
#endif |
|
} |
|
|
|
void fillMinMaxLocs(const Size2D & size, |
|
const u16 * srcBase, ptrdiff_t srcStride, |
|
u16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity, |
|
u16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity) |
|
{ |
|
internal::assertSupportedConfiguration(); |
|
#ifdef CAROTENE_NEON |
|
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; |
|
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; |
|
|
|
uint16x8_t v_maxval8 = vdupq_n_u16(maxVal), |
|
v_minval8 = vdupq_n_u16(minVal); |
|
u64 mask[2] = { 0ul }; |
|
|
|
minLocCapacity <<= 1; |
|
maxLocCapacity <<= 1; |
|
|
|
for (size_t i = 0; i < size.height; ++i) |
|
{ |
|
const u16 * src = internal::getRowPtr(srcBase, srcStride, i); |
|
size_t j = 0; |
|
|
|
for ( ; j < roiw16; j += 16) |
|
{ |
|
internal::prefetch(src + j); |
|
uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8); |
|
|
|
uint16x8_t v_mask0 = vorrq_u16(vceqq_u16(v_src0, v_maxval8), vceqq_u16(v_src0, v_minval8)); |
|
uint16x8_t v_mask1 = vorrq_u16(vceqq_u16(v_src1, v_maxval8), vceqq_u16(v_src1, v_minval8)); |
|
|
|
vst1q_u8((u8 *)&mask[0], vcombine_u8(vmovn_u16(v_mask0), vmovn_u16(v_mask1))); |
|
|
|
if (mask[0]) |
|
process(src, j, j + 8, i, |
|
minVal, minLocPtr, minLocCount, minLocCapacity, |
|
maxVal, maxLocPtr, maxLocCount, maxLocCapacity); |
|
if (mask[1]) |
|
process(src, j + 8, j + 16, i, |
|
minVal, minLocPtr, minLocCount, minLocCapacity, |
|
maxVal, maxLocPtr, maxLocCount, maxLocCapacity); |
|
} |
|
for ( ; j < roiw8; j += 8) |
|
{ |
|
internal::prefetch(src + j); |
|
uint16x8_t v_src = vld1q_u16(src + j); |
|
|
|
uint16x8_t v_maxmask = vceqq_u16(v_src, v_maxval8); |
|
uint16x8_t v_minmask = vceqq_u16(v_src, v_minval8); |
|
uint16x8_t v_mask = vorrq_u16(v_maxmask, v_minmask); |
|
|
|
vst1_u8((u8 *)&mask[0], vmovn_u16(v_mask)); |
|
|
|
if (mask[0]) |
|
process(src, j, j + 8, i, |
|
minVal, minLocPtr, minLocCount, minLocCapacity, |
|
maxVal, maxLocPtr, maxLocCount, maxLocCapacity); |
|
} |
|
|
|
process(src, j, size.width, i, |
|
minVal, minLocPtr, minLocCount, minLocCapacity, |
|
maxVal, maxLocPtr, maxLocCount, maxLocCapacity); |
|
} |
|
|
|
minLocCount >>= 1; |
|
maxLocCount >>= 1; |
|
#else |
|
(void)size; |
|
(void)srcBase; |
|
(void)srcStride; |
|
(void)minVal; |
|
(void)minLocPtr; |
|
(void)minLocCount; |
|
(void)minLocCapacity; |
|
(void)maxVal; |
|
(void)maxLocPtr; |
|
(void)maxLocCount; |
|
(void)maxLocCapacity; |
|
#endif |
|
} |
|
|
|
void fillMinMaxLocs(const Size2D & size, |
|
const s16 * srcBase, ptrdiff_t srcStride, |
|
s16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity, |
|
s16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity) |
|
{ |
|
internal::assertSupportedConfiguration(); |
|
#ifdef CAROTENE_NEON |
|
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; |
|
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; |
|
|
|
int16x8_t v_maxval8 = vdupq_n_s16(maxVal), |
|
v_minval8 = vdupq_n_s16(minVal); |
|
u64 mask[2] = { 0ul }; |
|
|
|
minLocCapacity <<= 1; |
|
maxLocCapacity <<= 1; |
|
|
|
for (size_t i = 0; i < size.height; ++i) |
|
{ |
|
const s16 * src = internal::getRowPtr(srcBase, srcStride, i); |
|
size_t j = 0; |
|
|
|
for ( ; j < roiw16; j += 16) |
|
{ |
|
internal::prefetch(src + j); |
|
int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8); |
|
|
|
uint16x8_t v_mask0 = vorrq_u16(vceqq_s16(v_src0, v_maxval8), vceqq_s16(v_src0, v_minval8)); |
|
uint16x8_t v_mask1 = vorrq_u16(vceqq_s16(v_src1, v_maxval8), vceqq_s16(v_src1, v_minval8)); |
|
|
|
vst1q_u8((u8 *)&mask[0], vcombine_u8(vmovn_u16(v_mask0), vmovn_u16(v_mask1))); |
|
|
|
if (mask[0]) |
|
process(src, j, j + 8, i, |
|
minVal, minLocPtr, minLocCount, minLocCapacity, |
|
maxVal, maxLocPtr, maxLocCount, maxLocCapacity); |
|
if (mask[1]) |
|
process(src, j + 8, j + 16, i, |
|
minVal, minLocPtr, minLocCount, minLocCapacity, |
|
maxVal, maxLocPtr, maxLocCount, maxLocCapacity); |
|
} |
|
for ( ; j < roiw8; j += 8) |
|
{ |
|
internal::prefetch(src + j); |
|
int16x8_t v_src = vld1q_s16(src + j); |
|
|
|
uint16x8_t v_maxmask = vceqq_s16(v_src, v_maxval8); |
|
uint16x8_t v_minmask = vceqq_s16(v_src, v_minval8); |
|
uint16x8_t v_mask = vorrq_u16(v_maxmask, v_minmask); |
|
|
|
vst1_u8((u8 *)&mask[0], vmovn_u16(v_mask)); |
|
|
|
if (mask[0]) |
|
process(src, j, j + 8, i, |
|
minVal, minLocPtr, minLocCount, minLocCapacity, |
|
maxVal, maxLocPtr, maxLocCount, maxLocCapacity); |
|
} |
|
|
|
process(src, j, size.width, i, |
|
minVal, minLocPtr, minLocCount, minLocCapacity, |
|
maxVal, maxLocPtr, maxLocCount, maxLocCapacity); |
|
} |
|
|
|
minLocCount >>= 1; |
|
maxLocCount >>= 1; |
|
#else |
|
(void)size; |
|
(void)srcBase; |
|
(void)srcStride; |
|
(void)minVal; |
|
(void)minLocPtr; |
|
(void)minLocCount; |
|
(void)minLocCapacity; |
|
(void)maxVal; |
|
(void)maxLocPtr; |
|
(void)maxLocCount; |
|
(void)maxLocCapacity; |
|
#endif |
|
} |
|
|
|
void fillMinMaxLocs(const Size2D & size, |
|
const s32 * srcBase, ptrdiff_t srcStride, |
|
s32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity, |
|
s32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity) |
|
{ |
|
internal::assertSupportedConfiguration(); |
|
#ifdef CAROTENE_NEON |
|
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; |
|
|
|
int32x4_t v_maxval4 = vdupq_n_s32(maxVal), |
|
v_minval4 = vdupq_n_s32(minVal); |
|
u64 mask = 0ul; |
|
|
|
minLocCapacity <<= 1; |
|
maxLocCapacity <<= 1; |
|
|
|
for (size_t i = 0; i < size.height; ++i) |
|
{ |
|
const s32 * src = internal::getRowPtr(srcBase, srcStride, i); |
|
size_t j = 0; |
|
|
|
for ( ; j < roiw8; j += 8) |
|
{ |
|
internal::prefetch(src + j); |
|
int32x4_t v_src0 = vld1q_s32(src + j), v_src1 = vld1q_s32(src + j + 4); |
|
|
|
uint32x4_t v_mask0 = vorrq_u32(vceqq_s32(v_src0, v_maxval4), vceqq_s32(v_src0, v_minval4)); |
|
uint32x4_t v_mask1 = vorrq_u32(vceqq_s32(v_src1, v_maxval4), vceqq_s32(v_src1, v_minval4)); |
|
|
|
vst1_u8((u8 *)&mask, vmovn_u16(vcombine_u16(vmovn_u32(v_mask0), vmovn_u32(v_mask1)))); |
|
|
|
if (mask) |
|
process(src, j, j + 8, i, |
|
minVal, minLocPtr, minLocCount, minLocCapacity, |
|
maxVal, maxLocPtr, maxLocCount, maxLocCapacity); |
|
} |
|
|
|
process(src, j, size.width, i, |
|
minVal, minLocPtr, minLocCount, minLocCapacity, |
|
maxVal, maxLocPtr, maxLocCount, maxLocCapacity); |
|
} |
|
|
|
minLocCount >>= 1; |
|
maxLocCount >>= 1; |
|
#else |
|
(void)size; |
|
(void)srcBase; |
|
(void)srcStride; |
|
(void)minVal; |
|
(void)minLocPtr; |
|
(void)minLocCount; |
|
(void)minLocCapacity; |
|
(void)maxVal; |
|
(void)maxLocPtr; |
|
(void)maxLocCount; |
|
(void)maxLocCapacity; |
|
#endif |
|
} |
|
|
|
void fillMinMaxLocs(const Size2D & size, |
|
const u32 * srcBase, ptrdiff_t srcStride, |
|
u32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity, |
|
u32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity) |
|
{ |
|
internal::assertSupportedConfiguration(); |
|
#ifdef CAROTENE_NEON |
|
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; |
|
|
|
uint32x4_t v_maxval4 = vdupq_n_u32(maxVal), |
|
v_minval4 = vdupq_n_u32(minVal); |
|
u64 mask = 0ul; |
|
|
|
minLocCapacity <<= 1; |
|
maxLocCapacity <<= 1; |
|
|
|
for (size_t i = 0; i < size.height; ++i) |
|
{ |
|
const u32 * src = internal::getRowPtr(srcBase, srcStride, i); |
|
size_t j = 0; |
|
|
|
for ( ; j < roiw8; j += 8) |
|
{ |
|
internal::prefetch(src + j); |
|
uint32x4_t v_src0 = vld1q_u32(src + j), v_src1 = vld1q_u32(src + j + 4); |
|
|
|
uint32x4_t v_mask0 = vorrq_u32(vceqq_u32(v_src0, v_maxval4), vceqq_u32(v_src0, v_minval4)); |
|
uint32x4_t v_mask1 = vorrq_u32(vceqq_u32(v_src1, v_maxval4), vceqq_u32(v_src1, v_minval4)); |
|
|
|
vst1_u8((u8 *)&mask, vmovn_u16(vcombine_u16(vmovn_u32(v_mask0), vmovn_u32(v_mask1)))); |
|
|
|
if (mask) |
|
process(src, j, j + 8, i, |
|
minVal, minLocPtr, minLocCount, minLocCapacity, |
|
maxVal, maxLocPtr, maxLocCount, maxLocCapacity); |
|
} |
|
|
|
process(src, j, size.width, i, |
|
minVal, minLocPtr, minLocCount, minLocCapacity, |
|
maxVal, maxLocPtr, maxLocCount, maxLocCapacity); |
|
} |
|
|
|
minLocCount >>= 1; |
|
maxLocCount >>= 1; |
|
#else |
|
(void)size; |
|
(void)srcBase; |
|
(void)srcStride; |
|
(void)minVal; |
|
(void)minLocPtr; |
|
(void)minLocCount; |
|
(void)minLocCapacity; |
|
(void)maxVal; |
|
(void)maxLocPtr; |
|
(void)maxLocCount; |
|
(void)maxLocCapacity; |
|
#endif |
|
} |
|
|
|
} // namespace CAROTENE_NS
|
|
|