mirror of https://github.com/opencv/opencv.git
Open Source Computer Vision Library
https://opencv.org/
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
460 lines
14 KiB
460 lines
14 KiB
/* |
|
* By downloading, copying, installing or using the software you agree to this license. |
|
* If you do not agree to this license, do not download, install, |
|
* copy or use the software. |
|
* |
|
* |
|
* License Agreement |
|
* For Open Source Computer Vision Library |
|
* (3-clause BSD License) |
|
* |
|
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. |
|
* Third party copyrights are property of their respective owners. |
|
* |
|
* Redistribution and use in source and binary forms, with or without modification, |
|
* are permitted provided that the following conditions are met: |
|
* |
|
* * Redistributions of source code must retain the above copyright notice, |
|
* this list of conditions and the following disclaimer. |
|
* |
|
* * Redistributions in binary form must reproduce the above copyright notice, |
|
* this list of conditions and the following disclaimer in the documentation |
|
* and/or other materials provided with the distribution. |
|
* |
|
* * Neither the names of the copyright holders nor the names of the contributors |
|
* may be used to endorse or promote products derived from this software |
|
* without specific prior written permission. |
|
* |
|
* This software is provided by the copyright holders and contributors "as is" and |
|
* any express or implied warranties, including, but not limited to, the implied |
|
* warranties of merchantability and fitness for a particular purpose are disclaimed. |
|
* In no event shall copyright holders or contributors be liable for any direct, |
|
* indirect, incidental, special, exemplary, or consequential damages |
|
* (including, but not limited to, procurement of substitute goods or services; |
|
* loss of use, data, or profits; or business interruption) however caused |
|
* and on any theory of liability, whether in contract, strict liability, |
|
* or tort (including negligence or otherwise) arising in any way out of |
|
* the use of this software, even if advised of the possibility of such damage. |
|
*/ |
|
|
|
#include "common.hpp" |
|
|
|
#include <cstring> |
|
|
|
namespace CAROTENE_NS { |
|
|
|
void reduceColSum(const Size2D &size, |
|
const u8 * srcBase, ptrdiff_t srcStride, |
|
s32 * dstBase) |
|
{ |
|
internal::assertSupportedConfiguration(); |
|
#ifdef CAROTENE_NEON |
|
memset(dstBase, 0, size.width*sizeof(s32)); |
|
size_t i = 0; |
|
for (; i + 16 <= size.width; i += 16) |
|
{ |
|
const u8* src_address = srcBase + i; |
|
|
|
int32x4_t sll = vmovq_n_s32(0); |
|
int32x4_t slh = vmovq_n_s32(0); |
|
int32x4_t shl = vmovq_n_s32(0); |
|
int32x4_t shh = vmovq_n_s32(0); |
|
|
|
for (size_t h = 0; h < size.height; h += 256) |
|
{ |
|
size_t lim = std::min(h + 256, size.height); |
|
|
|
uint16x8_t sl = vmovq_n_u16(0); |
|
uint16x8_t sh = vmovq_n_u16(0); |
|
|
|
for (size_t k = h; k < lim; ++k, src_address += srcStride) |
|
{ |
|
internal::prefetch(src_address + srcStride, 0); |
|
|
|
uint8x16_t v = vld1q_u8(src_address); |
|
|
|
sl = vaddw_u8(sl, vget_low_u8(v)); |
|
sh = vaddw_u8(sh, vget_high_u8(v)); |
|
} |
|
|
|
int32x4_t vsll = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sl))); |
|
int32x4_t vslh = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sl))); |
|
int32x4_t vshl = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sh))); |
|
int32x4_t vshh = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sh))); |
|
|
|
sll = vqaddq_s32(sll, vsll); |
|
slh = vqaddq_s32(slh, vslh); |
|
shl = vqaddq_s32(shl, vshl); |
|
shh = vqaddq_s32(shh, vshh); |
|
} |
|
|
|
vst1q_s32(dstBase + i + 0, sll); |
|
vst1q_s32(dstBase + i + 4, slh); |
|
vst1q_s32(dstBase + i + 8, shl); |
|
vst1q_s32(dstBase + i + 12, shh); |
|
} |
|
|
|
for(size_t h = 0; h < size.height; ++h) |
|
{ |
|
for(size_t j = i ; j < size.width; j++ ) |
|
{ |
|
if (((u32)(dstBase[j] += srcBase[j + srcStride * h])) > 0x7fFFffFFu) |
|
dstBase[j] = 0x7fFFffFF; |
|
} |
|
} |
|
#else |
|
(void)size; |
|
(void)srcBase; |
|
(void)srcStride; |
|
(void)dstBase; |
|
#endif |
|
} |
|
|
|
void reduceColMax(const Size2D &size, |
|
const u8 * srcBase, ptrdiff_t srcStride, |
|
u8 * dstBase) |
|
{ |
|
internal::assertSupportedConfiguration(); |
|
#ifdef CAROTENE_NEON |
|
memcpy(dstBase, srcBase, size.width); |
|
size_t i = 0; |
|
for (; i + 16*4 <= size.width; i += 16*4) |
|
{ |
|
const u8* src_address = srcBase + i; |
|
|
|
uint8x16_t s1 = vld1q_u8(src_address + 0); |
|
uint8x16_t s2 = vld1q_u8(src_address + 16); |
|
uint8x16_t s3 = vld1q_u8(src_address + 32); |
|
uint8x16_t s4 = vld1q_u8(src_address + 48); |
|
|
|
src_address += srcStride; |
|
|
|
for(size_t h = 1; h < size.height; ++h, src_address += srcStride) |
|
{ |
|
internal::prefetch(src_address + srcStride, 0); |
|
internal::prefetch(src_address + srcStride, 32); |
|
|
|
uint8x16_t v1 = vld1q_u8(src_address + 0); |
|
uint8x16_t v2 = vld1q_u8(src_address + 16); |
|
uint8x16_t v3 = vld1q_u8(src_address + 32); |
|
uint8x16_t v4 = vld1q_u8(src_address + 48); |
|
|
|
s1 = vmaxq_u8(s1, v1); |
|
s2 = vmaxq_u8(s2, v2); |
|
s3 = vmaxq_u8(s3, v3); |
|
s4 = vmaxq_u8(s4, v4); |
|
} |
|
|
|
vst1q_u8(dstBase + i + 0, s1); |
|
vst1q_u8(dstBase + i + 16, s2); |
|
vst1q_u8(dstBase + i + 32, s3); |
|
vst1q_u8(dstBase + i + 48, s4); |
|
} |
|
|
|
for (; i + 16 <= size.width; i += 16) |
|
{ |
|
const u8* src_address = srcBase + i; |
|
uint8x16_t s1 = vld1q_u8(src_address); |
|
src_address += srcStride; |
|
for(size_t h = 1; h < size.height; ++h, src_address += srcStride) |
|
{ |
|
internal::prefetch(src_address + srcStride, 0); |
|
|
|
uint8x16_t v1 = vld1q_u8(src_address); |
|
s1 = vmaxq_u8(s1, v1); |
|
} |
|
vst1q_u8(dstBase + i, s1); |
|
} |
|
|
|
if (i < size.width) |
|
for(size_t h = 1; h < size.height; ++h) |
|
for(size_t j = i ; j < size.width; j++ ) |
|
dstBase[j] = std::max(dstBase[j], srcBase[j + srcStride * h]); |
|
#else |
|
(void)size; |
|
(void)srcBase; |
|
(void)srcStride; |
|
(void)dstBase; |
|
#endif |
|
} |
|
|
|
void reduceColMin(const Size2D &size, |
|
const u8 * srcBase, ptrdiff_t srcStride, |
|
u8 * dstBase) |
|
{ |
|
internal::assertSupportedConfiguration(); |
|
#ifdef CAROTENE_NEON |
|
memcpy(dstBase, srcBase, size.width); |
|
size_t i = 0; |
|
for (; i + 16*4 <= size.width; i += 16*4) |
|
{ |
|
const u8* src_address = srcBase + i; |
|
|
|
uint8x16_t s1 = vld1q_u8(src_address + 0); |
|
uint8x16_t s2 = vld1q_u8(src_address + 16); |
|
uint8x16_t s3 = vld1q_u8(src_address + 32); |
|
uint8x16_t s4 = vld1q_u8(src_address + 48); |
|
|
|
src_address += srcStride; |
|
|
|
for(size_t h = 1; h < size.height; ++h, src_address += srcStride) |
|
{ |
|
internal::prefetch(src_address + srcStride, 0); |
|
internal::prefetch(src_address + srcStride, 32); |
|
|
|
uint8x16_t v1 = vld1q_u8(src_address + 0); |
|
uint8x16_t v2 = vld1q_u8(src_address + 16); |
|
uint8x16_t v3 = vld1q_u8(src_address + 32); |
|
uint8x16_t v4 = vld1q_u8(src_address + 48); |
|
|
|
s1 = vminq_u8(s1, v1); |
|
s2 = vminq_u8(s2, v2); |
|
s3 = vminq_u8(s3, v3); |
|
s4 = vminq_u8(s4, v4); |
|
} |
|
|
|
vst1q_u8(dstBase + i + 0, s1); |
|
vst1q_u8(dstBase + i + 16, s2); |
|
vst1q_u8(dstBase + i + 32, s3); |
|
vst1q_u8(dstBase + i + 48, s4); |
|
} |
|
|
|
for (; i + 16 <= size.width; i += 16) |
|
{ |
|
const u8* src_address = srcBase + i; |
|
uint8x16_t s1 = vld1q_u8(src_address); |
|
src_address += srcStride; |
|
for(size_t h = 1; h < size.height; ++h, src_address += srcStride) |
|
{ |
|
internal::prefetch(src_address + srcStride, 0); |
|
|
|
uint8x16_t v1 = vld1q_u8(src_address); |
|
s1 = vminq_u8(s1, v1); |
|
} |
|
vst1q_u8(dstBase + i, s1); |
|
} |
|
|
|
if (i < size.width) |
|
for(size_t h = 1; h < size.height; ++h) |
|
for(size_t j = i ; j < size.width; j++ ) |
|
dstBase[j] = std::min(dstBase[j], srcBase[j + srcStride * h]); |
|
#else |
|
(void)size; |
|
(void)srcBase; |
|
(void)srcStride; |
|
(void)dstBase; |
|
#endif |
|
} |
|
|
|
void reduceColSum(const Size2D &size, |
|
const f32 * srcBase, ptrdiff_t srcStride, |
|
f32 * dstBase) |
|
{ |
|
internal::assertSupportedConfiguration(); |
|
#ifdef CAROTENE_NEON |
|
memcpy(dstBase, srcBase, size.width*sizeof(f32)); |
|
size_t srcstep = srcStride/sizeof(f32); |
|
size_t i = 0; |
|
for (; i + 16 <= size.width; i += 16) |
|
{ |
|
const f32* src_address = srcBase + i; |
|
|
|
float32x4_t s1 = vld1q_f32(src_address + 0); |
|
float32x4_t s2 = vld1q_f32(src_address + 4); |
|
float32x4_t s3 = vld1q_f32(src_address + 8); |
|
float32x4_t s4 = vld1q_f32(src_address + 12); |
|
|
|
src_address += srcstep; |
|
|
|
for(size_t h = 1; h < size.height; ++h, src_address += srcstep) |
|
{ |
|
internal::prefetch(src_address + srcstep, 0); |
|
internal::prefetch(src_address + srcstep, 32); |
|
|
|
float32x4_t v1 = vld1q_f32(src_address + 0); |
|
float32x4_t v2 = vld1q_f32(src_address + 4); |
|
float32x4_t v3 = vld1q_f32(src_address + 8); |
|
float32x4_t v4 = vld1q_f32(src_address + 12); |
|
|
|
s1 = vaddq_f32(s1, v1); |
|
s2 = vaddq_f32(s2, v2); |
|
s3 = vaddq_f32(s3, v3); |
|
s4 = vaddq_f32(s4, v4); |
|
} |
|
|
|
vst1q_f32(dstBase + i + 0, s1); |
|
vst1q_f32(dstBase + i + 4, s2); |
|
vst1q_f32(dstBase + i + 8, s3); |
|
vst1q_f32(dstBase + i + 12, s4); |
|
} |
|
|
|
for (; i + 4 <= size.width; i += 4) |
|
{ |
|
const f32* src_address = srcBase + i; |
|
float32x4_t s1 = vld1q_f32(src_address); |
|
src_address += srcstep; |
|
for(size_t h = 1; h < size.height; ++h, src_address += srcstep) |
|
{ |
|
internal::prefetch(src_address + srcstep, 0); |
|
|
|
float32x4_t v1 = vld1q_f32(src_address); |
|
s1 = vaddq_f32(s1, v1); |
|
} |
|
vst1q_f32(dstBase + i, s1); |
|
} |
|
|
|
if (i < size.width) |
|
for(size_t h = 1; h < size.height; ++h) |
|
{ |
|
for(size_t j = i ; j < size.width; j++ ) |
|
{ |
|
dstBase[j] += srcBase[j + srcstep * h]; |
|
} |
|
} |
|
#else |
|
(void)size; |
|
(void)srcBase; |
|
(void)srcStride; |
|
(void)dstBase; |
|
#endif |
|
} |
|
|
|
void reduceColMax(const Size2D &size, |
|
const f32 * srcBase, ptrdiff_t srcStride, |
|
f32 * dstBase) |
|
{ |
|
internal::assertSupportedConfiguration(); |
|
#ifdef CAROTENE_NEON |
|
memcpy(dstBase, srcBase, size.width*sizeof(f32)); |
|
size_t srcstep = srcStride/sizeof(f32); |
|
size_t i = 0; |
|
for (; i + 16 <= size.width; i += 16) |
|
{ |
|
const f32* src_address = srcBase + i; |
|
|
|
float32x4_t s1 = vld1q_f32(src_address + 0); |
|
float32x4_t s2 = vld1q_f32(src_address + 4); |
|
float32x4_t s3 = vld1q_f32(src_address + 8); |
|
float32x4_t s4 = vld1q_f32(src_address + 12); |
|
|
|
src_address += srcstep; |
|
|
|
for(size_t h = 1; h < size.height; ++h, src_address += srcstep) |
|
{ |
|
internal::prefetch(src_address + srcstep, 0); |
|
internal::prefetch(src_address + srcstep, 32); |
|
|
|
float32x4_t v1 = vld1q_f32(src_address + 0); |
|
float32x4_t v2 = vld1q_f32(src_address + 4); |
|
float32x4_t v3 = vld1q_f32(src_address + 8); |
|
float32x4_t v4 = vld1q_f32(src_address + 12); |
|
|
|
s1 = vmaxq_f32(s1, v1); |
|
s2 = vmaxq_f32(s2, v2); |
|
s3 = vmaxq_f32(s3, v3); |
|
s4 = vmaxq_f32(s4, v4); |
|
} |
|
|
|
vst1q_f32(dstBase + i + 0, s1); |
|
vst1q_f32(dstBase + i + 4, s2); |
|
vst1q_f32(dstBase + i + 8, s3); |
|
vst1q_f32(dstBase + i + 12, s4); |
|
} |
|
|
|
for (; i + 4 <= size.width; i += 4) |
|
{ |
|
const f32* src_address = srcBase + i; |
|
float32x4_t s1 = vld1q_f32(src_address); |
|
src_address += srcstep; |
|
for(size_t h = 1; h < size.height; ++h, src_address += srcstep) |
|
{ |
|
internal::prefetch(src_address + srcstep, 0); |
|
|
|
float32x4_t v1 = vld1q_f32(src_address); |
|
s1 = vmaxq_f32(s1, v1); |
|
} |
|
vst1q_f32(dstBase + i, s1); |
|
} |
|
|
|
if (i < size.width) |
|
for(size_t h = 1; h < size.height; ++h) |
|
for(size_t j = i ; j < size.width; j++ ) |
|
dstBase[j] = std::max(dstBase[j], srcBase[j + srcstep * h]); |
|
#else |
|
(void)size; |
|
(void)srcBase; |
|
(void)srcStride; |
|
(void)dstBase; |
|
#endif |
|
} |
|
|
|
void reduceColMin(const Size2D &size, |
|
const f32 * srcBase, ptrdiff_t srcStride, |
|
f32 * dstBase) |
|
{ |
|
internal::assertSupportedConfiguration(); |
|
#ifdef CAROTENE_NEON |
|
memcpy(dstBase, srcBase, size.width*sizeof(f32)); |
|
size_t srcstep = srcStride/sizeof(f32); |
|
size_t i = 0; |
|
for (; i + 16 <= size.width; i += 16) |
|
{ |
|
const f32* src_address = srcBase + i; |
|
|
|
float32x4_t s1 = vld1q_f32(src_address + 0); |
|
float32x4_t s2 = vld1q_f32(src_address + 4); |
|
float32x4_t s3 = vld1q_f32(src_address + 8); |
|
float32x4_t s4 = vld1q_f32(src_address + 12); |
|
|
|
src_address += srcstep; |
|
|
|
for(size_t h = 1; h < size.height; ++h, src_address += srcstep) |
|
{ |
|
internal::prefetch(src_address + srcstep, 0); |
|
internal::prefetch(src_address + srcstep, 32); |
|
|
|
float32x4_t v1 = vld1q_f32(src_address + 0); |
|
float32x4_t v2 = vld1q_f32(src_address + 4); |
|
float32x4_t v3 = vld1q_f32(src_address + 8); |
|
float32x4_t v4 = vld1q_f32(src_address + 12); |
|
|
|
s1 = vminq_f32(s1, v1); |
|
s2 = vminq_f32(s2, v2); |
|
s3 = vminq_f32(s3, v3); |
|
s4 = vminq_f32(s4, v4); |
|
} |
|
|
|
vst1q_f32(dstBase + i + 0, s1); |
|
vst1q_f32(dstBase + i + 4, s2); |
|
vst1q_f32(dstBase + i + 8, s3); |
|
vst1q_f32(dstBase + i + 12, s4); |
|
} |
|
|
|
for (; i + 4 <= size.width; i += 4) |
|
{ |
|
const f32* src_address = srcBase + i; |
|
float32x4_t s1 = vld1q_f32(src_address); |
|
src_address += srcstep; |
|
for(size_t h = 1; h < size.height; ++h, src_address += srcstep) |
|
{ |
|
internal::prefetch(src_address + srcstep, 0); |
|
|
|
float32x4_t v1 = vld1q_f32(src_address); |
|
s1 = vminq_f32(s1, v1); |
|
} |
|
vst1q_f32(dstBase + i, s1); |
|
} |
|
|
|
if (i < size.width) |
|
for(size_t h = 1; h < size.height; ++h) |
|
for(size_t j = i ; j < size.width; j++ ) |
|
dstBase[j] = std::min(dstBase[j], srcBase[j + srcstep * h]); |
|
#else |
|
(void)size; |
|
(void)srcBase; |
|
(void)srcStride; |
|
(void)dstBase; |
|
#endif |
|
} |
|
|
|
} // namespace CAROTENE_NS
|
|
|