mirror of https://github.com/opencv/opencv.git
Open Source Computer Vision Library
https://opencv.org/
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
399 lines
13 KiB
399 lines
13 KiB
/* |
|
* By downloading, copying, installing or using the software you agree to this license. |
|
* If you do not agree to this license, do not download, install, |
|
* copy or use the software. |
|
* |
|
* |
|
* License Agreement |
|
* For Open Source Computer Vision Library |
|
* (3-clause BSD License) |
|
* |
|
* Copyright (C) 2014, NVIDIA Corporation, all rights reserved. |
|
* Third party copyrights are property of their respective owners. |
|
* |
|
* Redistribution and use in source and binary forms, with or without modification, |
|
* are permitted provided that the following conditions are met: |
|
* |
|
* * Redistributions of source code must retain the above copyright notice, |
|
* this list of conditions and the following disclaimer. |
|
* |
|
* * Redistributions in binary form must reproduce the above copyright notice, |
|
* this list of conditions and the following disclaimer in the documentation |
|
* and/or other materials provided with the distribution. |
|
* |
|
* * Neither the names of the copyright holders nor the names of the contributors |
|
* may be used to endorse or promote products derived from this software |
|
* without specific prior written permission. |
|
* |
|
* This software is provided by the copyright holders and contributors "as is" and |
|
* any express or implied warranties, including, but not limited to, the implied |
|
* warranties of merchantability and fitness for a particular purpose are disclaimed. |
|
* In no event shall copyright holders or contributors be liable for any direct, |
|
* indirect, incidental, special, exemplary, or consequential damages |
|
* (including, but not limited to, procurement of substitute goods or services; |
|
* loss of use, data, or profits; or business interruption) however caused |
|
* and on any theory of liability, whether in contract, strict liability, |
|
* or tort (including negligence or otherwise) arising in any way out of |
|
* the use of this software, even if advised of the possibility of such damage. |
|
*/ |
|
|
|
#include "common.hpp" |
|
|
|
#include <cstring> |
|
|
|
namespace CAROTENE_NS { |
|
|
|
#ifdef CAROTENE_NEON |
|
|
|
namespace { |
|
|
|
template <int shift> |
|
void lshiftConst(const Size2D &size, |
|
const u8 * srcBase, ptrdiff_t srcStride, |
|
s16 * dstBase, ptrdiff_t dstStride) |
|
{ |
|
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; |
|
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; |
|
|
|
for (size_t i = 0; i < size.height; ++i) |
|
{ |
|
const u8 * src = internal::getRowPtr(srcBase, srcStride, i); |
|
s16 * dst = internal::getRowPtr(dstBase, dstStride, i); |
|
size_t j = 0; |
|
|
|
for (; j < roiw16; j += 16) |
|
{ |
|
internal::prefetch(src + j); |
|
uint8x16_t v_src = vld1q_u8(src + j); |
|
int16x8_t v_dst0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src))); |
|
int16x8_t v_dst1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src))); |
|
|
|
vst1q_s16(dst + j, vshlq_n_s16(v_dst0, shift)); |
|
vst1q_s16(dst + j + 8, vshlq_n_s16(v_dst1, shift)); |
|
} |
|
for (; j < roiw8; j += 8) |
|
{ |
|
int16x8_t v_dst = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j))); |
|
vst1q_s16(dst + j, vshlq_n_s16(v_dst, shift)); |
|
} |
|
|
|
for (; j < size.width; j++) |
|
{ |
|
dst[j] = ((s16)src[j] << shift); |
|
} |
|
} |
|
} |
|
|
|
template <> |
|
void lshiftConst<0>(const Size2D &size, |
|
const u8 * srcBase, ptrdiff_t srcStride, |
|
s16 * dstBase, ptrdiff_t dstStride) |
|
{ |
|
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; |
|
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; |
|
|
|
for (size_t i = 0; i < size.height; ++i) |
|
{ |
|
const u8 * src = internal::getRowPtr(srcBase, srcStride, i); |
|
s16 * dst = internal::getRowPtr(dstBase, dstStride, i); |
|
size_t j = 0; |
|
|
|
for (; j < roiw16; j += 16) |
|
{ |
|
internal::prefetch(src + j); |
|
uint8x16_t v_src = vld1q_u8(src + j); |
|
int16x8_t v_dst0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src))); |
|
int16x8_t v_dst1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src))); |
|
|
|
vst1q_s16(dst + j, v_dst0); |
|
vst1q_s16(dst + j + 8, v_dst1); |
|
} |
|
for (; j < roiw8; j += 8) |
|
{ |
|
int16x8_t v_dst = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j))); |
|
vst1q_s16(dst + j, v_dst); |
|
} |
|
|
|
for (; j < size.width; j++) |
|
{ |
|
dst[j] = (s16)src[j]; |
|
} |
|
} |
|
} |
|
|
|
template <int shift> |
|
void rshiftConst(const Size2D &size, |
|
const s16 * srcBase, ptrdiff_t srcStride, |
|
u8 * dstBase, ptrdiff_t dstStride, |
|
CONVERT_POLICY cpolicy) |
|
{ |
|
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; |
|
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; |
|
|
|
for (size_t i = 0; i < size.height; ++i) |
|
{ |
|
const s16 * src = internal::getRowPtr(srcBase, srcStride, i); |
|
u8 * dst = internal::getRowPtr(dstBase, dstStride, i); |
|
size_t j = 0; |
|
|
|
if (cpolicy == CONVERT_POLICY_SATURATE) |
|
{ |
|
for (; j < roiw16; j += 16) |
|
{ |
|
internal::prefetch(src + j); |
|
int16x8_t v_src0 = vshrq_n_s16(vld1q_s16(src + j), shift), |
|
v_src1 = vshrq_n_s16(vld1q_s16(src + j + 8), shift); |
|
uint8x16_t v_dst = vcombine_u8(vqmovun_s16(v_src0), |
|
vqmovun_s16(v_src1)); |
|
vst1q_u8(dst + j, v_dst); |
|
} |
|
for (; j < roiw8; j += 8) |
|
{ |
|
int16x8_t v_src = vshrq_n_s16(vld1q_s16(src + j), shift); |
|
vst1_u8(dst + j, vqmovun_s16(v_src)); |
|
} |
|
|
|
for (; j < size.width; j++) |
|
{ |
|
dst[j] = internal::saturate_cast<u8>((src[j] >> shift)); |
|
} |
|
} |
|
else // CONVERT_POLICY_WRAP |
|
{ |
|
for (; j < roiw16; j += 16) |
|
{ |
|
internal::prefetch(src + j); |
|
int16x8_t v_src0 = vshrq_n_s16(vld1q_s16(src + j), shift), |
|
v_src1 = vshrq_n_s16(vld1q_s16(src + j + 8), shift); |
|
int8x16_t v_dst = vcombine_s8(vmovn_s16(v_src0), |
|
vmovn_s16(v_src1)); |
|
vst1q_u8(dst + j, vreinterpretq_u8_s8(v_dst)); |
|
} |
|
for (; j < roiw8; j += 8) |
|
{ |
|
int16x8_t v_src = vshrq_n_s16(vld1q_s16(src + j), shift); |
|
vst1_u8(dst + j, vreinterpret_u8_s8(vmovn_s16(v_src))); |
|
} |
|
|
|
for (; j < size.width; j++) |
|
{ |
|
dst[j] = (u8)((src[j] >> shift)); |
|
} |
|
} |
|
} |
|
} |
|
|
|
template <> |
|
void rshiftConst<0>(const Size2D &size, |
|
const s16 * srcBase, ptrdiff_t srcStride, |
|
u8 * dstBase, ptrdiff_t dstStride, |
|
CONVERT_POLICY cpolicy) |
|
{ |
|
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; |
|
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; |
|
|
|
for (size_t i = 0; i < size.height; ++i) |
|
{ |
|
const s16 * src = internal::getRowPtr(srcBase, srcStride, i); |
|
u8 * dst = internal::getRowPtr(dstBase, dstStride, i); |
|
size_t j = 0; |
|
|
|
if (cpolicy == CONVERT_POLICY_SATURATE) |
|
{ |
|
for (; j < roiw16; j += 16) |
|
{ |
|
internal::prefetch(src + j); |
|
int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8); |
|
uint8x16_t v_dst = vcombine_u8(vqmovun_s16(v_src0), vqmovun_s16(v_src1)); |
|
vst1q_u8(dst + j, v_dst); |
|
} |
|
for (; j < roiw8; j += 8) |
|
{ |
|
int16x8_t v_src = vld1q_s16(src + j); |
|
vst1_u8(dst + j, vqmovun_s16(v_src)); |
|
} |
|
|
|
for (; j < size.width; j++) |
|
{ |
|
dst[j] = internal::saturate_cast<u8>(src[j]); |
|
} |
|
} |
|
else // CONVERT_POLICY_WRAP |
|
{ |
|
for (; j < roiw16; j += 16) |
|
{ |
|
internal::prefetch(src + j); |
|
int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8); |
|
int8x16_t v_dst = vcombine_s8(vmovn_s16(v_src0), vmovn_s16(v_src1)); |
|
vst1q_u8(dst + j, vreinterpretq_u8_s8(v_dst)); |
|
} |
|
for (; j < roiw8; j += 8) |
|
{ |
|
int16x8_t v_src = vld1q_s16(src + j); |
|
vst1_u8(dst + j, vreinterpret_u8_s8(vmovn_s16(v_src))); |
|
} |
|
|
|
for (; j < size.width; j++) |
|
{ |
|
dst[j] = (u8)src[j]; |
|
} |
|
} |
|
} |
|
} |
|
|
|
typedef void (* lshiftConstFunc)(const Size2D &size, |
|
const u8 * srcBase, ptrdiff_t srcStride, |
|
s16 * dstBase, ptrdiff_t dstStride); |
|
|
|
typedef void (* rshiftConstFunc)(const Size2D &size, |
|
const s16 * srcBase, ptrdiff_t srcStride, |
|
u8 * dstBase, ptrdiff_t dstStride, |
|
CONVERT_POLICY cpolicy); |
|
|
|
} // namespace |
|
|
|
#endif |
|
|
|
void lshift(const Size2D &size, |
|
const u8 * srcBase, ptrdiff_t srcStride, |
|
s16 * dstBase, ptrdiff_t dstStride, |
|
u32 shift) |
|
{ |
|
internal::assertSupportedConfiguration(); |
|
|
|
#ifdef CAROTENE_NEON |
|
if (shift >= 16u) |
|
{ |
|
for (size_t i = 0; i < size.height; ++i) |
|
{ |
|
s16 * dst = internal::getRowPtr(dstBase, dstStride, i); |
|
std::memset(dst, 0, sizeof(s16) * size.width); |
|
} |
|
return; |
|
} |
|
|
|
// this ugly contruction is needed to avoid: |
|
// /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant |
|
// return (int16x8_t)__builtin_neon_vshl_nv8hi (__a, __b, 1); |
|
|
|
lshiftConstFunc funcs[16] = |
|
{ |
|
lshiftConst<0>, |
|
lshiftConst<1>, |
|
lshiftConst<2>, |
|
lshiftConst<3>, |
|
lshiftConst<4>, |
|
lshiftConst<5>, |
|
lshiftConst<6>, |
|
lshiftConst<7>, |
|
lshiftConst<8>, |
|
lshiftConst<9>, |
|
lshiftConst<10>, |
|
lshiftConst<11>, |
|
lshiftConst<12>, |
|
lshiftConst<13>, |
|
lshiftConst<14>, |
|
lshiftConst<15> |
|
}, func = funcs[shift]; |
|
|
|
func(size, srcBase, srcStride, dstBase, dstStride); |
|
#else |
|
(void)size; |
|
(void)srcBase; |
|
(void)srcStride; |
|
(void)dstBase; |
|
(void)dstStride; |
|
(void)shift; |
|
#endif |
|
} |
|
|
|
void rshift(const Size2D &size, |
|
const s16 * srcBase, ptrdiff_t srcStride, |
|
u8 * dstBase, ptrdiff_t dstStride, |
|
u32 shift, CONVERT_POLICY cpolicy) |
|
{ |
|
internal::assertSupportedConfiguration(); |
|
|
|
#ifdef CAROTENE_NEON |
|
if (shift >= 16) |
|
{ |
|
if (cpolicy == CONVERT_POLICY_WRAP) |
|
{ |
|
size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; |
|
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; |
|
int16x8_t v_zero = vdupq_n_s16(0); |
|
|
|
for (size_t i = 0; i < size.height; ++i) |
|
{ |
|
const s16 * src = internal::getRowPtr(srcBase, srcStride, i); |
|
u8 * dst = internal::getRowPtr(dstBase, dstStride, i); |
|
size_t j = 0; |
|
|
|
for (; j < roiw16; j += 16) |
|
{ |
|
internal::prefetch(src + j); |
|
int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8); |
|
uint8x16_t v_dst = vcombine_u8(vmovn_u16(vcltq_s16(v_src0, v_zero)), |
|
vmovn_u16(vcltq_s16(v_src1, v_zero))); |
|
vst1q_u8(dst + j, v_dst); |
|
} |
|
for (; j < roiw8; j += 8) |
|
{ |
|
int16x8_t v_src = vld1q_s16(src + j); |
|
vst1_u8(dst + j, vmovn_u16(vcltq_s16(v_src, v_zero))); |
|
} |
|
|
|
for (; j < size.width; j++) |
|
{ |
|
dst[j] = src[j] >= 0 ? 0 : 255; |
|
} |
|
} |
|
} |
|
else |
|
{ |
|
for (size_t i = 0; i < size.height; ++i) |
|
{ |
|
u8 * dst = internal::getRowPtr(dstBase, dstStride, i); |
|
std::memset(dst, 0, sizeof(u8) * size.width); |
|
} |
|
} |
|
return; |
|
} |
|
|
|
// this ugly contruction is needed to avoid: |
|
// /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant |
|
// return (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 1); |
|
|
|
rshiftConstFunc funcs[16] = |
|
{ |
|
rshiftConst<0>, |
|
rshiftConst<1>, |
|
rshiftConst<2>, |
|
rshiftConst<3>, |
|
rshiftConst<4>, |
|
rshiftConst<5>, |
|
rshiftConst<6>, |
|
rshiftConst<7>, |
|
rshiftConst<8>, |
|
rshiftConst<9>, |
|
rshiftConst<10>, |
|
rshiftConst<11>, |
|
rshiftConst<12>, |
|
rshiftConst<13>, |
|
rshiftConst<14>, |
|
rshiftConst<15> |
|
}, func = funcs[shift]; |
|
|
|
func(size, srcBase, srcStride, dstBase, dstStride, cpolicy); |
|
#else |
|
(void)size; |
|
(void)srcBase; |
|
(void)srcStride; |
|
(void)dstBase; |
|
(void)dstStride; |
|
(void)shift; |
|
(void)cpolicy; |
|
#endif |
|
} |
|
|
|
} // namespace CAROTENE_NS
|
|
|