Open Source Computer Vision Library https://opencv.org/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

1414 lines
58 KiB

/*
* By downloading, copying, installing or using the software you agree to this license.
* If you do not agree to this license, do not download, install,
* copy or use the software.
*
*
* License Agreement
* For Open Source Computer Vision Library
* (3-clause BSD License)
*
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
* Third party copyrights are property of their respective owners.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the names of the copyright holders nor the names of the contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* This software is provided by the copyright holders and contributors "as is" and
* any express or implied warranties, including, but not limited to, the implied
* warranties of merchantability and fitness for a particular purpose are disclaimed.
* In no event shall copyright holders or contributors be liable for any direct,
* indirect, incidental, special, exemplary, or consequential damages
* (including, but not limited to, procurement of substitute goods or services;
* loss of use, data, or profits; or business interruption) however caused
* and on any theory of liability, whether in contract, strict liability,
* or tort (including negligence or otherwise) arising in any way out of
* the use of this software, even if advised of the possibility of such damage.
*/
#include "common.hpp"
#include <vector>
namespace CAROTENE_NS {
bool isGaussianPyramidDownRTZSupported(const Size2D &srcSize, const Size2D &dstSize, BORDER_MODE border_mode)
{
if (!isSupportedConfiguration())
return false;
// Need at least 8 pixels for vectorization.
// Need to make sure dst width is half the src width.
// Don't care about dst height.
if ( dstSize.width < 8 || std::abs((ptrdiff_t)dstSize.width*2 - (ptrdiff_t)srcSize.width) > 2 )
return false;
// Current implementation only supports Reflect101 (ie: UNDEFINED mode)
if (border_mode != BORDER_MODE_UNDEFINED)
return false;
return true;
}
bool isGaussianPyramidDownU8Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn)
{
if (!isSupportedConfiguration())
return false;
if ( (dstSize.width * cn) < 8 ||
(cn != 1 && cn !=3 && cn!=4) ||
std::abs((ptrdiff_t)dstSize.width*2 - (ptrdiff_t)srcSize.width) > 2 ||
std::abs((ptrdiff_t)dstSize.height*2 - (ptrdiff_t)srcSize.height) > 2 )
return false;
return true;
}
bool isGaussianPyramidDownS16Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn)
{
if (!isSupportedConfiguration())
return false;
if ( (dstSize.width * cn) < 4 ||
(cn != 1 && cn !=3 && cn!=4) ||
std::abs((ptrdiff_t)dstSize.width*2 - (ptrdiff_t)srcSize.width) > 2 ||
std::abs((ptrdiff_t)dstSize.height*2 - (ptrdiff_t)srcSize.height) > 2 )
return false;
return true;
}
bool isGaussianPyramidDownF32Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn)
{
if (!isSupportedConfiguration())
return false;
if ( (dstSize.width * cn) < 4 ||
(cn != 1 && cn !=3 && cn!=4) ||
std::abs((ptrdiff_t)dstSize.width*2 - (ptrdiff_t)srcSize.width) > 2 ||
std::abs((ptrdiff_t)dstSize.height*2 - (ptrdiff_t)srcSize.height) > 2 )
return false;
return true;
}
bool isGaussianPyramidUpU8Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn)
{
if (!isSupportedConfiguration())
return false;
if ( (srcSize.width * cn) < 8 ||
(cn != 1 && cn !=3 && cn!=4) ||
std::abs((ptrdiff_t)dstSize.width - (ptrdiff_t)srcSize.width*2) != (ptrdiff_t)dstSize.width % 2 ||
std::abs((ptrdiff_t)dstSize.height - (ptrdiff_t)srcSize.height*2) != (ptrdiff_t)dstSize.height % 2 )
return false;
return true;
}
bool isGaussianPyramidUpS16Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn)
{
if (!isSupportedConfiguration())
return false;
if ( (srcSize.width * cn) < 12 ||
(cn != 1 && cn !=3 && cn!=4) ||
std::abs((ptrdiff_t)dstSize.width - (ptrdiff_t)srcSize.width*2) != (ptrdiff_t)dstSize.width % 2 ||
std::abs((ptrdiff_t)dstSize.height - (ptrdiff_t)srcSize.height*2) != (ptrdiff_t)dstSize.height % 2 )
return false;
return true;
}
#ifdef CAROTENE_NEON
namespace {
ptrdiff_t borderInterpolate101(ptrdiff_t p, ptrdiff_t len)
{
if (len == 1)
return 0;
else
{
while ((unsigned)p >= (unsigned)len)
{
if (p < 0)
p = -p;
else
p = (len - 1)*2 - p;
}
}
return p;
}
} // namespace
#endif
void gaussianPyramidDownRTZ(const Size2D &srcSize,
const u8 *srcBase, ptrdiff_t srcStride,
const Size2D &dstSize,
u8 *dstBase, ptrdiff_t dstStride,
BORDER_MODE border, u8 borderValue)
{
internal::assertSupportedConfiguration(isGaussianPyramidDownRTZSupported(srcSize, dstSize, border));
#ifdef CAROTENE_NEON
// Single-core NEON code
const size_t dwidth = dstSize.width;
const size_t dheight = dstSize.height;
const size_t swidth = srcSize.width;
const size_t sheight = srcSize.height;
ptrdiff_t idx_l1 = borderInterpolate101(-1, swidth);
ptrdiff_t idx_l2 = borderInterpolate101(-2, swidth);
ptrdiff_t idx_r1 = borderInterpolate101(swidth + 0, swidth);
ptrdiff_t idx_r2 = borderInterpolate101(swidth + 1, swidth);
//1-line buffer
std::vector<u16> _buf((swidth + 4) + 32/sizeof(u16));
u16* lane = internal::alignPtr(&_buf[2], 32);
uint8x8_t vc6u8 = vmov_n_u8(6);
uint16x8_t vc6u16 = vmovq_n_u16(6);
uint16x8_t vc4u16 = vmovq_n_u16(4);
u8* dst = dstBase;
for (size_t i = 0; i < dheight; ++i, dst += dstStride)
{
//vertical convolution
const u8* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-2, sheight));
const u8* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-1, sheight));
const u8* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+0, sheight));
const u8* ln3 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+1, sheight));
const u8* ln4 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+2, sheight));
size_t x = 0;
for (; x <= swidth - 8; x += 8)
{
internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2));
uint8x8_t v0 = vld1_u8(ln0+x);
uint8x8_t v1 = vld1_u8(ln1+x);
uint8x8_t v2 = vld1_u8(ln2+x);
uint8x8_t v3 = vld1_u8(ln3+x);
uint8x8_t v4 = vld1_u8(ln4+x);
uint16x8_t v = vaddl_u8(v0, v4);
uint16x8_t v13 = vaddl_u8(v1, v3);
v = vmlal_u8(v, v2, vc6u8);
v = vmlaq_u16(v, v13, vc4u16);
vst1q_u16(lane + x, v);
}
for (; x < swidth; ++x)
{
lane[x] = ln0[x] + ln4[x] + 4u * (ln1[x] + ln3[x]) + 6u * ln2[x];
}
//left&right borders
lane[-1] = lane[idx_l1];
lane[-2] = lane[idx_l2];
lane[swidth] = lane[idx_r1];
lane[swidth+1] = lane[idx_r2];
//horizontal convolution
x = 0;
size_t vw = (swidth/2) - 7; // Using 7 instead of 8 allows swidth of 14 or 15.
for (; x < vw; x += 8)
{
internal::prefetch(lane + 2 * x);
uint16x8x2_t vLane0 = vld2q_u16(lane + 2*x-2); // L0[0] = x0 x2 x4 x6 x8 x10 x12 x14 L0[1] = x1 x3 x5 x7 x9 x11 x13 x15
uint16x8x2_t vLane1 = vld2q_u16(lane + 2*x-1); // L1[0] = x1 x3 x5 x7 x9 x11 x13 x15 L1[1] = x2 x4 x6 x8 x10 x12 x14 x16
uint16x8x2_t vLane2 = vld2q_u16(lane + 2*x+0); // L2[0] = x2 x4 x6 x8 x10 x12 x14 x16 L2[1] = x3 x5 x7 x9 x11 x13 x15 x17
uint16x8x2_t vLane3 = vld2q_u16(lane + 2*x+1); // L3[0] = x3 x5 x7 x9 x11 x13 x15 x17 L3[1] = x4 x6 x8 x10 x12 x14 x16 x18
uint16x8x2_t vLane4 = vld2q_u16(lane + 2*x+2); // L4[0] = x4 x6 x8 x10 x12 x14 x16 x18 L4[1] = x5 x7 x9 x11 x13 x15 x17 x19
uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane4.val[0]);
uint16x8_t vSum_1_3 = vaddq_u16(vLane1.val[0], vLane3.val[0]);
vSum_0_4 = vmlaq_u16(vSum_0_4, vLane2.val[0], vc6u16);
vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_1_3, vc4u16);
uint8x8_t vRes = vshrn_n_u16(vSum_0_4, 8);
vst1_u8(dst + x, vRes);
}
for (; x < dwidth; x++)
{
dst[x] = u8((lane[2*x-2] + lane[2*x+2] + 4u * (lane[2*x-1] + lane[2*x+1]) + 6u * lane[2*x]) >> 8);
}
}
#else
// Remove 'unused parameter' warnings.
(void)srcSize;
(void)srcBase;
(void)srcStride;
(void)dstSize;
(void)dstBase;
(void)dstStride;
(void)border;
#endif
(void)borderValue;
}
void gaussianPyramidDown(const Size2D &srcSize,
const u8 *srcBase, ptrdiff_t srcStride,
const Size2D &dstSize,
u8 *dstBase, ptrdiff_t dstStride, u8 cn)
{
internal::assertSupportedConfiguration(isGaussianPyramidDownU8Supported(srcSize, dstSize, cn));
#ifdef CAROTENE_NEON
size_t dcolcn = dstSize.width*cn;
size_t scolcn = srcSize.width*cn;
size_t roiw8 = dcolcn - 7;
size_t idx_l1 = borderInterpolate101(-1, srcSize.width) * cn;
size_t idx_l2 = borderInterpolate101(-2, srcSize.width) * cn;
size_t idx_r1 = borderInterpolate101(srcSize.width + 0, srcSize.width) * cn;
size_t idx_r2 = borderInterpolate101(srcSize.width + 1, srcSize.width) * cn;
//1-line buffer
std::vector<u16> _buf(cn*(srcSize.width + 4) + 32/sizeof(u16));
u16* lane = internal::alignPtr(&_buf[2*cn], 32);
uint8x8_t vc6u8 = vmov_n_u8(6);
uint16x8_t vc6u16 = vmovq_n_u16(6);
uint16x8_t vc4u16 = vmovq_n_u16(4);
for (size_t i = 0; i < dstSize.height; ++i)
{
u8* dst = internal::getRowPtr(dstBase, dstStride, i);
//vertical convolution
const u8* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-2, srcSize.height));
const u8* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-1, srcSize.height));
const u8* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+0, srcSize.height));
const u8* ln3 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+1, srcSize.height));
const u8* ln4 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+2, srcSize.height));
size_t x = 0;
for (; x <= scolcn - 8; x += 8)
{
internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, (ptrdiff_t)x % 5 - 2));
uint8x8_t v0 = vld1_u8(ln0+x);
uint8x8_t v1 = vld1_u8(ln1+x);
uint8x8_t v2 = vld1_u8(ln2+x);
uint8x8_t v3 = vld1_u8(ln3+x);
uint8x8_t v4 = vld1_u8(ln4+x);
uint16x8_t v = vaddl_u8(v0, v4);
uint16x8_t v13 = vaddl_u8(v1, v3);
v = vmlal_u8(v, v2, vc6u8);
v = vmlaq_u16(v, v13, vc4u16);
vst1q_u16(lane + x, v);
}
for (; x < scolcn; ++x)
{
lane[x] = ln0[x] + ln4[x] + 4u * (ln1[x] + ln3[x]) + 6u * ln2[x];
}
//left&right borders
for (u32 k = 0; k < cn; ++k)
{
lane[(s32)(-cn+k)] = lane[idx_l1 + k];
lane[(s32)(-cn-cn+k)] = lane[idx_l2 + k];
lane[scolcn+k] = lane[idx_r1 + k];
lane[scolcn+cn+k] = lane[idx_r2 + k];
}
//horizontal convolution
x = 0;
switch(cn)
{
case 1:
for (; x < roiw8; x += 8)
{
internal::prefetch(lane + 2 * x);
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
__asm__ (
"vld2.16 {d0-d3}, [%[in0]] \n\t"
"vld2.16 {d4-d7}, [%[in4]] \n\t"
"vld2.16 {d12-d15}, [%[in1]] \n\t"
"vld2.16 {d16-d19}, [%[in3]] \n\t"
"vld2.16 {d8-d11}, [%[in2],:256] \n\t"
"vadd.i16 q0, q2 /*q0 = v0 + v4*/ \n\t"
"vadd.i16 q6, q8 /*q6 = v1 + v3*/ \n\t"
"vmla.i16 q0, q4, %q[c6] /*q0 += v2 * 6*/ \n\t"
"vmla.i16 q0, q6, %q[c4] /*q1 += (v1+v3) * 4*/ \n\t"
"vrshrn.u16 d8, q0, #8 \n\t"
"vst1.8 {d8}, [%[out]] \n\t"
: /*no output*/
: [out] "r" (dst + x),
[in0] "r" (lane + 2*x-2),
[in1] "r" (lane + 2*x-1),
[in2] "r" (lane + 2*x+0),
[in3] "r" (lane + 2*x+1),
[in4] "r" (lane + 2*x+2),
[c4] "w" (vc4u16), [c6] "w" (vc6u16)
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"
);
#else
uint16x8x2_t vLane0 = vld2q_u16(lane + 2*x-2);
uint16x8x2_t vLane1 = vld2q_u16(lane + 2*x-1);
uint16x8x2_t vLane2 = vld2q_u16(lane + 2*x+0);
uint16x8x2_t vLane3 = vld2q_u16(lane + 2*x+1);
uint16x8x2_t vLane4 = vld2q_u16(lane + 2*x+2);
uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane4.val[0]);
uint16x8_t vSum_1_3 = vaddq_u16(vLane1.val[0], vLane3.val[0]);
vSum_0_4 = vmlaq_u16(vSum_0_4, vLane2.val[0], vc6u16);
vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_1_3, vc4u16);
uint8x8_t vRes = vrshrn_n_u16(vSum_0_4, 8);
vst1_u8(dst + x, vRes);
#endif
}
break;
case 3:
{
uint16x4_t vx1 = vld1_u16(lane - 2*3);
uint16x4_t vx2 = vld1_u16(lane - 1*3);
uint16x4_t vx3 = vld1_u16(lane + 0*3);
uint16x8_t v0 = vcombine_u16(vx1, vx3);
uint8x8_t map = vreinterpret_u8_u64(vmov_n_u64(0xFFFF060504020100ULL));
for (; x < roiw8; x += 6)
{
internal::prefetch(lane + 2 * x + 12);
uint16x4_t vx_ = vld1_u16(lane + 2*x-1*3 + 6);
uint16x4_t vx4 = vld1_u16(lane + 2*x+0*3 + 6);
uint16x4_t vx5 = vld1_u16(lane + 2*x+1*3 + 6);
uint16x4_t vx6 = vld1_u16(lane + 2*x+2*3 + 6);
uint16x8_t v1 = vcombine_u16(vx2, vx_);
uint16x8_t v2 = vcombine_u16(vget_high_u16(v0), vx4);
uint16x8_t v3 = vcombine_u16(vx_, vx5);
uint16x8_t v4 = vcombine_u16(vx4, vx6);
vx2 = vx5;
uint16x8_t v = vaddq_u16(v0, v4);
uint16x8_t v13 = vaddq_u16(v1, v3);
v = vmlaq_u16(v, v2, vc6u16);
v = vmlaq_u16(v, v13, vc4u16);
uint8x8_t v8 = vrshrn_n_u16(v, 8);
v0 = v4;
vst1_u8(dst + x, vtbl1_u8(v8, map));
}
}
break;
case 4:
{
uint16x4_t vx1 = vld1_u16(lane - 2*4);
uint16x4_t vx2 = vld1_u16(lane - 1*4);
uint16x4_t vx3 = vld1_u16(lane + 0*4);
uint16x8_t v0 = vcombine_u16(vx1, vx3);
for (; x < roiw8; x += 8)
{
internal::prefetch(lane + 2 * x + 16);
uint16x4_t vx_ = vld1_u16(lane + 2 * x - 1*4 + 8);
uint16x4_t vx4 = vld1_u16(lane + 2 * x + 0*4 + 8);
uint16x4_t vx5 = vld1_u16(lane + 2 * x + 1*4 + 8);
uint16x4_t vx6 = vld1_u16(lane + 2 * x + 2*4 + 8);
uint16x8_t v1 = vcombine_u16(vx2, vx_);
uint16x8_t v2 = vcombine_u16(vget_high_u16(v0), vx4);
uint16x8_t v3 = vcombine_u16(vx_, vx5);
uint16x8_t v4 = vcombine_u16(vx4, vx6);
vx2 = vx5;
uint16x8_t v = vaddq_u16(v0, v4);
uint16x8_t v13 = vaddq_u16(v1, v3);
v = vmlaq_u16(v, v2, vc6u16);
v = vmlaq_u16(v, v13, vc4u16);
uint8x8_t v8 = vrshrn_n_u16(v, 8);
v0 = v4;
vst1_u8(dst + x, v8);
}
}
break;
}
for (u32 h = 0; h < cn; ++h)
{
u16* ln = lane + h;
u8* dt = dst + h;
for (size_t k = x; k < dcolcn; k += cn)
dt[k] = u8((ln[2*k-2*cn] + ln[2*k+2*cn] + 4u * (ln[2*k-cn] + ln[2*k+cn]) + 6u * ln[2*k] + (1 << 7)) >> 8);
}
}
#else
// Remove 'unused parameter' warnings.
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
#endif
}
void gaussianPyramidDown(const Size2D &srcSize,
const s16 *srcBase, ptrdiff_t srcStride,
const Size2D &dstSize,
s16 *dstBase, ptrdiff_t dstStride, u8 cn)
{
internal::assertSupportedConfiguration(isGaussianPyramidDownS16Supported(srcSize, dstSize, cn));
#ifdef CAROTENE_NEON
size_t dcolcn = dstSize.width*cn;
size_t scolcn = srcSize.width*cn;
size_t roiw4 = dcolcn - 3;
size_t idx_l1 = borderInterpolate101(-1, srcSize.width) * cn;
size_t idx_l2 = borderInterpolate101(-2, srcSize.width) * cn;
size_t idx_r1 = borderInterpolate101(srcSize.width + 0, srcSize.width) * cn;
size_t idx_r2 = borderInterpolate101(srcSize.width + 1, srcSize.width) * cn;
//1-line buffer
std::vector<s32> _buf(cn*(srcSize.width + 4) + 32/sizeof(s32));
s32* lane = internal::alignPtr(&_buf[2*cn], 32);
int16x4_t vc6s16 = vmov_n_s16(6);
int32x4_t vc6s32 = vmovq_n_s32(6);
int32x4_t vc4s32 = vmovq_n_s32(4);
for (size_t i = 0; i < dstSize.height; ++i)
{
s16* dst = internal::getRowPtr(dstBase, dstStride, i);
//vertical convolution
const s16* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-2, srcSize.height));
const s16* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-1, srcSize.height));
const s16* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+0, srcSize.height));
const s16* ln3 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+1, srcSize.height));
const s16* ln4 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+2, srcSize.height));
size_t x = 0;
for (; x <= scolcn - 4; x += 4)
{
internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, (ptrdiff_t)x % 5 - 2));
int16x4_t v0 = vld1_s16(ln0 + x);
int16x4_t v1 = vld1_s16(ln1 + x);
int16x4_t v2 = vld1_s16(ln2 + x);
int16x4_t v3 = vld1_s16(ln3 + x);
int16x4_t v4 = vld1_s16(ln4 + x);
int32x4_t v = vaddl_s16(v0, v4);
int32x4_t v13 = vaddl_s16(v1, v3);
v = vmlal_s16(v, v2, vc6s16);
v = vmlaq_s32(v, v13, vc4s32);
vst1q_s32(lane + x, v);
}
for (; x < scolcn; ++x)
{
lane[x] = ln0[x] + ln4[x] + 4 * (ln1[x] + ln3[x]) + 6 * ln2[x];
}
//left&right borders
for (u32 k = 0; k < cn; ++k)
{
lane[(s32)(-cn+k)] = lane[idx_l1 + k];
lane[(s32)(-cn-cn+k)] = lane[idx_l2 + k];
lane[scolcn+k] = lane[idx_r1 + k];
lane[scolcn+cn+k] = lane[idx_r2 + k];
}
//horizontal convolution
x = 0;
switch(cn)
{
case 1:
for (; x < roiw4; x += 4)
{
internal::prefetch(lane + 2 * x);
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
__asm__ (
"vld2.32 {d0-d3}, [%[in0]] \n\t"
"vld2.32 {d4-d7}, [%[in4]] \n\t"
"vld2.32 {d12-d15}, [%[in1]] \n\t"
"vld2.32 {d16-d19}, [%[in3]] \n\t"
"vld2.32 {d8-d11}, [%[in2],:256] \n\t"
"vadd.i32 q0, q2 \n\t"
"vadd.i32 q6, q8 \n\t"
"vmla.i32 q0, q4, %q[c6] \n\t"
"vmla.i32 q0, q6, %q[c4] \n\t"
"vrshrn.s32 d8, q0, #8 \n\t"
"vst1.16 {d8}, [%[out]] \n\t"
: /*no output*/
: [out] "r" (dst + x),
[in0] "r" (lane + 2*x-2),
[in1] "r" (lane + 2*x-1),
[in2] "r" (lane + 2*x+0),
[in3] "r" (lane + 2*x+1),
[in4] "r" (lane + 2*x+2),
[c4] "w" (vc4s32), [c6] "w" (vc6s32)
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19"
);
#else
int32x4x2_t vLane0 = vld2q_s32(lane + 2*x-2);
int32x4x2_t vLane1 = vld2q_s32(lane + 2*x-1);
int32x4x2_t vLane2 = vld2q_s32(lane + 2*x+0);
int32x4x2_t vLane3 = vld2q_s32(lane + 2*x+1);
int32x4x2_t vLane4 = vld2q_s32(lane + 2*x+2);
int32x4_t vSum_0_4 = vaddq_s32(vLane0.val[0], vLane4.val[0]);
int32x4_t vSum_1_3 = vaddq_s32(vLane1.val[0], vLane3.val[0]);
vSum_0_4 = vmlaq_s32(vSum_0_4, vLane2.val[0], vc6s32);
vSum_0_4 = vmlaq_s32(vSum_0_4, vSum_1_3, vc4s32);
int16x4_t vRes = vrshrn_n_s32(vSum_0_4, 8);
vst1_s16(dst + x, vRes);
#endif
}
break;
case 3:
{
int32x4_t v0 = vld1q_s32(lane - 2*3);
int32x4_t v1 = vld1q_s32(lane - 1*3);
int32x4_t v2 = vld1q_s32(lane + 0*3);
for (; x < roiw4; x += 3)
{
internal::prefetch(lane + 2 * x);
int32x4_t v3 = vld1q_s32(lane + 2 * x + 1*3);
int32x4_t v4 = vld1q_s32(lane + 2 * x + 2*3);
int32x4_t v = vaddq_s32(v0, v4);
int32x4_t v13 = vaddq_s32(v1, v3);
v = vmlaq_s32(v, v2, vc6s32);
v = vmlaq_s32(v, v13, vc4s32);
int16x4_t vv = vrshrn_n_s32(v, 8);
v0 = v2;
v1 = v3;
v2 = v4;
vst1_s16(dst + x, vv);
}
}
break;
case 4:
{
int32x4_t v0 = vld1q_s32(lane - 2*4);
int32x4_t v1 = vld1q_s32(lane - 1*4);
int32x4_t v2 = vld1q_s32(lane + 0*4);
for (; x < roiw4; x += 4)
{
internal::prefetch(lane + 2 * x + 8);
int32x4_t v3 = vld1q_s32(lane + 2 * x + 1*4);
int32x4_t v4 = vld1q_s32(lane + 2 * x + 2*4);
int32x4_t v = vaddq_s32(v0, v4);
int32x4_t v13 = vaddq_s32(v1, v3);
v = vmlaq_s32(v, v2, vc6s32);
v = vmlaq_s32(v, v13, vc4s32);
int16x4_t vv = vrshrn_n_s32(v, 8);
v0 = v2;
v1 = v3;
v2 = v4;
vst1_s16(dst + x, vv);
}
}
break;
}
for (u32 h = 0; h < cn; ++h)
{
s32* ln = lane + h;
s16* dt = dst + h;
for (size_t k = x; k < dcolcn; k += cn)
dt[k] = s16((ln[2*k-2*cn] + ln[2*k+2*cn] + 4 * (ln[2*k-cn] + ln[2*k+cn]) + 6 * ln[2*k] + (1 << 7)) >> 8);
}
}
#else
// Remove 'unused parameter' warnings.
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
#endif
}
void gaussianPyramidDown(const Size2D &srcSize,
const f32 *srcBase, ptrdiff_t srcStride,
const Size2D &dstSize,
f32 *dstBase, ptrdiff_t dstStride, u8 cn)
{
internal::assertSupportedConfiguration(isGaussianPyramidDownF32Supported(srcSize, dstSize, cn));
#ifdef CAROTENE_NEON
size_t dcolcn = dstSize.width*cn;
size_t scolcn = srcSize.width*cn;
size_t roiw4 = dcolcn - 3;
size_t idx_l1 = borderInterpolate101(-1, srcSize.width) * cn;
size_t idx_l2 = borderInterpolate101(-2, srcSize.width) * cn;
size_t idx_r1 = borderInterpolate101(srcSize.width + 0, srcSize.width) * cn;
size_t idx_r2 = borderInterpolate101(srcSize.width + 1, srcSize.width) * cn;
//1-line buffer
std::vector<f32> _buf(cn*(srcSize.width + 4) + 32/sizeof(f32));
f32* lane = internal::alignPtr(&_buf[2*cn], 32);
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
register float32x4_t vc6d4f32 asm ("q11") = vmovq_n_f32(1.5f); // 6/4
register float32x4_t vc1d4f32 asm ("q12") = vmovq_n_f32(0.25f); // 1/4
register float32x4_t vc1d64f32 asm ("q13") = vmovq_n_f32(0.015625f); //1/4/16
register float32x4_t vc4d64f32 asm ("q14") = vmovq_n_f32(0.0625f); //4/4/16
register float32x4_t vc6d64f32 asm ("q15") = vmovq_n_f32(0.09375f); //6/4/16
#else
float32x4_t vc6d4f32 = vmovq_n_f32(1.5f); // 6/4
float32x4_t vc1d4f32 = vmovq_n_f32(0.25f); // 1/4
float32x4_t vc1d64f32 = vmovq_n_f32(0.015625f); //1/4/16
float32x4_t vc4d64f32 = vmovq_n_f32(0.0625f); //4/4/16
float32x4_t vc6d64f32 = vmovq_n_f32(0.09375f); //6/4/16
#endif
for (size_t i = 0; i < dstSize.height; ++i)
{
f32* dst = internal::getRowPtr(dstBase, dstStride, i);
//vertical convolution
const f32* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-2, srcSize.height));
const f32* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-1, srcSize.height));
const f32* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+0, srcSize.height));
const f32* ln3 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+1, srcSize.height));
const f32* ln4 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+2, srcSize.height));
size_t x = 0;
for (; x <= scolcn - 4; x += 4)
{
internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, (ptrdiff_t)x % 5 - 2));
float32x4_t v0 = vld1q_f32((const float32_t*)ln0 + x);
float32x4_t v1 = vld1q_f32((const float32_t*)ln1 + x);
float32x4_t v2 = vld1q_f32((const float32_t*)ln2 + x);
float32x4_t v3 = vld1q_f32((const float32_t*)ln3 + x);
float32x4_t v4 = vld1q_f32((const float32_t*)ln4 + x);
float32x4_t v = vaddq_f32(v1, v3);
float32x4_t v04 = vaddq_f32(v0, v4);
v = vmlaq_f32(v, v2, vc6d4f32);
v = vmlaq_f32(v, v04, vc1d4f32);
vst1q_f32(lane + x, v);
}
for (; x < scolcn; ++x)
{
lane[x] = 0.25f*(ln0[x] + ln4[x]) + (ln1[x] + ln3[x]) + 1.5f * ln2[x];
}
//left&right borders
for (u32 k = 0; k < cn; ++k)
{
lane[(s32)(-cn+k)] = lane[idx_l1 + k];
lane[(s32)(-cn-cn+k)] = lane[idx_l2 + k];
lane[scolcn+k] = lane[idx_r1 + k];
lane[scolcn+cn+k] = lane[idx_r2 + k];
}
//horizontal convolution
x = 0;
switch(cn)
{
case 1:
for (; x < roiw4; x += 4)
{
internal::prefetch(lane + 2 * x);
#if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
__asm__ __volatile__ (
"vld2.32 {d0-d3}, [%[in0]] \n\t"
"vld2.32 {d8-d11}, [%[in4]] \n\t"
"vld2.32 {d14-d17}, [%[in2],:256] \n\t"
"vld2.32 {d10-d13}, [%[in1]] \n\t"
"vld2.32 {d16-d19}, [%[in3]] \n\t"
"vmul.f32 q7, %q[c6d64] \n\t"
"vadd.f32 q0, q4 @v04 \n\t"
"vadd.f32 q5, q8 @v13 \n\t"
"vmla.f32 q7, q0, %q[c1d64] \n\t"
"vmla.f32 q7, q5, %q[c4d64] \n\t"
"vst1.32 {d14-d15}, [%[out]] \n\t"
:
: [out] "r" (dst + x),
[in0] "r" (lane + 2*x-2),
[in1] "r" (lane + 2*x-1),
[in2] "r" (lane + 2*x+0),
[in3] "r" (lane + 2*x+1),
[in4] "r" (lane + 2*x+2),
[c4d64] "w" (vc4d64f32), [c6d64] "w" (vc6d64f32), [c1d64] "w" (vc1d64f32)
: "d0","d1","d2","d3","d4",/*"d5","d6","d7",*/"d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19" //ugly compiler "bug" - can't touch d5-d7
);
#else
float32x4x2_t vLane0 = vld2q_f32(lane + 2*x-2);
float32x4x2_t vLane1 = vld2q_f32(lane + 2*x-1);
float32x4x2_t vLane2 = vld2q_f32(lane + 2*x+0);
float32x4x2_t vLane3 = vld2q_f32(lane + 2*x+1);
float32x4x2_t vLane4 = vld2q_f32(lane + 2*x+2);
float32x4_t vSum_0_4 = vaddq_f32(vLane0.val[0], vLane4.val[0]);
float32x4_t vSum_1_3 = vaddq_f32(vLane1.val[0], vLane3.val[0]);
float32x4_t vRes = vmulq_f32(vLane2.val[0], vc6d64f32);
vRes = vmlaq_f32(vRes, vSum_0_4, vc1d64f32);
vRes = vmlaq_f32(vRes, vSum_1_3, vc4d64f32);
vst1q_f32(dst + x, vRes);
#endif
}
break;
case 3:
{
float32x4_t v0 = vld1q_f32((const float32_t*)lane - 2*3);
float32x4_t v1 = vld1q_f32((const float32_t*)lane - 1*3);
float32x4_t v2 = vld1q_f32((const float32_t*)lane + 0*3);
for (; x < roiw4; x += 3)
{
internal::prefetch(lane + 2 * x);
float32x4_t v3 = vld1q_f32((const float32_t*)lane + 2 * x + 1*3);
float32x4_t v4 = vld1q_f32((const float32_t*)lane + 2 * x + 2*3);
float32x4_t v04 = vaddq_f32(v0, v4);
float32x4_t v13 = vaddq_f32(v1, v3);
float32x4_t v = vmulq_f32(v2, vc6d64f32);
v = vmlaq_f32(v, v04, vc1d64f32);
v = vmlaq_f32(v, v13, vc4d64f32);
v0 = v2;
v1 = v3;
v2 = v4;
vst1q_f32(dst + x, v);
}
}
break;
case 4:
{
float32x4_t v0 = vld1q_f32((const float32_t*)lane - 2*4);
float32x4_t v1 = vld1q_f32((const float32_t*)lane - 1*4);
float32x4_t v2 = vld1q_f32((const float32_t*)lane + 0*4);
for (; x < roiw4; x += 4)
{
internal::prefetch(lane + 2 * x + 8);
float32x4_t v3 = vld1q_f32((const float32_t*)lane + 2 * x + 1*4);
float32x4_t v4 = vld1q_f32((const float32_t*)lane + 2 * x + 2*4);
float32x4_t v04 = vaddq_f32(v0, v4);
float32x4_t v13 = vaddq_f32(v1, v3);
float32x4_t v = vmulq_f32(v2, vc6d64f32);
v = vmlaq_f32(v, v04, vc1d64f32);
v = vmlaq_f32(v, v13, vc4d64f32);
v0 = v2;
v1 = v3;
v2 = v4;
vst1q_f32(dst + x, v);
}
}
break;
}
for (u32 h = 0; h < cn; ++h)
{
f32* ln = lane + h;
f32* dt = dst + h;
for (size_t k = x; k < dcolcn; k += cn)
dt[k] = 0.015625f * (ln[2*k-2*cn] + ln[2*k+2*cn]) + 0.0625f * (ln[2*k-cn] + ln[2*k+cn]) + 0.09375f * ln[2*k];
}
}
#else
// Remove 'unused parameter' warnings.
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
#endif
}
void gaussianPyramidUp(const Size2D &srcSize,
const u8 *srcBase, ptrdiff_t srcStride,
const Size2D &dstSize,
u8 *dstBase, ptrdiff_t dstStride, u8 cn)
{
internal::assertSupportedConfiguration(isGaussianPyramidUpU8Supported(srcSize, dstSize, cn));
#ifdef CAROTENE_NEON
size_t dcolshn = (dstSize.width/2) * cn;
size_t dcolshw = ((dstSize.width+1)/2) * cn;
size_t scolsn = srcSize.width*cn;
size_t idx_l = (borderInterpolate101(-2, 2 * srcSize.width)/2) * cn;
size_t idx_r1 = (borderInterpolate101(2 * srcSize.width + 0, 2 * srcSize.width)/2) * cn;
size_t idx_r2 = (borderInterpolate101(2 * srcSize.width + 2, 2 * srcSize.width + 2)/2) * cn;
//2-lines buffer
std::vector<u16> _buf(2*(cn*(srcSize.width + 3) + 32/sizeof(u16)));
u16* lane0 = internal::alignPtr(&_buf[cn], 32);
u16* lane1 = internal::alignPtr(lane0 + (3 + srcSize.width)*cn, 32);
uint8x8_t vc6u8 = vmov_n_u8(6);
uint16x8_t vc6u16 = vmovq_n_u16(6);
for (size_t i = 0; i < (dstSize.height + 1)/2; ++i)
{
u8* dst = internal::getRowPtr(dstBase, dstStride, 2*i);
//vertical convolution
const u8* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 - 2, srcSize.height * 2)/2);
const u8* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 + 0, srcSize.height * 2)/2);
const u8* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 + 2, srcSize.height * 2)/2);
size_t x = 0;
for (; x <= scolsn - 8; x += 8)
{
internal::prefetch(internal::getRowPtr(ln1 + x, srcStride, (ptrdiff_t)x % 3 - 1));
uint8x8_t v0 = vld1_u8(ln0+x);
uint8x8_t v2 = vld1_u8(ln2+x);
uint8x8_t v1 = vld1_u8(ln1+x);
uint16x8_t vl0 = vaddl_u8(v0, v2);
uint16x8_t vl1 = vaddl_u8(v1, v2);
vl0 = vmlal_u8(vl0, v1, vc6u8);
vl1 = vshlq_n_u16(vl1, 2);
vst1q_u16(lane0 + x, vl0);
vst1q_u16(lane1 + x, vl1);
}
for (; x < scolsn; ++x)
{
lane0[x] = ln0[x] + ln2[x] + 6u * ln1[x];
lane1[x] = 4u * (ln1[x] + ln2[x]);
}
//left&right borders
for (u32 k = 0; k < cn; ++k)
{
lane0[(s32)(-cn+k)] = lane0[idx_l + k];
lane1[(s32)(-cn+k)] = lane1[idx_l + k];
lane0[scolsn+k] = lane0[idx_r1 + k];
lane0[scolsn+cn+k] = lane0[idx_r2 + k];
lane1[scolsn+k] = lane1[idx_r1 + k];
lane1[scolsn+cn+k] = lane1[idx_r2 + k];
}
//horizontal convolution
const u16* lane = lane0;
pyrUp8uHorizontalConvolution:
x = 0;
size_t lim;
switch(cn)
{
case 1:
lim = dcolshn > 7 ? dcolshn - 7 : 0;
for (; x < lim; x += 8)
{
internal::prefetch(lane + x);
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
__asm__ (
"vld1.16 {d0-d1}, [%[in0]] /*q0 = v0*/ \n\t"
"vld1.16 {d2-d3}, [%[in2]] /*q1 = v2*/ \n\t"
"vld1.16 {d4-d5}, [%[in1],:128] /*q2 = v1*/ \n\t"
"vadd.i16 q0, q1 /*q0 = v0 + v2*/ \n\t"
"vadd.i16 q3, q1, q2 /*q3 = v1 + v2*/ \n\t"
"vmla.i16 q0, q2, %q[c6] /*q0 += v1*6*/ \n\t"
"vrshrn.u16 d9, q3, #4 \n\t"
"vrshrn.u16 d8, q0, #6 \n\t"
"vst2.8 {d8-d9}, [%[out]] \n\t"
: /*no output*/
: [out] "r" (dst + x*2),
[in0] "r" (lane + x - 1),
[in1] "r" (lane + x + 0),
[in2] "r" (lane + x + 1),
[c6] "w" (vc6u16)
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9"
);
#else
uint16x8_t vLane0 = vld1q_u16(lane + x - 1);
uint16x8_t vLane1 = vld1q_u16(lane + x + 0);
uint16x8_t vLane2 = vld1q_u16(lane + x + 1);
vLane0 = vaddq_u16(vLane0, vLane2);
vLane2 = vaddq_u16(vLane2, vLane1);
vLane0 = vmlaq_u16(vLane0, vLane1, vc6u16);
uint8x8x2_t vRes;
vRes.val[0] = vrshrn_n_u16(vLane0, 6);
vRes.val[1] = vrshrn_n_u16(vLane2, 4);
vst2_u8(dst + x*2, vRes);
#endif
}
break;
case 3:
{
lim = dcolshn > 23 ? dcolshn - 23 : 0;
for (; x < lim; x += 24)
{
internal::prefetch(lane + x);
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
__asm__ (
"vmov.u16 q9, #6 \n\t"
"vld3.16 {d0, d2, d4}, [%[in0]] /*v0*/ \n\t"
"vld3.16 {d1, d3, d5}, [%[in02]] \n\t"
"vld3.16 {d6, d8, d10}, [%[in2]] /*v2*/ \n\t"
"vld3.16 {d7, d9, d11}, [%[in22]] \n\t"
"vld3.16 {d12, d14, d16}, [%[in1]] /*v1*/ \n\t"
"vld3.16 {d13, d15, d17}, [%[in12]] \n\t"
"vadd.i16 q0, q3 /*v0 + v2*/ \n\t"
"vadd.i16 q1, q4 /*v0 + v2*/ \n\t"
"vadd.i16 q2, q5 /*v0 + v2*/ \n\t"
"vadd.i16 q3, q6 /*v1 + v2*/ \n\t"
"vadd.i16 q4, q7 /*v1 + v2*/ \n\t"
"vadd.i16 q5, q8 /*v1 + v2*/ \n\t"
"vmla.i16 q0, q6, q9 /*v0 + v2 + v1*6 */ \n\t"
"vmla.i16 q1, q7, q9 /*v0 + v2 + v1*6 */ \n\t"
"vmla.i16 q2, q8, q9 /*v0 + v2 + v1*6 */ \n\t"
"vrshrn.u16 d19, q3, #4 \n\t"
"vrshrn.u16 d21, q4, #4 \n\t"
"vrshrn.u16 d23, q5, #4 \n\t"
"vrshrn.u16 d18, q0, #6 \n\t"
"vrshrn.u16 d20, q1, #6 \n\t"
"vrshrn.u16 d22, q2, #6 \n\t"
"vzip.8 d18, d19 \n\t"
"vzip.8 d20, d21 \n\t"
"vzip.8 d22, d23 \n\t"
"vst3.8 {d18, d20, d22}, [%[out1]] \n\t"
"vst3.8 {d19, d21, d23}, [%[out2]] \n\t"
: /*no output*/
: [out1] "r" (dst + 2 * x),
[out2] "r" (dst + 2 * x + 24),
[in0] "r" (lane + x - 3),
[in02] "r" (lane + x + 9),
[in1] "r" (lane + x),
[in12] "r" (lane + x + 12),
[in2] "r" (lane + x + 3),
[in22] "r" (lane + x + 15)
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
);
#else
uint16x8_t vc6 = vmovq_n_u16(6);
uint16x8x3_t vLane0 = vld3q_u16(lane + x - 3);
uint16x8x3_t vLane1 = vld3q_u16(lane + x + 0);
uint16x8x3_t vLane2 = vld3q_u16(lane + x + 3);
uint16x8_t vSum_0_3 = vaddq_u16(vLane0.val[0], vLane2.val[0]);
uint16x8_t vSum_1_4 = vaddq_u16(vLane0.val[1], vLane2.val[1]);
uint16x8_t vSum_2_5 = vaddq_u16(vLane0.val[2], vLane2.val[2]);
uint16x8_t vSum_3_6 = vaddq_u16(vLane2.val[0], vLane1.val[0]);
uint16x8_t vSum_4_7 = vaddq_u16(vLane2.val[1], vLane1.val[1]);
uint16x8_t vSum_5_8 = vaddq_u16(vLane2.val[2], vLane1.val[2]);
vSum_0_3 = vmlaq_u16(vSum_0_3, vLane1.val[0], vc6);
vSum_1_4 = vmlaq_u16(vSum_1_4, vLane1.val[1], vc6);
vSum_2_5 = vmlaq_u16(vSum_2_5, vLane1.val[2], vc6);
uint8x8x2_t vSumShr3;
vSumShr3.val[0] = vrshrn_n_u16(vSum_3_6, 4);
vSumShr3.val[1] = vrshrn_n_u16(vSum_0_3, 6);;
uint8x8x2_t vSumShr4;
vSumShr4.val[0] = vrshrn_n_u16(vSum_4_7, 4);
vSumShr4.val[1] = vrshrn_n_u16(vSum_1_4, 6);
uint8x8x2_t vSumShr5;
vSumShr5.val[0] = vrshrn_n_u16(vSum_5_8, 4);
vSumShr5.val[1] = vrshrn_n_u16(vSum_2_5, 6);
vSumShr3 = vzip_u8(vSumShr3.val[1], vSumShr3.val[0]);
vSumShr4 = vzip_u8(vSumShr4.val[1], vSumShr4.val[0]);
vSumShr5 = vzip_u8(vSumShr5.val[1], vSumShr5.val[0]);
uint8x8x3_t vRes1;
vRes1.val[0] = vSumShr3.val[0];
vRes1.val[1] = vSumShr4.val[0];
vRes1.val[2] = vSumShr5.val[0];
vst3_u8(dst + 2 * x, vRes1);
uint8x8x3_t vRes2;
vRes2.val[0] = vSumShr3.val[1];
vRes2.val[1] = vSumShr4.val[1];
vRes2.val[2] = vSumShr5.val[1];
vst3_u8(dst + 2 * x + 24, vRes2);
#endif
}
}
break;
case 4:
lim = dcolshn > 7 ? dcolshn - 7 : 0;
for (; x < lim; x += 8)
{
internal::prefetch(lane + x);
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
__asm__ (
"vld1.16 {d0-d1}, [%[in0]] /*q0 = v0*/ \n\t"
"vld1.16 {d2-d3}, [%[in2]] /*q1 = v2*/ \n\t"
"vld1.16 {d4-d5}, [%[in1],:128] /*q2 = v1*/ \n\t"
"vadd.i16 q0, q1 /*q0 = v0 + v2*/ \n\t"
"vadd.i16 q3, q1, q2 /*q3 = v1 + v2*/ \n\t"
"vmla.i16 q0, q2, %q[c6] /*q0 += v1*6*/ \n\t"
"vrshrn.u16 d9, q3, #4 \n\t"
"vrshrn.u16 d8, q0, #6 \n\t"
"vst2.32 {d8-d9}, [%[out]] \n\t"
: /*no output*/
: [out] "r" (dst + x*2),
[in0] "r" (lane + x-4),
[in1] "r" (lane + x),
[in2] "r" (lane + x+4),
[c6] "w" (vc6u16)
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9"
);
#else
uint16x8_t vLane0 = vld1q_u16(lane + x-4);
uint16x8_t vLane1 = vld1q_u16(lane + x+0);
uint16x8_t vLane2 = vld1q_u16(lane + x+4);
vLane0 = vaddq_u16(vLane0, vLane2);
vLane2 = vaddq_u16(vLane2, vLane1);
vLane0 = vmlaq_u16(vLane0, vLane1, vc6u16);
uint32x2x2_t vRes;
vRes.val[1] = vreinterpret_u32_u8(vrshrn_n_u16(vLane2, 4));
vRes.val[0] = vreinterpret_u32_u8(vrshrn_n_u16(vLane0, 6));
vst2_u32((uint32_t*)(dst + x*2), vRes);
#endif
}
break;
};
for (u32 h = 0; h < cn; ++h)
{
const u16* ln = lane + h;
u8* dt = dst + h;
size_t k = x;
for (; k < dcolshn; k += cn)
{
dt[2*k+0] = u8((ln[(ptrdiff_t)(k-cn)] + ln[k+cn] + 6u * ln[k] + (1 << 5)) >> 6);
dt[2*k+cn] = u8(((ln[k] + ln[k+cn]) * 4u + (1 << 5)) >> 6);
}
for (; k < dcolshw; k += cn)
dt[2*k] = u8((ln[(ptrdiff_t)(k-cn)] + ln[k+cn] + 6u * ln[k] + (1 << 5)) >> 6);
}
dst = internal::getRowPtr(dstBase, dstStride, 2*i+1);
//second row
if (lane == lane0 && 2*i+1 < dstSize.height)
{
lane = lane1;
goto pyrUp8uHorizontalConvolution;
}
}
#else
// Remove 'unused parameter' warnings.
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
#endif
}
void gaussianPyramidUp(const Size2D &srcSize,
const s16 *srcBase, ptrdiff_t srcStride,
const Size2D &dstSize,
s16 *dstBase, ptrdiff_t dstStride, u8 cn)
{
internal::assertSupportedConfiguration(isGaussianPyramidUpS16Supported(srcSize, dstSize, cn));
#ifdef CAROTENE_NEON
size_t dcolshn = (dstSize.width/2) * cn;
size_t dcolshw = ((dstSize.width+1)/2) * cn;
size_t scolsn = srcSize.width*cn;
size_t idx_l = (borderInterpolate101(-2, 2 * srcSize.width)/2) * cn;
size_t idx_r1 = (borderInterpolate101(2 * srcSize.width + 0, 2 * srcSize.width)/2) * cn;
size_t idx_r2 = (borderInterpolate101(2 * srcSize.width + 2, 2 * srcSize.width + 2)/2) * cn;
//2-lines buffer
std::vector<s32> _buf(2*(cn*(srcSize.width + 3) + 32/sizeof(s32)));
s32* lane0 = internal::alignPtr(&_buf[cn], 32);
s32* lane1 = internal::alignPtr(lane0 + (3 + srcSize.width)*cn, 32);
int16x4_t vc6s16 = vmov_n_s16(6);
int32x4_t vc6s32 = vmovq_n_s32(6);
for (size_t i = 0; i < (dstSize.height + 1)/2; ++i)
{
s16* dst = internal::getRowPtr(dstBase, dstStride, 2*i);
//vertical convolution
const s16* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 - 2, srcSize.height * 2)/2);
const s16* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 + 0, srcSize.height * 2)/2);
const s16* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 + 2, srcSize.height * 2)/2);
size_t x = 0;
for (; x <= scolsn - 4; x += 4)
{
internal::prefetch(internal::getRowPtr(ln1 + x, srcStride, (ptrdiff_t)x % 3 - 1));
int16x4_t v0 = vld1_s16(ln0 + x);
int16x4_t v2 = vld1_s16(ln2 + x);
int16x4_t v1 = vld1_s16(ln1 + x);
int32x4_t vl0 = vaddl_s16(v0, v2);
int32x4_t vl1 = vaddl_s16(v1, v2);
vl0 = vmlal_s16(vl0, v1, vc6s16);
vl1 = vshlq_n_s32(vl1, 2);
vst1q_s32(lane0 + x, vl0);
vst1q_s32(lane1 + x, vl1);
}
for (; x < scolsn; ++x)
{
lane0[x] = ln0[x] + ln2[x] + 6 * ln1[x];
lane1[x] = 4 * (ln1[x] + ln2[x]);
}
//left&right borders
for (u32 k = 0; k < cn; ++k)
{
lane0[(s32)(-cn+k)] = lane0[idx_l + k];
lane1[(s32)(-cn+k)] = lane1[idx_l + k];
lane0[scolsn+k] = lane0[idx_r1 + k];
lane0[scolsn+cn+k] = lane0[idx_r2 + k];
lane1[scolsn+k] = lane1[idx_r1 + k];
lane1[scolsn+cn+k] = lane1[idx_r2 + k];
}
//horizontal convolution
const s32* lane = lane0;
pyrUp16sHorizontalConvolution:
x = 0;
size_t lim;
switch(cn)
{
case 1:
lim = dcolshn > 3 ? dcolshn - 3 : 0;
for (; x < lim; x += 4)
{
internal::prefetch(lane + x);
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
__asm__ (
"vld1.32 {d0-d1}, [%[in0]] /*q0 = v0*/ \n\t"
"vld1.32 {d2-d3}, [%[in2]] /*q1 = v2*/ \n\t"
"vld1.32 {d4-d5}, [%[in1],:128] /*q2 = v1*/ \n\t"
"vadd.i32 q0, q0, q1 /*q0 = v0 + v2*/ \n\t"
"vadd.i32 q3, q1, q2 /*q3 = v1 + v2*/ \n\t"
"vmla.i32 q0, q2, %q[c6] /*q0 += v1*6*/ \n\t"
"vrshrn.s32 d9, q3, #4 \n\t"
"vrshrn.s32 d8, q0, #6 \n\t"
"vst2.16 {d8-d9}, [%[out]] \n\t"
: /*no output*/
: [out] "r" (dst + x * 2),
[in0] "r" (lane + x - 1),
[in1] "r" (lane + x),
[in2] "r" (lane + x + 1),
[c6] "w" (vc6s32)
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9"
);
#else
int32x4_t vLane0 = vld1q_s32(lane + x - 1);
int32x4_t vLane1 = vld1q_s32(lane + x);
int32x4_t vLane2 = vld1q_s32(lane + x + 1);
vLane0 = vaddq_s32(vLane0, vLane2);
vLane2 = vaddq_s32(vLane2, vLane1);
vLane0 = vmlaq_s32(vLane0, vLane1, vc6s32);
int16x4x2_t vRes;
vRes.val[0] = vrshrn_n_s32(vLane0, 6);
vRes.val[1] = vrshrn_n_s32(vLane2, 4);
vst2_s16(dst + x * 2, vRes);
#endif
}
break;
case 3:
{
lim = dcolshn > 11 ? dcolshn - 11 : 0;
for (; x < lim; x += 12)
{
internal::prefetch(lane + x + 3);
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
__asm__ (
"vmov.s32 q9, #6 \n\t"
"vld3.32 {d0, d2, d4}, [%[in0]] /*v0*/ \n\t"
"vld3.32 {d1, d3, d5}, [%[in2]] \n\t"
"vld3.32 {d6, d8, d10}, [%[in2]] /*v2*/ \n\t"
"vld3.32 {d7, d9, d11}, [%[in22]] \n\t"
"vld3.32 {d12, d14, d16}, [%[in1]] /*v1*/ \n\t"
"vld3.32 {d13, d15, d17}, [%[in12]] \n\t"
"vadd.i32 q0, q3 /*v0 + v2*/ \n\t"
"vadd.i32 q1, q4 /*v0 + v2*/ \n\t"
"vadd.i32 q2, q5 /*v0 + v2*/ \n\t"
"vadd.i32 q3, q6 /*v1 + v2*/ \n\t"
"vadd.i32 q4, q7 /*v1 + v2*/ \n\t"
"vadd.i32 q5, q8 /*v1 + v2*/ \n\t"
"vmla.i32 q0, q6, q9 /*v0 + v2 + v1*6 */ \n\t"
"vmla.i32 q1, q7, q9 /*v0 + v2 + v1*6 */ \n\t"
"vmla.i32 q2, q8, q9 /*v0 + v2 + v1*6 */ \n\t"
"vrshrn.s32 d19, q3, #4 \n\t"
"vrshrn.s32 d21, q4, #4 \n\t"
"vrshrn.s32 d23, q5, #4 \n\t"
"vrshrn.s32 d18, q0, #6 \n\t"
"vrshrn.s32 d20, q1, #6 \n\t"
"vrshrn.s32 d22, q2, #6 \n\t"
"vzip.16 d18, d19 \n\t"
"vzip.16 d20, d21 \n\t"
"vzip.16 d22, d23 \n\t"
"vst3.16 {d18, d20, d22}, [%[out1]] \n\t"
"vst3.16 {d19, d21, d23}, [%[out2]] \n\t"
: /*no output*/
: [out1] "r" (dst + 2*x),
[out2] "r" (dst + 2*x + 12),
[in0] "r" (lane + x - 3),
[in1] "r" (lane + x),
[in12] "r" (lane + x + 6),
[in2] "r" (lane + x + 3),
[in22] "r" (lane + x + 9)
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23"
);
#else
int32x4_t vc6 = vmovq_n_s32(6);
int32x4x3_t vLane0 = vld3q_s32(lane + x - 3);
int32x4x3_t vLane1 = vld3q_s32(lane + x);
int32x4x3_t vLane2 = vld3q_s32(lane + x + 3);
int32x4_t vSum_0_3 = vaddq_s32(vLane0.val[0], vLane2.val[0]);
int32x4_t vSum_1_4 = vaddq_s32(vLane0.val[1], vLane2.val[1]);
int32x4_t vSum_2_5 = vaddq_s32(vLane0.val[2], vLane2.val[2]);
int32x4_t vSum_3_6 = vaddq_s32(vLane2.val[0], vLane1.val[0]);
int32x4_t vSum_4_7 = vaddq_s32(vLane2.val[1], vLane1.val[1]);
int32x4_t vSum_5_8 = vaddq_s32(vLane2.val[2], vLane1.val[2]);
vSum_0_3 = vmlaq_s32(vSum_0_3, vLane1.val[0], vc6);
vSum_1_4 = vmlaq_s32(vSum_1_4, vLane1.val[1], vc6);
vSum_2_5 = vmlaq_s32(vSum_2_5, vLane1.val[2], vc6);
int16x4x2_t vSumShr1;
vSumShr1.val[1] = vrshrn_n_s32(vSum_3_6, 4);
vSumShr1.val[0] = vrshrn_n_s32(vSum_0_3, 6);
int16x4x2_t vSumShr2;
vSumShr2.val[1] = vrshrn_n_s32(vSum_4_7, 4);
vSumShr2.val[0] = vrshrn_n_s32(vSum_1_4, 6);
int16x4x2_t vSumShr3;
vSumShr3.val[1] = vrshrn_n_s32(vSum_5_8, 4);
vSumShr3.val[0] = vrshrn_n_s32(vSum_2_5, 6);
vSumShr1 = vzip_s16(vSumShr1.val[0], vSumShr1.val[1]);
vSumShr2 = vzip_s16(vSumShr2.val[0], vSumShr2.val[1]);
vSumShr3 = vzip_s16(vSumShr3.val[0], vSumShr3.val[1]);
int16x4x3_t vRes1;
vRes1.val[0] = vSumShr1.val[0];
vRes1.val[1] = vSumShr2.val[0];
vRes1.val[2] = vSumShr3.val[0];
vst3_s16((int16_t*)(dst + 2 * x), vRes1);
int16x4x3_t vRes2;
vRes2.val[0] = vSumShr1.val[1];
vRes2.val[1] = vSumShr2.val[1];
vRes2.val[2] = vSumShr3.val[1];
vst3_s16(dst + 2 * x + 12, vRes2);
#endif
}
}
break;
case 4:
lim = dcolshn > 3 ? dcolshn - 3 : 0;
for (; x < lim; x += 4)
{
internal::prefetch(lane + x);
#if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
__asm__ (
"vld1.32 {d0-d1}, [%[in0]] /*q0 = v0*/ \n\t"
"vld1.32 {d2-d3}, [%[in2]] /*q1 = v2*/ \n\t"
"vld1.32 {d4-d5}, [%[in1],:128] /*q2 = v1*/ \n\t"
"vadd.i32 q0, q1 /*q0 = v0 + v2*/ \n\t"
"vadd.i32 q3, q1, q2 /*q3 = v1 + v2*/ \n\t"
"vmla.i32 q0, q2, %q[c6] /*q0 += v1*6*/ \n\t"
"vrshrn.s32 d9, q3, #4 \n\t"
"vrshrn.s32 d8, q0, #6 \n\t"
"vst1.16 {d8-d9}, [%[out]] \n\t"
: /*no output*/
: [out] "r" (dst + x * 2),
[in0] "r" (lane + x - 4),
[in1] "r" (lane + x),
[in2] "r" (lane + x + 4),
[c6] "w" (vc6s32)
: "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9"
);
#else
int32x4_t vLane0 = vld1q_s32(lane + x - 4);
int32x4_t vLane1 = vld1q_s32(lane + x);
int32x4_t vLane2 = vld1q_s32(lane + x + 4);
vLane0 = vaddq_s32(vLane0, vLane2);
vLane2 = vaddq_s32(vLane2, vLane1);
vLane0 = vmlaq_s32(vLane0, vLane1, vc6s32);
int16x4x2_t vRes;
vRes.val[0] = vrshrn_n_s32(vLane0, 6);
vRes.val[1] = vrshrn_n_s32(vLane2, 4);
vst1q_s16(dst + x * 2, vcombine_s16(vRes.val[0], vRes.val[1]));
#endif
}
break;
};
for (u32 h = 0; h < cn; ++h)
{
const s32* ln = lane + h;
s16* dt = dst + h;
size_t k = x;
for (; k < dcolshn; k += cn)
{
dt[2*k+0] = s16((ln[(ptrdiff_t)(k-cn)] + ln[k+cn] + 6 * ln[k] + (1 << 5)) >> 6);
dt[2*k+cn] = s16(((ln[k] + ln[k+cn]) * 4 + (1 << 5)) >> 6);
}
for (; k < dcolshw; k += cn)
dt[2*k] = s16((ln[(ptrdiff_t)(k-cn)] + ln[k+cn] + 6 * ln[k] + (1 << 5)) >> 6);
}
dst = internal::getRowPtr(dstBase, dstStride, 2*i+1);
//second row
if (lane == lane0 && 2*i+1 < dstSize.height)
{
lane = lane1;
goto pyrUp16sHorizontalConvolution;
}
}
#else
// Remove 'unused parameter' warnings.
(void)srcBase;
(void)srcStride;
(void)dstBase;
(void)dstStride;
#endif
}
} // namespace CAROTENE_NS