/* * By downloading, copying, installing or using the software you agree to this license. * If you do not agree to this license, do not download, install, * copy or use the software. * * * License Agreement * For Open Source Computer Vision Library * (3-clause BSD License) * * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. * Third party copyrights are property of their respective owners. * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * * Neither the names of the copyright holders nor the names of the contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * This software is provided by the copyright holders and contributors "as is" and * any express or implied warranties, including, but not limited to, the implied * warranties of merchantability and fitness for a particular purpose are disclaimed. * In no event shall copyright holders or contributors be liable for any direct, * indirect, incidental, special, exemplary, or consequential damages * (including, but not limited to, procurement of substitute goods or services; * loss of use, data, or profits; or business interruption) however caused * and on any theory of liability, whether in contract, strict liability, * or tort (including negligence or otherwise) arising in any way out of * the use of this software, even if advised of the possibility of such damage. */ #include "common.hpp" #include namespace CAROTENE_NS { bool isGaussianPyramidDownRTZSupported(const Size2D &srcSize, const Size2D &dstSize, BORDER_MODE border_mode) { if (!isSupportedConfiguration()) return false; // Need at least 8 pixels for vectorization. // Need to make sure dst width is half the src width. // Don't care about dst height. if ( dstSize.width < 8 || std::abs((ptrdiff_t)dstSize.width*2 - (ptrdiff_t)srcSize.width) > 2 ) return false; // Current implementation only supports Reflect101 (ie: UNDEFINED mode) if (border_mode != BORDER_MODE_UNDEFINED) return false; return true; } bool isGaussianPyramidDownU8Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn) { if (!isSupportedConfiguration()) return false; if ( (dstSize.width * cn) < 8 || (cn != 1 && cn !=3 && cn!=4) || std::abs((ptrdiff_t)dstSize.width*2 - (ptrdiff_t)srcSize.width) > 2 || std::abs((ptrdiff_t)dstSize.height*2 - (ptrdiff_t)srcSize.height) > 2 ) return false; return true; } bool isGaussianPyramidDownS16Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn) { if (!isSupportedConfiguration()) return false; if ( (dstSize.width * cn) < 4 || (cn != 1 && cn !=3 && cn!=4) || std::abs((ptrdiff_t)dstSize.width*2 - (ptrdiff_t)srcSize.width) > 2 || std::abs((ptrdiff_t)dstSize.height*2 - (ptrdiff_t)srcSize.height) > 2 ) return false; return true; } bool isGaussianPyramidDownF32Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn) { if (!isSupportedConfiguration()) return false; if ( (dstSize.width * cn) < 4 || (cn != 1 && cn !=3 && cn!=4) || std::abs((ptrdiff_t)dstSize.width*2 - (ptrdiff_t)srcSize.width) > 2 || std::abs((ptrdiff_t)dstSize.height*2 - (ptrdiff_t)srcSize.height) > 2 ) return false; return true; } bool isGaussianPyramidUpU8Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn) { if (!isSupportedConfiguration()) return false; if ( (srcSize.width * cn) < 8 || (cn != 1 && cn !=3 && cn!=4) || std::abs((ptrdiff_t)dstSize.width - (ptrdiff_t)srcSize.width*2) != (ptrdiff_t)dstSize.width % 2 || std::abs((ptrdiff_t)dstSize.height - (ptrdiff_t)srcSize.height*2) != (ptrdiff_t)dstSize.height % 2 ) return false; return true; } bool isGaussianPyramidUpS16Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn) { if (!isSupportedConfiguration()) return false; if ( (srcSize.width * cn) < 12 || (cn != 1 && cn !=3 && cn!=4) || std::abs((ptrdiff_t)dstSize.width - (ptrdiff_t)srcSize.width*2) != (ptrdiff_t)dstSize.width % 2 || std::abs((ptrdiff_t)dstSize.height - (ptrdiff_t)srcSize.height*2) != (ptrdiff_t)dstSize.height % 2 ) return false; return true; } #ifdef CAROTENE_NEON namespace { ptrdiff_t borderInterpolate101(ptrdiff_t p, ptrdiff_t len) { if (len == 1) return 0; else { while ((unsigned)p >= (unsigned)len) { if (p < 0) p = -p; else p = (len - 1)*2 - p; } } return p; } } // namespace #endif void gaussianPyramidDownRTZ(const Size2D &srcSize, const u8 *srcBase, ptrdiff_t srcStride, const Size2D &dstSize, u8 *dstBase, ptrdiff_t dstStride, BORDER_MODE border, u8 borderValue) { internal::assertSupportedConfiguration(isGaussianPyramidDownRTZSupported(srcSize, dstSize, border)); #ifdef CAROTENE_NEON // Single-core NEON code const size_t dwidth = dstSize.width; const size_t dheight = dstSize.height; const size_t swidth = srcSize.width; const size_t sheight = srcSize.height; ptrdiff_t idx_l1 = borderInterpolate101(-1, swidth); ptrdiff_t idx_l2 = borderInterpolate101(-2, swidth); ptrdiff_t idx_r1 = borderInterpolate101(swidth + 0, swidth); ptrdiff_t idx_r2 = borderInterpolate101(swidth + 1, swidth); //1-line buffer std::vector _buf((swidth + 4) + 32/sizeof(u16)); u16* lane = internal::alignPtr(&_buf[2], 32); uint8x8_t vc6u8 = vmov_n_u8(6); uint16x8_t vc6u16 = vmovq_n_u16(6); uint16x8_t vc4u16 = vmovq_n_u16(4); u8* dst = dstBase; for (size_t i = 0; i < dheight; ++i, dst += dstStride) { //vertical convolution const u8* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-2, sheight)); const u8* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-1, sheight)); const u8* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+0, sheight)); const u8* ln3 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+1, sheight)); const u8* ln4 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+2, sheight)); size_t x = 0; for (; x <= swidth - 8; x += 8) { internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2)); uint8x8_t v0 = vld1_u8(ln0+x); uint8x8_t v1 = vld1_u8(ln1+x); uint8x8_t v2 = vld1_u8(ln2+x); uint8x8_t v3 = vld1_u8(ln3+x); uint8x8_t v4 = vld1_u8(ln4+x); uint16x8_t v = vaddl_u8(v0, v4); uint16x8_t v13 = vaddl_u8(v1, v3); v = vmlal_u8(v, v2, vc6u8); v = vmlaq_u16(v, v13, vc4u16); vst1q_u16(lane + x, v); } for (; x < swidth; ++x) { lane[x] = ln0[x] + ln4[x] + 4u * (ln1[x] + ln3[x]) + 6u * ln2[x]; } //left&right borders lane[-1] = lane[idx_l1]; lane[-2] = lane[idx_l2]; lane[swidth] = lane[idx_r1]; lane[swidth+1] = lane[idx_r2]; //horizontal convolution x = 0; size_t vw = (swidth/2) - 7; // Using 7 instead of 8 allows swidth of 14 or 15. for (; x < vw; x += 8) { internal::prefetch(lane + 2 * x); uint16x8x2_t vLane0 = vld2q_u16(lane + 2*x-2); // L0[0] = x0 x2 x4 x6 x8 x10 x12 x14 L0[1] = x1 x3 x5 x7 x9 x11 x13 x15 uint16x8x2_t vLane1 = vld2q_u16(lane + 2*x-1); // L1[0] = x1 x3 x5 x7 x9 x11 x13 x15 L1[1] = x2 x4 x6 x8 x10 x12 x14 x16 uint16x8x2_t vLane2 = vld2q_u16(lane + 2*x+0); // L2[0] = x2 x4 x6 x8 x10 x12 x14 x16 L2[1] = x3 x5 x7 x9 x11 x13 x15 x17 uint16x8x2_t vLane3 = vld2q_u16(lane + 2*x+1); // L3[0] = x3 x5 x7 x9 x11 x13 x15 x17 L3[1] = x4 x6 x8 x10 x12 x14 x16 x18 uint16x8x2_t vLane4 = vld2q_u16(lane + 2*x+2); // L4[0] = x4 x6 x8 x10 x12 x14 x16 x18 L4[1] = x5 x7 x9 x11 x13 x15 x17 x19 uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane4.val[0]); uint16x8_t vSum_1_3 = vaddq_u16(vLane1.val[0], vLane3.val[0]); vSum_0_4 = vmlaq_u16(vSum_0_4, vLane2.val[0], vc6u16); vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_1_3, vc4u16); uint8x8_t vRes = vshrn_n_u16(vSum_0_4, 8); vst1_u8(dst + x, vRes); } for (; x < dwidth; x++) { dst[x] = u8((lane[2*x-2] + lane[2*x+2] + 4u * (lane[2*x-1] + lane[2*x+1]) + 6u * lane[2*x]) >> 8); } } #else // Remove 'unused parameter' warnings. (void)srcSize; (void)srcBase; (void)srcStride; (void)dstSize; (void)dstBase; (void)dstStride; (void)border; #endif (void)borderValue; } void gaussianPyramidDown(const Size2D &srcSize, const u8 *srcBase, ptrdiff_t srcStride, const Size2D &dstSize, u8 *dstBase, ptrdiff_t dstStride, u8 cn) { internal::assertSupportedConfiguration(isGaussianPyramidDownU8Supported(srcSize, dstSize, cn)); #ifdef CAROTENE_NEON size_t dcolcn = dstSize.width*cn; size_t scolcn = srcSize.width*cn; size_t roiw8 = dcolcn - 7; size_t idx_l1 = borderInterpolate101(-1, srcSize.width) * cn; size_t idx_l2 = borderInterpolate101(-2, srcSize.width) * cn; size_t idx_r1 = borderInterpolate101(srcSize.width + 0, srcSize.width) * cn; size_t idx_r2 = borderInterpolate101(srcSize.width + 1, srcSize.width) * cn; //1-line buffer std::vector _buf(cn*(srcSize.width + 4) + 32/sizeof(u16)); u16* lane = internal::alignPtr(&_buf[2*cn], 32); uint8x8_t vc6u8 = vmov_n_u8(6); uint16x8_t vc6u16 = vmovq_n_u16(6); uint16x8_t vc4u16 = vmovq_n_u16(4); for (size_t i = 0; i < dstSize.height; ++i) { u8* dst = internal::getRowPtr(dstBase, dstStride, i); //vertical convolution const u8* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-2, srcSize.height)); const u8* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-1, srcSize.height)); const u8* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+0, srcSize.height)); const u8* ln3 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+1, srcSize.height)); const u8* ln4 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+2, srcSize.height)); size_t x = 0; for (; x <= scolcn - 8; x += 8) { internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, (ptrdiff_t)x % 5 - 2)); uint8x8_t v0 = vld1_u8(ln0+x); uint8x8_t v1 = vld1_u8(ln1+x); uint8x8_t v2 = vld1_u8(ln2+x); uint8x8_t v3 = vld1_u8(ln3+x); uint8x8_t v4 = vld1_u8(ln4+x); uint16x8_t v = vaddl_u8(v0, v4); uint16x8_t v13 = vaddl_u8(v1, v3); v = vmlal_u8(v, v2, vc6u8); v = vmlaq_u16(v, v13, vc4u16); vst1q_u16(lane + x, v); } for (; x < scolcn; ++x) { lane[x] = ln0[x] + ln4[x] + 4u * (ln1[x] + ln3[x]) + 6u * ln2[x]; } //left&right borders for (u32 k = 0; k < cn; ++k) { lane[(s32)(-cn+k)] = lane[idx_l1 + k]; lane[(s32)(-cn-cn+k)] = lane[idx_l2 + k]; lane[scolcn+k] = lane[idx_r1 + k]; lane[scolcn+cn+k] = lane[idx_r2 + k]; } //horizontal convolution x = 0; switch(cn) { case 1: for (; x < roiw8; x += 8) { internal::prefetch(lane + 2 * x); #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__) __asm__ ( "vld2.16 {d0-d3}, [%[in0]] \n\t" "vld2.16 {d4-d7}, [%[in4]] \n\t" "vld2.16 {d12-d15}, [%[in1]] \n\t" "vld2.16 {d16-d19}, [%[in3]] \n\t" "vld2.16 {d8-d11}, [%[in2],:256] \n\t" "vadd.i16 q0, q2 /*q0 = v0 + v4*/ \n\t" "vadd.i16 q6, q8 /*q6 = v1 + v3*/ \n\t" "vmla.i16 q0, q4, %q[c6] /*q0 += v2 * 6*/ \n\t" "vmla.i16 q0, q6, %q[c4] /*q1 += (v1+v3) * 4*/ \n\t" "vrshrn.u16 d8, q0, #8 \n\t" "vst1.8 {d8}, [%[out]] \n\t" : /*no output*/ : [out] "r" (dst + x), [in0] "r" (lane + 2*x-2), [in1] "r" (lane + 2*x-1), [in2] "r" (lane + 2*x+0), [in3] "r" (lane + 2*x+1), [in4] "r" (lane + 2*x+2), [c4] "w" (vc4u16), [c6] "w" (vc6u16) : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19" ); #else uint16x8x2_t vLane0 = vld2q_u16(lane + 2*x-2); uint16x8x2_t vLane1 = vld2q_u16(lane + 2*x-1); uint16x8x2_t vLane2 = vld2q_u16(lane + 2*x+0); uint16x8x2_t vLane3 = vld2q_u16(lane + 2*x+1); uint16x8x2_t vLane4 = vld2q_u16(lane + 2*x+2); uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane4.val[0]); uint16x8_t vSum_1_3 = vaddq_u16(vLane1.val[0], vLane3.val[0]); vSum_0_4 = vmlaq_u16(vSum_0_4, vLane2.val[0], vc6u16); vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_1_3, vc4u16); uint8x8_t vRes = vrshrn_n_u16(vSum_0_4, 8); vst1_u8(dst + x, vRes); #endif } break; case 3: { uint16x4_t vx1 = vld1_u16(lane - 2*3); uint16x4_t vx2 = vld1_u16(lane - 1*3); uint16x4_t vx3 = vld1_u16(lane + 0*3); uint16x8_t v0 = vcombine_u16(vx1, vx3); uint8x8_t map = vreinterpret_u8_u64(vmov_n_u64(0xFFFF060504020100ULL)); for (; x < roiw8; x += 6) { internal::prefetch(lane + 2 * x + 12); uint16x4_t vx_ = vld1_u16(lane + 2*x-1*3 + 6); uint16x4_t vx4 = vld1_u16(lane + 2*x+0*3 + 6); uint16x4_t vx5 = vld1_u16(lane + 2*x+1*3 + 6); uint16x4_t vx6 = vld1_u16(lane + 2*x+2*3 + 6); uint16x8_t v1 = vcombine_u16(vx2, vx_); uint16x8_t v2 = vcombine_u16(vget_high_u16(v0), vx4); uint16x8_t v3 = vcombine_u16(vx_, vx5); uint16x8_t v4 = vcombine_u16(vx4, vx6); vx2 = vx5; uint16x8_t v = vaddq_u16(v0, v4); uint16x8_t v13 = vaddq_u16(v1, v3); v = vmlaq_u16(v, v2, vc6u16); v = vmlaq_u16(v, v13, vc4u16); uint8x8_t v8 = vrshrn_n_u16(v, 8); v0 = v4; vst1_u8(dst + x, vtbl1_u8(v8, map)); } } break; case 4: { uint16x4_t vx1 = vld1_u16(lane - 2*4); uint16x4_t vx2 = vld1_u16(lane - 1*4); uint16x4_t vx3 = vld1_u16(lane + 0*4); uint16x8_t v0 = vcombine_u16(vx1, vx3); for (; x < roiw8; x += 8) { internal::prefetch(lane + 2 * x + 16); uint16x4_t vx_ = vld1_u16(lane + 2 * x - 1*4 + 8); uint16x4_t vx4 = vld1_u16(lane + 2 * x + 0*4 + 8); uint16x4_t vx5 = vld1_u16(lane + 2 * x + 1*4 + 8); uint16x4_t vx6 = vld1_u16(lane + 2 * x + 2*4 + 8); uint16x8_t v1 = vcombine_u16(vx2, vx_); uint16x8_t v2 = vcombine_u16(vget_high_u16(v0), vx4); uint16x8_t v3 = vcombine_u16(vx_, vx5); uint16x8_t v4 = vcombine_u16(vx4, vx6); vx2 = vx5; uint16x8_t v = vaddq_u16(v0, v4); uint16x8_t v13 = vaddq_u16(v1, v3); v = vmlaq_u16(v, v2, vc6u16); v = vmlaq_u16(v, v13, vc4u16); uint8x8_t v8 = vrshrn_n_u16(v, 8); v0 = v4; vst1_u8(dst + x, v8); } } break; } for (u32 h = 0; h < cn; ++h) { u16* ln = lane + h; u8* dt = dst + h; for (size_t k = x; k < dcolcn; k += cn) dt[k] = u8((ln[2*k-2*cn] + ln[2*k+2*cn] + 4u * (ln[2*k-cn] + ln[2*k+cn]) + 6u * ln[2*k] + (1 << 7)) >> 8); } } #else // Remove 'unused parameter' warnings. (void)srcBase; (void)srcStride; (void)dstBase; (void)dstStride; #endif } void gaussianPyramidDown(const Size2D &srcSize, const s16 *srcBase, ptrdiff_t srcStride, const Size2D &dstSize, s16 *dstBase, ptrdiff_t dstStride, u8 cn) { internal::assertSupportedConfiguration(isGaussianPyramidDownS16Supported(srcSize, dstSize, cn)); #ifdef CAROTENE_NEON size_t dcolcn = dstSize.width*cn; size_t scolcn = srcSize.width*cn; size_t roiw4 = dcolcn - 3; size_t idx_l1 = borderInterpolate101(-1, srcSize.width) * cn; size_t idx_l2 = borderInterpolate101(-2, srcSize.width) * cn; size_t idx_r1 = borderInterpolate101(srcSize.width + 0, srcSize.width) * cn; size_t idx_r2 = borderInterpolate101(srcSize.width + 1, srcSize.width) * cn; //1-line buffer std::vector _buf(cn*(srcSize.width + 4) + 32/sizeof(s32)); s32* lane = internal::alignPtr(&_buf[2*cn], 32); int16x4_t vc6s16 = vmov_n_s16(6); int32x4_t vc6s32 = vmovq_n_s32(6); int32x4_t vc4s32 = vmovq_n_s32(4); for (size_t i = 0; i < dstSize.height; ++i) { s16* dst = internal::getRowPtr(dstBase, dstStride, i); //vertical convolution const s16* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-2, srcSize.height)); const s16* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-1, srcSize.height)); const s16* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+0, srcSize.height)); const s16* ln3 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+1, srcSize.height)); const s16* ln4 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+2, srcSize.height)); size_t x = 0; for (; x <= scolcn - 4; x += 4) { internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, (ptrdiff_t)x % 5 - 2)); int16x4_t v0 = vld1_s16(ln0 + x); int16x4_t v1 = vld1_s16(ln1 + x); int16x4_t v2 = vld1_s16(ln2 + x); int16x4_t v3 = vld1_s16(ln3 + x); int16x4_t v4 = vld1_s16(ln4 + x); int32x4_t v = vaddl_s16(v0, v4); int32x4_t v13 = vaddl_s16(v1, v3); v = vmlal_s16(v, v2, vc6s16); v = vmlaq_s32(v, v13, vc4s32); vst1q_s32(lane + x, v); } for (; x < scolcn; ++x) { lane[x] = ln0[x] + ln4[x] + 4 * (ln1[x] + ln3[x]) + 6 * ln2[x]; } //left&right borders for (u32 k = 0; k < cn; ++k) { lane[(s32)(-cn+k)] = lane[idx_l1 + k]; lane[(s32)(-cn-cn+k)] = lane[idx_l2 + k]; lane[scolcn+k] = lane[idx_r1 + k]; lane[scolcn+cn+k] = lane[idx_r2 + k]; } //horizontal convolution x = 0; switch(cn) { case 1: for (; x < roiw4; x += 4) { internal::prefetch(lane + 2 * x); #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__) __asm__ ( "vld2.32 {d0-d3}, [%[in0]] \n\t" "vld2.32 {d4-d7}, [%[in4]] \n\t" "vld2.32 {d12-d15}, [%[in1]] \n\t" "vld2.32 {d16-d19}, [%[in3]] \n\t" "vld2.32 {d8-d11}, [%[in2],:256] \n\t" "vadd.i32 q0, q2 \n\t" "vadd.i32 q6, q8 \n\t" "vmla.i32 q0, q4, %q[c6] \n\t" "vmla.i32 q0, q6, %q[c4] \n\t" "vrshrn.s32 d8, q0, #8 \n\t" "vst1.16 {d8}, [%[out]] \n\t" : /*no output*/ : [out] "r" (dst + x), [in0] "r" (lane + 2*x-2), [in1] "r" (lane + 2*x-1), [in2] "r" (lane + 2*x+0), [in3] "r" (lane + 2*x+1), [in4] "r" (lane + 2*x+2), [c4] "w" (vc4s32), [c6] "w" (vc6s32) : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19" ); #else int32x4x2_t vLane0 = vld2q_s32(lane + 2*x-2); int32x4x2_t vLane1 = vld2q_s32(lane + 2*x-1); int32x4x2_t vLane2 = vld2q_s32(lane + 2*x+0); int32x4x2_t vLane3 = vld2q_s32(lane + 2*x+1); int32x4x2_t vLane4 = vld2q_s32(lane + 2*x+2); int32x4_t vSum_0_4 = vaddq_s32(vLane0.val[0], vLane4.val[0]); int32x4_t vSum_1_3 = vaddq_s32(vLane1.val[0], vLane3.val[0]); vSum_0_4 = vmlaq_s32(vSum_0_4, vLane2.val[0], vc6s32); vSum_0_4 = vmlaq_s32(vSum_0_4, vSum_1_3, vc4s32); int16x4_t vRes = vrshrn_n_s32(vSum_0_4, 8); vst1_s16(dst + x, vRes); #endif } break; case 3: { int32x4_t v0 = vld1q_s32(lane - 2*3); int32x4_t v1 = vld1q_s32(lane - 1*3); int32x4_t v2 = vld1q_s32(lane + 0*3); for (; x < roiw4; x += 3) { internal::prefetch(lane + 2 * x); int32x4_t v3 = vld1q_s32(lane + 2 * x + 1*3); int32x4_t v4 = vld1q_s32(lane + 2 * x + 2*3); int32x4_t v = vaddq_s32(v0, v4); int32x4_t v13 = vaddq_s32(v1, v3); v = vmlaq_s32(v, v2, vc6s32); v = vmlaq_s32(v, v13, vc4s32); int16x4_t vv = vrshrn_n_s32(v, 8); v0 = v2; v1 = v3; v2 = v4; vst1_s16(dst + x, vv); } } break; case 4: { int32x4_t v0 = vld1q_s32(lane - 2*4); int32x4_t v1 = vld1q_s32(lane - 1*4); int32x4_t v2 = vld1q_s32(lane + 0*4); for (; x < roiw4; x += 4) { internal::prefetch(lane + 2 * x + 8); int32x4_t v3 = vld1q_s32(lane + 2 * x + 1*4); int32x4_t v4 = vld1q_s32(lane + 2 * x + 2*4); int32x4_t v = vaddq_s32(v0, v4); int32x4_t v13 = vaddq_s32(v1, v3); v = vmlaq_s32(v, v2, vc6s32); v = vmlaq_s32(v, v13, vc4s32); int16x4_t vv = vrshrn_n_s32(v, 8); v0 = v2; v1 = v3; v2 = v4; vst1_s16(dst + x, vv); } } break; } for (u32 h = 0; h < cn; ++h) { s32* ln = lane + h; s16* dt = dst + h; for (size_t k = x; k < dcolcn; k += cn) dt[k] = s16((ln[2*k-2*cn] + ln[2*k+2*cn] + 4 * (ln[2*k-cn] + ln[2*k+cn]) + 6 * ln[2*k] + (1 << 7)) >> 8); } } #else // Remove 'unused parameter' warnings. (void)srcBase; (void)srcStride; (void)dstBase; (void)dstStride; #endif } void gaussianPyramidDown(const Size2D &srcSize, const f32 *srcBase, ptrdiff_t srcStride, const Size2D &dstSize, f32 *dstBase, ptrdiff_t dstStride, u8 cn) { internal::assertSupportedConfiguration(isGaussianPyramidDownF32Supported(srcSize, dstSize, cn)); #ifdef CAROTENE_NEON size_t dcolcn = dstSize.width*cn; size_t scolcn = srcSize.width*cn; size_t roiw4 = dcolcn - 3; size_t idx_l1 = borderInterpolate101(-1, srcSize.width) * cn; size_t idx_l2 = borderInterpolate101(-2, srcSize.width) * cn; size_t idx_r1 = borderInterpolate101(srcSize.width + 0, srcSize.width) * cn; size_t idx_r2 = borderInterpolate101(srcSize.width + 1, srcSize.width) * cn; //1-line buffer std::vector _buf(cn*(srcSize.width + 4) + 32/sizeof(f32)); f32* lane = internal::alignPtr(&_buf[2*cn], 32); #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__) register float32x4_t vc6d4f32 asm ("q11") = vmovq_n_f32(1.5f); // 6/4 register float32x4_t vc1d4f32 asm ("q12") = vmovq_n_f32(0.25f); // 1/4 register float32x4_t vc1d64f32 asm ("q13") = vmovq_n_f32(0.015625f); //1/4/16 register float32x4_t vc4d64f32 asm ("q14") = vmovq_n_f32(0.0625f); //4/4/16 register float32x4_t vc6d64f32 asm ("q15") = vmovq_n_f32(0.09375f); //6/4/16 #else float32x4_t vc6d4f32 = vmovq_n_f32(1.5f); // 6/4 float32x4_t vc1d4f32 = vmovq_n_f32(0.25f); // 1/4 float32x4_t vc1d64f32 = vmovq_n_f32(0.015625f); //1/4/16 float32x4_t vc4d64f32 = vmovq_n_f32(0.0625f); //4/4/16 float32x4_t vc6d64f32 = vmovq_n_f32(0.09375f); //6/4/16 #endif for (size_t i = 0; i < dstSize.height; ++i) { f32* dst = internal::getRowPtr(dstBase, dstStride, i); //vertical convolution const f32* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-2, srcSize.height)); const f32* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-1, srcSize.height)); const f32* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+0, srcSize.height)); const f32* ln3 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+1, srcSize.height)); const f32* ln4 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+2, srcSize.height)); size_t x = 0; for (; x <= scolcn - 4; x += 4) { internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, (ptrdiff_t)x % 5 - 2)); float32x4_t v0 = vld1q_f32((const float32_t*)ln0 + x); float32x4_t v1 = vld1q_f32((const float32_t*)ln1 + x); float32x4_t v2 = vld1q_f32((const float32_t*)ln2 + x); float32x4_t v3 = vld1q_f32((const float32_t*)ln3 + x); float32x4_t v4 = vld1q_f32((const float32_t*)ln4 + x); float32x4_t v = vaddq_f32(v1, v3); float32x4_t v04 = vaddq_f32(v0, v4); v = vmlaq_f32(v, v2, vc6d4f32); v = vmlaq_f32(v, v04, vc1d4f32); vst1q_f32(lane + x, v); } for (; x < scolcn; ++x) { lane[x] = 0.25f*(ln0[x] + ln4[x]) + (ln1[x] + ln3[x]) + 1.5f * ln2[x]; } //left&right borders for (u32 k = 0; k < cn; ++k) { lane[(s32)(-cn+k)] = lane[idx_l1 + k]; lane[(s32)(-cn-cn+k)] = lane[idx_l2 + k]; lane[scolcn+k] = lane[idx_r1 + k]; lane[scolcn+cn+k] = lane[idx_r2 + k]; } //horizontal convolution x = 0; switch(cn) { case 1: for (; x < roiw4; x += 4) { internal::prefetch(lane + 2 * x); #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__) __asm__ __volatile__ ( "vld2.32 {d0-d3}, [%[in0]] \n\t" "vld2.32 {d8-d11}, [%[in4]] \n\t" "vld2.32 {d14-d17}, [%[in2],:256] \n\t" "vld2.32 {d10-d13}, [%[in1]] \n\t" "vld2.32 {d16-d19}, [%[in3]] \n\t" "vmul.f32 q7, %q[c6d64] \n\t" "vadd.f32 q0, q4 @v04 \n\t" "vadd.f32 q5, q8 @v13 \n\t" "vmla.f32 q7, q0, %q[c1d64] \n\t" "vmla.f32 q7, q5, %q[c4d64] \n\t" "vst1.32 {d14-d15}, [%[out]] \n\t" : : [out] "r" (dst + x), [in0] "r" (lane + 2*x-2), [in1] "r" (lane + 2*x-1), [in2] "r" (lane + 2*x+0), [in3] "r" (lane + 2*x+1), [in4] "r" (lane + 2*x+2), [c4d64] "w" (vc4d64f32), [c6d64] "w" (vc6d64f32), [c1d64] "w" (vc1d64f32) : "d0","d1","d2","d3","d4",/*"d5","d6","d7",*/"d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19" //ugly compiler "bug" - can't touch d5-d7 ); #else float32x4x2_t vLane0 = vld2q_f32(lane + 2*x-2); float32x4x2_t vLane1 = vld2q_f32(lane + 2*x-1); float32x4x2_t vLane2 = vld2q_f32(lane + 2*x+0); float32x4x2_t vLane3 = vld2q_f32(lane + 2*x+1); float32x4x2_t vLane4 = vld2q_f32(lane + 2*x+2); float32x4_t vSum_0_4 = vaddq_f32(vLane0.val[0], vLane4.val[0]); float32x4_t vSum_1_3 = vaddq_f32(vLane1.val[0], vLane3.val[0]); float32x4_t vRes = vmulq_f32(vLane2.val[0], vc6d64f32); vRes = vmlaq_f32(vRes, vSum_0_4, vc1d64f32); vRes = vmlaq_f32(vRes, vSum_1_3, vc4d64f32); vst1q_f32(dst + x, vRes); #endif } break; case 3: { float32x4_t v0 = vld1q_f32((const float32_t*)lane - 2*3); float32x4_t v1 = vld1q_f32((const float32_t*)lane - 1*3); float32x4_t v2 = vld1q_f32((const float32_t*)lane + 0*3); for (; x < roiw4; x += 3) { internal::prefetch(lane + 2 * x); float32x4_t v3 = vld1q_f32((const float32_t*)lane + 2 * x + 1*3); float32x4_t v4 = vld1q_f32((const float32_t*)lane + 2 * x + 2*3); float32x4_t v04 = vaddq_f32(v0, v4); float32x4_t v13 = vaddq_f32(v1, v3); float32x4_t v = vmulq_f32(v2, vc6d64f32); v = vmlaq_f32(v, v04, vc1d64f32); v = vmlaq_f32(v, v13, vc4d64f32); v0 = v2; v1 = v3; v2 = v4; vst1q_f32(dst + x, v); } } break; case 4: { float32x4_t v0 = vld1q_f32((const float32_t*)lane - 2*4); float32x4_t v1 = vld1q_f32((const float32_t*)lane - 1*4); float32x4_t v2 = vld1q_f32((const float32_t*)lane + 0*4); for (; x < roiw4; x += 4) { internal::prefetch(lane + 2 * x + 8); float32x4_t v3 = vld1q_f32((const float32_t*)lane + 2 * x + 1*4); float32x4_t v4 = vld1q_f32((const float32_t*)lane + 2 * x + 2*4); float32x4_t v04 = vaddq_f32(v0, v4); float32x4_t v13 = vaddq_f32(v1, v3); float32x4_t v = vmulq_f32(v2, vc6d64f32); v = vmlaq_f32(v, v04, vc1d64f32); v = vmlaq_f32(v, v13, vc4d64f32); v0 = v2; v1 = v3; v2 = v4; vst1q_f32(dst + x, v); } } break; } for (u32 h = 0; h < cn; ++h) { f32* ln = lane + h; f32* dt = dst + h; for (size_t k = x; k < dcolcn; k += cn) dt[k] = 0.015625f * (ln[2*k-2*cn] + ln[2*k+2*cn]) + 0.0625f * (ln[2*k-cn] + ln[2*k+cn]) + 0.09375f * ln[2*k]; } } #else // Remove 'unused parameter' warnings. (void)srcBase; (void)srcStride; (void)dstBase; (void)dstStride; #endif } void gaussianPyramidUp(const Size2D &srcSize, const u8 *srcBase, ptrdiff_t srcStride, const Size2D &dstSize, u8 *dstBase, ptrdiff_t dstStride, u8 cn) { internal::assertSupportedConfiguration(isGaussianPyramidUpU8Supported(srcSize, dstSize, cn)); #ifdef CAROTENE_NEON size_t dcolshn = (dstSize.width/2) * cn; size_t dcolshw = ((dstSize.width+1)/2) * cn; size_t scolsn = srcSize.width*cn; size_t idx_l = (borderInterpolate101(-2, 2 * srcSize.width)/2) * cn; size_t idx_r1 = (borderInterpolate101(2 * srcSize.width + 0, 2 * srcSize.width)/2) * cn; size_t idx_r2 = (borderInterpolate101(2 * srcSize.width + 2, 2 * srcSize.width + 2)/2) * cn; //2-lines buffer std::vector _buf(2*(cn*(srcSize.width + 3) + 32/sizeof(u16))); u16* lane0 = internal::alignPtr(&_buf[cn], 32); u16* lane1 = internal::alignPtr(lane0 + (3 + srcSize.width)*cn, 32); uint8x8_t vc6u8 = vmov_n_u8(6); uint16x8_t vc6u16 = vmovq_n_u16(6); for (size_t i = 0; i < (dstSize.height + 1)/2; ++i) { u8* dst = internal::getRowPtr(dstBase, dstStride, 2*i); //vertical convolution const u8* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 - 2, srcSize.height * 2)/2); const u8* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 + 0, srcSize.height * 2)/2); const u8* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 + 2, srcSize.height * 2)/2); size_t x = 0; for (; x <= scolsn - 8; x += 8) { internal::prefetch(internal::getRowPtr(ln1 + x, srcStride, (ptrdiff_t)x % 3 - 1)); uint8x8_t v0 = vld1_u8(ln0+x); uint8x8_t v2 = vld1_u8(ln2+x); uint8x8_t v1 = vld1_u8(ln1+x); uint16x8_t vl0 = vaddl_u8(v0, v2); uint16x8_t vl1 = vaddl_u8(v1, v2); vl0 = vmlal_u8(vl0, v1, vc6u8); vl1 = vshlq_n_u16(vl1, 2); vst1q_u16(lane0 + x, vl0); vst1q_u16(lane1 + x, vl1); } for (; x < scolsn; ++x) { lane0[x] = ln0[x] + ln2[x] + 6u * ln1[x]; lane1[x] = 4u * (ln1[x] + ln2[x]); } //left&right borders for (u32 k = 0; k < cn; ++k) { lane0[(s32)(-cn+k)] = lane0[idx_l + k]; lane1[(s32)(-cn+k)] = lane1[idx_l + k]; lane0[scolsn+k] = lane0[idx_r1 + k]; lane0[scolsn+cn+k] = lane0[idx_r2 + k]; lane1[scolsn+k] = lane1[idx_r1 + k]; lane1[scolsn+cn+k] = lane1[idx_r2 + k]; } //horizontal convolution const u16* lane = lane0; pyrUp8uHorizontalConvolution: x = 0; size_t lim; switch(cn) { case 1: lim = dcolshn > 7 ? dcolshn - 7 : 0; for (; x < lim; x += 8) { internal::prefetch(lane + x); #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__) __asm__ ( "vld1.16 {d0-d1}, [%[in0]] /*q0 = v0*/ \n\t" "vld1.16 {d2-d3}, [%[in2]] /*q1 = v2*/ \n\t" "vld1.16 {d4-d5}, [%[in1],:128] /*q2 = v1*/ \n\t" "vadd.i16 q0, q1 /*q0 = v0 + v2*/ \n\t" "vadd.i16 q3, q1, q2 /*q3 = v1 + v2*/ \n\t" "vmla.i16 q0, q2, %q[c6] /*q0 += v1*6*/ \n\t" "vrshrn.u16 d9, q3, #4 \n\t" "vrshrn.u16 d8, q0, #6 \n\t" "vst2.8 {d8-d9}, [%[out]] \n\t" : /*no output*/ : [out] "r" (dst + x*2), [in0] "r" (lane + x - 1), [in1] "r" (lane + x + 0), [in2] "r" (lane + x + 1), [c6] "w" (vc6u16) : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9" ); #else uint16x8_t vLane0 = vld1q_u16(lane + x - 1); uint16x8_t vLane1 = vld1q_u16(lane + x + 0); uint16x8_t vLane2 = vld1q_u16(lane + x + 1); vLane0 = vaddq_u16(vLane0, vLane2); vLane2 = vaddq_u16(vLane2, vLane1); vLane0 = vmlaq_u16(vLane0, vLane1, vc6u16); uint8x8x2_t vRes; vRes.val[0] = vrshrn_n_u16(vLane0, 6); vRes.val[1] = vrshrn_n_u16(vLane2, 4); vst2_u8(dst + x*2, vRes); #endif } break; case 3: { lim = dcolshn > 23 ? dcolshn - 23 : 0; for (; x < lim; x += 24) { internal::prefetch(lane + x); #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__) __asm__ ( "vmov.u16 q9, #6 \n\t" "vld3.16 {d0, d2, d4}, [%[in0]] /*v0*/ \n\t" "vld3.16 {d1, d3, d5}, [%[in02]] \n\t" "vld3.16 {d6, d8, d10}, [%[in2]] /*v2*/ \n\t" "vld3.16 {d7, d9, d11}, [%[in22]] \n\t" "vld3.16 {d12, d14, d16}, [%[in1]] /*v1*/ \n\t" "vld3.16 {d13, d15, d17}, [%[in12]] \n\t" "vadd.i16 q0, q3 /*v0 + v2*/ \n\t" "vadd.i16 q1, q4 /*v0 + v2*/ \n\t" "vadd.i16 q2, q5 /*v0 + v2*/ \n\t" "vadd.i16 q3, q6 /*v1 + v2*/ \n\t" "vadd.i16 q4, q7 /*v1 + v2*/ \n\t" "vadd.i16 q5, q8 /*v1 + v2*/ \n\t" "vmla.i16 q0, q6, q9 /*v0 + v2 + v1*6 */ \n\t" "vmla.i16 q1, q7, q9 /*v0 + v2 + v1*6 */ \n\t" "vmla.i16 q2, q8, q9 /*v0 + v2 + v1*6 */ \n\t" "vrshrn.u16 d19, q3, #4 \n\t" "vrshrn.u16 d21, q4, #4 \n\t" "vrshrn.u16 d23, q5, #4 \n\t" "vrshrn.u16 d18, q0, #6 \n\t" "vrshrn.u16 d20, q1, #6 \n\t" "vrshrn.u16 d22, q2, #6 \n\t" "vzip.8 d18, d19 \n\t" "vzip.8 d20, d21 \n\t" "vzip.8 d22, d23 \n\t" "vst3.8 {d18, d20, d22}, [%[out1]] \n\t" "vst3.8 {d19, d21, d23}, [%[out2]] \n\t" : /*no output*/ : [out1] "r" (dst + 2 * x), [out2] "r" (dst + 2 * x + 24), [in0] "r" (lane + x - 3), [in02] "r" (lane + x + 9), [in1] "r" (lane + x), [in12] "r" (lane + x + 12), [in2] "r" (lane + x + 3), [in22] "r" (lane + x + 15) : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23" ); #else uint16x8_t vc6 = vmovq_n_u16(6); uint16x8x3_t vLane0 = vld3q_u16(lane + x - 3); uint16x8x3_t vLane1 = vld3q_u16(lane + x + 0); uint16x8x3_t vLane2 = vld3q_u16(lane + x + 3); uint16x8_t vSum_0_3 = vaddq_u16(vLane0.val[0], vLane2.val[0]); uint16x8_t vSum_1_4 = vaddq_u16(vLane0.val[1], vLane2.val[1]); uint16x8_t vSum_2_5 = vaddq_u16(vLane0.val[2], vLane2.val[2]); uint16x8_t vSum_3_6 = vaddq_u16(vLane2.val[0], vLane1.val[0]); uint16x8_t vSum_4_7 = vaddq_u16(vLane2.val[1], vLane1.val[1]); uint16x8_t vSum_5_8 = vaddq_u16(vLane2.val[2], vLane1.val[2]); vSum_0_3 = vmlaq_u16(vSum_0_3, vLane1.val[0], vc6); vSum_1_4 = vmlaq_u16(vSum_1_4, vLane1.val[1], vc6); vSum_2_5 = vmlaq_u16(vSum_2_5, vLane1.val[2], vc6); uint8x8x2_t vSumShr3; vSumShr3.val[0] = vrshrn_n_u16(vSum_3_6, 4); vSumShr3.val[1] = vrshrn_n_u16(vSum_0_3, 6);; uint8x8x2_t vSumShr4; vSumShr4.val[0] = vrshrn_n_u16(vSum_4_7, 4); vSumShr4.val[1] = vrshrn_n_u16(vSum_1_4, 6); uint8x8x2_t vSumShr5; vSumShr5.val[0] = vrshrn_n_u16(vSum_5_8, 4); vSumShr5.val[1] = vrshrn_n_u16(vSum_2_5, 6); vSumShr3 = vzip_u8(vSumShr3.val[1], vSumShr3.val[0]); vSumShr4 = vzip_u8(vSumShr4.val[1], vSumShr4.val[0]); vSumShr5 = vzip_u8(vSumShr5.val[1], vSumShr5.val[0]); uint8x8x3_t vRes1; vRes1.val[0] = vSumShr3.val[0]; vRes1.val[1] = vSumShr4.val[0]; vRes1.val[2] = vSumShr5.val[0]; vst3_u8(dst + 2 * x, vRes1); uint8x8x3_t vRes2; vRes2.val[0] = vSumShr3.val[1]; vRes2.val[1] = vSumShr4.val[1]; vRes2.val[2] = vSumShr5.val[1]; vst3_u8(dst + 2 * x + 24, vRes2); #endif } } break; case 4: lim = dcolshn > 7 ? dcolshn - 7 : 0; for (; x < lim; x += 8) { internal::prefetch(lane + x); #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__) __asm__ ( "vld1.16 {d0-d1}, [%[in0]] /*q0 = v0*/ \n\t" "vld1.16 {d2-d3}, [%[in2]] /*q1 = v2*/ \n\t" "vld1.16 {d4-d5}, [%[in1],:128] /*q2 = v1*/ \n\t" "vadd.i16 q0, q1 /*q0 = v0 + v2*/ \n\t" "vadd.i16 q3, q1, q2 /*q3 = v1 + v2*/ \n\t" "vmla.i16 q0, q2, %q[c6] /*q0 += v1*6*/ \n\t" "vrshrn.u16 d9, q3, #4 \n\t" "vrshrn.u16 d8, q0, #6 \n\t" "vst2.32 {d8-d9}, [%[out]] \n\t" : /*no output*/ : [out] "r" (dst + x*2), [in0] "r" (lane + x-4), [in1] "r" (lane + x), [in2] "r" (lane + x+4), [c6] "w" (vc6u16) : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9" ); #else uint16x8_t vLane0 = vld1q_u16(lane + x-4); uint16x8_t vLane1 = vld1q_u16(lane + x+0); uint16x8_t vLane2 = vld1q_u16(lane + x+4); vLane0 = vaddq_u16(vLane0, vLane2); vLane2 = vaddq_u16(vLane2, vLane1); vLane0 = vmlaq_u16(vLane0, vLane1, vc6u16); uint32x2x2_t vRes; vRes.val[1] = vreinterpret_u32_u8(vrshrn_n_u16(vLane2, 4)); vRes.val[0] = vreinterpret_u32_u8(vrshrn_n_u16(vLane0, 6)); vst2_u32((uint32_t*)(dst + x*2), vRes); #endif } break; }; for (u32 h = 0; h < cn; ++h) { const u16* ln = lane + h; u8* dt = dst + h; size_t k = x; for (; k < dcolshn; k += cn) { dt[2*k+0] = u8((ln[(ptrdiff_t)(k-cn)] + ln[k+cn] + 6u * ln[k] + (1 << 5)) >> 6); dt[2*k+cn] = u8(((ln[k] + ln[k+cn]) * 4u + (1 << 5)) >> 6); } for (; k < dcolshw; k += cn) dt[2*k] = u8((ln[(ptrdiff_t)(k-cn)] + ln[k+cn] + 6u * ln[k] + (1 << 5)) >> 6); } dst = internal::getRowPtr(dstBase, dstStride, 2*i+1); //second row if (lane == lane0 && 2*i+1 < dstSize.height) { lane = lane1; goto pyrUp8uHorizontalConvolution; } } #else // Remove 'unused parameter' warnings. (void)srcBase; (void)srcStride; (void)dstBase; (void)dstStride; #endif } void gaussianPyramidUp(const Size2D &srcSize, const s16 *srcBase, ptrdiff_t srcStride, const Size2D &dstSize, s16 *dstBase, ptrdiff_t dstStride, u8 cn) { internal::assertSupportedConfiguration(isGaussianPyramidUpS16Supported(srcSize, dstSize, cn)); #ifdef CAROTENE_NEON size_t dcolshn = (dstSize.width/2) * cn; size_t dcolshw = ((dstSize.width+1)/2) * cn; size_t scolsn = srcSize.width*cn; size_t idx_l = (borderInterpolate101(-2, 2 * srcSize.width)/2) * cn; size_t idx_r1 = (borderInterpolate101(2 * srcSize.width + 0, 2 * srcSize.width)/2) * cn; size_t idx_r2 = (borderInterpolate101(2 * srcSize.width + 2, 2 * srcSize.width + 2)/2) * cn; //2-lines buffer std::vector _buf(2*(cn*(srcSize.width + 3) + 32/sizeof(s32))); s32* lane0 = internal::alignPtr(&_buf[cn], 32); s32* lane1 = internal::alignPtr(lane0 + (3 + srcSize.width)*cn, 32); int16x4_t vc6s16 = vmov_n_s16(6); int32x4_t vc6s32 = vmovq_n_s32(6); for (size_t i = 0; i < (dstSize.height + 1)/2; ++i) { s16* dst = internal::getRowPtr(dstBase, dstStride, 2*i); //vertical convolution const s16* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 - 2, srcSize.height * 2)/2); const s16* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 + 0, srcSize.height * 2)/2); const s16* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 + 2, srcSize.height * 2)/2); size_t x = 0; for (; x <= scolsn - 4; x += 4) { internal::prefetch(internal::getRowPtr(ln1 + x, srcStride, (ptrdiff_t)x % 3 - 1)); int16x4_t v0 = vld1_s16(ln0 + x); int16x4_t v2 = vld1_s16(ln2 + x); int16x4_t v1 = vld1_s16(ln1 + x); int32x4_t vl0 = vaddl_s16(v0, v2); int32x4_t vl1 = vaddl_s16(v1, v2); vl0 = vmlal_s16(vl0, v1, vc6s16); vl1 = vshlq_n_s32(vl1, 2); vst1q_s32(lane0 + x, vl0); vst1q_s32(lane1 + x, vl1); } for (; x < scolsn; ++x) { lane0[x] = ln0[x] + ln2[x] + 6 * ln1[x]; lane1[x] = 4 * (ln1[x] + ln2[x]); } //left&right borders for (u32 k = 0; k < cn; ++k) { lane0[(s32)(-cn+k)] = lane0[idx_l + k]; lane1[(s32)(-cn+k)] = lane1[idx_l + k]; lane0[scolsn+k] = lane0[idx_r1 + k]; lane0[scolsn+cn+k] = lane0[idx_r2 + k]; lane1[scolsn+k] = lane1[idx_r1 + k]; lane1[scolsn+cn+k] = lane1[idx_r2 + k]; } //horizontal convolution const s32* lane = lane0; pyrUp16sHorizontalConvolution: x = 0; size_t lim; switch(cn) { case 1: lim = dcolshn > 3 ? dcolshn - 3 : 0; for (; x < lim; x += 4) { internal::prefetch(lane + x); #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__) __asm__ ( "vld1.32 {d0-d1}, [%[in0]] /*q0 = v0*/ \n\t" "vld1.32 {d2-d3}, [%[in2]] /*q1 = v2*/ \n\t" "vld1.32 {d4-d5}, [%[in1],:128] /*q2 = v1*/ \n\t" "vadd.i32 q0, q0, q1 /*q0 = v0 + v2*/ \n\t" "vadd.i32 q3, q1, q2 /*q3 = v1 + v2*/ \n\t" "vmla.i32 q0, q2, %q[c6] /*q0 += v1*6*/ \n\t" "vrshrn.s32 d9, q3, #4 \n\t" "vrshrn.s32 d8, q0, #6 \n\t" "vst2.16 {d8-d9}, [%[out]] \n\t" : /*no output*/ : [out] "r" (dst + x * 2), [in0] "r" (lane + x - 1), [in1] "r" (lane + x), [in2] "r" (lane + x + 1), [c6] "w" (vc6s32) : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9" ); #else int32x4_t vLane0 = vld1q_s32(lane + x - 1); int32x4_t vLane1 = vld1q_s32(lane + x); int32x4_t vLane2 = vld1q_s32(lane + x + 1); vLane0 = vaddq_s32(vLane0, vLane2); vLane2 = vaddq_s32(vLane2, vLane1); vLane0 = vmlaq_s32(vLane0, vLane1, vc6s32); int16x4x2_t vRes; vRes.val[0] = vrshrn_n_s32(vLane0, 6); vRes.val[1] = vrshrn_n_s32(vLane2, 4); vst2_s16(dst + x * 2, vRes); #endif } break; case 3: { lim = dcolshn > 11 ? dcolshn - 11 : 0; for (; x < lim; x += 12) { internal::prefetch(lane + x + 3); #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__) __asm__ ( "vmov.s32 q9, #6 \n\t" "vld3.32 {d0, d2, d4}, [%[in0]] /*v0*/ \n\t" "vld3.32 {d1, d3, d5}, [%[in2]] \n\t" "vld3.32 {d6, d8, d10}, [%[in2]] /*v2*/ \n\t" "vld3.32 {d7, d9, d11}, [%[in22]] \n\t" "vld3.32 {d12, d14, d16}, [%[in1]] /*v1*/ \n\t" "vld3.32 {d13, d15, d17}, [%[in12]] \n\t" "vadd.i32 q0, q3 /*v0 + v2*/ \n\t" "vadd.i32 q1, q4 /*v0 + v2*/ \n\t" "vadd.i32 q2, q5 /*v0 + v2*/ \n\t" "vadd.i32 q3, q6 /*v1 + v2*/ \n\t" "vadd.i32 q4, q7 /*v1 + v2*/ \n\t" "vadd.i32 q5, q8 /*v1 + v2*/ \n\t" "vmla.i32 q0, q6, q9 /*v0 + v2 + v1*6 */ \n\t" "vmla.i32 q1, q7, q9 /*v0 + v2 + v1*6 */ \n\t" "vmla.i32 q2, q8, q9 /*v0 + v2 + v1*6 */ \n\t" "vrshrn.s32 d19, q3, #4 \n\t" "vrshrn.s32 d21, q4, #4 \n\t" "vrshrn.s32 d23, q5, #4 \n\t" "vrshrn.s32 d18, q0, #6 \n\t" "vrshrn.s32 d20, q1, #6 \n\t" "vrshrn.s32 d22, q2, #6 \n\t" "vzip.16 d18, d19 \n\t" "vzip.16 d20, d21 \n\t" "vzip.16 d22, d23 \n\t" "vst3.16 {d18, d20, d22}, [%[out1]] \n\t" "vst3.16 {d19, d21, d23}, [%[out2]] \n\t" : /*no output*/ : [out1] "r" (dst + 2*x), [out2] "r" (dst + 2*x + 12), [in0] "r" (lane + x - 3), [in1] "r" (lane + x), [in12] "r" (lane + x + 6), [in2] "r" (lane + x + 3), [in22] "r" (lane + x + 9) : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23" ); #else int32x4_t vc6 = vmovq_n_s32(6); int32x4x3_t vLane0 = vld3q_s32(lane + x - 3); int32x4x3_t vLane1 = vld3q_s32(lane + x); int32x4x3_t vLane2 = vld3q_s32(lane + x + 3); int32x4_t vSum_0_3 = vaddq_s32(vLane0.val[0], vLane2.val[0]); int32x4_t vSum_1_4 = vaddq_s32(vLane0.val[1], vLane2.val[1]); int32x4_t vSum_2_5 = vaddq_s32(vLane0.val[2], vLane2.val[2]); int32x4_t vSum_3_6 = vaddq_s32(vLane2.val[0], vLane1.val[0]); int32x4_t vSum_4_7 = vaddq_s32(vLane2.val[1], vLane1.val[1]); int32x4_t vSum_5_8 = vaddq_s32(vLane2.val[2], vLane1.val[2]); vSum_0_3 = vmlaq_s32(vSum_0_3, vLane1.val[0], vc6); vSum_1_4 = vmlaq_s32(vSum_1_4, vLane1.val[1], vc6); vSum_2_5 = vmlaq_s32(vSum_2_5, vLane1.val[2], vc6); int16x4x2_t vSumShr1; vSumShr1.val[1] = vrshrn_n_s32(vSum_3_6, 4); vSumShr1.val[0] = vrshrn_n_s32(vSum_0_3, 6); int16x4x2_t vSumShr2; vSumShr2.val[1] = vrshrn_n_s32(vSum_4_7, 4); vSumShr2.val[0] = vrshrn_n_s32(vSum_1_4, 6); int16x4x2_t vSumShr3; vSumShr3.val[1] = vrshrn_n_s32(vSum_5_8, 4); vSumShr3.val[0] = vrshrn_n_s32(vSum_2_5, 6); vSumShr1 = vzip_s16(vSumShr1.val[0], vSumShr1.val[1]); vSumShr2 = vzip_s16(vSumShr2.val[0], vSumShr2.val[1]); vSumShr3 = vzip_s16(vSumShr3.val[0], vSumShr3.val[1]); int16x4x3_t vRes1; vRes1.val[0] = vSumShr1.val[0]; vRes1.val[1] = vSumShr2.val[0]; vRes1.val[2] = vSumShr3.val[0]; vst3_s16((int16_t*)(dst + 2 * x), vRes1); int16x4x3_t vRes2; vRes2.val[0] = vSumShr1.val[1]; vRes2.val[1] = vSumShr2.val[1]; vRes2.val[2] = vSumShr3.val[1]; vst3_s16(dst + 2 * x + 12, vRes2); #endif } } break; case 4: lim = dcolshn > 3 ? dcolshn - 3 : 0; for (; x < lim; x += 4) { internal::prefetch(lane + x); #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__) __asm__ ( "vld1.32 {d0-d1}, [%[in0]] /*q0 = v0*/ \n\t" "vld1.32 {d2-d3}, [%[in2]] /*q1 = v2*/ \n\t" "vld1.32 {d4-d5}, [%[in1],:128] /*q2 = v1*/ \n\t" "vadd.i32 q0, q1 /*q0 = v0 + v2*/ \n\t" "vadd.i32 q3, q1, q2 /*q3 = v1 + v2*/ \n\t" "vmla.i32 q0, q2, %q[c6] /*q0 += v1*6*/ \n\t" "vrshrn.s32 d9, q3, #4 \n\t" "vrshrn.s32 d8, q0, #6 \n\t" "vst1.16 {d8-d9}, [%[out]] \n\t" : /*no output*/ : [out] "r" (dst + x * 2), [in0] "r" (lane + x - 4), [in1] "r" (lane + x), [in2] "r" (lane + x + 4), [c6] "w" (vc6s32) : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9" ); #else int32x4_t vLane0 = vld1q_s32(lane + x - 4); int32x4_t vLane1 = vld1q_s32(lane + x); int32x4_t vLane2 = vld1q_s32(lane + x + 4); vLane0 = vaddq_s32(vLane0, vLane2); vLane2 = vaddq_s32(vLane2, vLane1); vLane0 = vmlaq_s32(vLane0, vLane1, vc6s32); int16x4x2_t vRes; vRes.val[0] = vrshrn_n_s32(vLane0, 6); vRes.val[1] = vrshrn_n_s32(vLane2, 4); vst1q_s16(dst + x * 2, vcombine_s16(vRes.val[0], vRes.val[1])); #endif } break; }; for (u32 h = 0; h < cn; ++h) { const s32* ln = lane + h; s16* dt = dst + h; size_t k = x; for (; k < dcolshn; k += cn) { dt[2*k+0] = s16((ln[(ptrdiff_t)(k-cn)] + ln[k+cn] + 6 * ln[k] + (1 << 5)) >> 6); dt[2*k+cn] = s16(((ln[k] + ln[k+cn]) * 4 + (1 << 5)) >> 6); } for (; k < dcolshw; k += cn) dt[2*k] = s16((ln[(ptrdiff_t)(k-cn)] + ln[k+cn] + 6 * ln[k] + (1 << 5)) >> 6); } dst = internal::getRowPtr(dstBase, dstStride, 2*i+1); //second row if (lane == lane0 && 2*i+1 < dstSize.height) { lane = lane1; goto pyrUp16sHorizontalConvolution; } } #else // Remove 'unused parameter' warnings. (void)srcBase; (void)srcStride; (void)dstBase; (void)dstStride; #endif } } // namespace CAROTENE_NS