From 5d3a128cd34b8fb4a39c064d4a49d91aff31dfd4 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Sun, 31 Aug 2014 10:54:12 +0000 Subject: [PATCH] NEON impl on cv::convertScaleAba CV_32f --- CMakeLists.txt | 2 +- modules/core/src/convert.cpp | 47 +++++++++++++++++++++++++++++------- 2 files changed, 39 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ef88820043..96b3f8e02d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -130,7 +130,7 @@ OCV_OPTION(WITH_GSTREAMER "Include Gstreamer support" ON OCV_OPTION(WITH_GSTREAMER_0_10 "Enable Gstreamer 0.10 support (instead of 1.x)" OFF ) OCV_OPTION(WITH_GTK "Include GTK support" ON IF (UNIX AND NOT APPLE AND NOT ANDROID) ) OCV_OPTION(WITH_GTK_2_X "Use GTK version 2" OFF IF (UNIX AND NOT APPLE AND NOT ANDROID) ) -OCV_OPTION(WITH_IPP "Include Intel IPP support" ON IF (NOT IOS) ) +OCV_OPTION(WITH_IPP "Include Intel IPP support" ON IF (X86_64 OR X86) ) OCV_OPTION(WITH_JASPER "Include JPEG2K support" ON IF (NOT IOS) ) OCV_OPTION(WITH_JPEG "Include JPEG support" ON) OCV_OPTION(WITH_WEBP "Include WebP support" ON IF (NOT IOS) ) diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp index 49bfb7da83..f835e1b4c6 100644 --- a/modules/core/src/convert.cpp +++ b/modules/core/src/convert.cpp @@ -173,7 +173,7 @@ split_( const T* src, T** dst, int len, int cn ) int inc_j = 3 * inc_i; VSplit3 vsplit; - for( ; i < len - inc_i; i += inc_i, j += inc_j) + for( ; i <= len - inc_i; i += inc_i, j += inc_j) vsplit(src + j, dst0 + i, dst1 + i, dst2 + i); } #endif @@ -196,7 +196,7 @@ split_( const T* src, T** dst, int len, int cn ) int inc_j = 4 * inc_i; VSplit4 vsplit; - for( ; i < len - inc_i; i += inc_i, j += inc_j) + for( ; i <= len - inc_i; i += inc_i, j += inc_j) vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i); } #endif @@ -1076,7 +1076,7 @@ namespace cv { template -struct cvtScaleAbs_SSE2 +struct cvtScaleAbs_SIMD { int operator () (const T *, DT *, int, WT, WT) const { @@ -1087,7 +1087,7 @@ struct cvtScaleAbs_SSE2 #if CV_SSE2 template <> -struct cvtScaleAbs_SSE2 +struct cvtScaleAbs_SIMD { int operator () (const uchar * src, uchar * dst, int width, float scale, float shift) const @@ -1124,7 +1124,7 @@ struct cvtScaleAbs_SSE2 }; template <> -struct cvtScaleAbs_SSE2 +struct cvtScaleAbs_SIMD { int operator () (const ushort * src, uchar * dst, int width, float scale, float shift) const @@ -1155,7 +1155,7 @@ struct cvtScaleAbs_SSE2 }; template <> -struct cvtScaleAbs_SSE2 +struct cvtScaleAbs_SIMD { int operator () (const short * src, uchar * dst, int width, float scale, float shift) const @@ -1186,7 +1186,7 @@ struct cvtScaleAbs_SSE2 }; template <> -struct cvtScaleAbs_SSE2 +struct cvtScaleAbs_SIMD { int operator () (const int * src, uchar * dst, int width, float scale, float shift) const @@ -1215,7 +1215,7 @@ struct cvtScaleAbs_SSE2 }; template <> -struct cvtScaleAbs_SSE2 +struct cvtScaleAbs_SIMD { int operator () (const float * src, uchar * dst, int width, float scale, float shift) const @@ -1242,6 +1242,35 @@ struct cvtScaleAbs_SSE2 } }; +#elif CV_NEON + +template <> +struct cvtScaleAbs_SIMD +{ + int operator () (const float * src, uchar * dst, int width, + float scale, float shift) const + { + int x = 0; + float32x4_t v_shift = vdupq_n_f32(shift); + + for ( ; x <= width - 8; x += 8) + { + float32x4_t v_dst_0 = vmulq_n_f32(vld1q_f32(src + x), scale); + v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift)); + uint16x4_t v_dsti_0 = vqmovun_s32(vcvtq_s32_f32(v_dst_0)); + + float32x4_t v_dst_1 = vmulq_n_f32(vld1q_f32(src + x + 4), scale); + v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift)); + uint16x4_t v_dsti_1 = vqmovun_s32(vcvtq_s32_f32(v_dst_1)); + + uint16x8_t v_dst = vcombine_u16(v_dsti_0, v_dsti_1); + vst1_u8(dst + x, vqmovn_u16(v_dst)); + } + + return x; + } +}; + #endif template static void @@ -1251,7 +1280,7 @@ cvtScaleAbs_( const T* src, size_t sstep, { sstep /= sizeof(src[0]); dstep /= sizeof(dst[0]); - cvtScaleAbs_SSE2 vop; + cvtScaleAbs_SIMD vop; for( ; size.height--; src += sstep, dst += dstep ) {