diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index babefd3182..4f1453a2ff 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -1,37 +1,30 @@ -This is a template helping you to create an issue which can be processes as quickly as possible. Feel free to add additional information or remove not relevant points if you do not need them. - + -### Expected behaviour +##### System information (version) + -### Actual behaviour +- OpenCV => :grey_question: +- Operating System / Platform => :grey_question: +- Compiler => :grey_question: -### Additional description +##### Detailed description -### Code example to reproduce the issue / Steps to reproduce the issue -Please try to give a full example which will compile as is. -``` -#include "opencv2/core.hpp" -#include -using namespace std; -using namespace cv; + -int main() -{ - double d[] = { 546,2435,7,4534,23423,3 }; - cout << "d = 0x" << reinterpret_cast(d) << endl; +##### Steps to reproduce - return 0; -} -``` + \ No newline at end of file diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 496d748731..210a253113 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,4 +1,9 @@ -resolves #XXXX + -### What does this PR change? -Please add your changes here. +### This pullrequest changes + + diff --git a/3rdparty/carotene/.gitignore b/3rdparty/carotene/.gitignore new file mode 100644 index 0000000000..062445879b --- /dev/null +++ b/3rdparty/carotene/.gitignore @@ -0,0 +1,8 @@ +# Gedit temp files +*~ + +# Qt Creator file +*.user + +# MacOS-specific (Desktop Services Store) +.DS_Store diff --git a/3rdparty/carotene/CMakeLists.txt b/3rdparty/carotene/CMakeLists.txt new file mode 100644 index 0000000000..4dd7807c61 --- /dev/null +++ b/3rdparty/carotene/CMakeLists.txt @@ -0,0 +1,42 @@ +cmake_minimum_required(VERSION 2.8.11 FATAL_ERROR) + +project(Carotene) + +set(CAROTENE_NS "carotene" CACHE STRING "Namespace for Carotene definitions") + +set(CAROTENE_INCLUDE_DIR include) +set(CAROTENE_SOURCE_DIR src) + +file(GLOB_RECURSE carotene_headers RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "${CAROTENE_INCLUDE_DIR}/*.hpp") +file(GLOB_RECURSE carotene_sources RELATIVE "${CMAKE_CURRENT_LIST_DIR}" "${CAROTENE_SOURCE_DIR}/*.cpp" + "${CAROTENE_SOURCE_DIR}/*.hpp") + +include_directories(${CAROTENE_INCLUDE_DIR}) + +if(CMAKE_COMPILER_IS_GNUCC) + set(CMAKE_CXX_FLAGS "-fvisibility=hidden ${CMAKE_CXX_FLAGS}") + + # allow more inlines - these parameters improve performance for: + # - matchTemplate about 5-10% + # - goodFeaturesToTrack 10-20% + # - cornerHarris 30% for some cases + + set_source_files_properties(${carotene_sources} COMPILE_FLAGS "--param ipcp-unit-growth=100000 --param inline-unit-growth=100000 --param large-stack-frame-growth=5000") +endif() + +add_library(carotene_objs OBJECT + ${carotene_headers} + ${carotene_sources} +) + +if(NOT CAROTENE_NS STREQUAL "carotene") + target_compile_definitions(carotene_objs PUBLIC "-DCAROTENE_NS=${CAROTENE_NS}") +endif() + +if(WITH_NEON) + target_compile_definitions(carotene_objs PRIVATE "-DWITH_NEON") +endif() + +set_target_properties(carotene_objs PROPERTIES POSITION_INDEPENDENT_CODE TRUE) + +add_library(carotene STATIC EXCLUDE_FROM_ALL "$") diff --git a/3rdparty/carotene/README.md b/3rdparty/carotene/README.md new file mode 100644 index 0000000000..fbaae5e970 --- /dev/null +++ b/3rdparty/carotene/README.md @@ -0,0 +1,2 @@ +This is Carotene, a low-level library containing optimized CPU routines +that are useful for computer vision algorithms. diff --git a/3rdparty/carotene/hal/CMakeLists.txt b/3rdparty/carotene/hal/CMakeLists.txt new file mode 100644 index 0000000000..2fb92b907b --- /dev/null +++ b/3rdparty/carotene/hal/CMakeLists.txt @@ -0,0 +1,112 @@ +cmake_minimum_required(VERSION 2.8.8 FATAL_ERROR) + +include(CheckCCompilerFlag) +include(CheckCXXCompilerFlag) + +set(CMAKE_POSITION_INDEPENDENT_CODE ON) + +set(TEGRA_HAL_DIR "${CMAKE_CURRENT_SOURCE_DIR}") +set(CAROTENE_DIR "${TEGRA_HAL_DIR}/../") + +if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)") + set(ARM TRUE) +elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64.*|AARCH64.*") + set(AARCH64 TRUE) +endif() + +set(TEGRA_COMPILER_FLAGS "") + +if(CMAKE_COMPILER_IS_GNUCXX) + # Generate unwind information even for functions that can't throw/propagate exceptions. + # This lets debuggers and such get non-broken backtraces for such functions, even without debugging symbols. + list(APPEND TEGRA_COMPILER_FLAGS -funwind-tables) +endif() + +if(CMAKE_COMPILER_IS_GNUCXX) + if(X86 OR ARMEABI_V6 OR (MIPS AND ANDROID_COMPILER_VERSION VERSION_LESS "4.6")) + list(APPEND TEGRA_COMPILER_FLAGS -fweb -fwrapv -frename-registers -fsched-stalled-insns-dep=100 -fsched-stalled-insns=2) + else() + list(APPEND TEGRA_COMPILER_FLAGS -fweb -fwrapv -frename-registers -fsched2-use-superblocks -fsched2-use-traces + -fsched-stalled-insns-dep=100 -fsched-stalled-insns=2) + endif() + if((ANDROID_COMPILER_IS_CLANG OR NOT ANDROID_COMPILER_VERSION VERSION_LESS "4.7") AND ANDROID_NDK_RELEASE STRGREATER "r8d" ) + list(APPEND TEGRA_COMPILER_FLAGS -fgraphite -fgraphite-identity -floop-block -floop-flatten -floop-interchange + -floop-strip-mine -floop-parallelize-all -ftree-loop-linear) + endif() +endif() + +string(REPLACE ";" " " TEGRA_COMPILER_FLAGS "${TEGRA_COMPILER_FLAGS}") +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${TEGRA_COMPILER_FLAGS}") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TEGRA_COMPILER_FLAGS}") + +if(ARMEABI_V7A) + if (CMAKE_COMPILER_IS_GNUCXX) + set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-tree-vectorize" ) + set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-tree-vectorize" ) + endif() +endif() + +if(WITH_LOGS) + add_definitions(-DHAVE_LOGS) +endif() + +set(CAROTENE_NS "carotene_o4t" CACHE STRING "" FORCE) + +function(compile_carotene) + if(ENABLE_NEON) + set(WITH_NEON ON) + endif() + + add_subdirectory("${CAROTENE_DIR}" "${CMAKE_CURRENT_BINARY_DIR}/carotene") + + if(ARM OR AARCH64) + if(CMAKE_BUILD_TYPE) + set(CMAKE_TRY_COMPILE_CONFIGURATION ${CMAKE_BUILD_TYPE}) + endif() + check_cxx_compiler_flag("-mfpu=neon" CXX_HAS_MFPU_NEON) + check_c_compiler_flag("-mfpu=neon" C_HAS_MFPU_NEON) + if(${CXX_HAS_MFPU_NEON} AND ${C_HAS_MFPU_NEON}) + get_target_property(old_flags "carotene_objs" COMPILE_FLAGS) + if(old_flags) + set_target_properties("carotene_objs" PROPERTIES COMPILE_FLAGS "${old_flags} -mfpu=neon") + else() + set_target_properties("carotene_objs" PROPERTIES COMPILE_FLAGS "-mfpu=neon") + endif() + endif() + endif() +endfunction() + +compile_carotene() + +include_directories("${CAROTENE_DIR}/include") + +get_target_property(carotene_defs carotene_objs INTERFACE_COMPILE_DEFINITIONS) +set_property(DIRECTORY APPEND PROPERTY COMPILE_DEFINITIONS ${carotene_defs}) + + if (CMAKE_COMPILER_IS_GNUCXX) + # allow more inlines - these parameters improve performance for: + # matchTemplate about 5-10% + # goodFeaturesToTrack 10-20% + # cornerHarris 30% for some cases + set_source_files_properties(impl.cpp $ COMPILE_FLAGS "--param ipcp-unit-growth=100000 --param inline-unit-growth=100000 --param large-stack-frame-growth=5000") +# set_source_files_properties(impl.cpp $ COMPILE_FLAGS "--param ipcp-unit-growth=100000 --param inline-unit-growth=100000 --param large-stack-frame-growth=5000") + endif() + +add_library(tegra_hal STATIC $) +set_target_properties(tegra_hal PROPERTIES POSITION_INDEPENDENT_CODE TRUE) +set_target_properties(tegra_hal PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH}) +set(OPENCV_SRC_DIR "${CMAKE_SOURCE_DIR}") +if(NOT BUILD_SHARED_LIBS) + ocv_install_target(tegra_hal EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev) +endif() +target_include_directories(tegra_hal PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${OPENCV_SRC_DIR}/modules/core/include) + +set(CAROTENE_HAL_VERSION "0.0.1" PARENT_SCOPE) +set(CAROTENE_HAL_LIBRARIES "tegra_hal" PARENT_SCOPE) +set(CAROTENE_HAL_HEADERS "carotene/tegra_hal.hpp" PARENT_SCOPE) +set(CAROTENE_HAL_INCLUDE_DIRS "${CMAKE_BINARY_DIR}" PARENT_SCOPE) + +configure_file("tegra_hal.hpp" "${CMAKE_BINARY_DIR}/carotene/tegra_hal.hpp" COPYONLY) +configure_file("${CAROTENE_DIR}/include/carotene/definitions.hpp" "${CMAKE_BINARY_DIR}/carotene/definitions.hpp" COPYONLY) +configure_file("${CAROTENE_DIR}/include/carotene/functions.hpp" "${CMAKE_BINARY_DIR}/carotene/functions.hpp" COPYONLY) +configure_file("${CAROTENE_DIR}/include/carotene/types.hpp" "${CMAKE_BINARY_DIR}/carotene/types.hpp" COPYONLY) diff --git a/3rdparty/carotene/hal/tegra_hal.hpp b/3rdparty/carotene/hal/tegra_hal.hpp new file mode 100644 index 0000000000..401f521a64 --- /dev/null +++ b/3rdparty/carotene/hal/tegra_hal.hpp @@ -0,0 +1,1851 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2016, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#ifndef _tegra_hal_H_INCLUDED_ +#define _tegra_hal_H_INCLUDED_ + +#define CAROTENE_NS carotene_o4t + +#include "carotene/functions.hpp" +#include +#include +#include +#include + +#define RANGE_DATA(type, base, step) reinterpret_cast(const_cast(reinterpret_cast(base)) + static_cast(range.start) * step) + +#define PARALLEL_CORE 0 +#if PARALLEL_CORE + +#define SRC_ARG1 ST * src1_data_, size_t src1_step_, +#define SRC_STORE1 src1_data(src1_data_), src1_step(src1_step_), +#define SRC_VAR1 ST * src1_data; \ + size_t src1_step; +#define SRC_ARG2 ST * src1_data_, size_t src1_step_, \ + ST * src2_data_, size_t src2_step_, +#define SRC_STORE2 src1_data(src1_data_), src1_step(src1_step_), \ + src2_data(src2_data_), src2_step(src2_step_), +#define SRC_VAR2 ST * src1_data; \ + size_t src1_step; \ + ST * src2_data; \ + size_t src2_step; + +#define DST_ARG1 DT * dst1_data_, size_t dst1_step_, +#define DST_STORE1 dst1_data(dst1_data_), dst1_step(dst1_step_), +#define DST_VAR1 DT * dst1_data; \ + size_t dst1_step; + +#define SCALE_ARG0 +#define SCALE_STORE0 +#define SCALE_VAR0 +#define SCALE_ARG1 , double scale_ +#define SCALE_STORE1 , scale(scale_) +#define SCALE_VAR1 double scale; +#define SCALE_ARG3 , const double *scales_ +#define SCALE_STORE3 , scales(scales_, scales_ + 3) +#define SCALE_VAR3 std::vector scales; + +#define TegraGenOp_Invoker(name, func, src_cnt, dst_cnt, scale_cnt, ...) \ +template \ +class TegraGenOp_##name##_Invoker : public cv::ParallelLoopBody \ +{ \ +public: \ + TegraGenOp_##name##_Invoker(SRC_ARG##src_cnt \ + DST_ARG##dst_cnt \ + int width_, int height_ \ + SCALE_ARG##scale_cnt) : \ + cv::ParallelLoopBody(), SRC_STORE##src_cnt \ + DST_STORE##dst_cnt \ + width(width_), height(height_) \ + SCALE_STORE##scale_cnt {} \ + virtual void operator()(const cv::Range& range) const \ + { \ + CAROTENE_NS::func(CAROTENE_NS::Size2D(width, range.end-range.start), __VA_ARGS__); \ + } \ +private: \ + SRC_VAR##src_cnt \ + DST_VAR##dst_cnt \ + int width, height; \ + SCALE_VAR##scale_cnt \ + const TegraGenOp_##name##_Invoker& operator= (const TegraGenOp_##name##_Invoker&); \ +}; + +#define TegraBinaryOp_Invoker(name, func) TegraGenOp_Invoker(name, func, 2, 1, 0, \ + RANGE_DATA(ST, src1_data, src1_step), src1_step, \ + RANGE_DATA(ST, src2_data, src2_step), src2_step, \ + RANGE_DATA(DT, dst1_data, dst1_step), dst1_step ) + +#define TegraBinaryOp_InvokerVAArg(name, func, ...) TegraGenOp_Invoker(name, func, 2, 1, 0, \ + RANGE_DATA(ST, src1_data, src1_step), src1_step, \ + RANGE_DATA(ST, src2_data, src2_step), src2_step, \ + RANGE_DATA(DT, dst1_data, dst1_step), dst1_step, __VA_ARGS__) + +#define TEGRA_BINARYOP(type, op, src1, sz1, src2, sz2, dst, sz, w, h) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + parallel_for_(Range(0, h), \ + TegraGenOp_##op##_Invoker(src1, sz1, src2, sz2, dst, sz, w, h), \ + (w * h) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +TegraBinaryOp_InvokerVAArg(add, add, CAROTENE_NS::CONVERT_POLICY_SATURATE) /*Original addition use saturated operator, so use the same from CAROTENE*/ + +TegraBinaryOp_Invoker(addf, add) + +TegraBinaryOp_InvokerVAArg(sub, sub, CAROTENE_NS::CONVERT_POLICY_SATURATE) /*Original addition use saturated operator, so use the same from CAROTENE*/ + +TegraBinaryOp_Invoker(subf, sub) + +TegraBinaryOp_Invoker(max, max) + +TegraBinaryOp_Invoker(min, min) + +TegraBinaryOp_Invoker(absDiff, absDiff) + +TegraBinaryOp_Invoker(bitwiseAnd, bitwiseAnd) + +TegraBinaryOp_Invoker(bitwiseOr, bitwiseOr) + +TegraBinaryOp_Invoker(bitwiseXor, bitwiseXor) + +#define TegraUnaryOp_Invoker(name, func) TegraGenOp_Invoker(name, func, 1, 1, 0, \ + RANGE_DATA(ST, src1_data, src1_step), src1_step, \ + RANGE_DATA(DT, dst1_data, dst1_step), dst1_step ) + +TegraUnaryOp_Invoker(bitwiseNot, bitwiseNot) +#define TEGRA_UNARYOP(type, op, src1, sz1, dst, sz, w, h) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + parallel_for_(Range(0, h), \ + TegraGenOp_##op##_Invoker(src1, sz1, dst, sz, w, h), \ + (w * h) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#undef cv_hal_add8u +#define cv_hal_add8u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u8, add, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_add8s +#define cv_hal_add8s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s8, add, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_add16u +#define cv_hal_add16u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u16, add, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_add16s +#define cv_hal_add16s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s16, add, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_add32s +#define cv_hal_add32s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s32, add, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_add32f +#define cv_hal_add32f(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::f32, addf, src1, sz1, src2, sz2, dst, sz, w, h) +//#undef cv_hal_add64f +//#define cv_hal_add64f(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::f64, addf, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_sub8u +#define cv_hal_sub8u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u8, sub, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_sub8s +#define cv_hal_sub8s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s8, sub, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_sub16u +#define cv_hal_sub16u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u16, sub, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_sub16s +#define cv_hal_sub16s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s16, sub, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_sub32s +#define cv_hal_sub32s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s32, sub, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_sub32f +#define cv_hal_sub32f(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::f32, subf, src1, sz1, src2, sz2, dst, sz, w, h) +//#undef cv_hal_sub64f +//#define cv_hal_sub64f(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::f64, subf, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_max8u +#define cv_hal_max8u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u8, max, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_max8s +#define cv_hal_max8s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s8, max, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_max16u +#define cv_hal_max16u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u16, max, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_max16s +#define cv_hal_max16s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s16, max, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_max32s +#define cv_hal_max32s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s32, max, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_max32f +#define cv_hal_max32f(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::f32, max, src1, sz1, src2, sz2, dst, sz, w, h) +//#undef cv_hal_max64f +//#define cv_hal_max64f(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::f64, max, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_min8u +#define cv_hal_min8u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u8, min, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_min8s +#define cv_hal_min8s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s8, min, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_min16u +#define cv_hal_min16u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u16, min, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_min16s +#define cv_hal_min16s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s16, min, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_min32s +#define cv_hal_min32s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s32, min, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_min32f +#define cv_hal_min32f(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::f32, min, src1, sz1, src2, sz2, dst, sz, w, h) +//#undef cv_hal_min64f +//#define cv_hal_min64f(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::f64, min, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_absdiff8u +#define cv_hal_absdiff8u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u8, absDiff, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_absdiff8s +#define cv_hal_absdiff8s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s8, absDiff, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_absdiff16u +#define cv_hal_absdiff16u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u16, absDiff, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_absdiff16s +#define cv_hal_absdiff16s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s16, absDiff, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_absdiff32s +#define cv_hal_absdiff32s(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::s32, absDiff, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_absdiff32f +#define cv_hal_absdiff32f(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::f32, absDiff, src1, sz1, src2, sz2, dst, sz, w, h) +//#undef cv_hal_absdiff64f +//#define cv_hal_absdiff64f(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::f64, absDiff, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_and8u +#define cv_hal_and8u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u8, bitwiseAnd, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_or8u +#define cv_hal_or8u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u8, bitwiseOr, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_xor8u +#define cv_hal_xor8u(src1, sz1, src2, sz2, dst, sz, w, h) TEGRA_BINARYOP(CAROTENE_NS::u8, bitwiseXor, src1, sz1, src2, sz2, dst, sz, w, h) +#undef cv_hal_not8u +#define cv_hal_not8u(src1, sz1, dst, sz, w, h) TEGRA_UNARYOP(CAROTENE_NS::u8, bitwiseNot, src1, sz1, dst, sz, w, h) + +TegraBinaryOp_Invoker(cmpEQ, cmpEQ) +TegraBinaryOp_Invoker(cmpNE, cmpNE) +TegraBinaryOp_Invoker(cmpGT, cmpGT) +TegraBinaryOp_Invoker(cmpGE, cmpGE) +TegraGenOp_Invoker(cmpLT, cmpGT, 2, 1, 0, RANGE_DATA(ST, src2_data, src2_step), src2_step, \ + RANGE_DATA(ST, src1_data, src1_step), src1_step, \ + RANGE_DATA(DT, dst1_data, dst1_step), dst1_step) +TegraGenOp_Invoker(cmpLE, cmpGE, 2, 1, 0, RANGE_DATA(ST, src2_data, src2_step), src2_step, \ + RANGE_DATA(ST, src1_data, src1_step), src1_step, \ + RANGE_DATA(DT, dst1_data, dst1_step), dst1_step) +#define TEGRA_CMP(type, src1, sz1, src2, sz2, dst, sz, w, h, op) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + ((op) == cv::CMP_EQ) ? \ + parallel_for_(Range(0, h), \ + TegraGenOp_cmpEQ_Invoker(src1, sz1, src2, sz2, dst, sz, w, h), \ + (w * h) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK : \ + ((op) == cv::CMP_NE) ? \ + parallel_for_(Range(0, h), \ + TegraGenOp_cmpNE_Invoker(src1, sz1, src2, sz2, dst, sz, w, h), \ + (w * h) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK : \ + ((op) == cv::CMP_GT) ? \ + parallel_for_(Range(0, h), \ + TegraGenOp_cmpGT_Invoker(src1, sz1, src2, sz2, dst, sz, w, h), \ + (w * h) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK : \ + ((op) == cv::CMP_GE) ? \ + parallel_for_(Range(0, h), \ + TegraGenOp_cmpGE_Invoker(src1, sz1, src2, sz2, dst, sz, w, h), \ + (w * h) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK : \ + ((op) == cv::CMP_LT) ? \ + parallel_for_(Range(0, h), \ + TegraGenOp_cmpLT_Invoker(src1, sz1, src2, sz2, dst, sz, w, h), \ + (w * h) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK : \ + ((op) == cv::CMP_LE) ? \ + parallel_for_(Range(0, h), \ + TegraGenOp_cmpLE_Invoker(src1, sz1, src2, sz2, dst, sz, w, h), \ + (w * h) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK : \ + CV_HAL_ERROR_NOT_IMPLEMENTED \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#undef cv_hal_cmp8u +#define cv_hal_cmp8u(src1, sz1, src2, sz2, dst, sz, w, h, op) TEGRA_CMP(CAROTENE_NS::u8, src1, sz1, src2, sz2, dst, sz, w, h, op) +#undef cv_hal_cmp8s +#define cv_hal_cmp8s(src1, sz1, src2, sz2, dst, sz, w, h, op) TEGRA_CMP(CAROTENE_NS::s8, src1, sz1, src2, sz2, dst, sz, w, h, op) +#undef cv_hal_cmp16u +#define cv_hal_cmp16u(src1, sz1, src2, sz2, dst, sz, w, h, op) TEGRA_CMP(CAROTENE_NS::u16, src1, sz1, src2, sz2, dst, sz, w, h, op) +#undef cv_hal_cmp16s +#define cv_hal_cmp16s(src1, sz1, src2, sz2, dst, sz, w, h, op) TEGRA_CMP(CAROTENE_NS::s16, src1, sz1, src2, sz2, dst, sz, w, h, op) +#undef cv_hal_cmp32s +#define cv_hal_cmp32s(src1, sz1, src2, sz2, dst, sz, w, h, op) TEGRA_CMP(CAROTENE_NS::s32, src1, sz1, src2, sz2, dst, sz, w, h, op) +#undef cv_hal_cmp32f +#define cv_hal_cmp32f(src1, sz1, src2, sz2, dst, sz, w, h, op) TEGRA_CMP(CAROTENE_NS::f32, src1, sz1, src2, sz2, dst, sz, w, h, op) +//#undef cv_hal_cmp64f +//#define cv_hal_cmp64f(src1, sz1, src2, sz2, dst, sz, w, h, op) TEGRA_CMP(CAROTENE_NS::f64, src1, sz1, src2, sz2, dst, sz, w, h, op) + +#define TegraBinaryOpScale_Invoker(name, func, scale_cnt, ...) TegraGenOp_Invoker(name, func, 2, 1, scale_cnt, \ + RANGE_DATA(ST, src1_data, src1_step), src1_step, \ + RANGE_DATA(ST, src2_data, src2_step), src2_step, \ + RANGE_DATA(DT, dst1_data, dst1_step), dst1_step, __VA_ARGS__) + +#define TEGRA_BINARYOPSCALE(type, op, src1, sz1, src2, sz2, dst, sz, w, h, scales) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + parallel_for_(Range(0, h), \ + TegraGenOp_##op##_Invoker(src1, sz1, src2, sz2, dst, sz, w, h, scales), \ + (w * h) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +TegraBinaryOpScale_Invoker(mul, mul, 1, scale, CAROTENE_NS::CONVERT_POLICY_SATURATE) + +TegraBinaryOpScale_Invoker(mulf, mul, 1, scale) + +TegraBinaryOpScale_Invoker(div, div, 1, scale, CAROTENE_NS::CONVERT_POLICY_SATURATE) + +TegraBinaryOpScale_Invoker(divf, div, 1, scale) + +#define TegraUnaryOpScale_Invoker(name, func, scale_cnt, ...) TegraGenOp_Invoker(name, func, 1, 1, scale_cnt, \ + RANGE_DATA(ST, src1_data, src1_step), src1_step, \ + RANGE_DATA(DT, dst1_data, dst1_step), dst1_step, __VA_ARGS__) + +#define TEGRA_UNARYOPSCALE(type, op, src1, sz1, dst, sz, w, h, scales) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + parallel_for_(Range(0, h), \ + TegraGenOp_##op##_Invoker(src1, sz1, dst, sz, w, h, scales), \ + (w * h) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +TegraUnaryOpScale_Invoker(recip, reciprocal, 1, scale, CAROTENE_NS::CONVERT_POLICY_SATURATE) + +TegraUnaryOpScale_Invoker(recipf, reciprocal, 1, scale) + +#undef cv_hal_mul8u +#define cv_hal_mul8u(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::u8, mul, src1, sz1, src2, sz2, dst, sz, w, h, scales) +#undef cv_hal_mul8s +#define cv_hal_mul8s(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::s8, mul, src1, sz1, src2, sz2, dst, sz, w, h, scales) +#undef cv_hal_mul16u +#define cv_hal_mul16u(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::u16, mul, src1, sz1, src2, sz2, dst, sz, w, h, scales) +#undef cv_hal_mul16s +#define cv_hal_mul16s(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::s16, mul, src1, sz1, src2, sz2, dst, sz, w, h, scales) +#undef cv_hal_mul32s +#define cv_hal_mul32s(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::s32, mul, src1, sz1, src2, sz2, dst, sz, w, h, scales) +#undef cv_hal_mul32f +#define cv_hal_mul32f(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::f32, mulf, src1, sz1, src2, sz2, dst, sz, w, h, scales) +//#undef cv_hal_mul64f +//#define cv_hal_mul64f(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::f64, mulf, src1, sz1, src2, sz2, dst, sz, w, h, scales) +#undef cv_hal_div8u +#define cv_hal_div8u(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::u8, div, src1, sz1, src2, sz2, dst, sz, w, h, scales) +#undef cv_hal_div8s +#define cv_hal_div8s(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::s8, div, src1, sz1, src2, sz2, dst, sz, w, h, scales) +#undef cv_hal_div16u +#define cv_hal_div16u(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::u16, div, src1, sz1, src2, sz2, dst, sz, w, h, scales) +#undef cv_hal_div16s +#define cv_hal_div16s(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::s16, div, src1, sz1, src2, sz2, dst, sz, w, h, scales) +#undef cv_hal_div32s +#define cv_hal_div32s(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::s32, div, src1, sz1, src2, sz2, dst, sz, w, h, scales) +#undef cv_hal_div32f +#define cv_hal_div32f(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::f32, divf, src1, sz1, src2, sz2, dst, sz, w, h, scales) +//#undef cv_hal_div64f +//#define cv_hal_div64f(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::f64, divf, src1, sz1, src2, sz2, dst, sz, w, h, scales) +#undef cv_hal_recip8u +#define cv_hal_recip8u(src1, sz1, dst, sz, w, h, scales) TEGRA_UNARYOPSCALE(CAROTENE_NS::u8, recip, src1, sz1, dst, sz, w, h, scales) +#undef cv_hal_recip8s +#define cv_hal_recip8s(src1, sz1, dst, sz, w, h, scales) TEGRA_UNARYOPSCALE(CAROTENE_NS::s8, recip, src1, sz1, dst, sz, w, h, scales) +#undef cv_hal_recip16u +#define cv_hal_recip16u(src1, sz1, dst, sz, w, h, scales) TEGRA_UNARYOPSCALE(CAROTENE_NS::u16, recip, src1, sz1, dst, sz, w, h, scales) +#undef cv_hal_recip16s +#define cv_hal_recip16s(src1, sz1, dst, sz, w, h, scales) TEGRA_UNARYOPSCALE(CAROTENE_NS::s16, recip, src1, sz1, dst, sz, w, h, scales) +#undef cv_hal_recip32s +#define cv_hal_recip32s(src1, sz1, dst, sz, w, h, scales) TEGRA_UNARYOPSCALE(CAROTENE_NS::s32, recip, src1, sz1, dst, sz, w, h, scales) +#undef cv_hal_recip32f +#define cv_hal_recip32f(src1, sz1, dst, sz, w, h, scales) TEGRA_UNARYOPSCALE(CAROTENE_NS::f32, recipf, src1, sz1, dst, sz, w, h, scales) +//#undef cv_hal_recip64f +//#define cv_hal_recip64f(src1, sz1, dst, sz, w, h, scales) TEGRA_UNARYOPSCALE(CAROTENE_NS::f64, recipf, src1, sz1, dst, sz, w, h, scales) + +TegraBinaryOpScale_Invoker(addWeighted, addWeighted, 3, scales[0], scales[1], scales[2]) + +#undef cv_hal_addWeighted8u +#define cv_hal_addWeighted8u(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::u8, addWeighted, src1, sz1, src2, sz2, dst, sz, w, h, scales) +#undef cv_hal_addWeighted8s +#define cv_hal_addWeighted8s(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::s8, addWeighted, src1, sz1, src2, sz2, dst, sz, w, h, scales) +#undef cv_hal_addWeighted16u +#define cv_hal_addWeighted16u(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::u16, addWeighted, src1, sz1, src2, sz2, dst, sz, w, h, scales) +#undef cv_hal_addWeighted16s +#define cv_hal_addWeighted16s(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::s16, addWeighted, src1, sz1, src2, sz2, dst, sz, w, h, scales) +#undef cv_hal_addWeighted32s +#define cv_hal_addWeighted32s(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::s32, addWeighted, src1, sz1, src2, sz2, dst, sz, w, h, scales) +//#undef cv_hal_addWeighted32f +//#define cv_hal_addWeighted32f(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::f32, addWeighted, src1, sz1, src2, sz2, dst, sz, w, h, scales) +//#undef cv_hal_addWeighted64f +//#define cv_hal_addWeighted64f(src1, sz1, src2, sz2, dst, sz, w, h, scales) TEGRA_BINARYOPSCALE(CAROTENE_NS::f64, addWeighted, src1, sz1, src2, sz2, dst, sz, w, h, scales) + +#else + +#define TEGRA_ADD(src1, sz1, src2, sz2, dst, sz, w, h) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + CAROTENE_NS::add(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz, \ + CAROTENE_NS::CONVERT_POLICY_SATURATE), /*Original addition use saturated operator*/ \ + /*so use the same from CAROTENE*/ \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#define TEGRA_ADDF(src1, sz1, src2, sz2, dst, sz, w, h) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + CAROTENE_NS::add(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#define TEGRA_SUB(src1, sz1, src2, sz2, dst, sz, w, h) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + CAROTENE_NS::sub(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz, \ + CAROTENE_NS::CONVERT_POLICY_SATURATE), /*Original addition use saturated operator*/ \ + /*so use the same from CAROTENE*/ \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#define TEGRA_SUBF(src1, sz1, src2, sz2, dst, sz, w, h) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + CAROTENE_NS::sub(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#define TEGRA_MAX(src1, sz1, src2, sz2, dst, sz, w, h) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + CAROTENE_NS::max(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#define TEGRA_MIN(src1, sz1, src2, sz2, dst, sz, w, h) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + CAROTENE_NS::min(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#define TEGRA_ABSDIFF(src1, sz1, src2, sz2, dst, sz, w, h) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + CAROTENE_NS::absDiff(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#define TEGRA_AND(src1, sz1, src2, sz2, dst, sz, w, h) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + CAROTENE_NS::bitwiseAnd(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) +#define TEGRA_OR(src1, sz1, src2, sz2, dst, sz, w, h) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + CAROTENE_NS::bitwiseOr(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#define TEGRA_XOR(src1, sz1, src2, sz2, dst, sz, w, h) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + CAROTENE_NS::bitwiseXor(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#define TEGRA_NOT(src1, sz1, dst, sz, w, h) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + CAROTENE_NS::bitwiseNot(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + dst, sz), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#undef cv_hal_add8u +#define cv_hal_add8u TEGRA_ADD +#undef cv_hal_add8s +#define cv_hal_add8s TEGRA_ADD +#undef cv_hal_add16u +#define cv_hal_add16u TEGRA_ADD +#undef cv_hal_add16s +#define cv_hal_add16s TEGRA_ADD +#undef cv_hal_add32s +#define cv_hal_add32s TEGRA_ADD +#undef cv_hal_add32f +#define cv_hal_add32f TEGRA_ADDF +//#undef cv_hal_add64f +//#define cv_hal_add64f TEGRA_ADDF +#undef cv_hal_sub8u +#define cv_hal_sub8u TEGRA_SUB +#undef cv_hal_sub8s +#define cv_hal_sub8s TEGRA_SUB +#undef cv_hal_sub16u +#define cv_hal_sub16u TEGRA_SUB +#undef cv_hal_sub16s +#define cv_hal_sub16s TEGRA_SUB +#undef cv_hal_sub32s +#define cv_hal_sub32s TEGRA_SUB +#undef cv_hal_sub32f +#define cv_hal_sub32f TEGRA_SUBF +//#undef cv_hal_sub64f +//#define cv_hal_sub64f TEGRA_SUBF +#undef cv_hal_max8u +#define cv_hal_max8u TEGRA_MAX +#undef cv_hal_max8s +#define cv_hal_max8s TEGRA_MAX +#undef cv_hal_max16u +#define cv_hal_max16u TEGRA_MAX +#undef cv_hal_max16s +#define cv_hal_max16s TEGRA_MAX +#undef cv_hal_max32s +#define cv_hal_max32s TEGRA_MAX +#undef cv_hal_max32f +#define cv_hal_max32f TEGRA_MAX +//#undef cv_hal_max64f +//#define cv_hal_max64f TEGRA_MAX +#undef cv_hal_min8u +#define cv_hal_min8u TEGRA_MIN +#undef cv_hal_min8s +#define cv_hal_min8s TEGRA_MIN +#undef cv_hal_min16u +#define cv_hal_min16u TEGRA_MIN +#undef cv_hal_min16s +#define cv_hal_min16s TEGRA_MIN +#undef cv_hal_min32s +#define cv_hal_min32s TEGRA_MIN +#undef cv_hal_min32f +#define cv_hal_min32f TEGRA_MIN +//#undef cv_hal_min64f +//#define cv_hal_min64f TEGRA_MIN +#undef cv_hal_absdiff8u +#define cv_hal_absdiff8u TEGRA_ABSDIFF +#undef cv_hal_absdiff8s +#define cv_hal_absdiff8s TEGRA_ABSDIFF +#undef cv_hal_absdiff16u +#define cv_hal_absdiff16u TEGRA_ABSDIFF +#undef cv_hal_absdiff16s +#define cv_hal_absdiff16s TEGRA_ABSDIFF +#undef cv_hal_absdiff32s +#define cv_hal_absdiff32s TEGRA_ABSDIFF +#undef cv_hal_absdiff32f +#define cv_hal_absdiff32f TEGRA_ABSDIFF +//#undef cv_hal_absdiff64f +//#define cv_hal_absdiff64f TEGRA_ABSDIFF +#undef cv_hal_and8u +#define cv_hal_and8u TEGRA_AND +#undef cv_hal_or8u +#define cv_hal_or8u TEGRA_OR +#undef cv_hal_xor8u +#define cv_hal_xor8u TEGRA_XOR +#undef cv_hal_not8u +#define cv_hal_not8u TEGRA_NOT + +#define TEGRA_CMP(src1, sz1, src2, sz2, dst, sz, w, h, op) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + ((op) == cv::CMP_EQ) ? \ + CAROTENE_NS::cmpEQ(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz), \ + CV_HAL_ERROR_OK : \ + ((op) == cv::CMP_NE) ? \ + CAROTENE_NS::cmpNE(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz), \ + CV_HAL_ERROR_OK : \ + ((op) == cv::CMP_GT) ? \ + CAROTENE_NS::cmpGT(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz), \ + CV_HAL_ERROR_OK : \ + ((op) == cv::CMP_GE) ? \ + CAROTENE_NS::cmpGE(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz), \ + CV_HAL_ERROR_OK : \ + ((op) == cv::CMP_LT) ? \ + CAROTENE_NS::cmpGT(CAROTENE_NS::Size2D(w, h), \ + src2, sz2, \ + src1, sz1, \ + dst, sz), \ + CV_HAL_ERROR_OK : \ + ((op) == cv::CMP_LE) ? \ + CAROTENE_NS::cmpGE(CAROTENE_NS::Size2D(w, h), \ + src2, sz2, \ + src1, sz1, \ + dst, sz), \ + CV_HAL_ERROR_OK : \ + CV_HAL_ERROR_NOT_IMPLEMENTED \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#undef cv_hal_cmp8u +#define cv_hal_cmp8u TEGRA_CMP +#undef cv_hal_cmp8s +#define cv_hal_cmp8s TEGRA_CMP +#undef cv_hal_cmp16u +#define cv_hal_cmp16u TEGRA_CMP +#undef cv_hal_cmp16s +#define cv_hal_cmp16s TEGRA_CMP +#undef cv_hal_cmp32s +#define cv_hal_cmp32s TEGRA_CMP +#undef cv_hal_cmp32f +#define cv_hal_cmp32f TEGRA_CMP +//#undef cv_hal_cmp64f +//#define cv_hal_cmp64f TEGRA_CMP + +#define TEGRA_MUL(src1, sz1, src2, sz2, dst, sz, w, h, scale) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + CAROTENE_NS::mul(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz, \ + scale, \ + CAROTENE_NS::CONVERT_POLICY_SATURATE), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#define TEGRA_MULF(src1, sz1, src2, sz2, dst, sz, w, h, scale) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + CAROTENE_NS::mul(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz, \ + (float)scale), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#define TEGRA_DIV(src1, sz1, src2, sz2, dst, sz, w, h, scale) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + CAROTENE_NS::div(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz, \ + scale, \ + CAROTENE_NS::CONVERT_POLICY_SATURATE), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#define TEGRA_DIVF(src1, sz1, src2, sz2, dst, sz, w, h, scale) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + CAROTENE_NS::div(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz, \ + (float)scale), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#define TEGRA_RECIP(src2, sz2, dst, sz, w, h, scale) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + CAROTENE_NS::reciprocal(CAROTENE_NS::Size2D(w, h), \ + src2, sz2, \ + dst, sz, \ + scale, \ + CAROTENE_NS::CONVERT_POLICY_SATURATE), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#define TEGRA_RECIPF(src2, sz2, dst, sz, w, h, scale) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + CAROTENE_NS::reciprocal(CAROTENE_NS::Size2D(w, h), \ + src2, sz2, \ + dst, sz, \ + (float)scale), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#undef cv_hal_mul8u +#define cv_hal_mul8u TEGRA_MUL +#undef cv_hal_mul8s +#define cv_hal_mul8s TEGRA_MUL +#undef cv_hal_mul16u +#define cv_hal_mul16u TEGRA_MUL +#undef cv_hal_mul16s +#define cv_hal_mul16s TEGRA_MUL +#undef cv_hal_mul32s +#define cv_hal_mul32s TEGRA_MUL +#undef cv_hal_mul32f +#define cv_hal_mul32f TEGRA_MULF +//#undef cv_hal_mul64f +//#define cv_hal_mul64f TEGRA_MULF +#undef cv_hal_div8u +#define cv_hal_div8u TEGRA_DIV +#undef cv_hal_div8s +#define cv_hal_div8s TEGRA_DIV +#undef cv_hal_div16u +#define cv_hal_div16u TEGRA_DIV +#undef cv_hal_div16s +#define cv_hal_div16s TEGRA_DIV +#undef cv_hal_div32s +#define cv_hal_div32s TEGRA_DIV +#undef cv_hal_div32f +#define cv_hal_div32f TEGRA_DIVF +//#undef cv_hal_div64f +//#define cv_hal_div64f TEGRA_DIVF +#undef cv_hal_recip8u +#define cv_hal_recip8u TEGRA_RECIP +#undef cv_hal_recip8s +#define cv_hal_recip8s TEGRA_RECIP +#undef cv_hal_recip16u +#define cv_hal_recip16u TEGRA_RECIP +#undef cv_hal_recip16s +#define cv_hal_recip16s TEGRA_RECIP +#undef cv_hal_recip32s +#define cv_hal_recip32s TEGRA_RECIP +#undef cv_hal_recip32f +#define cv_hal_recip32f TEGRA_RECIPF +//#undef cv_hal_recip64f +//#define cv_hal_recip64f TEGRA_RECIPF + +#define TEGRA_ADDWEIGHTED(src1, sz1, src2, sz2, dst, sz, w, h, scales) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + CAROTENE_NS::addWeighted(CAROTENE_NS::Size2D(w, h), \ + src1, sz1, \ + src2, sz2, \ + dst, sz, \ + ((double *)scales)[0], ((double *)scales)[1], ((double *)scales)[2]), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#undef cv_hal_addWeighted8u +#define cv_hal_addWeighted8u TEGRA_ADDWEIGHTED +#undef cv_hal_addWeighted8s +#define cv_hal_addWeighted8s TEGRA_ADDWEIGHTED +#undef cv_hal_addWeighted16u +#define cv_hal_addWeighted16u TEGRA_ADDWEIGHTED +#undef cv_hal_addWeighted16s +#define cv_hal_addWeighted16s TEGRA_ADDWEIGHTED +#undef cv_hal_addWeighted32s +#define cv_hal_addWeighted32s TEGRA_ADDWEIGHTED +//#undef cv_hal_addWeighted32f +//#define cv_hal_addWeighted32f TEGRA_ADDWEIGHTED +//#undef cv_hal_addWeighted64f +//#define cv_hal_addWeighted64f TEGRA_ADDWEIGHTED + +#endif //PARALLEL_CORE + +#define ROW_SRC_ARG1 const ST * src1_data_ +#define ROW_SRC_STORE1 , src1_data(src1_data_) +#define ROW_SRC_VAR1 const ST * src1_data; +#define ROW_SRC_ARG2 ROW_SRC_ARG1 \ + , const ST * src2_data_ +#define ROW_SRC_STORE2 ROW_SRC_STORE1 \ + , src2_data(src2_data_) +#define ROW_SRC_VAR2 ROW_SRC_VAR1 \ + const ST * src2_data; +#define ROW_SRC_ARG3 ROW_SRC_ARG2 \ + , const ST * src3_data_ +#define ROW_SRC_STORE3 ROW_SRC_STORE2 \ + , src3_data(src3_data_) +#define ROW_SRC_VAR3 ROW_SRC_VAR2 \ + const ST * src3_data; +#define ROW_SRC_ARG4 ROW_SRC_ARG3 \ + , const ST * src4_data_ +#define ROW_SRC_STORE4 ROW_SRC_STORE3 \ + , src4_data(src4_data_) +#define ROW_SRC_VAR4 ROW_SRC_VAR3 \ + const ST * src4_data; + +#define ROW_DST_ARG1 , DT * dst1_data_ +#define ROW_DST_STORE1 , dst1_data(dst1_data_) +#define ROW_DST_VAR1 DT * dst1_data; +#define ROW_DST_ARG2 ROW_DST_ARG1 \ + , DT * dst2_data_ +#define ROW_DST_STORE2 ROW_DST_STORE1 \ + , dst2_data(dst2_data_) +#define ROW_DST_VAR2 ROW_DST_VAR1 \ + DT * dst2_data; +#define ROW_DST_ARG3 ROW_DST_ARG2 \ + , DT * dst3_data_ +#define ROW_DST_STORE3 ROW_DST_STORE2 \ + , dst3_data(dst3_data_) +#define ROW_DST_VAR3 ROW_DST_VAR2 \ + DT * dst3_data; +#define ROW_DST_ARG4 ROW_DST_ARG3 \ + , DT * dst4_data_ +#define ROW_DST_STORE4 ROW_DST_STORE3 \ + , dst4_data(dst4_data_) +#define ROW_DST_VAR4 ROW_DST_VAR3 \ + DT * dst4_data; + +#define ROW_VAL_ARG0 +#define ROW_VAL_STORE0 +#define ROW_VAL_VAR0 +#define ROW_VAL_ARG1 , double val_ +#define ROW_VAL_STORE1 , val(val_) +#define ROW_VAL_VAR1 double val; + +#define TegraRowOp_Invoker(name, func, src_cnt, dst_cnt, val_cnt, ...) \ +template \ +class TegraRowOp_##name##_Invoker : public cv::ParallelLoopBody \ +{ \ +public: \ + TegraRowOp_##name##_Invoker(ROW_SRC_ARG##src_cnt \ + ROW_DST_ARG##dst_cnt \ + ROW_VAL_ARG##val_cnt) : \ + cv::ParallelLoopBody() ROW_SRC_STORE##src_cnt \ + ROW_DST_STORE##dst_cnt \ + ROW_VAL_STORE##val_cnt {} \ + virtual void operator()(const cv::Range& range) const \ + { \ + CAROTENE_NS::func(CAROTENE_NS::Size2D(range.end-range.start, 1), __VA_ARGS__); \ + } \ +private: \ + ROW_SRC_VAR##src_cnt \ + ROW_DST_VAR##dst_cnt \ + ROW_VAL_VAR##val_cnt \ + const TegraRowOp_##name##_Invoker& operator= (const TegraRowOp_##name##_Invoker&); \ +}; + + +#define TEGRA_SPLIT(src, dst, len, cn) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + cn == 2 ? \ + CAROTENE_NS::split2(CAROTENE_NS::Size2D(len, 1), \ + src, len, \ + dst[0], len, \ + dst[1], len), \ + CV_HAL_ERROR_OK : \ + cn == 3 ? \ + CAROTENE_NS::split3(CAROTENE_NS::Size2D(len, 1), \ + src, len, \ + dst[0], len, \ + dst[1], len, \ + dst[2], len), \ + CV_HAL_ERROR_OK : \ + cn == 4 ? \ + CAROTENE_NS::split4(CAROTENE_NS::Size2D(len, 1), \ + src, len, \ + dst[0], len, \ + dst[1], len, \ + dst[2], len, \ + dst[3], len), \ + CV_HAL_ERROR_OK : \ + CV_HAL_ERROR_NOT_IMPLEMENTED \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +TegraRowOp_Invoker(split2, split2, 1, 2, 0, RANGE_DATA(ST, src1_data, 2*sizeof(ST)), range.end-range.start, + RANGE_DATA(DT, dst1_data, sizeof(DT)), range.end-range.start, + RANGE_DATA(DT, dst2_data, sizeof(DT)), range.end-range.start) +TegraRowOp_Invoker(split3, split3, 1, 3, 0, RANGE_DATA(ST, src1_data, 3*sizeof(ST)), range.end-range.start, + RANGE_DATA(DT, dst1_data, sizeof(DT)), range.end-range.start, + RANGE_DATA(DT, dst2_data, sizeof(DT)), range.end-range.start, + RANGE_DATA(DT, dst3_data, sizeof(DT)), range.end-range.start) +TegraRowOp_Invoker(split4, split4, 1, 4, 0, RANGE_DATA(ST, src1_data, 4*sizeof(ST)), range.end-range.start, + RANGE_DATA(DT, dst1_data, sizeof(DT)), range.end-range.start, + RANGE_DATA(DT, dst2_data, sizeof(DT)), range.end-range.start, + RANGE_DATA(DT, dst3_data, sizeof(DT)), range.end-range.start, + RANGE_DATA(DT, dst4_data, sizeof(DT)), range.end-range.start) +#define TEGRA_SPLIT64S(type, src, dst, len, cn) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + cn == 2 ? \ + parallel_for_(Range(0, len), \ + TegraRowOp_split2_Invoker(src, dst[0], dst[1]), \ + (len) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK : \ + cn == 3 ? \ + parallel_for_(Range(0, len), \ + TegraRowOp_split3_Invoker(src, dst[0], dst[1], dst[2]), \ + (len) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK : \ + cn == 4 ? \ + parallel_for_(Range(0, len), \ + TegraRowOp_split4_Invoker(src, dst[0], dst[1], dst[2], dst[3]), \ + (len) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK : \ + CV_HAL_ERROR_NOT_IMPLEMENTED \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#define TEGRA_MERGE(src, dst, len, cn) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + cn == 2 ? \ + CAROTENE_NS::combine2(CAROTENE_NS::Size2D(len, 1), \ + src[0], len, \ + src[1], len, \ + dst, len), \ + CV_HAL_ERROR_OK : \ + cn == 3 ? \ + CAROTENE_NS::combine3(CAROTENE_NS::Size2D(len, 1), \ + src[0], len, \ + src[1], len, \ + src[2], len, \ + dst, len), \ + CV_HAL_ERROR_OK : \ + cn == 4 ? \ + CAROTENE_NS::combine4(CAROTENE_NS::Size2D(len, 1), \ + src[0], len, \ + src[1], len, \ + src[2], len, \ + src[3], len, \ + dst, len), \ + CV_HAL_ERROR_OK : \ + CV_HAL_ERROR_NOT_IMPLEMENTED \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +TegraRowOp_Invoker(combine2, combine2, 2, 1, 0, RANGE_DATA(ST, src1_data, sizeof(ST)), range.end-range.start, + RANGE_DATA(ST, src2_data, sizeof(ST)), range.end-range.start, + RANGE_DATA(DT, dst1_data, 2*sizeof(DT)), range.end-range.start) +TegraRowOp_Invoker(combine3, combine3, 3, 1, 0, RANGE_DATA(ST, src1_data, sizeof(ST)), range.end-range.start, + RANGE_DATA(ST, src2_data, sizeof(ST)), range.end-range.start, + RANGE_DATA(ST, src3_data, sizeof(ST)), range.end-range.start, + RANGE_DATA(DT, dst1_data, 3*sizeof(DT)), range.end-range.start) +TegraRowOp_Invoker(combine4, combine4, 4, 1, 0, RANGE_DATA(ST, src1_data, sizeof(ST)), range.end-range.start, + RANGE_DATA(ST, src2_data, sizeof(ST)), range.end-range.start, + RANGE_DATA(ST, src3_data, sizeof(ST)), range.end-range.start, + RANGE_DATA(ST, src4_data, sizeof(ST)), range.end-range.start, + RANGE_DATA(DT, dst1_data, 4*sizeof(DT)), range.end-range.start) +#define TEGRA_MERGE64S(type, src, dst, len, cn) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + cn == 2 ? \ + parallel_for_(Range(0, len), \ + TegraRowOp_combine2_Invoker(src[0], src[1], dst), \ + (len) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK : \ + cn == 3 ? \ + parallel_for_(Range(0, len), \ + TegraRowOp_combine3_Invoker(src[0], src[1], src[2], dst), \ + (len) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK : \ + cn == 4 ? \ + parallel_for_(Range(0, len), \ + TegraRowOp_combine4_Invoker(src[0], src[1], src[2], src[3], dst), \ + (len) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK : \ + CV_HAL_ERROR_NOT_IMPLEMENTED \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#undef cv_hal_split8u +#define cv_hal_split8u TEGRA_SPLIT +#undef cv_hal_split16u +#define cv_hal_split16u TEGRA_SPLIT +#undef cv_hal_split32s +#define cv_hal_split32s TEGRA_SPLIT +#undef cv_hal_split64s +#define cv_hal_split64s(src, dst, len, cn) TEGRA_SPLIT64S(CAROTENE_NS::s64, src, dst, len, cn) + +#undef cv_hal_merge8u +#define cv_hal_merge8u TEGRA_MERGE +#undef cv_hal_merge16u +#define cv_hal_merge16u TEGRA_MERGE +#undef cv_hal_merge32s +#define cv_hal_merge32s TEGRA_MERGE +#undef cv_hal_merge64s +#define cv_hal_merge64s(src, dst, len, cn) TEGRA_MERGE64S(CAROTENE_NS::s64, src, dst, len, cn) + + +TegraRowOp_Invoker(phase, phase, 2, 1, 1, RANGE_DATA(ST, src1_data, sizeof(CAROTENE_NS::f32)), range.end-range.start, + RANGE_DATA(ST, src2_data, sizeof(CAROTENE_NS::f32)), range.end-range.start, + RANGE_DATA(DT, dst1_data, sizeof(CAROTENE_NS::f32)), range.end-range.start, val) +#define TEGRA_FASTATAN(y, x, dst, len, angleInDegrees) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + parallel_for_(Range(0, len), \ + TegraRowOp_phase_Invoker(x, y, dst, angleInDegrees ? 1.0f : M_PI/180), \ + (len) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#undef cv_hal_fastAtan32f +#define cv_hal_fastAtan32f TEGRA_FASTATAN + +TegraRowOp_Invoker(magnitude, magnitude, 2, 1, 0, RANGE_DATA(ST, src1_data, sizeof(CAROTENE_NS::f32)), range.end-range.start, + RANGE_DATA(ST, src2_data, sizeof(CAROTENE_NS::f32)), range.end-range.start, + RANGE_DATA(DT, dst1_data, sizeof(CAROTENE_NS::f32)), range.end-range.start) +#define TEGRA_MAGNITUDE(x, y, dst, len) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + parallel_for_(Range(0, len), \ + TegraRowOp_magnitude_Invoker(x, y, dst), \ + (len) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#undef cv_hal_magnitude32f +#define cv_hal_magnitude32f TEGRA_MAGNITUDE + + +#if defined OPENCV_IMGPROC_HAL_INTERFACE_H + +struct cvhalFilter2D; + +struct FilterCtx +{ + CAROTENE_NS::Size2D ksize; + CAROTENE_NS::s16* kernel_data; + CAROTENE_NS::BORDER_MODE border; +}; +inline int TEGRA_FILTERINIT(cvhalFilter2D **context, uchar *kernel_data, size_t kernel_step, int kernel_type, int kernel_width, int kernel_height, + int max_width, int max_height, int src_type, int dst_type, int borderType, double delta, int anchor_x, int anchor_y, bool allowSubmatrix, bool allowInplace) +{ + if(!context || !kernel_data || allowSubmatrix || allowInplace || + src_type != CV_8UC1 || dst_type != CV_8UC1 || + delta != 0 || anchor_x != kernel_width / 2 || anchor_y != kernel_height / 2 ) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + FilterCtx* ctx = new FilterCtx; + if(!ctx) + return CV_HAL_ERROR_UNKNOWN; + ctx->ksize.width = kernel_width; + ctx->ksize.height = kernel_height; + switch(borderType) + { + case CV_HAL_BORDER_CONSTANT: + ctx->border = CAROTENE_NS::BORDER_MODE_CONSTANT; + break; + case CV_HAL_BORDER_REPLICATE: + ctx->border = CAROTENE_NS::BORDER_MODE_REPLICATE; + break; + case CV_HAL_BORDER_REFLECT: + ctx->border = CAROTENE_NS::BORDER_MODE_REFLECT; + break; + case CV_HAL_BORDER_WRAP: + ctx->border = CAROTENE_NS::BORDER_MODE_WRAP; + break; + case CV_HAL_BORDER_REFLECT_101: + ctx->border = CAROTENE_NS::BORDER_MODE_REFLECT101; + break; + default: + delete ctx; + return CV_HAL_ERROR_NOT_IMPLEMENTED; + } + + if(!CAROTENE_NS::isConvolutionSupported(CAROTENE_NS::Size2D(max_width, max_height), ctx->ksize, ctx->border)) + { + delete ctx; + return CV_HAL_ERROR_NOT_IMPLEMENTED; + } + + ctx->kernel_data = new CAROTENE_NS::s16[kernel_width*kernel_height]; + if(!ctx->kernel_data) + return CV_HAL_ERROR_UNKNOWN; + switch(kernel_type) + { + case CV_8UC1: + convert(ctx->ksize, (CAROTENE_NS::u8*)kernel_data, kernel_step, ctx->kernel_data, kernel_width); + break; + case CV_8SC1: + convert(ctx->ksize, (CAROTENE_NS::s8*)kernel_data, kernel_step, ctx->kernel_data, kernel_width); + break; + case CV_16UC1: + for(int j = 0; j < kernel_height; ++j) + { + std::memcpy(ctx->kernel_data + kernel_width * j, kernel_data + kernel_step * j, kernel_width * sizeof(int16_t)); + } + default: + delete[] ctx->kernel_data; + delete ctx; + return CV_HAL_ERROR_NOT_IMPLEMENTED; + } + + *context = (cvhalFilter2D*)(ctx); + return CV_HAL_ERROR_OK; +} +inline int TEGRA_FILTERFREE(cvhalFilter2D *context) +{ + if(context) + { + if(((FilterCtx*)context)->kernel_data) + delete[] ((FilterCtx*)context)->kernel_data; + delete (FilterCtx*)context; + return CV_HAL_ERROR_OK; + } + else + { + return CV_HAL_ERROR_UNKNOWN; + } +} +#define TEGRA_FILTERIMPL(context, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y) \ +( \ + (void)full_width, (void)full_height, (void)offset_x, (void)offset_y, \ + context && CAROTENE_NS::isConvolutionSupported(CAROTENE_NS::Size2D(width, height), ((FilterCtx*)context)->ksize, ((FilterCtx*)context)->border) ? \ + CAROTENE_NS::convolution(CAROTENE_NS::Size2D(width, height), \ + src_data, src_step, \ + dst_data, dst_step, \ + ((FilterCtx*)context)->border, 0, \ + ((FilterCtx*)context)->ksize, ((FilterCtx*)context)->kernel_data, 1), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#undef cv_hal_filterInit +#define cv_hal_filterInit TEGRA_FILTERINIT +#undef cv_hal_filter +#define cv_hal_filter TEGRA_FILTERIMPL +#undef cv_hal_filterFree +#define cv_hal_filterFree TEGRA_FILTERFREE + + +struct SepFilterCtx +{ + int16_t kernelx_data[3]; + int16_t kernely_data[3]; + CAROTENE_NS::BORDER_MODE border; +}; +inline int TEGRA_SEPFILTERINIT(cvhalFilter2D **context, int src_type, int dst_type, int kernel_type, + uchar *kernelx_data, size_t , int kernelx_width, int kernelx_height, + uchar *kernely_data, size_t kernely_step, int kernely_width, int kernely_height, + int anchor_x, int anchor_y, double delta, int borderType) +{ + if(!context || !kernelx_data || !kernely_data || src_type != CV_8UC1 || dst_type != CV_16SC1 || + !(kernelx_width == 3 && kernelx_height == 1) || !(kernely_width == 1 && kernely_height == 3) || + delta != 0 || anchor_x != 1 || anchor_y != 1) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + SepFilterCtx* ctx = new SepFilterCtx; + if(!ctx) + return CV_HAL_ERROR_UNKNOWN; + switch(borderType) + { + case CV_HAL_BORDER_CONSTANT: + ctx->border = CAROTENE_NS::BORDER_MODE_CONSTANT; + break; + case CV_HAL_BORDER_REPLICATE: + ctx->border = CAROTENE_NS::BORDER_MODE_REPLICATE; + break; + case CV_HAL_BORDER_REFLECT: + ctx->border = CAROTENE_NS::BORDER_MODE_REFLECT; + break; + case CV_HAL_BORDER_WRAP: + ctx->border = CAROTENE_NS::BORDER_MODE_WRAP; + break; + case CV_HAL_BORDER_REFLECT_101: + ctx->border = CAROTENE_NS::BORDER_MODE_REFLECT101; + break; + default: + delete ctx; + return CV_HAL_ERROR_NOT_IMPLEMENTED; + } + + if(!CAROTENE_NS::isSeparableFilter3x3Supported(CAROTENE_NS::Size2D(16, 16), ctx->border, 3, 3)) + { + delete ctx; + return CV_HAL_ERROR_NOT_IMPLEMENTED; + } + + switch(kernel_type) + { + case CV_8UC1: + ctx->kernelx_data[0]=kernelx_data[0]; + ctx->kernelx_data[1]=kernelx_data[1]; + ctx->kernelx_data[2]=kernelx_data[2]; + ctx->kernely_data[0]=kernely_data[0]; + ctx->kernely_data[1]=kernely_data[kernely_step]; + ctx->kernely_data[2]=kernely_data[2*kernely_step]; + break; + case CV_8SC1: + ctx->kernelx_data[0]=((char*)kernelx_data)[0]; + ctx->kernelx_data[1]=((char*)kernelx_data)[1]; + ctx->kernelx_data[2]=((char*)kernelx_data)[2]; + ctx->kernely_data[0]=((char*)kernely_data)[0]; + ctx->kernely_data[1]=((char*)(kernely_data+kernely_step))[0]; + ctx->kernely_data[2]=((char*)(kernely_data+2*kernely_step))[0]; + break; + case CV_16UC1: + ctx->kernelx_data[0]=((int16_t*)kernelx_data)[0]; + ctx->kernelx_data[1]=((int16_t*)kernelx_data)[1]; + ctx->kernelx_data[2]=((int16_t*)kernelx_data)[2]; + ctx->kernely_data[0]=((int16_t*)kernely_data)[0]; + ctx->kernely_data[1]=((int16_t*)(kernely_data+kernely_step))[0]; + ctx->kernely_data[2]=((int16_t*)(kernely_data+2*kernely_step))[0]; + default: + delete ctx; + return CV_HAL_ERROR_NOT_IMPLEMENTED; + } + + *context = (cvhalFilter2D*)(ctx); + return CV_HAL_ERROR_OK; +} +inline int TEGRA_SEPFILTERFREE(cvhalFilter2D *context) +{ + if(context) + { + delete (SepFilterCtx*)context; + return CV_HAL_ERROR_OK; + } + else + { + return CV_HAL_ERROR_UNKNOWN; + } +} +#define TEGRA_SEPFILTERIMPL(context, src_data, src_step, dst_data, dst_step, width, height, full_width, full_height, offset_x, offset_y) \ +( \ + context && CAROTENE_NS::isSeparableFilter3x3Supported(CAROTENE_NS::Size2D(width, height), ((SepFilterCtx*)context)->border, 3, 3, \ + CAROTENE_NS::Margin(offset_x, full_width - width - offset_x, offset_y, full_height - height - offset_y)) ? \ + CAROTENE_NS::SeparableFilter3x3(CAROTENE_NS::Size2D(width, height), \ + src_data, src_step, \ + (CAROTENE_NS::s16*)dst_data, dst_step, \ + 3, 3, ((SepFilterCtx*)context)->kernelx_data, ((SepFilterCtx*)context)->kernely_data, \ + ((SepFilterCtx*)context)->border, 0, \ + CAROTENE_NS::Margin(offset_x, full_width - width - offset_x, offset_y, full_height - height - offset_y)), \ + CV_HAL_ERROR_OK \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#undef cv_hal_sepFilterInit +#define cv_hal_sepFilterInit TEGRA_SEPFILTERINIT +#undef cv_hal_sepFilter +#define cv_hal_sepFilter TEGRA_SEPFILTERIMPL +#undef cv_hal_sepFilterFree +#define cv_hal_sepFilterFree TEGRA_SEPFILTERFREE + + +struct MorphCtx +{ + int operation; + int channels; + CAROTENE_NS::Size2D ksize; + int anchor_x, anchor_y; + CAROTENE_NS::BORDER_MODE border; + uchar borderValues[4]; +}; +inline int TEGRA_MORPHINIT(cvhalFilter2D **context, int operation, int src_type, int dst_type, int, int, + int kernel_type, uchar *kernel_data, size_t kernel_step, int kernel_width, int kernel_height, int anchor_x, int anchor_y, + int borderType, const double borderValue[4], int iterations, bool allowSubmatrix, bool allowInplace) +{ + if(!context || !kernel_data || src_type != dst_type || + CV_MAT_DEPTH(src_type) != CV_8U || src_type < 0 || (src_type >> CV_CN_SHIFT) > 3 || + + allowSubmatrix || allowInplace || iterations != 1 || + !CAROTENE_NS::isSupportedConfiguration()) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + + switch(CV_MAT_DEPTH(kernel_type)) + { + case CV_8U: + if(CAROTENE_NS::countNonZero(CAROTENE_NS::Size2D(kernel_width, kernel_height), kernel_data, kernel_step) != kernel_width * kernel_height) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + break; + case CV_16U: + if(CAROTENE_NS::countNonZero(CAROTENE_NS::Size2D(kernel_width, kernel_height), (uint16_t*)kernel_data, kernel_step) != kernel_width * kernel_height) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + break; + case CV_32S: + if(CAROTENE_NS::countNonZero(CAROTENE_NS::Size2D(kernel_width, kernel_height), (int32_t*)kernel_data, kernel_step) != kernel_width * kernel_height) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + break; + case CV_32F: + if(CAROTENE_NS::countNonZero(CAROTENE_NS::Size2D(kernel_width, kernel_height), (float*)kernel_data, kernel_step) != kernel_width * kernel_height) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + break; + case CV_64F: + if(CAROTENE_NS::countNonZero(CAROTENE_NS::Size2D(kernel_width, kernel_height), (double*)kernel_data, kernel_step) != kernel_width * kernel_height) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + break; + default: + return CV_HAL_ERROR_NOT_IMPLEMENTED; + } + + MorphCtx* ctx = new MorphCtx; + if(!ctx) + return CV_HAL_ERROR_UNKNOWN; + ctx->channels = (src_type >> CV_CN_SHIFT) + 1; + ctx->ksize.width = kernel_width; + ctx->ksize.height = kernel_height; + ctx->anchor_x = anchor_x; + ctx->anchor_y = anchor_y; + switch(operation) + { + case MORPH_ERODE: + case MORPH_DILATE: + ctx->operation = operation; + break; + default: + delete ctx; + return CV_HAL_ERROR_NOT_IMPLEMENTED; + } + switch(borderType) + { + case CV_HAL_BORDER_CONSTANT: + ctx->border = CAROTENE_NS::BORDER_MODE_CONSTANT; + if( borderValue[0] == DBL_MAX && borderValue[1] == DBL_MAX && borderValue[2] == DBL_MAX && borderValue[3] == DBL_MAX ) + { + if( operation == MORPH_ERODE ) + for(int i = 0; i < ctx->channels; ++i) + ctx->borderValues[i] = (CAROTENE_NS::u8)UCHAR_MAX; + else + for(int i = 0; i < ctx->channels; ++i) + ctx->borderValues[i] = 0; + } + else + { + for(int i = 0; i < ctx->channels; ++i) + ctx->borderValues[i] = (CAROTENE_NS::u8)cv::saturate_cast(borderValue[i]); + } + break; + case CV_HAL_BORDER_REPLICATE: + ctx->border = CAROTENE_NS::BORDER_MODE_REPLICATE; + break; + case CV_HAL_BORDER_REFLECT: + ctx->border = CAROTENE_NS::BORDER_MODE_REFLECT; + break; + case CV_HAL_BORDER_WRAP: + ctx->border = CAROTENE_NS::BORDER_MODE_WRAP; + break; + case CV_HAL_BORDER_REFLECT_101: + ctx->border = CAROTENE_NS::BORDER_MODE_REFLECT101; + break; + default: + delete ctx; + return CV_HAL_ERROR_NOT_IMPLEMENTED; + } + + *context = (cvhalFilter2D*)(ctx); + return CV_HAL_ERROR_OK; +} +inline int TEGRA_MORPHFREE(cvhalFilter2D *context) +{ + if(context) + { + delete (MorphCtx*)context; + return CV_HAL_ERROR_OK; + } + else + { + return CV_HAL_ERROR_UNKNOWN; + } +} +#define TEGRA_MORPHIMPL(context, src_data, src_step, dst_data, dst_step, width, height, src_full_width, src_full_height, src_roi_x, src_roi_y, dst_full_width, dst_full_height, dst_roi_x, dst_roi_y) \ +( \ + (void)dst_full_width, (void)dst_full_height, (void)dst_roi_x, (void)dst_roi_y, \ + context && CAROTENE_NS::isSupportedConfiguration() ? \ + ((MorphCtx*)context)->operation == MORPH_ERODE ? \ + CAROTENE_NS::erode(CAROTENE_NS::Size2D(width, height), ((MorphCtx*)context)->channels, \ + src_data, src_step, dst_data, dst_step, \ + ((MorphCtx*)context)->ksize, ((MorphCtx*)context)->anchor_x, ((MorphCtx*)context)->anchor_y, \ + ((MorphCtx*)context)->border, ((MorphCtx*)context)->border, ((MorphCtx*)context)->borderValues, \ + CAROTENE_NS::Margin(src_roi_x, src_full_width - width - src_roi_x, src_roi_y, src_full_height - height - src_roi_y)), \ + CV_HAL_ERROR_OK : \ + ((MorphCtx*)context)->operation == MORPH_DILATE ? \ + CAROTENE_NS::dilate(CAROTENE_NS::Size2D(width, height), ((MorphCtx*)context)->channels, \ + src_data, src_step, dst_data, dst_step, \ + ((MorphCtx*)context)->ksize, ((MorphCtx*)context)->anchor_x, ((MorphCtx*)context)->anchor_y, \ + ((MorphCtx*)context)->border, ((MorphCtx*)context)->border, ((MorphCtx*)context)->borderValues, \ + CAROTENE_NS::Margin(src_roi_x, src_full_width - width - src_roi_x, src_roi_y, src_full_height - height - src_roi_y)), \ + CV_HAL_ERROR_OK : \ + CV_HAL_ERROR_NOT_IMPLEMENTED \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#undef cv_hal_morphInit +#define cv_hal_morphInit TEGRA_MORPHINIT +#undef cv_hal_morph +#define cv_hal_morph TEGRA_MORPHIMPL +#undef cv_hal_morphFree +#define cv_hal_morphFree TEGRA_MORPHFREE + + + +#define TEGRA_RESIZE(src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, inv_scale_x, inv_scale_y, interpolation) \ +( \ + interpolation == CV_HAL_INTER_LINEAR ? \ + CV_MAT_DEPTH(src_type) == CV_8U && CAROTENE_NS::isResizeLinearOpenCVSupported(CAROTENE_NS::Size2D(src_width, src_height), CAROTENE_NS::Size2D(dst_width, dst_height), ((src_type >> CV_CN_SHIFT) + 1)) && \ + inv_scale_x > 0 && inv_scale_y > 0 && \ + (dst_width - 0.5)/inv_scale_x - 0.5 < src_width && (dst_height - 0.5)/inv_scale_y - 0.5 < src_height && \ + (dst_width + 0.5)/inv_scale_x + 0.5 >= src_width && (dst_height + 0.5)/inv_scale_y + 0.5 >= src_height && \ + std::abs(dst_width / inv_scale_x - src_width) < 0.1 && std::abs(dst_height / inv_scale_y - src_height) < 0.1 ? \ + CAROTENE_NS::resizeLinearOpenCV(CAROTENE_NS::Size2D(src_width, src_height), CAROTENE_NS::Size2D(dst_width, dst_height), \ + src_data, src_step, dst_data, dst_step, 1.0/inv_scale_x, 1.0/inv_scale_y, ((src_type >> CV_CN_SHIFT) + 1)), \ + CV_HAL_ERROR_OK : CV_HAL_ERROR_NOT_IMPLEMENTED : \ + interpolation == CV_HAL_INTER_AREA ? \ + CV_MAT_DEPTH(src_type) == CV_8U && CAROTENE_NS::isResizeAreaSupported(1.0/inv_scale_x, 1.0/inv_scale_y, ((src_type >> CV_CN_SHIFT) + 1)) && \ + std::abs(dst_width / inv_scale_x - src_width) < 0.1 && std::abs(dst_height / inv_scale_y - src_height) < 0.1 ? \ + CAROTENE_NS::resizeAreaOpenCV(CAROTENE_NS::Size2D(src_width, src_height), CAROTENE_NS::Size2D(dst_width, dst_height), \ + src_data, src_step, dst_data, dst_step, 1.0/inv_scale_x, 1.0/inv_scale_y, ((src_type >> CV_CN_SHIFT) + 1)), \ + CV_HAL_ERROR_OK : CV_HAL_ERROR_NOT_IMPLEMENTED : \ + /*nearest neighbour interpolation disabled due to rounding accuracy issues*/ \ + /*interpolation == CV_HAL_INTER_NEAREST ? \ + (src_type == CV_8UC1 || src_type == CV_8SC1) && CAROTENE_NS::isResizeNearestNeighborSupported(CAROTENE_NS::Size2D(src_width, src_height), 1) ? \ + CAROTENE_NS::resizeNearestNeighbor(CAROTENE_NS::Size2D(src_width, src_height), CAROTENE_NS::Size2D(dst_width, dst_height), \ + src_data, src_step, dst_data, dst_step, 1.0/inv_scale_x, 1.0/inv_scale_y, 1), \ + CV_HAL_ERROR_OK : \ + (src_type == CV_8UC3 || src_type == CV_8SC3) && CAROTENE_NS::isResizeNearestNeighborSupported(CAROTENE_NS::Size2D(src_width, src_height), 3) ? \ + CAROTENE_NS::resizeNearestNeighbor(CAROTENE_NS::Size2D(src_width, src_height), CAROTENE_NS::Size2D(dst_width, dst_height), \ + src_data, src_step, dst_data, dst_step, 1.0/inv_scale_x, 1.0/inv_scale_y, 3), \ + CV_HAL_ERROR_OK : \ + (src_type == CV_8UC4 || src_type == CV_8SC4 || src_type == CV_16UC2 || src_type == CV_16SC2 || src_type == CV_32SC1) && \ + CAROTENE_NS::isResizeNearestNeighborSupported(CAROTENE_NS::Size2D(src_width, src_height), 4) ? \ + CAROTENE_NS::resizeNearestNeighbor(CAROTENE_NS::Size2D(src_width, src_height), CAROTENE_NS::Size2D(dst_width, dst_height), \ + src_data, src_step, dst_data, dst_step, 1.0/inv_scale_x, 1.0/inv_scale_y, 4), \ + CV_HAL_ERROR_OK : CV_HAL_ERROR_NOT_IMPLEMENTED :*/ \ + CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#define TEGRA_WARPAFFINE(src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, M, interpolation, borderType, borderValue) \ +( \ + interpolation == CV_HAL_INTER_NEAREST ? \ + (src_type == CV_8UC1 || src_type == CV_8SC1) && (borderType == CV_HAL_BORDER_REPLICATE || borderType == CV_HAL_BORDER_CONSTANT) && \ + CAROTENE_NS::isWarpAffineNearestNeighborSupported(CAROTENE_NS::Size2D(src_width, src_height)) ? \ + CAROTENE_NS::warpAffineNearestNeighbor(CAROTENE_NS::Size2D(src_width, src_height), CAROTENE_NS::Size2D(dst_width, dst_height), \ + src_data, src_step, \ + std::vector(M+0,M+6).data(), \ + dst_data, dst_step, \ + borderType == CV_HAL_BORDER_REPLICATE ? CAROTENE_NS::BORDER_MODE_REPLICATE : CAROTENE_NS::BORDER_MODE_CONSTANT, \ + (CAROTENE_NS::u8)borderValue[0]), \ + CV_HAL_ERROR_OK : CV_HAL_ERROR_NOT_IMPLEMENTED : \ + interpolation == CV_HAL_INTER_LINEAR ? \ + (src_type == CV_8UC1 || src_type == CV_8SC1) && (borderType == CV_HAL_BORDER_REPLICATE || borderType == CV_HAL_BORDER_CONSTANT) && \ + CAROTENE_NS::isWarpAffineLinearSupported(CAROTENE_NS::Size2D(src_width, src_height)) ? \ + CAROTENE_NS::warpAffineLinear(CAROTENE_NS::Size2D(src_width, src_height), CAROTENE_NS::Size2D(dst_width, dst_height), \ + src_data, src_step, \ + std::vector(M+0,M+6).data(), \ + dst_data, dst_step, \ + borderType == CV_HAL_BORDER_REPLICATE ? CAROTENE_NS::BORDER_MODE_REPLICATE : CAROTENE_NS::BORDER_MODE_CONSTANT, \ + (CAROTENE_NS::u8)borderValue[0]), \ + CV_HAL_ERROR_OK : CV_HAL_ERROR_NOT_IMPLEMENTED : \ + CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#define TEGRA_WARPPERSPECTIVE(src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, M, interpolation, borderType, borderValue) \ +( \ + interpolation == CV_HAL_INTER_NEAREST ? \ + (src_type == CV_8UC1 || src_type == CV_8SC1) && (borderType == CV_HAL_BORDER_REPLICATE || borderType == CV_HAL_BORDER_CONSTANT) && \ + CAROTENE_NS::isWarpPerspectiveNearestNeighborSupported(CAROTENE_NS::Size2D(src_width, src_height)) ? \ + CAROTENE_NS::warpPerspectiveNearestNeighbor(CAROTENE_NS::Size2D(src_width, src_height), CAROTENE_NS::Size2D(dst_width, dst_height), \ + src_data, src_step, \ + std::vector(M+0,M+9).data(), \ + dst_data, dst_step, \ + borderType == CV_HAL_BORDER_REPLICATE ? CAROTENE_NS::BORDER_MODE_REPLICATE : CAROTENE_NS::BORDER_MODE_CONSTANT, \ + (CAROTENE_NS::u8)borderValue[0]), \ + CV_HAL_ERROR_OK : CV_HAL_ERROR_NOT_IMPLEMENTED : \ + interpolation == CV_HAL_INTER_LINEAR ? \ + (src_type == CV_8UC1 || src_type == CV_8SC1) && (borderType == CV_HAL_BORDER_REPLICATE || borderType == CV_HAL_BORDER_CONSTANT) && \ + CAROTENE_NS::isWarpPerspectiveLinearSupported(CAROTENE_NS::Size2D(src_width, src_height)) ? \ + CAROTENE_NS::warpPerspectiveLinear(CAROTENE_NS::Size2D(src_width, src_height), CAROTENE_NS::Size2D(dst_width, dst_height), \ + src_data, src_step, \ + std::vector(M+0,M+9).data(), \ + dst_data, dst_step, \ + borderType == CV_HAL_BORDER_REPLICATE ? CAROTENE_NS::BORDER_MODE_REPLICATE : CAROTENE_NS::BORDER_MODE_CONSTANT, \ + (CAROTENE_NS::u8)borderValue[0]), \ + CV_HAL_ERROR_OK : CV_HAL_ERROR_NOT_IMPLEMENTED : \ + CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#undef cv_hal_resize +#define cv_hal_resize TEGRA_RESIZE +//warpAffine/warpPerspective disabled due to rounding accuracy issue +//#undef cv_hal_warpAffine +//#define cv_hal_warpAffine TEGRA_WARPAFFINE +//#undef cv_hal_warpPerspective +//#define cv_hal_warpPerspective TEGRA_WARPPERSPECTIVE + + +#define TegraCvtColor_Invoker(name, func, ...) \ +class TegraCvtColor_##name##_Invoker : public cv::ParallelLoopBody \ +{ \ +public: \ + TegraCvtColor_##name##_Invoker(const uchar * src_data_, size_t src_step_, uchar * dst_data_, size_t dst_step_, int width_, int height_) : \ + cv::ParallelLoopBody(), src_data(src_data_), src_step(src_step_), dst_data(dst_data_), dst_step(dst_step_), width(width_), height(height_) {} \ + virtual void operator()(const cv::Range& range) const \ + { \ + CAROTENE_NS::func(CAROTENE_NS::Size2D(width, range.end-range.start), __VA_ARGS__); \ + } \ +private: \ + const uchar * src_data; \ + size_t src_step; \ + uchar * dst_data; \ + size_t dst_step; \ + int width, height; \ + const TegraCvtColor_##name##_Invoker& operator= (const TegraCvtColor_##name##_Invoker&); \ +}; + +TegraCvtColor_Invoker(rgb2bgr, rgb2bgr, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +TegraCvtColor_Invoker(rgb2bgrx, rgb2bgrx, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +TegraCvtColor_Invoker(rgb2rgbx, rgb2rgbx, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +TegraCvtColor_Invoker(rgbx2bgr, rgbx2bgr, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +TegraCvtColor_Invoker(rgbx2rgb, rgbx2rgb, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +TegraCvtColor_Invoker(rgbx2bgrx, rgbx2bgrx, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +#define TEGRA_CVTBGRTOBGR(src_data, src_step, dst_data, dst_step, width, height, depth, scn, dcn, swapBlue) \ +( \ + depth == CV_8U && CAROTENE_NS::isSupportedConfiguration() ? \ + scn == 3 ? \ + dcn == 3 ? \ + swapBlue ? \ + parallel_for_(Range(0, height), \ + TegraCvtColor_rgb2bgr_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK : \ + CV_HAL_ERROR_NOT_IMPLEMENTED : \ + dcn == 4 ? \ + (swapBlue ? \ + parallel_for_(Range(0, height), \ + TegraCvtColor_rgb2bgrx_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) : \ + parallel_for_(Range(0, height), \ + TegraCvtColor_rgb2rgbx_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) ), \ + CV_HAL_ERROR_OK : \ + CV_HAL_ERROR_NOT_IMPLEMENTED : \ + scn == 4 ? \ + dcn == 3 ? \ + (swapBlue ? \ + parallel_for_(Range(0, height), \ + TegraCvtColor_rgbx2bgr_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) : \ + parallel_for_(Range(0, height), \ + TegraCvtColor_rgbx2rgb_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) ), \ + CV_HAL_ERROR_OK : \ + dcn == 4 ? \ + swapBlue ? \ + parallel_for_(Range(0, height), \ + TegraCvtColor_rgbx2bgrx_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK : \ + CV_HAL_ERROR_NOT_IMPLEMENTED : \ + CV_HAL_ERROR_NOT_IMPLEMENTED : \ + CV_HAL_ERROR_NOT_IMPLEMENTED \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +TegraCvtColor_Invoker(rgb2bgr565, rgb2bgr565, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +TegraCvtColor_Invoker(rgb2rgb565, rgb2rgb565, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +TegraCvtColor_Invoker(rgbx2bgr565, rgbx2bgr565, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +TegraCvtColor_Invoker(rgbx2rgb565, rgbx2rgb565, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +#define TEGRA_CVTBGRTOBGR565(src_data, src_step, dst_data, dst_step, width, height, scn, swapBlue, greenBits) \ +( \ + greenBits == 6 && CAROTENE_NS::isSupportedConfiguration() ? \ + scn == 3 ? \ + (swapBlue ? \ + parallel_for_(Range(0, height), \ + TegraCvtColor_rgb2bgr565_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) : \ + parallel_for_(Range(0, height), \ + TegraCvtColor_rgb2rgb565_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) ), \ + CV_HAL_ERROR_OK : \ + scn == 4 ? \ + (swapBlue ? \ + parallel_for_(Range(0, height), \ + TegraCvtColor_rgbx2bgr565_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) : \ + parallel_for_(Range(0, height), \ + TegraCvtColor_rgbx2rgb565_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) ), \ + CV_HAL_ERROR_OK : \ + CV_HAL_ERROR_NOT_IMPLEMENTED \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +TegraCvtColor_Invoker(rgb2gray, rgb2gray, CAROTENE_NS::COLOR_SPACE_BT601, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +TegraCvtColor_Invoker(bgr2gray, bgr2gray, CAROTENE_NS::COLOR_SPACE_BT601, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +TegraCvtColor_Invoker(rgbx2gray, rgbx2gray, CAROTENE_NS::COLOR_SPACE_BT601, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +TegraCvtColor_Invoker(bgrx2gray, bgrx2gray, CAROTENE_NS::COLOR_SPACE_BT601, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +#define TEGRA_CVTBGRTOGRAY(src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue) \ +( \ + depth == CV_8U && CAROTENE_NS::isSupportedConfiguration() ? \ + scn == 3 ? \ + (swapBlue ? \ + parallel_for_(Range(0, height), \ + TegraCvtColor_rgb2gray_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) : \ + parallel_for_(Range(0, height), \ + TegraCvtColor_bgr2gray_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) ), \ + CV_HAL_ERROR_OK : \ + scn == 4 ? \ + (swapBlue ? \ + parallel_for_(Range(0, height), \ + TegraCvtColor_rgbx2gray_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) : \ + parallel_for_(Range(0, height), \ + TegraCvtColor_bgrx2gray_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) ), \ + CV_HAL_ERROR_OK : \ + CV_HAL_ERROR_NOT_IMPLEMENTED \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +TegraCvtColor_Invoker(gray2rgb, gray2rgb, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +TegraCvtColor_Invoker(gray2rgbx, gray2rgbx, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +#define TEGRA_CVTGRAYTOBGR(src_data, src_step, dst_data, dst_step, width, height, depth, dcn) \ +( \ + depth == CV_8U && CAROTENE_NS::isSupportedConfiguration() ? \ + dcn == 3 ? \ + parallel_for_(Range(0, height), \ + TegraCvtColor_gray2rgb_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK : \ + dcn == 4 ? \ + parallel_for_(Range(0, height), \ + TegraCvtColor_gray2rgbx_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)), \ + CV_HAL_ERROR_OK : \ + CV_HAL_ERROR_NOT_IMPLEMENTED \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +TegraCvtColor_Invoker(rgb2ycrcb, rgb2ycrcb, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +TegraCvtColor_Invoker(bgr2ycrcb, bgr2ycrcb, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +TegraCvtColor_Invoker(rgbx2ycrcb, rgbx2ycrcb, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +TegraCvtColor_Invoker(bgrx2ycrcb, bgrx2ycrcb, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step) +#define TEGRA_CVTBGRTOYUV(src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue, isCbCr) \ +( \ + isCbCr && depth == CV_8U && CAROTENE_NS::isSupportedConfiguration() ? \ + scn == 3 ? \ + (swapBlue ? \ + parallel_for_(Range(0, height), \ + TegraCvtColor_rgb2ycrcb_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) : \ + parallel_for_(Range(0, height), \ + TegraCvtColor_bgr2ycrcb_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) ), \ + CV_HAL_ERROR_OK : \ + scn == 4 ? \ + (swapBlue ? \ + parallel_for_(Range(0, height), \ + TegraCvtColor_rgbx2ycrcb_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) : \ + parallel_for_(Range(0, height), \ + TegraCvtColor_bgrx2ycrcb_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) ), \ + CV_HAL_ERROR_OK : \ + CV_HAL_ERROR_NOT_IMPLEMENTED \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +TegraCvtColor_Invoker(rgb2hsv, rgb2hsv, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step, 180) +TegraCvtColor_Invoker(bgr2hsv, bgr2hsv, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step, 180) +TegraCvtColor_Invoker(rgbx2hsv, rgbx2hsv, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step, 180) +TegraCvtColor_Invoker(bgrx2hsv, bgrx2hsv, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step, 180) +TegraCvtColor_Invoker(rgb2hsvf, rgb2hsv, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step, 256) +TegraCvtColor_Invoker(bgr2hsvf, bgr2hsv, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step, 256) +TegraCvtColor_Invoker(rgbx2hsvf, rgbx2hsv, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step, 256) +TegraCvtColor_Invoker(bgrx2hsvf, bgrx2hsv, src_data + static_cast(range.start) * src_step, src_step, \ + dst_data + static_cast(range.start) * dst_step, dst_step, 256) +#define TEGRA_CVTBGRTOHSV(src_data, src_step, dst_data, dst_step, width, height, depth, scn, swapBlue, isFullRange, isHSV) \ +( \ + isHSV && depth == CV_8U && CAROTENE_NS::isSupportedConfiguration() ? \ + scn == 3 ? \ + (swapBlue ? \ + isFullRange ? \ + parallel_for_(Range(0, height), \ + TegraCvtColor_rgb2hsvf_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) : \ + parallel_for_(Range(0, height), \ + TegraCvtColor_rgb2hsv_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) : \ + isFullRange ? \ + parallel_for_(Range(0, height), \ + TegraCvtColor_bgr2hsvf_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) : \ + parallel_for_(Range(0, height), \ + TegraCvtColor_bgr2hsv_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) ), \ + CV_HAL_ERROR_OK : \ + scn == 4 ? \ + (swapBlue ? \ + isFullRange ? \ + parallel_for_(Range(0, height), \ + TegraCvtColor_rgbx2hsvf_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) : \ + parallel_for_(Range(0, height), \ + TegraCvtColor_rgbx2hsv_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) : \ + isFullRange ? \ + parallel_for_(Range(0, height), \ + TegraCvtColor_bgrx2hsvf_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) : \ + parallel_for_(Range(0, height), \ + TegraCvtColor_bgrx2hsv_Invoker(src_data, src_step, dst_data, dst_step, width, height), \ + (width * height) / static_cast(1<<16)) ), \ + CV_HAL_ERROR_OK : \ + CV_HAL_ERROR_NOT_IMPLEMENTED \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#define TEGRA_CVT2PYUVTOBGR(src_data, src_step, dst_data, dst_step, dst_width, dst_height, dcn, swapBlue, uIdx) \ +( \ + CAROTENE_NS::isSupportedConfiguration() ? \ + dcn == 3 ? \ + uIdx == 0 ? \ + (swapBlue ? \ + CAROTENE_NS::yuv420i2rgb(CAROTENE_NS::Size2D(dst_width, dst_height), \ + src_data, src_step, \ + src_data + src_step * dst_height, src_step, \ + dst_data, dst_step) : \ + CAROTENE_NS::yuv420i2bgr(CAROTENE_NS::Size2D(dst_width, dst_height), \ + src_data, src_step, \ + src_data + src_step * dst_height, src_step, \ + dst_data, dst_step)), \ + CV_HAL_ERROR_OK : \ + uIdx == 1 ? \ + (swapBlue ? \ + CAROTENE_NS::yuv420sp2rgb(CAROTENE_NS::Size2D(dst_width, dst_height), \ + src_data, src_step, \ + src_data + src_step * dst_height, src_step, \ + dst_data, dst_step) : \ + CAROTENE_NS::yuv420sp2bgr(CAROTENE_NS::Size2D(dst_width, dst_height), \ + src_data, src_step, \ + src_data + src_step * dst_height, src_step, \ + dst_data, dst_step)), \ + CV_HAL_ERROR_OK : \ + CV_HAL_ERROR_NOT_IMPLEMENTED : \ + dcn == 4 ? \ + uIdx == 0 ? \ + (swapBlue ? \ + CAROTENE_NS::yuv420i2rgbx(CAROTENE_NS::Size2D(dst_width, dst_height), \ + src_data, src_step, \ + src_data + src_step * dst_height, src_step, \ + dst_data, dst_step) : \ + CAROTENE_NS::yuv420i2bgrx(CAROTENE_NS::Size2D(dst_width, dst_height), \ + src_data, src_step, \ + src_data + src_step * dst_height, src_step, \ + dst_data, dst_step)), \ + CV_HAL_ERROR_OK : \ + uIdx == 1 ? \ + (swapBlue ? \ + CAROTENE_NS::yuv420sp2rgbx(CAROTENE_NS::Size2D(dst_width, dst_height), \ + src_data, src_step, \ + src_data + src_step * dst_height, src_step, \ + dst_data, dst_step) : \ + CAROTENE_NS::yuv420sp2bgrx(CAROTENE_NS::Size2D(dst_width, dst_height), \ + src_data, src_step, \ + src_data + src_step * dst_height, src_step, \ + dst_data, dst_step)), \ + CV_HAL_ERROR_OK : \ + CV_HAL_ERROR_NOT_IMPLEMENTED : \ + CV_HAL_ERROR_NOT_IMPLEMENTED \ + : CV_HAL_ERROR_NOT_IMPLEMENTED \ +) + +#undef cv_hal_cvtBGRtoBGR +#define cv_hal_cvtBGRtoBGR TEGRA_CVTBGRTOBGR +#undef cv_hal_cvtBGRtoBGR5x5 +#define cv_hal_cvtBGRtoBGR5x5 TEGRA_CVTBGRTOBGR565 +#undef cv_hal_cvtBGRtoGray +#define cv_hal_cvtBGRtoGray TEGRA_CVTBGRTOGRAY +#undef cv_hal_cvtGraytoBGR +#define cv_hal_cvtGraytoBGR TEGRA_CVTGRAYTOBGR +#undef cv_hal_cvtBGRtoYUV +#define cv_hal_cvtBGRtoYUV TEGRA_CVTBGRTOYUV +#undef cv_hal_cvtBGRtoHSV +#define cv_hal_cvtBGRtoHSV TEGRA_CVTBGRTOHSV +#undef cv_hal_cvtTwoPlaneYUVtoBGR +#define cv_hal_cvtTwoPlaneYUVtoBGR TEGRA_CVT2PYUVTOBGR + +#endif // OPENCV_IMGPROC_HAL_INTERFACE_H + +#endif diff --git a/3rdparty/carotene/include/carotene/definitions.hpp b/3rdparty/carotene/include/carotene/definitions.hpp new file mode 100644 index 0000000000..124a674d61 --- /dev/null +++ b/3rdparty/carotene/include/carotene/definitions.hpp @@ -0,0 +1,47 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#ifndef CAROTENE_DEFINITIONS_HPP +#define CAROTENE_DEFINITIONS_HPP + +#ifndef CAROTENE_NS +#define CAROTENE_NS carotene +#endif + +#endif diff --git a/3rdparty/carotene/include/carotene/functions.hpp b/3rdparty/carotene/include/carotene/functions.hpp new file mode 100644 index 0000000000..76d1328194 --- /dev/null +++ b/3rdparty/carotene/include/carotene/functions.hpp @@ -0,0 +1,2492 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#ifndef CAROTENE_FUNCTIONS_HPP +#define CAROTENE_FUNCTIONS_HPP + +#include +#include + +namespace CAROTENE_NS { + /* If this returns false, none of the functions will work. */ + bool isSupportedConfiguration(); + + /* + For each point `p` within `size`, do: + dst[p] = src0[p] + src1[p] + */ + void add(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy); + + void add(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy); + + void add(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const s16 *src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy); + + void add(const Size2D &size, + const s8 *src0Base, ptrdiff_t src0Stride, + const s8 *src1Base, ptrdiff_t src1Stride, + s8 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy); + + void add(const Size2D &size, + const s16 *src0Base, ptrdiff_t src0Stride, + const s16 *src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy); + + void add(const Size2D &size, + const u16 * src0Base, ptrdiff_t src0Stride, + const u16 * src1Base, ptrdiff_t src1Stride, + u16 * dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy); + + void add(const Size2D &size, + const s32 * src0Base, ptrdiff_t src0Stride, + const s32 * src1Base, ptrdiff_t src1Stride, + s32 * dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy); + + void add(const Size2D &size, + const u32 * src0Base, ptrdiff_t src0Stride, + const u32 * src1Base, ptrdiff_t src1Stride, + u32 * dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy); + + void add(const Size2D &size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride, + f32 * dstBase, ptrdiff_t dstStride); + + /* + For each point `p` within `size`, do: + dst[p] = src0[p] - src1[p] + */ + void sub(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy); + + void sub(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy); + + void sub(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + f32 *dstBase, ptrdiff_t dstStride); + + void sub(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const s16 *src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy); + + void sub(const Size2D &size, + const s16 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy); + + void sub(const Size2D &size, + const s16 *src0Base, ptrdiff_t src0Stride, + const s16 *src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy); + + void sub(const Size2D &size, + const s8 *src0Base, ptrdiff_t src0Stride, + const s8 *src1Base, ptrdiff_t src1Stride, + s8 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy); + + void sub(const Size2D &size, + const u16 * src0Base, ptrdiff_t src0Stride, + const u16 * src1Base, ptrdiff_t src1Stride, + u16 * dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy); + + void sub(const Size2D &size, + const s32 * src0Base, ptrdiff_t src0Stride, + const s32 * src1Base, ptrdiff_t src1Stride, + s32 * dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy); + + void sub(const Size2D &size, + const u32 * src0Base, ptrdiff_t src0Stride, + const u32 * src1Base, ptrdiff_t src1Stride, + u32 * dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy); + + void sub(const Size2D &size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride, + f32 * dstBase, ptrdiff_t dstStride); + + /* + For each point `p` within `size`, do: + dst[p] = src0[p] * alpha + src1[p] * beta + gamma + */ + void addWeighted(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + u8 * dstBase, ptrdiff_t dstStride, + f32 alpha, f32 beta, f32 gamma); + + void addWeighted(const Size2D &size, + const s8 * src0Base, ptrdiff_t src0Stride, + const s8 * src1Base, ptrdiff_t src1Stride, + s8 * dstBase, ptrdiff_t dstStride, + f32 alpha, f32 beta, f32 gamma); + + void addWeighted(const Size2D &size, + const u16 * src0Base, ptrdiff_t src0Stride, + const u16 * src1Base, ptrdiff_t src1Stride, + u16 * dstBase, ptrdiff_t dstStride, + f32 alpha, f32 beta, f32 gamma); + + void addWeighted(const Size2D &size, + const s16 * src0Base, ptrdiff_t src0Stride, + const s16 * src1Base, ptrdiff_t src1Stride, + s16 * dstBase, ptrdiff_t dstStride, + f32 alpha, f32 beta, f32 gamma); + + void addWeighted(const Size2D &size, + const u32 * src0Base, ptrdiff_t src0Stride, + const u32 * src1Base, ptrdiff_t src1Stride, + u32 * dstBase, ptrdiff_t dstStride, + f32 alpha, f32 beta, f32 gamma); + + void addWeighted(const Size2D &size, + const s32 * src0Base, ptrdiff_t src0Stride, + const s32 * src1Base, ptrdiff_t src1Stride, + s32 * dstBase, ptrdiff_t dstStride, + f32 alpha, f32 beta, f32 gamma); + + void addWeighted(const Size2D &size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride, + f32 * dstBase, ptrdiff_t dstStride, + f32 alpha, f32 beta, f32 gamma); + + /* + For each point `p` within `size`, do: + dst[p] = min(src0[p], src1[p]) + */ + void min(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void min(const Size2D &size, + const s8 *src0Base, ptrdiff_t src0Stride, + const s8 *src1Base, ptrdiff_t src1Stride, + s8 *dstBase, ptrdiff_t dstStride); + + void min(const Size2D &size, + const u16 * src0Base, ptrdiff_t src0Stride, + const u16 * src1Base, ptrdiff_t src1Stride, + u16 * dstBase, ptrdiff_t dstStride); + + void min(const Size2D &size, + const s16 *src0Base, ptrdiff_t src0Stride, + const s16 *src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride); + + void min(const Size2D &size, + const s32 * src0Base, ptrdiff_t src0Stride, + const s32 * src1Base, ptrdiff_t src1Stride, + s32 * dstBase, ptrdiff_t dstStride); + + void min(const Size2D &size, + const u32 * src0Base, ptrdiff_t src0Stride, + const u32 * src1Base, ptrdiff_t src1Stride, + u32 * dstBase, ptrdiff_t dstStride); + + void min(const Size2D &size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride, + f32 * dstBase, ptrdiff_t dstStride); + + /* + For each point `p` within `size`, do: + dst[p] = max(src0[p], src1[p]) + */ + void max(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void max(const Size2D &size, + const s8 *src0Base, ptrdiff_t src0Stride, + const s8 *src1Base, ptrdiff_t src1Stride, + s8 *dstBase, ptrdiff_t dstStride); + + void max(const Size2D &size, + const u16 * src0Base, ptrdiff_t src0Stride, + const u16 * src1Base, ptrdiff_t src1Stride, + u16 * dstBase, ptrdiff_t dstStride); + + void max(const Size2D &size, + const s16 *src0Base, ptrdiff_t src0Stride, + const s16 *src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride); + + void max(const Size2D &size, + const s32 * src0Base, ptrdiff_t src0Stride, + const s32 * src1Base, ptrdiff_t src1Stride, + s32 * dstBase, ptrdiff_t dstStride); + + void max(const Size2D &size, + const u32 * src0Base, ptrdiff_t src0Stride, + const u32 * src1Base, ptrdiff_t src1Stride, + u32 * dstBase, ptrdiff_t dstStride); + + void max(const Size2D &size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride, + f32 * dstBase, ptrdiff_t dstStride); + + /* + For each point `p` within `size`, do: + dst[p] = src0[p] * src1[p] * scale + + NOTE: ROUND_TO_ZERO convert policy is used + */ + void mul(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + u8 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy); + + void mul(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + s16 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy); + + void mul(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const s16 * src1Base, ptrdiff_t src1Stride, + s16 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy); + + void mul(const Size2D &size, + const s8 * src0Base, ptrdiff_t src0Stride, + const s8 * src1Base, ptrdiff_t src1Stride, + s8 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy); + + void mul(const Size2D &size, + const u16 * src0Base, ptrdiff_t src0Stride, + const u16 * src1Base, ptrdiff_t src1Stride, + u16 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy); + + void mul(const Size2D &size, + const s16 * src0Base, ptrdiff_t src0Stride, + const s16 * src1Base, ptrdiff_t src1Stride, + s16 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy); + + void mul(const Size2D &size, + const s32 * src0Base, ptrdiff_t src0Stride, + const s32 * src1Base, ptrdiff_t src1Stride, + s32 * dstBase, ptrdiff_t dstStride, + f64 scale, + CONVERT_POLICY cpolicy); + + void mul(const Size2D &size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride, + f32 * dstBase, ptrdiff_t dstStride, + f32 scale); + + /* + For each point `p` within `size`, do: + dst[p] = src0[p] * scale / src1[p] + + NOTE: ROUND_TO_ZERO convert policy is used + */ + void div(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + u8 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy); + + void div(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + s16 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy); + + void div(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const s16 * src1Base, ptrdiff_t src1Stride, + s16 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy); + + void div(const Size2D &size, + const s8 * src0Base, ptrdiff_t src0Stride, + const s8 * src1Base, ptrdiff_t src1Stride, + s8 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy); + + void div(const Size2D &size, + const u16 * src0Base, ptrdiff_t src0Stride, + const u16 * src1Base, ptrdiff_t src1Stride, + u16 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy); + + void div(const Size2D &size, + const s16 * src0Base, ptrdiff_t src0Stride, + const s16 * src1Base, ptrdiff_t src1Stride, + s16 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy); + + void div(const Size2D &size, + const s32 * src0Base, ptrdiff_t src0Stride, + const s32 * src1Base, ptrdiff_t src1Stride, + s32 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy); + + void div(const Size2D &size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride, + f32 * dstBase, ptrdiff_t dstStride, + f32 scale); + + /* + For each point `p` within `size`, do: + dst[p] = scale / src[p] + + NOTE: ROUND_TO_ZERO convert policy is used + */ + void reciprocal(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy); + + void reciprocal(const Size2D &size, + const s8 * srcBase, ptrdiff_t srcStride, + s8 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy); + + void reciprocal(const Size2D &size, + const u16 * srcBase, ptrdiff_t srcStride, + u16 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy); + + void reciprocal(const Size2D &size, + const s16 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy); + + void reciprocal(const Size2D &size, + const s32 * srcBase, ptrdiff_t srcStride, + s32 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy); + + void reciprocal(const Size2D &size, + const f32 * srcBase, ptrdiff_t srcStride, + f32 * dstBase, ptrdiff_t dstStride, + f32 scale); + + /* + For each point `p` within `size`, set `dst[p]` to the median + of `src[p]` and the 8 points around it. If `srcMargin` is + zero on any side, get the neighbors on that side by replicating + the edge. + */ + bool isMedianFilter3x3Supported(const Size2D &size, u32 numChannels); + void medianFilter3x3(const Size2D &size, u32 numChannels, + const u8 *srcBase, ptrdiff_t srcStride, + const Margin &srcMargin, + u8 *dstBase, ptrdiff_t dstStride); + + /* + Apply a half Gaussian filter + half Scale, as one level of a Gaussian + pyramid. For all `p` within `dstSize`, set `dst[p]` to `f[2 * p]`, where + `f` is an image of size srcSize obtained by filtering src with the 5x5 + Gaussian kernel ([1 4 6 4 1]'*[1 4 6 4 1]/256) using the border mode + passed in, and round-to-zero rounding. + dstSize must be (srcSize.width / 2, srcSize.height / 2), rounded by any method. + */ + bool isGaussianPyramidDownRTZSupported(const Size2D &srcSize, const Size2D &dstSize, BORDER_MODE border); + void gaussianPyramidDownRTZ(const Size2D &srcSize, + const u8 *srcBase, ptrdiff_t srcStride, + const Size2D &dstSize, + u8 *dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue); + + /* Same as above, but uses round-half-up rounding. */ + + bool isGaussianPyramidDownU8Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn); + void gaussianPyramidDown(const Size2D &srcSize, + const u8 *srcBase, ptrdiff_t srcStride, + const Size2D &dstSize, + u8 *dstBase, ptrdiff_t dstStride, u8 cn); + + + bool isGaussianPyramidDownS16Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn); + void gaussianPyramidDown(const Size2D &srcSize, + const s16 *srcBase, ptrdiff_t srcStride, + const Size2D &dstSize, + s16 *dstBase, ptrdiff_t dstStride, u8 cn); + + bool isGaussianPyramidDownF32Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn); + void gaussianPyramidDown(const Size2D &srcSize, + const f32 *srcBase, ptrdiff_t srcStride, + const Size2D &dstSize, + f32 *dstBase, ptrdiff_t dstStride, u8 cn); + + bool isGaussianPyramidUpU8Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn); + void gaussianPyramidUp(const Size2D &srcSize, + const u8 *srcBase, ptrdiff_t srcStride, + const Size2D &dstSize, + u8 *dstBase, ptrdiff_t dstStride, u8 cn); + + bool isGaussianPyramidUpS16Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn); + void gaussianPyramidUp(const Size2D &srcSize, + const s16 *srcBase, ptrdiff_t srcStride, + const Size2D &dstSize, + s16 *dstBase, ptrdiff_t dstStride, u8 cn); + + /* + For each point `p` within `size`, do: + dst[p] = src[p] > threshold ? trueValue : falseValue + */ + void thresholdBinary(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 *dstBase, ptrdiff_t dstStride, + u8 threshold, u8 trueValue = 255, u8 falseValue = 0); + + /* + For each point `p` within `size`, do: + dst[p] = lower <= src[p] && src[p] <= upper ? trueValue : falseValue + */ + void thresholdRange(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 *dstBase, ptrdiff_t dstStride, + u8 lowerThreshold, u8 upperThreshold, + u8 trueValue = 255, u8 falseValue = 0); + + /* + For each point `p` within `size`, do: + dst[p] = src[p] > threshold ? value : 0 + */ + void thresholdBinary(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 *dstBase, ptrdiff_t dstStride, + u8 threshold, u8 value); + + void thresholdBinary(const Size2D &size, + const s8 *srcBase, ptrdiff_t srcStride, + s8 *dstBase, ptrdiff_t dstStride, + s8 threshold, s8 value); + + void thresholdBinary(const Size2D &size, + const u16 *srcBase, ptrdiff_t srcStride, + u16 *dstBase, ptrdiff_t dstStride, + u16 threshold, u16 value); + + void thresholdBinary(const Size2D &size, + const s16 *srcBase, ptrdiff_t srcStride, + s16 *dstBase, ptrdiff_t dstStride, + s16 threshold, s16 value); + + void thresholdBinary(const Size2D &size, + const s32 *srcBase, ptrdiff_t srcStride, + s32 *dstBase, ptrdiff_t dstStride, + s32 threshold, s32 value); + + void thresholdBinary(const Size2D &size, + const f32 *srcBase, ptrdiff_t srcStride, + f32 *dstBase, ptrdiff_t dstStride, + f32 threshold, f32 value); + + /* + For each point `p` within `size`, do: + dst[p] = src[p] > threshold ? 0 : value + */ + void thresholdBinaryInv(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 *dstBase, ptrdiff_t dstStride, + u8 threshold, u8 value); + + void thresholdBinaryInv(const Size2D &size, + const s8 *srcBase, ptrdiff_t srcStride, + s8 *dstBase, ptrdiff_t dstStride, + s8 threshold, s8 value); + + void thresholdBinaryInv(const Size2D &size, + const u16 *srcBase, ptrdiff_t srcStride, + u16 *dstBase, ptrdiff_t dstStride, + u16 threshold, u16 value); + + void thresholdBinaryInv(const Size2D &size, + const s16 *srcBase, ptrdiff_t srcStride, + s16 *dstBase, ptrdiff_t dstStride, + s16 threshold, s16 value); + + void thresholdBinaryInv(const Size2D &size, + const s32 *srcBase, ptrdiff_t srcStride, + s32 *dstBase, ptrdiff_t dstStride, + s32 threshold, s32 value); + + void thresholdBinaryInv(const Size2D &size, + const f32 *srcBase, ptrdiff_t srcStride, + f32 *dstBase, ptrdiff_t dstStride, + f32 threshold, f32 value); + + /* + For each point `p` within `size`, do: + dst[p] = src[p] > threshold ? threshold : src[p] + */ + void thresholdTruncate(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 *dstBase, ptrdiff_t dstStride, + u8 threshold); + + void thresholdTruncate(const Size2D &size, + const s8 *srcBase, ptrdiff_t srcStride, + s8 *dstBase, ptrdiff_t dstStride, + s8 threshold); + + void thresholdTruncate(const Size2D &size, + const u16 *srcBase, ptrdiff_t srcStride, + u16 *dstBase, ptrdiff_t dstStride, + u16 threshold); + + void thresholdTruncate(const Size2D &size, + const s16 *srcBase, ptrdiff_t srcStride, + s16 *dstBase, ptrdiff_t dstStride, + s16 threshold); + + void thresholdTruncate(const Size2D &size, + const s32 *srcBase, ptrdiff_t srcStride, + s32 *dstBase, ptrdiff_t dstStride, + s32 threshold); + + void thresholdTruncate(const Size2D &size, + const f32 *srcBase, ptrdiff_t srcStride, + f32 *dstBase, ptrdiff_t dstStride, + f32 threshold); + + /* + For each point `p` within `size`, do: + dst[p] = src[p] > threshold ? src[p] : 0 + */ + void thresholdToZero(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 *dstBase, ptrdiff_t dstStride, + u8 threshold); + + void thresholdToZero(const Size2D &size, + const s8 *srcBase, ptrdiff_t srcStride, + s8 *dstBase, ptrdiff_t dstStride, + s8 threshold); + + void thresholdToZero(const Size2D &size, + const u16 *srcBase, ptrdiff_t srcStride, + u16 *dstBase, ptrdiff_t dstStride, + u16 threshold); + + void thresholdToZero(const Size2D &size, + const s16 *srcBase, ptrdiff_t srcStride, + s16 *dstBase, ptrdiff_t dstStride, + s16 threshold); + + void thresholdToZero(const Size2D &size, + const s32 *srcBase, ptrdiff_t srcStride, + s32 *dstBase, ptrdiff_t dstStride, + s32 threshold); + + void thresholdToZero(const Size2D &size, + const f32 *srcBase, ptrdiff_t srcStride, + f32 *dstBase, ptrdiff_t dstStride, + f32 threshold); + + /* + For each point `p` within `size`, do: + dst[p] = src[p] > threshold ? 0 : src[p] + */ + void thresholdToZeroInv(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 *dstBase, ptrdiff_t dstStride, + u8 threshold); + + void thresholdToZeroInv(const Size2D &size, + const s8 *srcBase, ptrdiff_t srcStride, + s8 *dstBase, ptrdiff_t dstStride, + s8 threshold); + + void thresholdToZeroInv(const Size2D &size, + const u16 *srcBase, ptrdiff_t srcStride, + u16 *dstBase, ptrdiff_t dstStride, + u16 threshold); + + void thresholdToZeroInv(const Size2D &size, + const s16 *srcBase, ptrdiff_t srcStride, + s16 *dstBase, ptrdiff_t dstStride, + s16 threshold); + + void thresholdToZeroInv(const Size2D &size, + const s32 *srcBase, ptrdiff_t srcStride, + s32 *dstBase, ptrdiff_t dstStride, + s32 threshold); + + void thresholdToZeroInv(const Size2D &size, + const f32 *srcBase, ptrdiff_t srcStride, + f32 *dstBase, ptrdiff_t dstStride, + f32 threshold); + + /* + For each point `p` within `size`, do: + dst[p] = abs(src0[p] - src1[p]) + */ + void absDiff(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void absDiff(const Size2D &size, + const u16 *src0Base, ptrdiff_t src0Stride, + const u16 *src1Base, ptrdiff_t src1Stride, + u16 *dstBase, ptrdiff_t dstStride); + + void absDiff(const Size2D &size, + const s8 *src0Base, ptrdiff_t src0Stride, + const s8 *src1Base, ptrdiff_t src1Stride, + s8 *dstBase, ptrdiff_t dstStride); + + void absDiff(const Size2D &size, + const s16 *src0Base, ptrdiff_t src0Stride, + const s16 *src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride); + + void absDiff(const Size2D &size, + const s32 * src0Base, ptrdiff_t src0Stride, + const s32 * src1Base, ptrdiff_t src1Stride, + s32 * dstBase, ptrdiff_t dstStride); + + void absDiff(const Size2D &size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride, + f32 * dstBase, ptrdiff_t dstStride); + + /* + For each point `p` within `size`, do: + dst[p] = ~src[p] + */ + void bitwiseNot(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 *dstBase, ptrdiff_t dstStride); + + /* + For each point `p` within `size`, do: + dst[p] = src0[p] & src1[p] + */ + void bitwiseAnd(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + /* + For each point `p` within `size`, do: + dst[p] = src0[p] | src1[p] + */ + void bitwiseOr(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + /* + For each point `p` within `size`, do: + dst[p] = src0[p] ^ src1[p] + */ + void bitwiseXor(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + /* + For each point `p` within `size`, do: + dst[p] = src0[p] == src1[p] ? 255 : 0 + */ + void cmpEQ(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpEQ(const Size2D &size, + const s8 *src0Base, ptrdiff_t src0Stride, + const s8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpEQ(const Size2D &size, + const u16 *src0Base, ptrdiff_t src0Stride, + const u16 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpEQ(const Size2D &size, + const s16 *src0Base, ptrdiff_t src0Stride, + const s16 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpEQ(const Size2D &size, + const u32 *src0Base, ptrdiff_t src0Stride, + const u32 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpEQ(const Size2D &size, + const s32 *src0Base, ptrdiff_t src0Stride, + const s32 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpEQ(const Size2D &size, + const f32 *src0Base, ptrdiff_t src0Stride, + const f32 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + /* + For each point `p` within `size`, do: + dst[p] = src0[p] != src1[p] ? 255 : 0 + */ + void cmpNE(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpNE(const Size2D &size, + const s8 *src0Base, ptrdiff_t src0Stride, + const s8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpNE(const Size2D &size, + const u16 *src0Base, ptrdiff_t src0Stride, + const u16 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpNE(const Size2D &size, + const s16 *src0Base, ptrdiff_t src0Stride, + const s16 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpNE(const Size2D &size, + const u32 *src0Base, ptrdiff_t src0Stride, + const u32 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpNE(const Size2D &size, + const s32 *src0Base, ptrdiff_t src0Stride, + const s32 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpNE(const Size2D &size, + const f32 *src0Base, ptrdiff_t src0Stride, + const f32 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + /* + For each point `p` within `size`, do: + dst[p] = src0[p] > src1[p] ? 255 : 0 + */ + void cmpGT(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpGT(const Size2D &size, + const s8 *src0Base, ptrdiff_t src0Stride, + const s8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpGT(const Size2D &size, + const u16 *src0Base, ptrdiff_t src0Stride, + const u16 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpGT(const Size2D &size, + const s16 *src0Base, ptrdiff_t src0Stride, + const s16 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpGT(const Size2D &size, + const u32 *src0Base, ptrdiff_t src0Stride, + const u32 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpGT(const Size2D &size, + const s32 *src0Base, ptrdiff_t src0Stride, + const s32 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpGT(const Size2D &size, + const f32 *src0Base, ptrdiff_t src0Stride, + const f32 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + /* + For each point `p` within `size`, do: + dst[p] = src0[p] >= src1[p] ? 255 : 0 + */ + void cmpGE(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpGE(const Size2D &size, + const s8 *src0Base, ptrdiff_t src0Stride, + const s8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpGE(const Size2D &size, + const u16 *src0Base, ptrdiff_t src0Stride, + const u16 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpGE(const Size2D &size, + const s16 *src0Base, ptrdiff_t src0Stride, + const s16 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpGE(const Size2D &size, + const u32 *src0Base, ptrdiff_t src0Stride, + const u32 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpGE(const Size2D &size, + const s32 *src0Base, ptrdiff_t src0Stride, + const s32 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + void cmpGE(const Size2D &size, + const f32 *src0Base, ptrdiff_t src0Stride, + const f32 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride); + + /* + Calculates dot product + */ + f64 dotProduct(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride); + + f64 dotProduct(const Size2D &size, + const s8 * src0Base, ptrdiff_t src0Stride, + const s8 * src1Base, ptrdiff_t src1Stride); + + f64 dotProduct(const Size2D &size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride); + + /* + Calculates mean and stddev + */ + void meanStdDev(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + f32 * pMean, f32 * pStdDev); + + void meanStdDev(const Size2D &size, + const u16 * srcBase, ptrdiff_t srcStride, + f32 * pMean, f32 * pStdDev); + + /* + For each point `p` within `size`, do: + dst[p] = sqrt(src0[p] ^ 2 + src1[p] ^ 2) + */ + void magnitude(const Size2D &size, + const s16 *src0Base, ptrdiff_t src0Stride, + const s16 *src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride); + + void magnitude(const Size2D &size, + const f32 *src0Base, ptrdiff_t src0Stride, + const f32 *src1Base, ptrdiff_t src1Stride, + f32 *dstBase, ptrdiff_t dstStride); + + /* + Compute an integral image + */ + void integral(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u32 * sumBase, ptrdiff_t sumStride); + + /* + Compute an integral of squared image values + */ + void sqrIntegral(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + f64 * sqsumBase, ptrdiff_t sqsumStride); + + /* + Among each pixel `p` within `src` find min and max values + */ + void minMaxVals(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 * minVal, u8 * maxVal); + + void minMaxVals(const Size2D &size, + const s16 *srcBase, ptrdiff_t srcStride, + s16 * minVal, s16 * maxVal); + + void minMaxVals(const Size2D &size, + const u16 *srcBase, ptrdiff_t srcStride, + u16 * minVal, u16 * maxVal); + + void minMaxVals(const Size2D &size, + const s32 *srcBase, ptrdiff_t srcStride, + s32 * minVal, s32 * maxVal); + + void minMaxVals(const Size2D &size, + const u32 *srcBase, ptrdiff_t srcStride, + u32 * minVal, u32 * maxVal); + + /* + Fill the arrays `minLocPtr`, `maxLocPtr` with locations of + given values `minVal`, `maxVal` + */ + void fillMinMaxLocs(const Size2D & size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity, + u8 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity); + + void fillMinMaxLocs(const Size2D & size, + const u16 *srcBase, ptrdiff_t srcStride, + u16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity, + u16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity); + + void fillMinMaxLocs(const Size2D & size, + const s16 *srcBase, ptrdiff_t srcStride, + s16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity, + s16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity); + + void fillMinMaxLocs(const Size2D & size, + const u32 *srcBase, ptrdiff_t srcStride, + u32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity, + u32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity); + + void fillMinMaxLocs(const Size2D & size, + const s32 *srcBase, ptrdiff_t srcStride, + s32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity, + s32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity); + + /* + Among each pixel `p` within `src` find min and max values and its first occurences + */ + void minMaxLoc(const Size2D &size, + const s8 * srcBase, ptrdiff_t srcStride, + s8 &minVal, size_t &minCol, size_t &minRow, + s8 &maxVal, size_t &maxCol, size_t &maxRow); + + void minMaxLoc(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 &minVal, size_t &minCol, size_t &minRow, + u8 &maxVal, size_t &maxCol, size_t &maxRow); + + void minMaxLoc(const Size2D &size, + const s16 * srcBase, ptrdiff_t srcStride, + s16 &minVal, size_t &minCol, size_t &minRow, + s16 &maxVal, size_t &maxCol, size_t &maxRow); + + void minMaxLoc(const Size2D &size, + const u16 * srcBase, ptrdiff_t srcStride, + u16 &minVal, size_t &minCol, size_t &minRow, + u16 &maxVal, size_t &maxCol, size_t &maxRow); + + void minMaxLoc(const Size2D &size, + const s32 * srcBase, ptrdiff_t srcStride, + s32 &minVal, size_t &minCol, size_t &minRow, + s32 &maxVal, size_t &maxCol, size_t &maxRow); + + void minMaxLoc(const Size2D &size, + const f32 * srcBase, ptrdiff_t srcStride, + f32 &minVal, size_t &minCol, size_t &minRow, + f32 &maxVal, size_t &maxCol, size_t &maxRow); + + void minMaxLoc(const Size2D &size, + const f32 * srcBase, ptrdiff_t srcStride, + const u8 * maskBase, ptrdiff_t maskStride, + f32 &minVal, size_t &minCol, size_t &minRow, + f32 &maxVal, size_t &maxCol, size_t &maxRow); + + /* + For each point `p` within `size`, do: + dst[p] += src[p] + */ + void accumulate(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + s16 *dstBase, ptrdiff_t dstStride); + + /* + For each point `p` within `size`, do: + dst[p] = (dst[p] + ((src[p] ^ 2) >> shift)) + */ + void accumulateSquare(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + s16 *dstBase, ptrdiff_t dstStride, + u32 shift); + + /* + For each point `p` within `size`, do: + dst[p] = (1 - alpha) * dst[p] + alpha * src[p] + */ + void accumulateWeighted(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 *dstBase, ptrdiff_t dstStride, + f32 alpha); + + /* + orient[p] = atan2(src0[p], src1[p]) + */ + void phase(const Size2D &size, + const s16 * src0Base, ptrdiff_t src0Stride, + const s16 * src1Base, ptrdiff_t src1Stride, + u8 * orientBase, ptrdiff_t orientStride); + + void phase(const Size2D &size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride, + f32 * orientBase, ptrdiff_t orientStride, + f32 scale); + + /* + Combine 2 planes to a single one + */ + void combine2(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + u8 * dstBase, ptrdiff_t dstStride); + + void combine2(const Size2D &size, + const u16 * src0Base, ptrdiff_t src0Stride, + const u16 * src1Base, ptrdiff_t src1Stride, + u16 * dstBase, ptrdiff_t dstStride); + + void combine2(const Size2D &size, + const s32 * src0Base, ptrdiff_t src0Stride, + const s32 * src1Base, ptrdiff_t src1Stride, + s32 * dstBase, ptrdiff_t dstStride); + + void combine2(const Size2D &size, + const s64 * src0Base, ptrdiff_t src0Stride, + const s64 * src1Base, ptrdiff_t src1Stride, + s64 * dstBase, ptrdiff_t dstStride); + + /* + Combine 3 planes to a single one + */ + void combine3(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + const u8 * src2Base, ptrdiff_t src2Stride, + u8 * dstBase, ptrdiff_t dstStride); + + void combine3(const Size2D &size, + const u16 * src0Base, ptrdiff_t src0Stride, + const u16 * src1Base, ptrdiff_t src1Stride, + const u16 * src2Base, ptrdiff_t src2Stride, + u16 * dstBase, ptrdiff_t dstStride); + + void combine3(const Size2D &size, + const s32 * src0Base, ptrdiff_t src0Stride, + const s32 * src1Base, ptrdiff_t src1Stride, + const s32 * src2Base, ptrdiff_t src2Stride, + s32 * dstBase, ptrdiff_t dstStride); + + void combine3(const Size2D &size, + const s64 * src0Base, ptrdiff_t src0Stride, + const s64 * src1Base, ptrdiff_t src1Stride, + const s64 * src2Base, ptrdiff_t src2Stride, + s64 * dstBase, ptrdiff_t dstStride); + + /* + Combine 4 planes to a single one + */ + void combine4(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + const u8 * src2Base, ptrdiff_t src2Stride, + const u8 * src3Base, ptrdiff_t src3Stride, + u8 * dstBase, ptrdiff_t dstStride); + + void combine4(const Size2D &size, + const u16 * src0Base, ptrdiff_t src0Stride, + const u16 * src1Base, ptrdiff_t src1Stride, + const u16 * src2Base, ptrdiff_t src2Stride, + const u16 * src3Base, ptrdiff_t src3Stride, + u16 * dstBase, ptrdiff_t dstStride); + + void combine4(const Size2D &size, + const s32 * src0Base, ptrdiff_t src0Stride, + const s32 * src1Base, ptrdiff_t src1Stride, + const s32 * src2Base, ptrdiff_t src2Stride, + const s32 * src3Base, ptrdiff_t src3Stride, + s32 * dstBase, ptrdiff_t dstStride); + + void combine4(const Size2D &size, + const s64 * src0Base, ptrdiff_t src0Stride, + const s64 * src1Base, ptrdiff_t src1Stride, + const s64 * src2Base, ptrdiff_t src2Stride, + const s64 * src3Base, ptrdiff_t src3Stride, + s64 * dstBase, ptrdiff_t dstStride); + + /* + Combine 3 planes to YUYV one + */ + void combineYUYV(const Size2D &size, + const u8 * srcyBase, ptrdiff_t srcyStride, + const u8 * srcuBase, ptrdiff_t srcuStride, + const u8 * srcvBase, ptrdiff_t srcvStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Combine 3 planes to UYVY one + */ + void combineUYVY(const Size2D &size, + const u8 * srcyBase, ptrdiff_t srcyStride, + const u8 * srcuBase, ptrdiff_t srcuStride, + const u8 * srcvBase, ptrdiff_t srcvStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert RGB image to grayscale one + */ + void rgb2gray(const Size2D &size, COLOR_SPACE color_space, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert RGBX image to grayscale one + */ + void rgbx2gray(const Size2D &size, COLOR_SPACE color_space, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert BGR image to grayscale one + */ + void bgr2gray(const Size2D &size, COLOR_SPACE color_space, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert BGRX image to grayscale one + */ + void bgrx2gray(const Size2D &size, COLOR_SPACE color_space, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert grayscale image to RGB one + */ + void gray2rgb(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert grayscale image to RGBX one + */ + void gray2rgbx(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert RGB image to RGBX + */ + void rgb2rgbx(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert RGBX image to RGB + */ + void rgbx2rgb(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert RGB image to BGR + */ + void rgb2bgr(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert RGBX image to BGRX + */ + void rgbx2bgrx(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert RGBX image to BGR + */ + void rgbx2bgr(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert RGB image to BGRX + */ + void rgb2bgrx(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert RGB image to HSV + */ + void rgb2hsv(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + s32 hrange); + + /* + Convert RGBX image to HSV + */ + void rgbx2hsv(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + s32 hrange); + + /* + Convert BGR image to HSV + */ + void bgr2hsv(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + s32 hrange); + + /* + Convert BGRX image to HSV + */ + void bgrx2hsv(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + s32 hrange); + + /* + Convert RGBX image to BGR565 + RRRRrrrr GGGGgggg BBBBbbbb XXXXxxxx -> GggBBBBb RRRRrGGG + */ + void rgbx2bgr565(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert RGB image to BGR565 + RRRRrrrr GGGGgggg BBBBbbbb -> GggBBBBb RRRRrGGG + */ + void rgb2bgr565(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert RGBX image to RGB565 + RRRRrrrr GGGGgggg BBBBbbbb XXXXxxxx -> GggRRRRr BBBBbGGG + */ + void rgbx2rgb565(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert RGB image to RGB565 + RRRRrrrr GGGGgggg BBBBbbbb -> GggRRRRr BBBBbGGG + */ + void rgb2rgb565(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert RGB image to YCrCb + */ + void rgb2ycrcb(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert RGBX image to YCrCb + */ + void rgbx2ycrcb(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert BGR image to YCrCb + */ + void bgr2ycrcb(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert BGRX image to YCrCb + */ + void bgrx2ycrcb(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert YUV420sp image to RGB + */ + void yuv420sp2rgb(const Size2D &size, + const u8 * yBase, ptrdiff_t yStride, + const u8 * uvBase, ptrdiff_t uvStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert YUV420sp image to RGBX + */ + void yuv420sp2rgbx(const Size2D &size, + const u8 * yBase, ptrdiff_t yStride, + const u8 * uvBase, ptrdiff_t uvStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert YUV420i image to RGB + */ + void yuv420i2rgb(const Size2D &size, + const u8 * yBase, ptrdiff_t yStride, + const u8 * uvBase, ptrdiff_t uvStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert YUV420i image to RGBX + */ + void yuv420i2rgbx(const Size2D &size, + const u8 * yBase, ptrdiff_t yStride, + const u8 * uvBase, ptrdiff_t uvStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert YUV420sp image to BGR + */ + void yuv420sp2bgr(const Size2D &size, + const u8 * yBase, ptrdiff_t yStride, + const u8 * uvBase, ptrdiff_t uvStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert YUV420sp image to BGRX + */ + void yuv420sp2bgrx(const Size2D &size, + const u8 * yBase, ptrdiff_t yStride, + const u8 * uvBase, ptrdiff_t uvStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert YUV420i image to BGR + */ + void yuv420i2bgr(const Size2D &size, + const u8 * yBase, ptrdiff_t yStride, + const u8 * uvBase, ptrdiff_t uvStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Convert YUV420i image to BGRX + */ + void yuv420i2bgrx(const Size2D &size, + const u8 * yBase, ptrdiff_t yStride, + const u8 * uvBase, ptrdiff_t uvStride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + For each point `p` within `size`, do: + dst[p] = src[p] << shift + */ + void lshift(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + u32 shift); + + /* + For each point `p` within `size`, do sign-extending shift: + dst[p] = src[p] >> shift + */ + void rshift(const Size2D &size, + const s16 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + u32 shift, CONVERT_POLICY cpolicy); + + /* + For each point `p` within `size`, set `dst[p]` to the average + of `src[p]` and the 8 (or 24 for blur5x5) points around it + NOTE: the function cannot operate inplace + */ + bool isBlur3x3Supported(const Size2D &size, BORDER_MODE border); + void blur3x3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue); + + bool isBlurU8Supported(const Size2D &size, s32 cn, BORDER_MODE border); + void blur3x3(const Size2D &size, s32 cn, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderType, u8 borderValue); + + void blur5x5(const Size2D &size, s32 cn, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderType, u8 borderValue); + + /* + For each point `p` within `size`, set `dst[p]` to the average + of `src[p]` and the 8 points around it + NOTE: the function can operate inplace + */ + bool isBlurF32Supported(const Size2D &size, s32 cn, BORDER_MODE border); + void blur3x3(const Size2D &size, s32 cn, + const f32 * srcBase, ptrdiff_t srcStride, + f32 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderType, f32 borderValue, Margin borderMargin); + + bool isBlurS32Supported(const Size2D &size, s32 cn, BORDER_MODE border); + void blur3x3(const Size2D &size, s32 cn, + const s32 * srcBase, ptrdiff_t srcStride, + s32 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderType, s32 borderValue, Margin borderMargin); + + /* + For each point `p` within `size`, set `dst[p]` to gaussian smooth + of `src[p]` and the 8(24 for 5x5 version) points around it + NOTE: the function cannot operate inplace + */ + bool isGaussianBlur3x3Supported(const Size2D &size, BORDER_MODE border); + void gaussianBlur3x3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue); + bool isGaussianBlur3x3MarginSupported(const Size2D &size, BORDER_MODE border, Margin borderMargin = Margin()); + void gaussianBlur3x3Margin(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue, Margin borderMargin = Margin()); + + bool isGaussianBlur5x5Supported(const Size2D &size, s32 cn, BORDER_MODE border); + void gaussianBlur5x5(const Size2D &size, s32 cn, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderType, u8 borderValue, Margin borderMargin); + + void gaussianBlur5x5(const Size2D &size, s32 cn, + const u16 * srcBase, ptrdiff_t srcStride, + u16 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderType, u16 borderValue, Margin borderMargin); + + void gaussianBlur5x5(const Size2D &size, s32 cn, + const s16 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderType, s16 borderValue, Margin borderMargin); + + void gaussianBlur5x5(const Size2D &size, s32 cn, + const s32 * srcBase, ptrdiff_t srcStride, + s32 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderType, s32 borderValue, Margin borderMargin); + + /* + Calculation of Sobel operator + NOTE: the function cannot operate inplace + */ + bool isSobel3x3Supported(const Size2D &size, BORDER_MODE border, s32 dx, s32 dy, Margin borderMargin = Margin()); + void Sobel3x3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + s32 dx, s32 dy, + BORDER_MODE border, u8 borderValue, Margin borderMargin = Margin()); + + /* + Calculation of Sobel operator for f32 data + NOTE: the function can operate inplace + */ + bool isSobel3x3f32Supported(const Size2D &size, BORDER_MODE border, s32 dx, s32 dy); + void Sobel3x3(const Size2D &size, + const f32 * srcBase, ptrdiff_t srcStride, + f32 * dstBase, ptrdiff_t dstStride, + s32 dx, s32 dy, + BORDER_MODE borderType, f32 borderValue); + + /* + Calculation of Scharr operator + NOTE: the function cannot operate inplace + */ + bool isScharr3x3Supported(const Size2D &size, BORDER_MODE border, s32 dx, s32 dy, Margin borderMargin = Margin()); + void Scharr3x3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + s32 dx, s32 dy, + BORDER_MODE borderType, u8 borderValue, Margin borderMargin = Margin()); + + void ScharrDeriv(const Size2D &size, s32 cn, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride); + + /* + Calculation of generic separable filtering operator + rowFilter/colFilter define filter weights + 0 - predefined 1 2 1 + 1 - predefined -1 0 1 + 2 - predefined 1 -2 1 + 3 - weights provided as xw/yw + */ + bool isSeparableFilter3x3Supported(const Size2D &size, BORDER_MODE border, s32 dx, s32 dy, Margin borderMargin = Margin()); + void SeparableFilter3x3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + const u8 rowFilter, const u8 colFilter, const s16 *xw, const s16 *yw, + BORDER_MODE border, u8 borderValue, Margin borderMargin = Margin()); + + /* + Extract a single plane from 2 channel image + */ + void extract2(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + u32 coi); + + /* + Extract a single plane from 3 channel image + */ + void extract3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + u32 coi); + + /* + Extract a single plane from 4 channel image + */ + void extract4(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + u32 coi); + + /* + Split 2 channel image to separate planes + */ + void split2(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dst0Base, ptrdiff_t dst0Stride, + u8 * dst1Base, ptrdiff_t dst1Stride); + + void split2(const Size2D &size, + const u16* srcBase, ptrdiff_t srcStride, + u16 * dst0Base, ptrdiff_t dst0Stride, + u16 * dst1Base, ptrdiff_t dst1Stride); + + void split2(const Size2D &size, + const s32 * srcBase, ptrdiff_t srcStride, + s32 * dst0Base, ptrdiff_t dst0Stride, + s32 * dst1Base, ptrdiff_t dst1Stride); + + void split2(const Size2D &size, + const s64 * srcBase, ptrdiff_t srcStride, + s64 * dst0Base, ptrdiff_t dst0Stride, + s64 * dst1Base, ptrdiff_t dst1Stride); + + /* + Split 3 channel image to separate planes + */ + void split3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dst0Base, ptrdiff_t dst0Stride, + u8 * dst1Base, ptrdiff_t dst1Stride, + u8 * dst2Base, ptrdiff_t dst2Stride); + + void split3(const Size2D &size, + const u16* srcBase, ptrdiff_t srcStride, + u16 * dst0Base, ptrdiff_t dst0Stride, + u16 * dst1Base, ptrdiff_t dst1Stride, + u16 * dst2Base, ptrdiff_t dst2Stride); + + void split3(const Size2D &size, + const s32 * srcBase, ptrdiff_t srcStride, + s32 * dst0Base, ptrdiff_t dst0Stride, + s32 * dst1Base, ptrdiff_t dst1Stride, + s32 * dst2Base, ptrdiff_t dst2Stride); + + void split3(const Size2D &size, + const s64 * srcBase, ptrdiff_t srcStride, + s64 * dst0Base, ptrdiff_t dst0Stride, + s64 * dst1Base, ptrdiff_t dst1Stride, + s64 * dst2Base, ptrdiff_t dst2Stride); + + /* + Split 4 channel image to separate planes + */ + void split4(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dst0Base, ptrdiff_t dst0Stride, + u8 * dst1Base, ptrdiff_t dst1Stride, + u8 * dst2Base, ptrdiff_t dst2Stride, + u8 * dst3Base, ptrdiff_t dst3Stride); + + void split4(const Size2D &size, + const u16* srcBase, ptrdiff_t srcStride, + u16 * dst0Base, ptrdiff_t dst0Stride, + u16 * dst1Base, ptrdiff_t dst1Stride, + u16 * dst2Base, ptrdiff_t dst2Stride, + u16 * dst3Base, ptrdiff_t dst3Stride); + + void split4(const Size2D &size, + const s32 * srcBase, ptrdiff_t srcStride, + s32 * dst0Base, ptrdiff_t dst0Stride, + s32 * dst1Base, ptrdiff_t dst1Stride, + s32 * dst2Base, ptrdiff_t dst2Stride, + s32 * dst3Base, ptrdiff_t dst3Stride); + + void split4(const Size2D &size, + const s64 * srcBase, ptrdiff_t srcStride, + s64 * dst0Base, ptrdiff_t dst0Stride, + s64 * dst1Base, ptrdiff_t dst1Stride, + s64 * dst2Base, ptrdiff_t dst2Stride, + s64 * dst3Base, ptrdiff_t dst3Stride); + + /* + Split 4 channel image to 3 channel image and 1 channel image + */ + void split4(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dst3Base, ptrdiff_t dst3Stride, + u8 * dst1Base, ptrdiff_t dst1Stride); + + /* + Flip image using specified flip mode + */ + bool isFlipSupported(FLIP_MODE flipMode, u32 elemSize); + void flip(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + FLIP_MODE flipMode, u32 elemSize); + + /* + For each point `p` within `size`, set `dst[p]` to the maximum + of `src[p]` and the 8 points around it + NOTE: the function cannot operate inplace + */ + bool isMorph3x3Supported(const Size2D &size, BORDER_MODE border); + + void erode3x3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue); + + void dilate3x3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue); + + void erode(const Size2D &ssize, u32 cn, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + const Size2D &ksize, + size_t anchorX, size_t anchorY, + BORDER_MODE rowBorderType, BORDER_MODE columnBorderType, + const u8 * borderValues, Margin borderMargin); + + void dilate(const Size2D &ssize, u32 cn, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + const Size2D &ksize, + size_t anchorX, size_t anchorY, + BORDER_MODE rowBorderType, BORDER_MODE columnBorderType, + const u8 * borderValues, Margin borderMargin); + + /* + Resize a source image using "nearest neighbor" interpolation type + + wr = src_width / dst_width + hr = src_height / dst_height + */ + bool isResizeNearestNeighborSupported(const Size2D &ssize, u32 elemSize); + void resizeNearestNeighbor(const Size2D &ssize, const Size2D &dsize, + const void * srcBase, ptrdiff_t srcStride, + void * dstBase, ptrdiff_t dstStride, + f32 wr, f32 hr, u32 elemSize); + + /* + Resize a source image using "area" interpolation type + + wr = src_width / dst_width + hr = src_height / dst_height + */ + bool isResizeAreaSupported(f32 wr, f32 hr, u32 channels); + void resizeAreaOpenCV(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f32 wr, f32 hr, u32 channels); + void resizeArea(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f32 wr, f32 hr, u32 channels); + + /* + Resize a source image using "linear" interpolation type + + wr = src_width / dst_width + hr = src_height / dst_height + */ + bool isResizeLinearOpenCVSupported(const Size2D &ssize, const Size2D &dsize, u32 channels); + bool isResizeLinearSupported(const Size2D &ssize, const Size2D &dsize, + f32 wr, f32 hr, u32 channels); + void resizeLinearOpenCV(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f32 wr, f32 hr, u32 channels); + void resizeLinear(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f32 wr, f32 hr, u32 channels); + + /* + For each point `p` within `size`, set `dst[p]` to convolution + of `src[p]` and the (ksize * ksize - 1) points around it + The function uses OpenVX semantic (so, in order to use this function + in OpenCV you should flip kernel in both directions) + NOTE: the function cannot operate inplace + */ + bool isConvolutionSupported(const Size2D &size, const Size2D &ksize, BORDER_MODE border); + void convolution(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue, + const Size2D & ksize, s16 * kernelBase, u32 scale); + + /* + For each point `p` within `dstSize`, does convolution + of tmpl points and size*size square of src points starting with `src[p]`. + Src should be of size (dstSize+size-1)*(dstSize+size-1) + NOTE: the function cannot operate inplace + */ + bool isMatchTemplateSupported(const Size2D &tmplSize); + void matchTemplate(const Size2D &srcSize, + const u8 * srcBase, ptrdiff_t srcStride, + const Size2D &tmplSize, + const u8 * tmplBase, ptrdiff_t tmplStride, + f32 * dstBase, ptrdiff_t dstStride, + bool normalize); + + /* + Calculation of Laplacian operator + + 1 1 1 + 1 -8 1 + 1 1 1 + + NOTE: the function cannot operate inplace + */ + bool isLaplacian3x3Supported(const Size2D &size, BORDER_MODE border); + void Laplacian3x3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue); + + /* + OpenCV like calculation of Laplacian operator + + kernel 1 kernel 3 kernel 5 + 0 1 0 2 0 2 1 2 2 2 1 + 1 -4 1 0 -8 0 2 0 -4 0 2 + 0 1 0 2 0 2 2 -4 -12 -4 2 + 2 0 -4 0 2 + 1 2 2 2 1 + + NOTE: the function cannot operate inplace + */ + bool isLaplacianOpenCVSupported(const Size2D &size, BORDER_MODE border); + void Laplacian1OpenCV(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue); + void Laplacian3OpenCV(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue); + void Laplacian5OpenCV(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue); + + /* + Detect image edges using Canny algorithm + These functions perform derivatives estimation using sobel algorithm + */ + bool isCanny3x3Supported(const Size2D &size); + void Canny3x3L1(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f64 low_thresh, f64 high_thresh, + Margin borderMargin); + + void Canny3x3L2(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f64 low_thresh, f64 high_thresh, + Margin borderMargin); + + /* + Detect image edges using Canny algorithm + These functions don't estimate derivatives and thus require + precomputed derivatives estimation instead of source image + */ + void Canny3x3L1(const Size2D &size, s32 cn, + s16 * dxBase, ptrdiff_t dxStride, + s16 * dyBase, ptrdiff_t dyStride, + u8 * dstBase, ptrdiff_t dstStride, + f64 low_thresh, f64 high_thresh); + + void Canny3x3L2(const Size2D &size, s32 cn, + s16 * dxBase, ptrdiff_t dxStride, + s16 * dyBase, ptrdiff_t dyStride, + u8 * dstBase, ptrdiff_t dstStride, + f64 low_thresh, f64 high_thresh); + + /* + Performs detection of FAST features + */ + void FAST(const Size2D &size, + u8 *srcBase, ptrdiff_t srcStride, + KeypointStore *keypoints, + u8 threshold, bool nonmax_suppression); + + /* + Remap a source image using table and specified + extrapolation method + */ + bool isRemapNearestNeighborSupported(const Size2D &ssize); + void remapNearestNeighbor(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + const f32 * tableBase, ptrdiff_t tableStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderMode, u8 borderValue); + + bool isRemapLinearSupported(const Size2D &ssize); + void remapLinear(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + const f32 * tableBase, ptrdiff_t tableStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderMode, u8 borderValue); + + /* + Perform an affine transform on an input image + + src_x = dst_x * m[0] + dst_y * m[2] + m[4] + src_y = dst_x * m[1] + dst_y * m[3] + m[5] + */ + bool isWarpAffineNearestNeighborSupported(const Size2D &ssize); + void warpAffineNearestNeighbor(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + const f32 * m, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderMode, u8 borderValue); + + bool isWarpAffineLinearSupported(const Size2D &ssize); + void warpAffineLinear(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + const f32 * m, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderMode, u8 borderValue); + + /* + Perform a perspective transform on an input image + + src_x = dst_x * m[0] + dst_y * m[3] + m[6] + src_y = dst_x * m[1] + dst_y * m[4] + m[7] + w = dst_x * m[2] + dst_y * m[5] + m[8] + + src_x = w == 0 ? 0 : src_x / w + src_y = w == 0 ? 0 : src_y / w + */ + bool isWarpPerspectiveNearestNeighborSupported(const Size2D &ssize); + void warpPerspectiveNearestNeighbor(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + const f32 * m, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderMode, u8 borderValue); + + bool isWarpPerspectiveLinearSupported(const Size2D &ssize); + void warpPerspectiveLinear(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + const f32 * m, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderMode, u8 borderValue); + + /* + Convert data from source to destination type + */ + void convert(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride, + s8 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride, + u16 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride, + s32 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride, + f32 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const s8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const s8 * srcBase, ptrdiff_t srcStride, + u16 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const s8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const s8 * srcBase, ptrdiff_t srcStride, + s32 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const s8 * srcBase, ptrdiff_t srcStride, + f32 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride, + s8 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride, + s32 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride, + f32 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const s16 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const s16 * srcBase, ptrdiff_t srcStride, + s8 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const s16 * srcBase, ptrdiff_t srcStride, + u16 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const s16 * srcBase, ptrdiff_t srcStride, + s32 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const s16 * srcBase, ptrdiff_t srcStride, + f32 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride, + s8 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride, + u16 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride, + f32 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride, + s8 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride, + u16 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride); + + void convert(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride, + s32 * dstBase, ptrdiff_t dstStride); + + /* + Convert data from source to destination type with scaling + dst = saturate_cast(src * alpha + beta) + */ + void convertScale(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride, + s8 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride, + u16 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride, + s32 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride, + f32 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const s8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const s8 * srcBase, ptrdiff_t srcStride, + s8 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const s8 * srcBase, ptrdiff_t srcStride, + u16 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const s8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const s8 * srcBase, ptrdiff_t srcStride, + s32 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const s8 * srcBase, ptrdiff_t srcStride, + f32 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride, + s8 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride, + u16 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride, + s32 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride, + f32 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const s16 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const s16 * srcBase, ptrdiff_t srcStride, + s8 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const s16 * srcBase, ptrdiff_t srcStride, + u16 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const s16 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const s16 * srcBase, ptrdiff_t srcStride, + s32 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const s16 * srcBase, ptrdiff_t srcStride, + f32 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride, + s8 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride, + u16 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride, + s32 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride, + f32 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride, + s8 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride, + u16 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride, + s32 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + void convertScale(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride, + f32 * dstBase, ptrdiff_t dstStride, + f64 alpha, f64 beta); + + /* + Reduce matrix to a vector by calculatin given operation for each column + */ + void reduceColSum(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s32 * dstBase); + + void reduceColMax(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase); + + void reduceColMin(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase); + + void reduceColSum(const Size2D &size, + const f32 * srcBase, ptrdiff_t srcStride, + f32 * dstBase); + + void reduceColMax(const Size2D &size, + const f32 * srcBase, ptrdiff_t srcStride, + f32 * dstBase); + + void reduceColMin(const Size2D &size, + const f32 * srcBase, ptrdiff_t srcStride, + f32 * dstBase); + + /* + For each point `p` within `size`, do: + dst[p] = (rng1[p] <= src[p] && src[p] <= rng2[p]) ? 255 : 0 + */ + + void inRange(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride, + const u8 * rng1Base, ptrdiff_t rng1Stride, + const u8 * rng2Base, ptrdiff_t rng2Stride, + u8 * dstBase, ptrdiff_t dstStride); + + void inRange(const Size2D &_size, + const s8 * srcBase, ptrdiff_t srcStride, + const s8 * rng1Base, ptrdiff_t rng1Stride, + const s8 * rng2Base, ptrdiff_t rng2Stride, + u8 * dstBase, ptrdiff_t dstStride); + + void inRange(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride, + const u16 * rng1Base, ptrdiff_t rng1Stride, + const u16 * rng2Base, ptrdiff_t rng2Stride, + u8 * dstBase, ptrdiff_t dstStride); + + void inRange(const Size2D &_size, + const s16 * srcBase, ptrdiff_t srcStride, + const s16 * rng1Base, ptrdiff_t rng1Stride, + const s16 * rng2Base, ptrdiff_t rng2Stride, + u8 * dstBase, ptrdiff_t dstStride); + + void inRange(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride, + const s32 * rng1Base, ptrdiff_t rng1Stride, + const s32 * rng2Base, ptrdiff_t rng2Stride, + u8 * dstBase, ptrdiff_t dstStride); + + void inRange(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride, + const f32 * rng1Base, ptrdiff_t rng1Stride, + const f32 * rng2Base, ptrdiff_t rng2Stride, + u8 * dstBase, ptrdiff_t dstStride); + + /* + Estimate amount of non zero elements + */ + s32 countNonZero(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride); + + s32 countNonZero(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride); + + s32 countNonZero(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride); + + s32 countNonZero(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride); + + s32 countNonZero(const Size2D &_size, + const f64 * srcBase, ptrdiff_t srcStride); + + /* + Calculates sum of all image pixel values and squared values + */ + bool isSumSupported(u32 channels); + + void sum(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride, + u32 * sumdst, u32 channels); + + void sum(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride, + f64 * sumdst, u32 channels); + + bool isSqsumSupported(u32 channels); + + void sqsum(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride, + f64 * sumdst, f64 * sqsumdst, u32 channels); + + /* + Calculates norm + */ + s32 normInf(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride); + + s32 normInf(const Size2D &_size, + const s8 * srcBase, ptrdiff_t srcStride); + + s32 normInf(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride); + + s32 normInf(const Size2D &_size, + const s16 * srcBase, ptrdiff_t srcStride); + + s32 normInf(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride); + + f32 normInf(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride); + + s32 normL1(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride); + + s32 normL1(const Size2D &_size, + const s8 * srcBase, ptrdiff_t srcStride); + + s32 normL1(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride); + + s32 normL1(const Size2D &_size, + const s16 * srcBase, ptrdiff_t srcStride); + + f64 normL1(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride); + + f64 normL1(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride); + + s32 normL2(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride); + + s32 normL2(const Size2D &_size, + const s8 * srcBase, ptrdiff_t srcStride); + + f64 normL2(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride); + + f64 normL2(const Size2D &_size, + const s16 * srcBase, ptrdiff_t srcStride); + + f64 normL2(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride); + + f64 normL2(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride); + + /* + Calculates norm of per element difference + */ + s32 diffNormInf(const Size2D &_size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride); + + f32 diffNormInf(const Size2D &_size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride); + + s32 diffNormL1(const Size2D &_size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride); + + f64 diffNormL1(const Size2D &_size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride); + + s32 diffNormL2(const Size2D &_size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride); + + f64 diffNormL2(const Size2D &_size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride); + + /* + * Pyramidal Lucas-Kanade Optical Flow level processing + */ + void pyrLKOptFlowLevel(const Size2D &size, s32 cn, + const u8 *prevData, ptrdiff_t prevStride, + const s16 *prevDerivData, ptrdiff_t prevDerivStride, + const u8 *nextData, ptrdiff_t nextStride, + u32 ptCount, + const f32 *prevPts, f32 *nextPts, + u8 *status, f32 *err, + const Size2D &winSize, + u32 terminationCount, f64 terminationEpsilon, + u32 level, u32 maxLevel, bool useInitialFlow, bool getMinEigenVals, + f32 minEigThreshold); +} + +#endif diff --git a/3rdparty/carotene/include/carotene/types.hpp b/3rdparty/carotene/include/carotene/types.hpp new file mode 100644 index 0000000000..81b03d649a --- /dev/null +++ b/3rdparty/carotene/include/carotene/types.hpp @@ -0,0 +1,125 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#ifndef CAROTENE_TYPES_HPP +#define CAROTENE_TYPES_HPP + +#include +#include +#include + +#ifndef UINT32_MAX + #define UINT32_MAX (4294967295U) +#endif + +namespace CAROTENE_NS { + using std::size_t; + using std::ptrdiff_t; + + typedef int8_t s8; + typedef uint8_t u8; + typedef int16_t s16; + typedef uint16_t u16; + typedef int32_t s32; + typedef uint32_t u32; + typedef float f32; + typedef int64_t s64; + typedef uint64_t u64; + typedef double f64; + + typedef ptrdiff_t stride_t; + + enum CONVERT_POLICY + { + CONVERT_POLICY_WRAP, + CONVERT_POLICY_SATURATE + }; + + enum BORDER_MODE + { + BORDER_MODE_UNDEFINED, + BORDER_MODE_CONSTANT, + BORDER_MODE_REPLICATE, + BORDER_MODE_REFLECT, + BORDER_MODE_REFLECT101, + BORDER_MODE_WRAP + }; + + enum FLIP_MODE + { + FLIP_HORIZONTAL_MODE = 1, + FLIP_VERTICAL_MODE = 2, + FLIP_BOTH_MODE = FLIP_HORIZONTAL_MODE | FLIP_VERTICAL_MODE + }; + + enum COLOR_SPACE + { + COLOR_SPACE_BT601, + COLOR_SPACE_BT709 + }; + + struct Size2D { + Size2D() : width(0), height(0) {} + Size2D(size_t width_, size_t height_) : width(width_), height(height_) {} + + size_t width; + size_t height; + + inline size_t total() const + { + return width * height; + } + }; + + struct Margin { + Margin() : left(0), right(0), top(0), bottom(0) {} + Margin(size_t left_, size_t right_, size_t top_, size_t bottom_) + : left(left_), right(right_), top(top_), bottom(bottom_) {} + + // these are measured in elements + size_t left, right, top, bottom; + }; + + struct KeypointStore { + virtual void push(f32 kpX, f32 kpY, f32 kpSize, f32 kpAngle=-1, f32 kpResponse=0, s32 kpOctave=0, s32 kpClass_id=-1) = 0; + virtual ~KeypointStore() {}; + }; +} + +#endif diff --git a/3rdparty/carotene/src/absdiff.cpp b/3rdparty/carotene/src/absdiff.cpp new file mode 100644 index 0000000000..02008ceb3e --- /dev/null +++ b/3rdparty/carotene/src/absdiff.cpp @@ -0,0 +1,241 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include + +#include "common.hpp" +#include "vtransform.hpp" + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON + +namespace { + +template +struct AbsDiff +{ + typedef T type; + + void operator() (const typename internal::VecTraits::vec128 & v_src0, + const typename internal::VecTraits::vec128 & v_src1, + typename internal::VecTraits::vec128 & v_dst) const + { + v_dst = internal::vabdq(v_src0, v_src1); + } + + void operator() (const typename internal::VecTraits::vec64 & v_src0, + const typename internal::VecTraits::vec64 & v_src1, + typename internal::VecTraits::vec64 & v_dst) const + { + v_dst = internal::vabd(v_src0, v_src1); + } + + void operator() (const T * src0, const T * src1, T * dst) const + { + dst[0] = src0[0] >= src1[0] ? src0[0] - src1[0] : src1[0] - src0[0]; + } +}; + +template +struct AbsDiffSigned +{ + typedef T type; + + void operator() (const typename internal::VecTraits::vec128 & v_src0, + const typename internal::VecTraits::vec128 & v_src1, + typename internal::VecTraits::vec128 & v_dst) const + { + typename internal::VecTraits::vec128 v_min = internal::vminq(v_src0, v_src1); + typename internal::VecTraits::vec128 v_max = internal::vmaxq(v_src0, v_src1); + v_dst = internal::vqsubq(v_max, v_min); + } + + void operator() (const typename internal::VecTraits::vec64 & v_src0, + const typename internal::VecTraits::vec64 & v_src1, + typename internal::VecTraits::vec64 & v_dst) const + { + typename internal::VecTraits::vec64 v_min = internal::vmin(v_src0, v_src1); + typename internal::VecTraits::vec64 v_max = internal::vmax(v_src0, v_src1); + v_dst = internal::vqsub(v_max, v_min); + } + + void operator() (const T * src0, const T * src1, T * dst) const + { + dst[0] = internal::saturate_cast(src0[0] >= src1[0] ? (s64)src0[0] - src1[0] : (s64)src1[0] - src0[0]); + } +}; + +} // namespace + +#endif + +void absDiff(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, AbsDiff()); +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; +#endif +} + +void absDiff(const Size2D &size, + const u16 *src0Base, ptrdiff_t src0Stride, + const u16 *src1Base, ptrdiff_t src1Stride, + u16 *dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, AbsDiff()); +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; +#endif +} + +void absDiff(const Size2D &size, + const s8 *src0Base, ptrdiff_t src0Stride, + const s8 *src1Base, ptrdiff_t src1Stride, + s8 *dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, AbsDiffSigned()); +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; +#endif +} + +void absDiff(const Size2D &size, + const s16 *src0Base, ptrdiff_t src0Stride, + const s16 *src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, AbsDiffSigned()); +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; +#endif +} + +void absDiff(const Size2D &size, + const s32 *src0Base, ptrdiff_t src0Stride, + const s32 *src1Base, ptrdiff_t src1Stride, + s32 *dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, AbsDiffSigned()); +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; +#endif +} + +void absDiff(const Size2D &size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride, + f32 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, AbsDiff()); +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/accumulate.cpp b/3rdparty/carotene/src/accumulate.cpp new file mode 100644 index 0000000000..ee9ce22d35 --- /dev/null +++ b/3rdparty/carotene/src/accumulate.cpp @@ -0,0 +1,408 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + + +#include "common.hpp" +#include "vtransform.hpp" + +#include + +namespace CAROTENE_NS { + +void accumulate(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + s16 *dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u8* src = internal::getRowPtr(srcBase, srcStride, i); + s16* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + internal::prefetch(dst + j); + uint8x16_t v_src = vld1q_u8(src + j); + int16x8_t v_dst0 = vld1q_s16(dst + j); + int16x8_t v_dst1 = vld1q_s16(dst + j + 8); + int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src))); + int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src))); + v_dst0 = vqaddq_s16(v_dst0, v_src0); + v_dst1 = vqaddq_s16(v_dst1, v_src1); + vst1q_s16(dst + j, v_dst0); + vst1q_s16(dst + j + 8, v_dst1); + } + for (; j < roiw8; j += 8) + { + uint8x8_t v_src = vld1_u8(src + j); + int16x8_t v_src16 = vreinterpretq_s16_u16(vmovl_u8(v_src)); + int16x8_t v_dst = vld1q_s16(dst + j); + v_dst = vqaddq_s16(v_dst, v_src16); + vst1q_s16(dst + j, v_dst); + } + + for (; j < size.width; j++) + dst[j] = internal::saturate_cast(src[j] + dst[j]); + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +#ifdef CAROTENE_NEON + +namespace { + +template +void accumulateSquareConst(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + s16 *dstBase, ptrdiff_t dstStride) +{ + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u8* src = internal::getRowPtr(srcBase, srcStride, i); + s16* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + internal::prefetch(dst + j); + uint8x16_t v_src = vld1q_u8(src + j); + int16x8_t v_dst0 = vld1q_s16(dst + j), v_dst1 = vld1q_s16(dst + j + 8); + int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src))); + int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src))); + + int16x4_t v_srclo = vget_low_s16(v_src0), v_srchi = vget_high_s16(v_src0); + v_dst0 = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst0))), + vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst0)))); + + v_srclo = vget_low_s16(v_src1); + v_srchi = vget_high_s16(v_src1); + v_dst1 = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst1))), + vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst1)))); + + vst1q_s16(dst + j, v_dst0); + vst1q_s16(dst + j + 8, v_dst1); + } + for (; j < roiw8; j += 8) + { + int16x8_t v_src = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j))); + int16x8_t v_dst = vld1q_s16(dst + j); + int16x4_t v_srclo = vget_low_s16(v_src), v_srchi = vget_high_s16(v_src); + v_dst = vcombine_s16(vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srclo, v_srclo), shift), vget_low_s16(v_dst))), + vqmovn_s32(vaddw_s16(vshrq_n_s32(vmull_s16(v_srchi, v_srchi), shift), vget_high_s16(v_dst)))); + vst1q_s16(dst + j, v_dst); + } + + for (; j < size.width; j++) + { + s32 srcVal = src[j]; + dst[j] = internal::saturate_cast(dst[j] + ((srcVal * srcVal) >> shift)); + } + } +} + +template <> +void accumulateSquareConst<0>(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + s16 *dstBase, ptrdiff_t dstStride) +{ + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u8* src = internal::getRowPtr(srcBase, srcStride, i); + s16* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + internal::prefetch(dst + j); + uint8x16_t v_src = vld1q_u8(src + j); + int16x8_t v_dst0 = vld1q_s16(dst + j), v_dst1 = vld1q_s16(dst + j + 8); + int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src))); + int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src))); + + int16x4_t v_srclo = vget_low_s16(v_src0), v_srchi = vget_high_s16(v_src0); + v_dst0 = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst0))), + vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst0)))); + + v_srclo = vget_low_s16(v_src1); + v_srchi = vget_high_s16(v_src1); + v_dst1 = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst1))), + vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst1)))); + + vst1q_s16(dst + j, v_dst0); + vst1q_s16(dst + j + 8, v_dst1); + } + for (; j < roiw8; j += 8) + { + int16x8_t v_src = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j))); + int16x8_t v_dst = vld1q_s16(dst + j); + int16x4_t v_srclo = vget_low_s16(v_src), v_srchi = vget_high_s16(v_src); + v_dst = vcombine_s16(vqmovn_s32(vaddw_s16(vmull_s16(v_srclo, v_srclo), vget_low_s16(v_dst))), + vqmovn_s32(vaddw_s16(vmull_s16(v_srchi, v_srchi), vget_high_s16(v_dst)))); + vst1q_s16(dst + j, v_dst); + } + + for (; j < size.width; j++) + { + s32 srcVal = src[j]; + dst[j] = internal::saturate_cast(dst[j] + srcVal * srcVal); + } + } +} + +typedef void (* accumulateSquareConstFunc)(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + s16 *dstBase, ptrdiff_t dstStride); + +} // namespace + +#endif + +void accumulateSquare(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + s16 *dstBase, ptrdiff_t dstStride, + u32 shift) +{ + if (shift >= 16) + { + for (size_t i = 0; i < size.height; ++i) + { + s16 * dst = internal::getRowPtr(dstBase, dstStride, i); + std::memset(dst, 0, sizeof(s16) * size.width); + } + return; + } + + internal::assertSupportedConfiguration(); + +#ifdef CAROTENE_NEON + // this ugly contruction is needed to avoid: + // /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant + // return (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 1); + + accumulateSquareConstFunc funcs[16] = + { + accumulateSquareConst<0>, + accumulateSquareConst<1>, + accumulateSquareConst<2>, + accumulateSquareConst<3>, + accumulateSquareConst<4>, + accumulateSquareConst<5>, + accumulateSquareConst<6>, + accumulateSquareConst<7>, + accumulateSquareConst<8>, + accumulateSquareConst<9>, + accumulateSquareConst<10>, + accumulateSquareConst<11>, + accumulateSquareConst<12>, + accumulateSquareConst<13>, + accumulateSquareConst<14>, + accumulateSquareConst<15> + }, func = funcs[shift]; + + func(size, srcBase, srcStride, dstBase, dstStride); +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)shift; +#endif +} + +#ifdef CAROTENE_NEON + +namespace { + +struct AccumulateWeightedHalf +{ + typedef u8 type; + + void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1, + uint8x16_t & v_dst) const + { + v_dst = vhaddq_u8(v_src0, v_src1); + } + + void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1, + uint8x8_t & v_dst) const + { + v_dst = vhadd_u8(v_src0, v_src1); + } + + void operator() (const u8 * src0, const u8 * src1, u8 * dst) const + { + dst[0] = ((u16)(src0[0]) + src1[0]) >> 1; + } +}; + +struct AccumulateWeighted +{ + typedef u8 type; + + float alpha, beta; + float32x4_t v_alpha, v_beta; + + explicit AccumulateWeighted(float _alpha) : + alpha(_alpha), beta(1 - _alpha) + { + v_alpha = vdupq_n_f32(alpha); + v_beta = vdupq_n_f32(beta); + } + + void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1, + uint8x16_t & v_dst) const + { + uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0)); + uint16x8_t v_src1_p = vmovl_u8(vget_low_u8(v_src1)); + float32x4_t v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p))), v_beta), + v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p)))); + float32x4_t v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p))), v_beta), + v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p)))); + uint16x8_t v_dst0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)), + vmovn_u32(vcvtq_u32_f32(v_dst1f))); + + v_src0_p = vmovl_u8(vget_high_u8(v_src0)); + v_src1_p = vmovl_u8(vget_high_u8(v_src1)); + v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p))), v_beta), + v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p)))); + v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p))), v_beta), + v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p)))); + uint16x8_t v_dst1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)), + vmovn_u32(vcvtq_u32_f32(v_dst1f))); + + v_dst = vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1)); + } + + void operator() (const uint8x8_t & _v_src0, const uint8x8_t & _v_src1, + uint8x8_t & v_dst) const + { + uint16x8_t v_src0 = vmovl_u8(_v_src0), v_src1 = vmovl_u8(_v_src1); + + float32x4_t v_dst0f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), v_beta), + v_alpha, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0)))); + float32x4_t v_dst1f = vmlaq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), v_beta), + v_alpha, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0)))); + uint16x8_t _v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)), + vmovn_u32(vcvtq_u32_f32(v_dst1f))); + + v_dst = vmovn_u16(_v_dst); + } + + void operator() (const u8 * src0, const u8 * src1, u8 * dst) const + { + dst[0] = beta * src1[0] + alpha * src0[0]; + } +}; + +} // namespace + +#endif + +void accumulateWeighted(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 *dstBase, ptrdiff_t dstStride, + f32 alpha) +{ + if (alpha == 0.0f) + return; + if (alpha == 1.0f) + { + for (size_t i = 0; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + std::memcpy(dst, src, sizeof(u8) * size.width); + } + return; + } + + internal::assertSupportedConfiguration(); + +#ifdef CAROTENE_NEON + // in this case we can use the following scheme: + // dst[p] = (src[p] + dst[p]) >> 1 + // which is faster + if (alpha == 0.5f) + { + internal::vtransform(size, + srcBase, srcStride, + dstBase, dstStride, + dstBase, dstStride, + AccumulateWeightedHalf()); + + return; + } + + internal::vtransform(size, + srcBase, srcStride, + dstBase, dstStride, + dstBase, dstStride, + AccumulateWeighted(alpha)); +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)alpha; +#endif +} + +} //namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/add.cpp b/3rdparty/carotene/src/add.cpp new file mode 100644 index 0000000000..e8ace53122 --- /dev/null +++ b/3rdparty/carotene/src/add.cpp @@ -0,0 +1,475 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" +#include "vtransform.hpp" + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON + +namespace { + +template +struct AddWrap +{ + typedef T type; + + void operator() (const typename internal::VecTraits::vec128 & v_src0, + const typename internal::VecTraits::vec128 & v_src1, + typename internal::VecTraits::vec128 & v_dst) const + { + v_dst = internal::vaddq(v_src0, v_src1); + } + + void operator() (const typename internal::VecTraits::vec64 & v_src0, + const typename internal::VecTraits::vec64 & v_src1, + typename internal::VecTraits::vec64 & v_dst) const + { + v_dst = internal::vadd(v_src0, v_src1); + } + + void operator() (const T * src0, const T * src1, T * dst) const + { + dst[0] = (T)((WT)src0[0] + (WT)src1[0]); + } +}; + +template +struct AddSaturate +{ + typedef T type; + + void operator() (const typename internal::VecTraits::vec128 & v_src0, + const typename internal::VecTraits::vec128 & v_src1, + typename internal::VecTraits::vec128 & v_dst) const + { + v_dst = internal::vqaddq(v_src0, v_src1); + } + + void operator() (const typename internal::VecTraits::vec64 & v_src0, + const typename internal::VecTraits::vec64 & v_src1, + typename internal::VecTraits::vec64 & v_dst) const + { + v_dst = internal::vqadd(v_src0, v_src1); + } + + void operator() (const T * src0, const T * src1, T * dst) const + { + dst[0] = internal::saturate_cast((WT)src0[0] + (WT)src1[0]); + } +}; + +} // namespace + +#endif + +void add(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + if (policy == CONVERT_POLICY_SATURATE) + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + AddSaturate()); + } + else + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + AddWrap()); + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)policy; +#endif +} + +void add(const Size2D &size, + const s8 * src0Base, ptrdiff_t src0Stride, + const s8 * src1Base, ptrdiff_t src1Stride, + s8 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + if (policy == CONVERT_POLICY_SATURATE) + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + AddSaturate()); + } + else + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + AddWrap()); + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)policy; +#endif +} + +void add(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i); + u16 * dst = internal::getRowPtr((u16 *)dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw32; j += 32) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src00 = vld1q_u8(src0 + j), v_src01 = vld1q_u8(src0 + j + 16); + uint8x16_t v_src10 = vld1q_u8(src1 + j), v_src11 = vld1q_u8(src1 + j + 16); + vst1q_u16(dst + j, vaddl_u8(vget_low_u8(v_src00), vget_low_u8(v_src10))); + vst1q_u16(dst + j + 8, vaddl_u8(vget_high_u8(v_src00), vget_high_u8(v_src10))); + vst1q_u16(dst + j + 16, vaddl_u8(vget_low_u8(v_src01), vget_low_u8(v_src11))); + vst1q_u16(dst + j + 24, vaddl_u8(vget_high_u8(v_src01), vget_high_u8(v_src11))); + } + for (; j < roiw8; j += 8) + { + uint8x8_t v_src0 = vld1_u8(src0 + j); + uint8x8_t v_src1 = vld1_u8(src1 + j); + vst1q_u16(dst + j, vaddl_u8(v_src0, v_src1)); + } + + for (; j < size.width; j++) + dst[j] = (u16)src0[j] + (u16)src1[j]; + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; +#endif +} + +void add(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const s16 * src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i); + s16 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + if (policy == CONVERT_POLICY_SATURATE) + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j); + int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0))); + int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0))); + int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8); + int16x8_t v_dst0 = vqaddq_s16(v_src00, v_src10); + int16x8_t v_dst1 = vqaddq_s16(v_src01, v_src11); + vst1q_s16(dst + j, v_dst0); + vst1q_s16(dst + j + 8, v_dst1); + } + for (; j < roiw8; j += 8) + { + int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j))); + int16x8_t v_src1 = vld1q_s16(src1 + j); + int16x8_t v_dst = vqaddq_s16(v_src0, v_src1); + vst1q_s16(dst + j, v_dst); + } + + for (; j < size.width; j++) + dst[j] = internal::saturate_cast((s32)src0[j] + (s32)src1[j]); + } + else + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j); + int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0))); + int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0))); + int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8); + int16x8_t v_dst0 = vaddq_s16(v_src00, v_src10); + int16x8_t v_dst1 = vaddq_s16(v_src01, v_src11); + vst1q_s16(dst + j, v_dst0); + vst1q_s16(dst + j + 8, v_dst1); + } + for (; j < roiw8; j += 8) + { + int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j))); + int16x8_t v_src1 = vld1q_s16(src1 + j); + int16x8_t v_dst = vaddq_s16(v_src0, v_src1); + vst1q_s16(dst + j, v_dst); + } + + for (; j < size.width; j++) + dst[j] = (s16)((s32)src0[j] + (s32)src1[j]); + } + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)policy; +#endif +} + +void add(const Size2D &size, + const s16 * src0Base, ptrdiff_t src0Stride, + const s16 * src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + if (policy == CONVERT_POLICY_SATURATE) + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + AddSaturate()); + } + else + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + AddWrap()); + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)policy; +#endif +} + +void add(const Size2D &size, + const u16 * src0Base, ptrdiff_t src0Stride, + const u16 * src1Base, ptrdiff_t src1Stride, + u16 * dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + if (policy == CONVERT_POLICY_SATURATE) + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + AddSaturate()); + } + else + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + AddWrap()); + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)policy; +#endif +} + +void add(const Size2D &size, + const s32 * src0Base, ptrdiff_t src0Stride, + const s32 * src1Base, ptrdiff_t src1Stride, + s32 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + if (policy == CONVERT_POLICY_SATURATE) + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + AddSaturate()); + } + else + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + AddWrap()); + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)policy; +#endif +} + +void add(const Size2D &size, + const u32 * src0Base, ptrdiff_t src0Stride, + const u32 * src1Base, ptrdiff_t src1Stride, + u32 * dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + if (policy == CONVERT_POLICY_SATURATE) + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + AddSaturate()); + } + else + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + AddWrap()); + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)policy; +#endif +} + +void add(const Size2D &size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride, + f32 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + AddWrap()); +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/add_weighted.cpp b/3rdparty/carotene/src/add_weighted.cpp new file mode 100644 index 0000000000..1f89fb5372 --- /dev/null +++ b/3rdparty/carotene/src/add_weighted.cpp @@ -0,0 +1,265 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" +#include "vtransform.hpp" + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON + +namespace { + +using namespace internal; + +template struct TypeTraits; +template <> struct TypeTraits< u8> { typedef u16 wide; typedef u8 unsign; typedef uint8x16_t vec128; }; +template <> struct TypeTraits< s8> { typedef s16 wide; typedef u8 unsign; typedef int8x16_t vec128; }; +template <> struct TypeTraits { typedef u32 wide; typedef u8 narrow; typedef u16 unsign; typedef uint16x8_t vec128; }; +template <> struct TypeTraits { typedef s32 wide; typedef s8 narrow; typedef u16 unsign; typedef int16x8_t vec128; }; +template <> struct TypeTraits { typedef u64 wide; typedef u16 narrow; typedef u32 unsign; typedef uint32x4_t vec128; }; +template <> struct TypeTraits { typedef s64 wide; typedef s16 narrow; typedef u32 unsign; typedef int32x4_t vec128; }; +template <> struct TypeTraits { typedef f64 wide; typedef float32x4_t vec128; }; + +template struct wAdd +{ + typedef T type; + + f32 alpha, beta, gamma; + typedef typename TypeTraits::wide wtype; + wAdd wideAdd; + wAdd(f32 _alpha, f32 _beta, f32 _gamma): + alpha(_alpha), beta(_beta), gamma(_gamma), + wideAdd(_alpha, _beta, _gamma) {} + + void operator() (const typename VecTraits::vec128 & v_src0, + const typename VecTraits::vec128 & v_src1, + typename VecTraits::vec128 & v_dst) const + { + typename VecTraits::vec128 vrl, vrh; + wideAdd(vmovl( vget_low(v_src0)), vmovl( vget_low(v_src1)), vrl); + wideAdd(vmovl(vget_high(v_src0)), vmovl(vget_high(v_src1)), vrh); + + v_dst = vcombine(vqmovn(vrl), vqmovn(vrh)); + } + + void operator() (const typename VecTraits::vec64 & v_src0, + const typename VecTraits::vec64 & v_src1, + typename VecTraits::vec64 & v_dst) const + { + typename VecTraits::vec128 vr; + wideAdd(vmovl(v_src0), vmovl(v_src1), vr); + + v_dst = vqmovn(vr); + } + + void operator() (const T * src0, const T * src1, T * dst) const + { + dst[0] = saturate_cast(alpha*src0[0] + beta*src1[0] + gamma); + } +}; + +template <> struct wAdd +{ + typedef s32 type; + + f32 alpha, beta, gamma; + float32x4_t valpha, vbeta, vgamma; + wAdd(f32 _alpha, f32 _beta, f32 _gamma): + alpha(_alpha), beta(_beta), gamma(_gamma) + { + valpha = vdupq_n_f32(_alpha); + vbeta = vdupq_n_f32(_beta); + vgamma = vdupq_n_f32(_gamma + 0.5); + } + + void operator() (const typename VecTraits::vec128 & v_src0, + const typename VecTraits::vec128 & v_src1, + typename VecTraits::vec128 & v_dst) const + { + float32x4_t vs1 = vcvtq_f32_s32(v_src0); + float32x4_t vs2 = vcvtq_f32_s32(v_src1); + + vs1 = vmlaq_f32(vgamma, vs1, valpha); + vs1 = vmlaq_f32(vs1, vs2, vbeta); + v_dst = vcvtq_s32_f32(vs1); + } + + void operator() (const typename VecTraits::vec64 & v_src0, + const typename VecTraits::vec64 & v_src1, + typename VecTraits::vec64 & v_dst) const + { + float32x2_t vs1 = vcvt_f32_s32(v_src0); + float32x2_t vs2 = vcvt_f32_s32(v_src1); + + vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha)); + vs1 = vmla_f32(vs1, vs2, vget_low(vbeta)); + v_dst = vcvt_s32_f32(vs1); + } + + void operator() (const s32 * src0, const s32 * src1, s32 * dst) const + { + dst[0] = saturate_cast(alpha*src0[0] + beta*src1[0] + gamma); + } +}; + +template <> struct wAdd +{ + typedef u32 type; + + f32 alpha, beta, gamma; + float32x4_t valpha, vbeta, vgamma; + wAdd(f32 _alpha, f32 _beta, f32 _gamma): + alpha(_alpha), beta(_beta), gamma(_gamma) + { + valpha = vdupq_n_f32(_alpha); + vbeta = vdupq_n_f32(_beta); + vgamma = vdupq_n_f32(_gamma + 0.5); + } + + void operator() (const typename VecTraits::vec128 & v_src0, + const typename VecTraits::vec128 & v_src1, + typename VecTraits::vec128 & v_dst) const + { + float32x4_t vs1 = vcvtq_f32_u32(v_src0); + float32x4_t vs2 = vcvtq_f32_u32(v_src1); + + vs1 = vmlaq_f32(vgamma, vs1, valpha); + vs1 = vmlaq_f32(vs1, vs2, vbeta); + v_dst = vcvtq_u32_f32(vs1); + } + + void operator() (const typename VecTraits::vec64 & v_src0, + const typename VecTraits::vec64 & v_src1, + typename VecTraits::vec64 & v_dst) const + { + float32x2_t vs1 = vcvt_f32_u32(v_src0); + float32x2_t vs2 = vcvt_f32_u32(v_src1); + + vs1 = vmla_f32(vget_low(vgamma), vs1, vget_low(valpha)); + vs1 = vmla_f32(vs1, vs2, vget_low(vbeta)); + v_dst = vcvt_u32_f32(vs1); + } + + void operator() (const u32 * src0, const u32 * src1, u32 * dst) const + { + dst[0] = saturate_cast(alpha*src0[0] + beta*src1[0] + gamma); + } +}; + +template <> struct wAdd +{ + typedef f32 type; + + f32 alpha, beta, gamma; + float32x4_t valpha, vbeta, vgamma; + wAdd(f32 _alpha, f32 _beta, f32 _gamma): + alpha(_alpha), beta(_beta), gamma(_gamma) + { + valpha = vdupq_n_f32(_alpha); + vbeta = vdupq_n_f32(_beta); + vgamma = vdupq_n_f32(_gamma + 0.5); + } + + void operator() (const typename VecTraits::vec128 & v_src0, + const typename VecTraits::vec128 & v_src1, + typename VecTraits::vec128 & v_dst) const + { + float32x4_t vs1 = vmlaq_f32(vgamma, v_src0, valpha); + v_dst = vmlaq_f32(vs1, v_src1, vbeta); + } + + void operator() (const typename VecTraits::vec64 & v_src0, + const typename VecTraits::vec64 & v_src1, + typename VecTraits::vec64 & v_dst) const + { + float32x2_t vs1 = vmla_f32(vget_low(vgamma), v_src0, vget_low(valpha)); + v_dst = vmla_f32(vs1, v_src1, vget_low(vbeta)); + + } + + void operator() (const f32 * src0, const f32 * src1, f32 * dst) const + { + dst[0] = alpha*src0[0] + beta*src1[0] + gamma; + } +}; + +} // namespace + +#define IMPL_ADDWEIGHTED(type) \ +void addWeighted(const Size2D &size, \ + const type * src0Base, ptrdiff_t src0Stride, \ + const type * src1Base, ptrdiff_t src1Stride, \ + type * dstBase, ptrdiff_t dstStride, \ + f32 alpha, f32 beta, f32 gamma) \ +{ \ + internal::assertSupportedConfiguration(); \ + wAdd wgtAdd(alpha, \ + beta, \ + gamma); \ + internal::vtransform(size, \ + src0Base, src0Stride, \ + src1Base, src1Stride, \ + dstBase, dstStride, \ + wgtAdd); \ +} + +#else + +#define IMPL_ADDWEIGHTED(type) \ +void addWeighted(const Size2D &, \ + const type *, ptrdiff_t, \ + const type *, ptrdiff_t, \ + type *, ptrdiff_t, \ + f32, f32, f32) \ +{ \ + internal::assertSupportedConfiguration(); \ +} + +#endif + +IMPL_ADDWEIGHTED(u8) +IMPL_ADDWEIGHTED(s8) +IMPL_ADDWEIGHTED(u16) +IMPL_ADDWEIGHTED(s16) +IMPL_ADDWEIGHTED(u32) +IMPL_ADDWEIGHTED(s32) +IMPL_ADDWEIGHTED(f32) + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/bitwise.cpp b/3rdparty/carotene/src/bitwise.cpp new file mode 100644 index 0000000000..ee00775111 --- /dev/null +++ b/3rdparty/carotene/src/bitwise.cpp @@ -0,0 +1,225 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" +#include "vtransform.hpp" + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON + +struct BitwiseAnd +{ + typedef u8 type; + + void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1, + uint8x16_t & v_dst) const + { + v_dst = vandq_u8(v_src0, v_src1); + } + + void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1, + uint8x8_t & v_dst) const + { + v_dst = vand_u8(v_src0, v_src1); + } + + void operator() (const u8 * src0, const u8 * src1, u8 * dst) const + { + dst[0] = src0[0] & src1[0]; + } +}; + +struct BitwiseOr +{ + typedef u8 type; + + void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1, + uint8x16_t & v_dst) const + { + v_dst = vorrq_u8(v_src0, v_src1); + } + + void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1, + uint8x8_t & v_dst) const + { + v_dst = vorr_u8(v_src0, v_src1); + } + + void operator() (const u8 * src0, const u8 * src1, u8 * dst) const + { + dst[0] = src0[0] | src1[0]; + } +}; + +struct BitwiseXor +{ + typedef u8 type; + + void operator() (const uint8x16_t & v_src0, const uint8x16_t & v_src1, + uint8x16_t & v_dst) const + { + v_dst = veorq_u8(v_src0, v_src1); + } + + void operator() (const uint8x8_t & v_src0, const uint8x8_t & v_src1, + uint8x8_t & v_dst) const + { + v_dst = veor_u8(v_src0, v_src1); + } + + void operator() (const u8 * src0, const u8 * src1, u8 * dst) const + { + dst[0] = src0[0] ^ src1[0]; + } +}; + +#endif + +void bitwiseNot(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 *dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u8* src = internal::getRowPtr(srcBase, srcStride, i); + u8* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw32; j += 32) + { + internal::prefetch(src + j); + uint8x16_t v_src0 = vld1q_u8(src + j), v_src1 = vld1q_u8(src + j + 16); + uint8x16_t v_dst0 = vmvnq_u8(v_src0), v_dst1 = vmvnq_u8(v_src1); + vst1q_u8(dst + j, v_dst0); + vst1q_u8(dst + j + 16, v_dst1); + } + for (; j < roiw8; j += 8) + { + uint8x8_t v_src = vld1_u8(src + j); + uint8x8_t v_dst = vmvn_u8(v_src); + vst1_u8(dst + j, v_dst); + } + + for (; j < size.width; j++) + { + dst[j] = ~src[j]; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void bitwiseAnd(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, BitwiseAnd()); +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; +#endif +} + +void bitwiseOr(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, BitwiseOr()); +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; +#endif +} + +void bitwiseXor(const Size2D &size, + const u8 *src0Base, ptrdiff_t src0Stride, + const u8 *src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, BitwiseXor()); +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/blur.cpp b/3rdparty/carotene/src/blur.cpp new file mode 100644 index 0000000000..798cce5a71 --- /dev/null +++ b/3rdparty/carotene/src/blur.cpp @@ -0,0 +1,1337 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include + +#include "common.hpp" +#include "saturate_cast.hpp" + +namespace CAROTENE_NS { + +bool isBlur3x3Supported(const Size2D &size, BORDER_MODE border) +{ + return isSupportedConfiguration() && size.width >= 8 && + (border == BORDER_MODE_CONSTANT || + border == BORDER_MODE_REPLICATE); +} + +void blur3x3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue) +{ + internal::assertSupportedConfiguration(isBlur3x3Supported(size, border)); +#ifdef CAROTENE_NEON + const int16x8_t v_scale = vmovq_n_s16(3640); + const uint16x8_t v_border_x3 = vdupq_n_u16(borderValue * 3); + const uint16x8_t v_zero = vdupq_n_u16(0); + const uint8x8_t v_border = vdup_n_u8(borderValue); + + uint16x8_t tprev = v_zero, tcurr = v_zero, tnext = v_zero; + uint16x8_t t0 = v_zero, t1 = v_zero, t2 = v_zero; + + ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height; + + for (ptrdiff_t y = 0; y < height; ++y) + { + const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max(y - 1, 0)); + const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y); + const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1)); + u8 * drow = internal::getRowPtr(dstBase, dstStride, y); + + s16 prevx = 0, currx = 0, nextx = 0; + ptrdiff_t x = 0; + const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8); + + // perform vertical convolution + for ( ; x <= bwidth; x += 8) + { + internal::prefetch(srow0 + x); + internal::prefetch(srow1 + x); + internal::prefetch(srow2 + x); + + uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x); + uint8x8_t x1 = vld1_u8(srow1 + x); + uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x); + + // calculate values for plain CPU part below if needed + if (x + 8 >= bwidth) + { + ptrdiff_t x3 = x == width ? width - 1 : x; + ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max(x3 - 1, 0); + + if (border == BORDER_MODE_CONSTANT && x4 < 0) + prevx = borderValue; + else + prevx = (srow2 ? srow2[x4] : borderValue) + srow1[x4] + (srow0 ? srow0[x4] : borderValue); + + currx = (srow2 ? srow2[x3] : borderValue) + srow1[x3] + (srow0 ? srow0[x3] : borderValue); + } + + // make shift + if (x) + { + tprev = tcurr; + tcurr = tnext; + } + + // and calculate next value + tnext = vaddw_u8(vaddl_u8(x0, x1), x2); + + // make extrapolation for the first elements + if (!x) + { + // make border + if (border == BORDER_MODE_CONSTANT) + tcurr = v_border_x3; + else if (border == BORDER_MODE_REPLICATE) + tcurr = vdupq_n_u16(vgetq_lane_u16(tnext, 0)); + + continue; + } + + // combine 3 "shifted" vectors + t0 = vextq_u16(tprev, tcurr, 7); + t1 = tcurr; + t2 = vextq_u16(tcurr, tnext, 1); + + // and add them + t0 = vqaddq_u16(t0, vqaddq_u16(t1, t2)); + + int16x8_t tt0 = vqrdmulhq_s16(vreinterpretq_s16_u16(t0), v_scale); + uint8x8_t it0 = vmovn_u16(vreinterpretq_u16_s16(tt0)); + vst1_u8(drow + x - 8, it0); + } + + x -= 8; + if (x == width) + --x; + + for ( ; x < width; ++x) + { + // make extrapolation for the last elements + if (x + 1 >= width) + { + if (border == BORDER_MODE_CONSTANT) + nextx = borderValue * 3; + else if (border == BORDER_MODE_REPLICATE) + nextx = srow2[x] + srow1[x] + srow0[x]; + } + else + nextx = (srow2 ? srow2[x + 1] : borderValue) + + srow1[x + 1] + + (srow0 ? srow0[x + 1] : borderValue); + + f32 val = (prevx + currx + nextx) * (1 / 9.f) + 0.5f; + drow[x] = internal::saturate_cast((s32)val); + + // make shift + prevx = currx; + currx = nextx; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)border; + (void)borderValue; +#endif +} + +bool isBlurU8Supported(const Size2D &size, s32 cn, BORDER_MODE border) +{ + return isSupportedConfiguration() && + cn > 0 && cn <= 4 && + size.width*cn >= 8 && size.height >= 2 && + (border == BORDER_MODE_CONSTANT || + border == BORDER_MODE_REFLECT101 || + border == BORDER_MODE_REFLECT || + border == BORDER_MODE_REPLICATE); +} + +void blur3x3(const Size2D &size, s32 cn, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderType, u8 borderValue) +{ + internal::assertSupportedConfiguration(isBlurU8Supported(size, cn, borderType)); +#ifdef CAROTENE_NEON +//#define FLOAT_VARIANT_1_9 +#ifdef FLOAT_VARIANT_1_9 + float32x4_t v1_9 = vdupq_n_f32 (1.0/9.0); + float32x4_t v0_5 = vdupq_n_f32 (.5); +#else + const int16x8_t vScale = vmovq_n_s16(3640); +#endif + + size_t colsn = size.width*cn; + + std::vector _tmp; + u8 *tmp = 0; + if (borderType == BORDER_MODE_CONSTANT) + { + _tmp.assign(colsn + 2*cn, borderValue); + tmp = &_tmp[cn]; + } + + uint16x8_t tprev = vdupq_n_u16(0x0); + uint16x8_t tcurr = tprev; + uint16x8_t tnext = tprev; + uint16x8_t t0, t1, t2; + if(cn == 1) + { + for( size_t y = 0; y < size.height; y++ ) + { + const u8* srow0; + const u8* srow1 = internal::getRowPtr(srcBase, srcStride, y); + const u8* srow2; + u8* drow = internal::getRowPtr(dstBase, dstStride, y); + if (borderType == BORDER_MODE_REFLECT101) { + srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 1); + srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-2); + } else if (borderType == BORDER_MODE_CONSTANT) { + srow0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp; + srow2 = y < size.height-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp; + } else { // BORDER_MODE_REFLECT || BORDER_MODE_REPLICATE + srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0); + srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-1); + } + + // do vertical convolution + size_t x = 0; + const size_t bcols = y + 2 < size.height ? colsn : (colsn - 8); + for( ; x <= bcols; x += 8 ) + { + internal::prefetch(srow0 + x); + internal::prefetch(srow1 + x); + internal::prefetch(srow2 + x); + + uint8x8_t x0 = vld1_u8(srow0 + x); + uint8x8_t x1 = vld1_u8(srow1 + x); + uint8x8_t x2 = vld1_u8(srow2 + x); + + tprev = tcurr; + tcurr = tnext; + tnext = vaddw_u8(vaddl_u8(x0, x1), x2); + + if(!x) { + tcurr = tnext; + + // make border + if (borderType == BORDER_MODE_CONSTANT) + { + tcurr = vsetq_lane_u16(borderValue, tcurr, 7); + } + else if (borderType == BORDER_MODE_REFLECT101) + { + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 7); + } + else // borderType == BORDER_MODE_REFLECT || borderType == BORDER_MODE_REPLICATE + { + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 7); + } + continue; + } + + t0 = vextq_u16(tprev, tcurr, 7); + t1 = tcurr; + t2 = vextq_u16(tcurr, tnext, 1); + + t0 = vqaddq_u16(t0, vqaddq_u16(t1, t2)); + +#ifdef FLOAT_VARIANT_1_9 + uint32x4_t tres1 = vmovl_u16(vget_low_u16(t0)); + uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0)); + float32x4_t vf1 = vmulq_f32(v1_9, vcvtq_f32_u32(tres1)); + float32x4_t vf2 = vmulq_f32(v1_9, vcvtq_f32_u32(tres2)); + tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5)); + tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5)); + t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2)); + vst1_u8(drow + x - 8, vmovn_u16(t0)); +#else + int16x8_t tt0 = vqrdmulhq_s16(vreinterpretq_s16_u16(t0), vScale); + uint8x8_t it0 = vmovn_u16(vreinterpretq_u16_s16(tt0)); + vst1_u8(drow + x - 8, it0); +#endif + } + + x -= 8; + if(x == colsn){ + x--; + } + s16 prevx, rowx, nextx; + prevx = srow2[x-1] + srow1[x-1] + srow0[x-1]; + rowx = srow2[x] + srow1[x] + srow0[x]; + for( ; x < colsn; x++ ) + { + if(x+1 >= colsn) { + // make border + if (borderType == BORDER_MODE_CONSTANT) + { + nextx = borderValue; + } else if (borderType == BORDER_MODE_REFLECT101) + { + nextx = srow2[x-1] + srow1[x-1] + srow0[x-1]; + } else { + nextx = srow2[x] + srow1[x] + srow0[x]; + } + } else { + nextx = srow2[x+1] + srow1[x+1] + srow0[x+1]; + } + *(drow+x) = internal::saturate_cast((prevx + rowx + nextx)*(1/9.)); + prevx = rowx; + rowx = nextx; + } + } + } + else + { + for( size_t y = 0; y < size.height; y++ ) + { + const u8* srow0; + const u8* srow1 = internal::getRowPtr(srcBase, srcStride, y); + const u8* srow2; + u8* drow = internal::getRowPtr(dstBase, dstStride, y); + if (borderType == BORDER_MODE_REFLECT101) { + srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 1); + srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-2); + } else if (borderType == BORDER_MODE_CONSTANT) { + srow0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp; + srow2 = y < size.height-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp; + } else { // BORDER_MODE_REFLECT || BORDER_MODE_REPLICATE + srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0); + srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-1); + } + + // do vertical convolution + size_t x = 0; + const size_t bcols = y + 2 < size.height ? colsn : (colsn - 8); + for( ; x <= bcols; x += 8 ) + { + internal::prefetch(srow0 + x); + internal::prefetch(srow1 + x); + internal::prefetch(srow2 + x); + + uint8x8_t x0 = vld1_u8(srow0 + x); + uint8x8_t x1 = vld1_u8(srow1 + x); + uint8x8_t x2 = vld1_u8(srow2 + x); + + tprev = tcurr; + tcurr = tnext; + tnext = vaddw_u8(vaddl_u8(x0, x1), x2); + + if(!x) { + tcurr = tnext; + + // make border + switch(cn) + { + case 2: + if (borderType == BORDER_MODE_CONSTANT) + { + tcurr = vsetq_lane_u16(borderValue, tcurr, 6); + tcurr = vsetq_lane_u16(borderValue, tcurr, 7); + } + else if (borderType == BORDER_MODE_REFLECT101) + { + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tcurr, 6); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tcurr, 6); + } + else + { + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 6); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 7); + } + break; + case 3: + if (borderType == BORDER_MODE_CONSTANT) + { + tcurr = vsetq_lane_u16(borderValue, tcurr, 5); + tcurr = vsetq_lane_u16(borderValue, tcurr, 6); + tcurr = vsetq_lane_u16(borderValue, tcurr, 7); + } + else if (borderType == BORDER_MODE_REFLECT101) + { + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tcurr, 5); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 4),tcurr, 6); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 5),tcurr, 7); + } + else + { + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 5); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 6); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tcurr, 7); + } + break; + case 4: + if (borderType == BORDER_MODE_CONSTANT) + { + tcurr = vsetq_lane_u16(borderValue, tcurr, 4); + tcurr = vsetq_lane_u16(borderValue, tcurr, 5); + tcurr = vsetq_lane_u16(borderValue, tcurr, 6); + tcurr = vsetq_lane_u16(borderValue, tcurr, 7); + } + else if (borderType != BORDER_MODE_REFLECT101) + { + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 4); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 5); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tcurr, 6); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tcurr, 7); + } + break; + } + continue; + } + + if(cn==2) + t0 = vextq_u16(tprev, tcurr, 6); + else if(cn==3) + t0 = vextq_u16(tprev, tcurr, 5); + else if(cn==4) + t0 = vextq_u16(tprev, tcurr, 4); + + t1 = tcurr; + + if(cn==2) + t2 = vextq_u16(tcurr, tnext, 2); + else if(cn==3) + t2 = vextq_u16(tcurr, tnext, 3); + else if(cn==4) + t2 = vextq_u16(tcurr, tnext, 4); + + t0 = vqaddq_u16(t0, vqaddq_u16(t1, t2)); + +#ifdef FLOAT_VARIANT_1_9 + uint32x4_t tres1 = vmovl_u16(vget_low_u16(t0)); + uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0)); + float32x4_t vf1 = vmulq_f32(v1_9, vcvtq_f32_u32(tres1)); + float32x4_t vf2 = vmulq_f32(v1_9, vcvtq_f32_u32(tres2)); + tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5)); + tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5)); + t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2)); + vst1_u8(drow + x - 8, vmovn_u16(t0)); +#else + int16x8_t tt0 = vqrdmulhq_s16(vreinterpretq_s16_u16(t0), vScale); + uint8x8_t it0 = vmovn_u16(vreinterpretq_u16_s16(tt0)); + vst1_u8(drow + x - 8, it0); +#endif + } + + x -= 8; + if(x == colsn){ + x -= cn; + } + s16 prevx[4], rowx[4], nextx[4]; + for( s32 k = 0; k < cn; k++ ) + { + prevx[(k + x%cn)%cn] = srow2[x+k-cn] + srow1[x+k-cn] + srow0[x+k-cn]; + rowx[(k + x%cn)%cn] = srow2[x+k] + srow1[x+k] + srow0[x+k]; + } + for( ; x < colsn; x++ ) + { + size_t xx = x%cn; + if(x+cn >= colsn) { + // make border + if (borderType == BORDER_MODE_CONSTANT) + { + nextx[xx] = borderValue; + } else if (borderType == BORDER_MODE_REFLECT101) + { + nextx[xx] = srow2[x-cn] + srow1[x-cn] + srow0[x-cn]; + } else { + nextx[xx] = srow2[x] + srow1[x] + srow0[x]; + } + } else { + nextx[xx] = srow2[x+cn] + srow1[x+cn] + srow0[x+cn]; + } + *(drow+x) = internal::saturate_cast((prevx[xx] + rowx[xx] + nextx[xx])*(1/9.)); + prevx[xx] = rowx[xx]; + rowx[xx] = nextx[xx]; + } + } + } +#else + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)borderValue; +#endif +} + +void blur5x5(const Size2D &size, s32 cn, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderType, u8 borderValue) +{ + internal::assertSupportedConfiguration(isBlurU8Supported(size, cn, borderType)); +#ifdef CAROTENE_NEON +#define FLOAT_VARIANT_1_25 +#ifdef FLOAT_VARIANT_1_25 + float32x4_t v1_25 = vdupq_n_f32 (1.0f/25.0f); + float32x4_t v0_5 = vdupq_n_f32 (.5f); +#else + const int16x8_t vScale = vmovq_n_s16(1310); +#endif + size_t colsn = size.width*cn; + + std::vector _tmp; + u8 *tmp = 0; + if (borderType == BORDER_MODE_CONSTANT) + { + _tmp.assign(colsn + 2*cn, borderValue); + tmp = &_tmp[cn]; + } + + uint16x8_t tprev = vdupq_n_u16(0x0); + uint16x8_t tcurr = tprev; + uint16x8_t tnext = tprev; + uint16x8_t t0, t1, t2, t3, t4; + for( size_t y = 0; y < size.height; y++ ) + { + const u8 *srow0, *srow1; + const u8 *srow2 = internal::getRowPtr(srcBase, srcStride, y); + const u8 *srow3, *srow4; + u8 *drow = internal::getRowPtr(dstBase, dstStride, y); + if (borderType == BORDER_MODE_REFLECT101) { + srow0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : 2-y); + srow1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 1); + srow3 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-2); + srow4 = internal::getRowPtr(srcBase, srcStride, y < size.height-2 ? y+2 : (size.height<<1)-4-y); + } else if (borderType == BORDER_MODE_CONSTANT) { + srow0 = y > 1 ? internal::getRowPtr(srcBase, srcStride, y-2) : tmp; + srow1 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp; + srow3 = y < size.height-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp; + srow4 = y < size.height-2 ? internal::getRowPtr(srcBase, srcStride, y+2) : tmp; + } else if (borderType == BORDER_MODE_REFLECT) { + srow0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : 1-y); + srow1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0); + srow3 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-1); + srow4 = internal::getRowPtr(srcBase, srcStride, y < size.height-2 ? y+2 : (size.height<<1)-3-y); + } else { // BORDER_MODE_REPLICATE + srow0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : 0); + srow1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0); + srow3 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-1); + srow4 = internal::getRowPtr(srcBase, srcStride, y < size.height-2 ? y+2 : size.height-1); + } + + // do vertical convolution + size_t x = 0; + const size_t bcols = y + 3 < size.height ? colsn : (colsn - 8); + for( ; x <= bcols; x += 8 ) + { + internal::prefetch(srow0 + x); + internal::prefetch(srow1 + x); + internal::prefetch(srow2 + x); + internal::prefetch(srow3 + x); + internal::prefetch(srow4 + x); + + uint8x8_t x0 = vld1_u8(srow0 + x); + uint8x8_t x1 = vld1_u8(srow1 + x); + uint8x8_t x2 = vld1_u8(srow2 + x); + uint8x8_t x3 = vld1_u8(srow3 + x); + uint8x8_t x4 = vld1_u8(srow4 + x); + + tprev = tcurr; + tcurr = tnext; + tnext = vaddw_u8(vaddq_u16(vaddl_u8(x0, x1), vaddl_u8(x2, x3)), x4); + + if(!x) { + tcurr = tnext; + + if(borderType == BORDER_MODE_REFLECT101 && size.width < 3) + { + x = 8; + break; + } + + // make border + switch(cn) + { + case 1: + if (borderType == BORDER_MODE_CONSTANT) + { + tcurr = vsetq_lane_u16(borderValue, tcurr, 6); + tcurr = vsetq_lane_u16(borderValue, tcurr, 7); + } + else if (borderType == BORDER_MODE_REFLECT101) + { + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tcurr, 6); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 7); + } + else if (borderType == BORDER_MODE_REFLECT) + { + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 6); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 7); + } + else + { + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 6); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 7); + } + break; + case 2: + if (borderType == BORDER_MODE_CONSTANT) + { + tcurr = vsetq_lane_u16(borderValue, tcurr, 4); + tcurr = vsetq_lane_u16(borderValue, tcurr, 5); + tcurr = vsetq_lane_u16(borderValue, tcurr, 6); + tcurr = vsetq_lane_u16(borderValue, tcurr, 7); + } + else if (borderType == BORDER_MODE_REFLECT101) + { + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tcurr, 6); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tcurr, 7); + } + else if (borderType == BORDER_MODE_REFLECT) + { + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tcurr, 4); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tcurr, 5); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 6); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 7); + } + else + { + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 4); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 5); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 6); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 7); + } + break; + case 3: + if (borderType == BORDER_MODE_CONSTANT) + { + tcurr = vsetq_lane_u16(borderValue, tcurr, 2); + tcurr = vsetq_lane_u16(borderValue, tcurr, 3); + tcurr = vsetq_lane_u16(borderValue, tcurr, 4); + tcurr = vsetq_lane_u16(borderValue, tcurr, 5); + tcurr = vsetq_lane_u16(borderValue, tcurr, 6); + tcurr = vsetq_lane_u16(borderValue, tcurr, 7); + } + else if (borderType == BORDER_MODE_REFLECT101) + { + tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 6),tcurr, 2); + tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 7),tprev, 3); + tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tprev, 5); + tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 4),tprev, 6); + tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 5),tprev, 7); + s16 lane8 = srow4[8] + srow3[8] + srow2[8] + srow1[8] + srow0[8]; + tcurr = vsetq_lane_u16(lane8,tprev, 4); + } + else if (borderType == BORDER_MODE_REFLECT) + { + tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tcurr, 2); + tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 4),tprev, 3); + tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 5),tprev, 4); + tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tprev, 5); + tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tprev, 6); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tprev, 7); + } + else + { + tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 2); + tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tprev, 3); + tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tprev, 4); + tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tprev, 5); + tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tprev, 6); + tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tprev, 7); + } + break; + case 4: + if (borderType == BORDER_MODE_CONSTANT) + { + tcurr = vsetq_lane_u16(borderValue, tcurr, 0); + tcurr = vsetq_lane_u16(borderValue, tcurr, 1); + tcurr = vsetq_lane_u16(borderValue, tcurr, 2); + tcurr = vsetq_lane_u16(borderValue, tcurr, 3); + tcurr = vsetq_lane_u16(borderValue, tcurr, 4); + tcurr = vsetq_lane_u16(borderValue, tcurr, 5); + tcurr = vsetq_lane_u16(borderValue, tcurr, 6); + tcurr = vsetq_lane_u16(borderValue, tcurr, 7); + } + else if (borderType == BORDER_MODE_REFLECT101) + { + s16 lane8 = srow4[ 8] + srow3[ 8] + srow2[ 8] + srow1[ 8] + srow0[ 8]; + s16 lane9 = srow4[ 9] + srow3[ 9] + srow2[ 9] + srow1[ 9] + srow0[ 9]; + s16 lane10 = srow4[10] + srow3[10] + srow2[10] + srow1[10] + srow0[10]; + s16 lane11 = srow4[11] + srow3[11] + srow2[11] + srow1[11] + srow0[11]; + tprev = vsetq_lane_u16( lane8,tcurr, 0); + tprev = vsetq_lane_u16( lane9,tprev, 1); + tprev = vsetq_lane_u16(lane10,tprev, 2); + tcurr = vsetq_lane_u16(lane11,tprev, 3); + } + else if (borderType == BORDER_MODE_REFLECT) + { + tcurr = vcombine_u16(vget_high_u16(tcurr),vget_low_u16(tcurr));//swap 64-bit parts + } + else + { + tcurr = vcombine_u16(vget_low_u16(tcurr),vget_low_u16(tcurr));//double 64-bit part + } + break; + } + continue; + } + switch(cn) + { + case 1: + t0 = vextq_u16(tprev, tcurr, 6); + t1 = vextq_u16(tprev, tcurr, 7); + t2 = tcurr; + t3 = vextq_u16(tcurr, tnext, 1); + t4 = vextq_u16(tcurr, tnext, 2); + break; + case 2: + t0 = vextq_u16(tprev, tcurr, 4); + t1 = vextq_u16(tprev, tcurr, 6); + t2 = tcurr; + t3 = vextq_u16(tcurr, tnext, 2); + t4 = vextq_u16(tcurr, tnext, 4); + break; + case 3: + t0 = vextq_u16(tprev, tcurr, 2); + t1 = vextq_u16(tprev, tcurr, 5); + t2 = tcurr; + t3 = vextq_u16(tcurr, tnext, 3); + t4 = vextq_u16(tcurr, tnext, 6); + break; + case 4: + t0 = tprev; + t1 = vextq_u16(tprev, tcurr, 4); + t2 = tcurr; + t3 = vextq_u16(tcurr, tnext, 4); + t4 = tnext; + break; + default: + internal::assertSupportedConfiguration(false);//Unsupported channels number + return; + } + t0 = vqaddq_u16(vqaddq_u16(vqaddq_u16(t0, t1), vqaddq_u16(t2, t3)), t4); + +#ifdef FLOAT_VARIANT_1_25 + uint32x4_t tres1 = vmovl_u16(vget_low_u16(t0)); + uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0)); + float32x4_t vf1 = vmulq_f32(v1_25, vcvtq_f32_u32(tres1)); + float32x4_t vf2 = vmulq_f32(v1_25, vcvtq_f32_u32(tres2)); + tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5)); + tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5)); + t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2)); + vst1_u8(drow + x - 8, vmovn_u16(t0)); +#else + int16x8_t tt0 = vqrdmulhq_s16(vreinterpretq_s16_u16(t0), vScale); + uint8x8_t it0 = vmovn_u16(vreinterpretq_u16_s16(tt0)); + vst1_u8(drow + x - 8, it0); +#endif + } + + x -= 8; + if(x == colsn){ + x -= cn; + } + s16 pprevx[4], prevx[4], rowx[4], nextx[4], nnextx[4]; + ptrdiff_t px = x / cn; + for( s32 k = 0; k < cn; k++ ) + { + ptrdiff_t ploc; + ploc = internal::borderInterpolate(px-2, size.width, borderType); + pprevx[k] = ploc < 0 ? 5*borderValue : + srow4[ploc*cn+k] + srow3[ploc*cn+k] + srow2[ploc*cn+k] + srow1[ploc*cn+k] + srow0[ploc*cn+k]; + + ploc = internal::borderInterpolate(px-1, size.width, borderType); + prevx[k] = ploc < 0 ? 5*borderValue : + srow4[ploc*cn+k] + srow3[ploc*cn+k] + srow2[ploc*cn+k] + srow1[ploc*cn+k] + srow0[ploc*cn+k]; + + rowx[k] = srow4[px*cn+k] + srow3[px*cn+k] + srow2[px*cn+k] + srow1[px*cn+k] + srow0[px*cn+k]; + + ploc = internal::borderInterpolate(px+1, size.width, borderType); + nextx[k] = ploc < 0 ? 5*borderValue : + srow4[ploc*cn+k] + srow3[ploc*cn+k] + srow2[ploc*cn+k] + srow1[ploc*cn+k] + srow0[ploc*cn+k]; + } + x = px*cn; + for( ; x < colsn; x+=cn, px++ ) + { + for( s32 k = 0; k < cn; k++ ) + { + ptrdiff_t ploc = internal::borderInterpolate(px+2, size.width, borderType); + nnextx[k] = ploc < 0 ? 5*borderValue : + srow4[ploc*cn+k] + srow3[ploc*cn+k] + srow2[ploc*cn+k] + srow1[ploc*cn+k] + srow0[ploc*cn+k]; + *(drow+x+k) = internal::saturate_cast((pprevx[k] + prevx[k] + rowx[k] + nextx[k] +nnextx[k])*(1/25.)); + pprevx[k] = prevx[k]; + prevx[k] = rowx[k]; + rowx[k] = nextx[k]; + nextx[k] = nnextx[k]; + } + } + } +#else + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)borderValue; +#endif +} + +bool isBlurF32Supported(const Size2D &size, s32 cn, BORDER_MODE border) +{ + return isSupportedConfiguration() && + cn > 0 && cn <= 4 && + size.width*cn >= 4 && size.height >= 2 && + (border == BORDER_MODE_CONSTANT || + border == BORDER_MODE_REFLECT101 || + border == BORDER_MODE_REFLECT || + border == BORDER_MODE_REPLICATE || + border == BORDER_MODE_WRAP); +} + +void blur3x3(const Size2D &size, s32 cn, + const f32 * srcBase, ptrdiff_t srcStride, + f32 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderType, f32 borderValue, Margin borderMargin) +{ + internal::assertSupportedConfiguration(isBlurF32Supported(size, cn, borderType)); +#ifdef CAROTENE_NEON + size_t colsn = size.width * cn; + + std::vector _tmp; + f32 *tmp = 0; + if (borderType == BORDER_MODE_CONSTANT) + { + _tmp.assign(colsn + 2*cn, borderValue); + tmp = &_tmp[cn]; + } + + ptrdiff_t idx_l = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + ptrdiff_t idx_r = internal::borderInterpolate(size.width, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + + //2-line buffer + std::vector _buf(4*(cn * (size.width + 2) + 32 / sizeof(f32))); + f32* lanea = internal::alignPtr(&_buf[cn], 32); + f32* laneA = internal::alignPtr(lanea + cn * (size.width + 2), 32); + + f32* laneb = internal::alignPtr(laneA + cn * (size.width + 2), 32); + f32* laneB = internal::alignPtr(laneb + cn * (size.width + 2), 32); + + if (borderType == BORDER_MODE_CONSTANT) + for (s32 k = 0; k < cn; ++k) + { + lanea[-cn+k] = borderValue; + lanea[colsn+k] = borderValue; + laneA[-cn+k] = borderValue; + laneA[colsn+k] = borderValue; + laneb[-cn+k] = borderValue; + laneb[colsn+k] = borderValue; + laneB[-cn+k] = borderValue; + laneB[colsn+k] = borderValue; + } + + size_t i = 0; + f32* dsta = internal::getRowPtr(dstBase, dstStride, 0); + for (; i < size.height-1; i+=2) + { + //vertical convolution + ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom); + ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom); + + const f32* ln0 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp; + const f32* ln1 = internal::getRowPtr(srcBase, srcStride, i); + const f32* ln2 = internal::getRowPtr(srcBase, srcStride, i + 1); + const f32* ln3 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp; + + size_t x = 0; + for (; x <= colsn - 4; x += 4) + { + internal::prefetch(ln1 + x); + internal::prefetch(ln2 + x); + internal::prefetch(ln0 + x); + internal::prefetch(ln3 + x); +box3x3f32_vert: + float32x4_t v1 = vld1q_f32(ln1 + x); + float32x4_t v2 = vld1q_f32(ln2 + x); + float32x4_t v0 = vld1q_f32(ln0 + x); + float32x4_t v3 = vld1q_f32(ln3 + x); + + float32x4_t v = vaddq_f32(v1, v2); + float32x4_t w0 = vaddq_f32(v, v0); + float32x4_t w1 = vaddq_f32(v, v3); + + vst1q_f32(lanea + x, w0); + vst1q_f32(laneb + x, w1); + } + if(x < colsn) + { + x = colsn-4; + goto box3x3f32_vert; + } + + //left&right borders + if (borderType != BORDER_MODE_CONSTANT) + for (s32 k = 0; k < cn; ++k) + { + lanea[-cn+k] = lanea[idx_l + k]; + lanea[colsn+k] = lanea[idx_r + k]; + laneb[-cn+k] = laneb[idx_l + k]; + laneb[colsn+k] = laneb[idx_r + k]; + } + + //horizontal convolution (2 lines from previous iteration) + if (i > 0) + { + f32* dstb = internal::getRowPtr(dstBase, dstStride, i-1); + x = 0; + for (; x <= colsn - 4; x += 4) + { + internal::prefetch(laneA + x + cn); + internal::prefetch(laneB + x + cn); +box3x3f32_horiz: + float32x4_t lane0a = vld1q_f32(laneA + x - cn); + float32x4_t lane2a = vld1q_f32(laneA + x + cn); + float32x4_t lane1a = vld1q_f32(laneA + x); + + float32x4_t lane0b = vld1q_f32(laneB + x - cn); + float32x4_t lane2b = vld1q_f32(laneB + x + cn); + float32x4_t lane1b = vld1q_f32(laneB + x); + + float32x4_t va = vaddq_f32(lane0a, lane2a); + float32x4_t vb = vaddq_f32(lane0b, lane2b); + float32x4_t wa = vaddq_f32(va, lane1a); + float32x4_t wb = vaddq_f32(vb, lane1b); + + vst1q_f32(dsta + x, wa); + vst1q_f32(dstb + x, wb); + } + if(x < colsn) + { + x = colsn-4; + goto box3x3f32_horiz; + } + dsta = internal::getRowPtr(dstBase, dstStride, i); + } + + std::swap(lanea, laneA); + std::swap(laneb, laneB); + } + + //last line + if(i < size.height) + { + //vertical convolution + ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom); + ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom); + + const f32* ln0 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp; + const f32* ln1 = internal::getRowPtr(srcBase, srcStride, i); + const f32* ln2 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp; + + size_t x = 0; + for (; x <= colsn - 4; x += 4) + { + internal::prefetch(ln0 + x); + internal::prefetch(ln1 + x); + internal::prefetch(ln2 + x); +box3x3f32_vert_ll: + float32x4_t v0 = vld1q_f32(ln0+x); + float32x4_t v1 = vld1q_f32(ln1+x); + float32x4_t v2 = vld1q_f32(ln2+x); + + float32x4_t v = vaddq_f32(v0, v1); + float32x4_t w = vaddq_f32(v, v2); + + vst1q_f32(lanea + x, w); + } + if(x < colsn) + { + x = colsn-4; + goto box3x3f32_vert_ll; + } + + //left&right borders + if (borderType != BORDER_MODE_CONSTANT) + for (s32 k = 0; k < cn; ++k) + { + lanea[-cn+k] = lanea[idx_l + k]; + lanea[colsn+k] = lanea[idx_r + k]; + } + + //horizontal convolution (last 3 lines) + x = 0; + f32* dstb = internal::getRowPtr(dstBase, dstStride, i-1); + f32* dstc = internal::getRowPtr(dstBase, dstStride, i); + for (; x <= colsn - 4; x += 4) + { + internal::prefetch(laneA + x + cn); + internal::prefetch(laneB + x + cn); + internal::prefetch(lanea + x + cn); +box3x3f32_horiz_ll: + float32x4_t lane0a = vld1q_f32(laneA + x - cn); + float32x4_t lane2a = vld1q_f32(laneA + x + cn); + float32x4_t lane1a = vld1q_f32(laneA + x); + + float32x4_t lane0b = vld1q_f32(laneB + x - cn); + float32x4_t lane2b = vld1q_f32(laneB + x + cn); + float32x4_t lane1b = vld1q_f32(laneB + x); + + float32x4_t lane0c = vld1q_f32(lanea + x - cn); + float32x4_t lane2c = vld1q_f32(lanea + x + cn); + float32x4_t lane1c = vld1q_f32(lanea + x); + + float32x4_t va = vaddq_f32(lane0a, lane2a); + float32x4_t vb = vaddq_f32(lane0b, lane2b); + float32x4_t vc = vaddq_f32(lane0c, lane2c); + float32x4_t wa = vaddq_f32(va, lane1a); + float32x4_t wb = vaddq_f32(vb, lane1b); + float32x4_t wc = vaddq_f32(vc, lane1c); + + vst1q_f32(dsta + x, wa); + vst1q_f32(dstb + x, wb); + vst1q_f32(dstc + x, wc); + } + if(x < colsn) + { + x = colsn-4; + goto box3x3f32_horiz_ll; + } + } + else + { + //horizontal convolution (last 2 lines) + f32* dstb = internal::getRowPtr(dstBase, dstStride, i-1); + size_t x = 0; + for (; x <= colsn - 4; x += 4) + { + internal::prefetch(laneA + x + cn); + internal::prefetch(laneB + x + cn); +box3x3f32_horiz_last2: + float32x4_t lane0a = vld1q_f32(laneA + x - cn); + float32x4_t lane2a = vld1q_f32(laneA + x + cn); + float32x4_t lane1a = vld1q_f32(laneA + x); + + float32x4_t lane0b = vld1q_f32(laneB + x - cn); + float32x4_t lane2b = vld1q_f32(laneB + x + cn); + float32x4_t lane1b = vld1q_f32(laneB + x); + + float32x4_t va = vaddq_f32(lane0a, lane2a); + float32x4_t vb = vaddq_f32(lane0b, lane2b); + float32x4_t wa = vaddq_f32(va, lane1a); + float32x4_t wb = vaddq_f32(vb, lane1b); + + vst1q_f32(dsta + x, wa); + vst1q_f32(dstb + x, wb); + } + if(x < colsn) + { + x = colsn-4; + goto box3x3f32_horiz_last2; + } + } +#else + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)borderValue; + (void)borderMargin; +#endif +} + +bool isBlurS32Supported(const Size2D &size, s32 cn, BORDER_MODE border) +{ + return isSupportedConfiguration() && + cn > 0 && cn <= 4 && + size.width*cn >= 4 && size.height >= 2 && + (border == BORDER_MODE_CONSTANT || + border == BORDER_MODE_REFLECT101 || + border == BORDER_MODE_REFLECT || + border == BORDER_MODE_REPLICATE || + border == BORDER_MODE_WRAP); +} + +void blur3x3(const Size2D &size, s32 cn, + const s32 * srcBase, ptrdiff_t srcStride, + s32 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderType, s32 borderValue, Margin borderMargin) +{ + internal::assertSupportedConfiguration(isBlurS32Supported(size, cn, borderType)); +#ifdef CAROTENE_NEON + size_t colsn = size.width * cn; + + std::vector _tmp; + s32 *tmp = 0; + if (borderType == BORDER_MODE_CONSTANT) + { + _tmp.assign(colsn + 2*cn, borderValue); + tmp = &_tmp[cn]; + } + + ptrdiff_t idx_l = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + ptrdiff_t idx_r = internal::borderInterpolate(size.width, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + + //2-line buffer + std::vector _buf(4*(cn * (size.width + 2) + 32 / sizeof(s32))); + s32* lanea = internal::alignPtr(&_buf[cn], 32); + s32* laneA = internal::alignPtr(lanea + cn * (size.width + 2), 32); + + s32* laneb = internal::alignPtr(laneA + cn * (size.width + 2), 32); + s32* laneB = internal::alignPtr(laneb + cn * (size.width + 2), 32); + + if (borderType == BORDER_MODE_CONSTANT) + for (s32 k = 0; k < cn; ++k) + { + lanea[-cn+k] = borderValue; + lanea[colsn+k] = borderValue; + laneA[-cn+k] = borderValue; + laneA[colsn+k] = borderValue; + laneb[-cn+k] = borderValue; + laneb[colsn+k] = borderValue; + laneB[-cn+k] = borderValue; + laneB[colsn+k] = borderValue; + } + + size_t i = 0; + s32* dsta = internal::getRowPtr(dstBase, dstStride, 0); + for (; i < size.height-1; i+=2) + { + //vertical convolution + ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom); + ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom); + + const s32* ln0 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp; + const s32* ln1 = internal::getRowPtr(srcBase, srcStride, i); + const s32* ln2 = internal::getRowPtr(srcBase, srcStride, i + 1); + const s32* ln3 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp; + + size_t x = 0; + for (; x <= colsn - 4; x += 4) + { + internal::prefetch(ln1 + x); + internal::prefetch(ln2 + x); + internal::prefetch(ln0 + x); + internal::prefetch(ln3 + x); +box3x3s32_vert: + int32x4_t v1 = vld1q_s32(ln1 + x); + int32x4_t v2 = vld1q_s32(ln2 + x); + int32x4_t v0 = vld1q_s32(ln0 + x); + int32x4_t v3 = vld1q_s32(ln3 + x); + + int32x4_t v = vaddq_s32(v1, v2); + int32x4_t w0 = vaddq_s32(v, v0); + int32x4_t w1 = vaddq_s32(v, v3); + + vst1q_s32(lanea + x, w0); + vst1q_s32(laneb + x, w1); + } + if(x < colsn) + { + x = colsn-4; + goto box3x3s32_vert; + } + + //left&right borders + if (borderType != BORDER_MODE_CONSTANT) + for (s32 k = 0; k < cn; ++k) + { + lanea[-cn+k] = lanea[idx_l + k]; + lanea[colsn+k] = lanea[idx_r + k]; + laneb[-cn+k] = laneb[idx_l + k]; + laneb[colsn+k] = laneb[idx_r + k]; + } + + //horizontal convolution (2 lines from previous iteration) + if (i > 0) + { + s32* dstb = internal::getRowPtr(dstBase, dstStride, i-1); + x = 0; + for (; x <= colsn - 4; x += 4) + { + internal::prefetch(laneA + x + cn); + internal::prefetch(laneB + x + cn); +box3x3s32_horiz: + int32x4_t lane0a = vld1q_s32(laneA + x - cn); + int32x4_t lane2a = vld1q_s32(laneA + x + cn); + int32x4_t lane1a = vld1q_s32(laneA + x); + + int32x4_t lane0b = vld1q_s32(laneB + x - cn); + int32x4_t lane2b = vld1q_s32(laneB + x + cn); + int32x4_t lane1b = vld1q_s32(laneB + x); + + int32x4_t va = vaddq_s32(lane0a, lane2a); + int32x4_t vb = vaddq_s32(lane0b, lane2b); + int32x4_t wa = vaddq_s32(va, lane1a); + int32x4_t wb = vaddq_s32(vb, lane1b); + + vst1q_s32(dsta + x, wa); + vst1q_s32(dstb + x, wb); + } + if(x < colsn) + { + x = colsn-4; + goto box3x3s32_horiz; + } + dsta = internal::getRowPtr(dstBase, dstStride, i); + } + + std::swap(lanea, laneA); + std::swap(laneb, laneB); + } + //last line + if(i < size.height) + { + //vertical convolution + ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom); + ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom); + + const s32* ln0 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp; + const s32* ln1 = internal::getRowPtr(srcBase, srcStride, i); + const s32* ln2 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp; + + size_t x = 0; + for (; x <= colsn - 4; x += 4) + { + internal::prefetch(ln0 + x); + internal::prefetch(ln1 + x); + internal::prefetch(ln2 + x); +box3x3s32_vert_ll: + int32x4_t v0 = vld1q_s32(ln0+x); + int32x4_t v1 = vld1q_s32(ln1+x); + int32x4_t v2 = vld1q_s32(ln2+x); + + int32x4_t v = vaddq_s32(v0, v1); + int32x4_t w = vaddq_s32(v, v2); + + vst1q_s32(lanea + x, w); + } + if(x < colsn) + { + x = colsn-4; + goto box3x3s32_vert_ll; + } + + //left&right borders + if (borderType != BORDER_MODE_CONSTANT) + for (s32 k = 0; k < cn; ++k) + { + lanea[-cn+k] = lanea[idx_l + k]; + lanea[colsn+k] = lanea[idx_r + k]; + } + + //horizontal convolution (last 3 lines) + x = 0; + s32* dstb = internal::getRowPtr(dstBase, dstStride, i-1); + s32* dstc = internal::getRowPtr(dstBase, dstStride, i); + for (; x <= colsn - 4; x += 4) + { + internal::prefetch(laneA + x + cn); + internal::prefetch(laneB + x + cn); + internal::prefetch(lanea + x + cn); +box3x3s32_horiz_ll: + int32x4_t lane0a = vld1q_s32(laneA + x - cn); + int32x4_t lane2a = vld1q_s32(laneA + x + cn); + int32x4_t lane1a = vld1q_s32(laneA + x); + + int32x4_t lane0b = vld1q_s32(laneB + x - cn); + int32x4_t lane2b = vld1q_s32(laneB + x + cn); + int32x4_t lane1b = vld1q_s32(laneB + x); + + int32x4_t lane0c = vld1q_s32(lanea + x - cn); + int32x4_t lane2c = vld1q_s32(lanea + x + cn); + int32x4_t lane1c = vld1q_s32(lanea + x); + + int32x4_t va = vaddq_s32(lane0a, lane2a); + int32x4_t vb = vaddq_s32(lane0b, lane2b); + int32x4_t vc = vaddq_s32(lane0c, lane2c); + int32x4_t wa = vaddq_s32(va, lane1a); + int32x4_t wb = vaddq_s32(vb, lane1b); + int32x4_t wc = vaddq_s32(vc, lane1c); + + vst1q_s32(dsta + x, wa); + vst1q_s32(dstb + x, wb); + vst1q_s32(dstc + x, wc); + } + if(x < colsn) + { + x = colsn-4; + goto box3x3s32_horiz_ll; + } + } + else + { + //horizontal convolution (last 2 lines) + s32* dstb = internal::getRowPtr(dstBase, dstStride, i-1); + size_t x = 0; + for (; x <= colsn - 4; x += 4) + { + internal::prefetch(laneA + x + cn); + internal::prefetch(laneB + x + cn); +box3x3s32_horiz_last2: + int32x4_t lane0a = vld1q_s32(laneA + x - cn); + int32x4_t lane2a = vld1q_s32(laneA + x + cn); + int32x4_t lane1a = vld1q_s32(laneA + x); + + int32x4_t lane0b = vld1q_s32(laneB + x - cn); + int32x4_t lane2b = vld1q_s32(laneB + x + cn); + int32x4_t lane1b = vld1q_s32(laneB + x); + + int32x4_t va = vaddq_s32(lane0a, lane2a); + int32x4_t vb = vaddq_s32(lane0b, lane2b); + int32x4_t wa = vaddq_s32(va, lane1a); + int32x4_t wb = vaddq_s32(vb, lane1b); + + vst1q_s32(dsta + x, wa); + vst1q_s32(dstb + x, wb); + } + if(x < colsn) + { + x = colsn-4; + goto box3x3s32_horiz_last2; + } + } +#else + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)borderValue; + (void)borderMargin; +#endif +} + +} //namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/canny.cpp b/3rdparty/carotene/src/canny.cpp new file mode 100644 index 0000000000..f61bc23e9b --- /dev/null +++ b/3rdparty/carotene/src/canny.cpp @@ -0,0 +1,773 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +#include "saturate_cast.hpp" +#include +#include + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON +namespace { +struct RowFilter3x3Canny +{ + inline RowFilter3x3Canny(const ptrdiff_t borderxl, const ptrdiff_t borderxr) + { + vfmask = vreinterpret_u8_u64(vmov_n_u64(borderxl ? 0x0000FFffFFffFFffULL : 0x0100FFffFFffFFffULL)); + vtmask = vreinterpret_u8_u64(vmov_n_u64(borderxr ? 0x0707060504030201ULL : 0x0706050403020100ULL)); + lookLeft = offsetk - borderxl; + lookRight = offsetk - borderxr; + } + + inline void operator()(const u8* src, s16* dstx, s16* dsty, ptrdiff_t width) + { + uint8x8_t l = vtbl1_u8(vld1_u8(src - lookLeft), vfmask); + ptrdiff_t i = 0; + for (; i < width - 8 + lookRight; i += 8) + { + internal::prefetch(src + i); + uint8x8_t l18u = vld1_u8(src + i + 1); + + uint8x8_t l2 = l18u; + uint8x8_t l0 = vext_u8(l, l18u, 6); + int16x8_t l1x2 = vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l, l18u, 7), 1)); + + l = l18u; + + int16x8_t l02 = vreinterpretq_s16_u16(vaddl_u8(l2, l0)); + int16x8_t ldx = vreinterpretq_s16_u16(vsubl_u8(l2, l0)); + int16x8_t ldy = vaddq_s16(l02, l1x2); + + vst1q_s16(dstx + i, ldx); + vst1q_s16(dsty + i, ldy); + } + + //tail + if (lookRight == 0 || i != width) + { + uint8x8_t tail0 = vld1_u8(src + (width - 9));//can't get left 1 pixel another way if width==8*k+1 + uint8x8_t tail2 = vtbl1_u8(vld1_u8(src + (width - 8 + lookRight)), vtmask); + uint8x8_t tail1 = vext_u8(vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(tail0), 8*6)), tail2, 7); + + int16x8_t tail02 = vreinterpretq_s16_u16(vaddl_u8(tail2, tail0)); + int16x8_t tail1x2 = vreinterpretq_s16_u16(vshll_n_u8(tail1, 1)); + int16x8_t taildx = vreinterpretq_s16_u16(vsubl_u8(tail2, tail0)); + int16x8_t taildy = vqaddq_s16(tail02, tail1x2); + + vst1q_s16(dstx + (width - 8), taildx); + vst1q_s16(dsty + (width - 8), taildy); + } + } + + uint8x8_t vfmask; + uint8x8_t vtmask; + enum { offsetk = 1}; + ptrdiff_t lookLeft; + ptrdiff_t lookRight; +}; + +template +inline void ColFilter3x3Canny(const s16* src0, const s16* src1, const s16* src2, s16* dstx, s16* dsty, s32* mag, ptrdiff_t width) +{ + ptrdiff_t j = 0; + for (; j <= width - 8; j += 8) + { + ColFilter3x3CannyL1Loop: + int16x8_t line0x = vld1q_s16(src0 + j); + int16x8_t line1x = vld1q_s16(src1 + j); + int16x8_t line2x = vld1q_s16(src2 + j); + int16x8_t line0y = vld1q_s16(src0 + j + width); + int16x8_t line2y = vld1q_s16(src2 + j + width); + + int16x8_t l02 = vaddq_s16(line0x, line2x); + int16x8_t l1x2 = vshlq_n_s16(line1x, 1); + int16x8_t dy = vsubq_s16(line2y, line0y); + int16x8_t dx = vaddq_s16(l1x2, l02); + + int16x8_t dya = vabsq_s16(dy); + int16x8_t dxa = vabsq_s16(dx); + int16x8_t norm = vaddq_s16(dya, dxa); + + int32x4_t normh = vmovl_s16(vget_high_s16(norm)); + int32x4_t norml = vmovl_s16(vget_low_s16(norm)); + + vst1q_s16(dsty + j, dy); + vst1q_s16(dstx + j, dx); + vst1q_s32(mag + j + 4, normh); + vst1q_s32(mag + j, norml); + } + if (j != width) + { + j = width - 8; + goto ColFilter3x3CannyL1Loop; + } +} +template <> +inline void ColFilter3x3Canny(const s16* src0, const s16* src1, const s16* src2, s16* dstx, s16* dsty, s32* mag, ptrdiff_t width) +{ + ptrdiff_t j = 0; + for (; j <= width - 8; j += 8) + { + ColFilter3x3CannyL2Loop: + int16x8_t line0x = vld1q_s16(src0 + j); + int16x8_t line1x = vld1q_s16(src1 + j); + int16x8_t line2x = vld1q_s16(src2 + j); + int16x8_t line0y = vld1q_s16(src0 + j + width); + int16x8_t line2y = vld1q_s16(src2 + j + width); + + int16x8_t l02 = vaddq_s16(line0x, line2x); + int16x8_t l1x2 = vshlq_n_s16(line1x, 1); + int16x8_t dy = vsubq_s16(line2y, line0y); + int16x8_t dx = vaddq_s16(l1x2, l02); + + int32x4_t norml = vmull_s16(vget_low_s16(dx), vget_low_s16(dx)); + int32x4_t normh = vmull_s16(vget_high_s16(dy), vget_high_s16(dy)); + + norml = vmlal_s16(norml, vget_low_s16(dy), vget_low_s16(dy)); + normh = vmlal_s16(normh, vget_high_s16(dx), vget_high_s16(dx)); + + vst1q_s16(dsty + j, dy); + vst1q_s16(dstx + j, dx); + vst1q_s32(mag + j, norml); + vst1q_s32(mag + j + 4, normh); + } + if (j != width) + { + j = width - 8; + goto ColFilter3x3CannyL2Loop; + } +} + +template +inline void NormCanny(const ptrdiff_t colscn, s16* _dx, s16* _dy, s32* _norm) +{ + ptrdiff_t j = 0; + if (colscn >= 8) + { + int16x8_t vx = vld1q_s16(_dx); + int16x8_t vy = vld1q_s16(_dy); + for (; j <= colscn - 16; j+=8) + { + internal::prefetch(_dx); + internal::prefetch(_dy); + + int16x8_t vx2 = vld1q_s16(_dx + j + 8); + int16x8_t vy2 = vld1q_s16(_dy + j + 8); + + int16x8_t vabsx = vabsq_s16(vx); + int16x8_t vabsy = vabsq_s16(vy); + + int16x8_t norm = vaddq_s16(vabsx, vabsy); + + int32x4_t normh = vmovl_s16(vget_high_s16(norm)); + int32x4_t norml = vmovl_s16(vget_low_s16(norm)); + + vst1q_s32(_norm + j + 4, normh); + vst1q_s32(_norm + j + 0, norml); + + vx = vx2; + vy = vy2; + } + int16x8_t vabsx = vabsq_s16(vx); + int16x8_t vabsy = vabsq_s16(vy); + + int16x8_t norm = vaddq_s16(vabsx, vabsy); + + int32x4_t normh = vmovl_s16(vget_high_s16(norm)); + int32x4_t norml = vmovl_s16(vget_low_s16(norm)); + + vst1q_s32(_norm + j + 4, normh); + vst1q_s32(_norm + j + 0, norml); + } + for (; j < colscn; j++) + _norm[j] = std::abs(s32(_dx[j])) + std::abs(s32(_dy[j])); +} + +template <> +inline void NormCanny(const ptrdiff_t colscn, s16* _dx, s16* _dy, s32* _norm) +{ + ptrdiff_t j = 0; + if (colscn >= 8) + { + int16x8_t vx = vld1q_s16(_dx); + int16x8_t vy = vld1q_s16(_dy); + + for (; j <= colscn - 16; j+=8) + { + internal::prefetch(_dx); + internal::prefetch(_dy); + + int16x8_t vxnext = vld1q_s16(_dx + j + 8); + int16x8_t vynext = vld1q_s16(_dy + j + 8); + + int32x4_t norml = vmull_s16(vget_low_s16(vx), vget_low_s16(vx)); + int32x4_t normh = vmull_s16(vget_high_s16(vy), vget_high_s16(vy)); + + norml = vmlal_s16(norml, vget_low_s16(vy), vget_low_s16(vy)); + normh = vmlal_s16(normh, vget_high_s16(vx), vget_high_s16(vx)); + + vst1q_s32(_norm + j + 0, norml); + vst1q_s32(_norm + j + 4, normh); + + vx = vxnext; + vy = vynext; + } + int32x4_t norml = vmull_s16(vget_low_s16(vx), vget_low_s16(vx)); + int32x4_t normh = vmull_s16(vget_high_s16(vy), vget_high_s16(vy)); + + norml = vmlal_s16(norml, vget_low_s16(vy), vget_low_s16(vy)); + normh = vmlal_s16(normh, vget_high_s16(vx), vget_high_s16(vx)); + + vst1q_s32(_norm + j + 0, norml); + vst1q_s32(_norm + j + 4, normh); + } + for (; j < colscn; j++) + _norm[j] = s32(_dx[j])*_dx[j] + s32(_dy[j])*_dy[j]; +} + +template +inline void prepareThresh(f64 low_thresh, f64 high_thresh, + s32 &low, s32 &high) +{ + if (low_thresh > high_thresh) + std::swap(low_thresh, high_thresh); +#if defined __GNUC__ + low = (s32)low_thresh; + high = (s32)high_thresh; + low -= (low > low_thresh); + high -= (high > high_thresh); +#else + low = internal::round(low_thresh); + high = internal::round(high_thresh); + f32 ldiff = (f32)(low_thresh - low); + f32 hdiff = (f32)(high_thresh - high); + low -= (ldiff < 0); + high -= (hdiff < 0); +#endif +} +template <> +inline void prepareThresh(f64 low_thresh, f64 high_thresh, + s32 &low, s32 &high) +{ + if (low_thresh > high_thresh) + std::swap(low_thresh, high_thresh); + if (low_thresh > 0) low_thresh *= low_thresh; + if (high_thresh > 0) high_thresh *= high_thresh; +#if defined __GNUC__ + low = (s32)low_thresh; + high = (s32)high_thresh; + low -= (low > low_thresh); + high -= (high > high_thresh); +#else + low = internal::round(low_thresh); + high = internal::round(high_thresh); + f32 ldiff = (f32)(low_thresh - low); + f32 hdiff = (f32)(high_thresh - high); + low -= (ldiff < 0); + high -= (hdiff < 0); +#endif +} + +template +struct _normEstimator +{ + ptrdiff_t magstep; + ptrdiff_t dxOffset; + ptrdiff_t dyOffset; + ptrdiff_t shxOffset; + ptrdiff_t shyOffset; + std::vector buffer; + const ptrdiff_t offsetk; + ptrdiff_t borderyt, borderyb; + RowFilter3x3Canny sobelRow; + + inline _normEstimator(const Size2D &size, s32, Margin borderMargin, + ptrdiff_t &mapstep, s32** mag_buf, u8* &map): + offsetk(1), + sobelRow(std::max(0, offsetk - (ptrdiff_t)borderMargin.left), + std::max(0, offsetk - (ptrdiff_t)borderMargin.right)) + { + mapstep = size.width + 2; + magstep = size.width + 2 + size.width * (4 * sizeof(s16)/sizeof(s32)); + dxOffset = mapstep * sizeof(s32)/sizeof(s16); + dyOffset = dxOffset + size.width * 1; + shxOffset = dxOffset + size.width * 2; + shyOffset = dxOffset + size.width * 3; + buffer.resize( (size.width+2)*(size.height+2) + magstep*3*sizeof(s32) ); + mag_buf[0] = (s32*)&buffer[0]; + mag_buf[1] = mag_buf[0] + magstep; + mag_buf[2] = mag_buf[1] + magstep; + memset(mag_buf[0], 0, mapstep * sizeof(s32)); + + map = (u8*)(mag_buf[2] + magstep); + memset(map, 1, mapstep); + memset(map + mapstep*(size.height + 1), 1, mapstep); + borderyt = std::max(0, offsetk - (ptrdiff_t)borderMargin.top); + borderyb = std::max(0, offsetk - (ptrdiff_t)borderMargin.bottom); + } + inline void firstRow(const Size2D &size, s32, + const u8 *srcBase, ptrdiff_t srcStride, + s16*, ptrdiff_t, + s16*, ptrdiff_t, + s32** mag_buf) + { + //sobelH row #0 + const u8* _src = internal::getRowPtr(srcBase, srcStride, 0); + sobelRow(_src, ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[0]) + shyOffset, size.width); + //sobelH row #1 + _src = internal::getRowPtr(srcBase, srcStride, 1); + sobelRow(_src, ((s16*)mag_buf[1]) + shxOffset, ((s16*)mag_buf[1]) + shyOffset, size.width); + + mag_buf[1][0] = mag_buf[1][size.width+1] = 0; + if (borderyt == 0) + { + //sobelH row #-1 + _src = internal::getRowPtr(srcBase, srcStride, -1); + sobelRow(_src, ((s16*)mag_buf[2]) + shxOffset, ((s16*)mag_buf[2]) + shyOffset, size.width); + + ColFilter3x3Canny( ((s16*)mag_buf[2]) + shxOffset, ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset, + ((s16*)mag_buf[1]) + dxOffset, ((s16*)mag_buf[1]) + dyOffset, mag_buf[1] + 1, size.width); + } + else + { + ColFilter3x3Canny( ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset, + ((s16*)mag_buf[1]) + dxOffset, ((s16*)mag_buf[1]) + dyOffset, mag_buf[1] + 1, size.width); + } + } + inline void nextRow(const Size2D &size, s32, + const u8 *srcBase, ptrdiff_t srcStride, + s16*, ptrdiff_t, + s16*, ptrdiff_t, + const ptrdiff_t &mapstep, s32** mag_buf, + size_t i, const s16* &_x, const s16* &_y) + { + mag_buf[2][0] = mag_buf[2][size.width+1] = 0; + if (i < size.height - borderyb) + { + const u8* _src = internal::getRowPtr(srcBase, srcStride, i+1); + //sobelH row #i+1 + sobelRow(_src, ((s16*)mag_buf[2]) + shxOffset, ((s16*)mag_buf[2]) + shyOffset, size.width); + + ColFilter3x3Canny( ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset, ((s16*)mag_buf[2]) + shxOffset, + ((s16*)mag_buf[2]) + dxOffset, ((s16*)mag_buf[2]) + dyOffset, mag_buf[2] + 1, size.width); + } + else if (i < size.height) + { + ColFilter3x3Canny( ((s16*)mag_buf[0]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset, ((s16*)mag_buf[1]) + shxOffset, + ((s16*)mag_buf[2]) + dxOffset, ((s16*)mag_buf[2]) + dyOffset, mag_buf[2] + 1, size.width); + } + else + memset(mag_buf[2], 0, mapstep*sizeof(s32)); + _x = ((s16*)mag_buf[1]) + dxOffset; + _y = ((s16*)mag_buf[1]) + dyOffset; + } +}; +template +struct _normEstimator +{ + std::vector buffer; + + inline _normEstimator(const Size2D &size, s32 cn, Margin, + ptrdiff_t &mapstep, s32** mag_buf, u8* &map) + { + mapstep = size.width + 2; + buffer.resize( (size.width+2)*(size.height+2) + cn*mapstep*3*sizeof(s32) ); + mag_buf[0] = (s32*)&buffer[0]; + mag_buf[1] = mag_buf[0] + mapstep*cn; + mag_buf[2] = mag_buf[1] + mapstep*cn; + memset(mag_buf[0], 0, /* cn* */mapstep * sizeof(s32)); + + map = (u8*)(mag_buf[2] + mapstep*cn); + memset(map, 1, mapstep); + memset(map + mapstep*(size.height + 1), 1, mapstep); + } + inline void firstRow(const Size2D &size, s32 cn, + const u8 *, ptrdiff_t, + s16* dxBase, ptrdiff_t dxStride, + s16* dyBase, ptrdiff_t dyStride, + s32** mag_buf) + { + s32* _norm = mag_buf[1] + 1; + + s16* _dx = internal::getRowPtr(dxBase, dxStride, 0); + s16* _dy = internal::getRowPtr(dyBase, dyStride, 0); + + NormCanny(size.width*cn, _dx, _dy, _norm); + + if(cn > 1) + { + for(size_t j = 0, jn = 0; j < size.width; ++j, jn += cn) + { + size_t maxIdx = jn; + for(s32 k = 1; k < cn; ++k) + if(_norm[jn + k] > _norm[maxIdx]) maxIdx = jn + k; + _norm[j] = _norm[maxIdx]; + _dx[j] = _dx[maxIdx]; + _dy[j] = _dy[maxIdx]; + } + } + + _norm[-1] = _norm[size.width] = 0; + } + inline void nextRow(const Size2D &size, s32 cn, + const u8 *, ptrdiff_t, + s16* dxBase, ptrdiff_t dxStride, + s16* dyBase, ptrdiff_t dyStride, + const ptrdiff_t &mapstep, s32** mag_buf, + size_t i, const s16* &_x, const s16* &_y) + { + s32* _norm = mag_buf[(i > 0) + 1] + 1; + if (i < size.height) + { + s16* _dx = internal::getRowPtr(dxBase, dxStride, i); + s16* _dy = internal::getRowPtr(dyBase, dyStride, i); + + NormCanny(size.width*cn, _dx, _dy, _norm); + + if(cn > 1) + { + for(size_t j = 0, jn = 0; j < size.width; ++j, jn += cn) + { + size_t maxIdx = jn; + for(s32 k = 1; k < cn; ++k) + if(_norm[jn + k] > _norm[maxIdx]) maxIdx = jn + k; + _norm[j] = _norm[maxIdx]; + _dx[j] = _dx[maxIdx]; + _dy[j] = _dy[maxIdx]; + } + } + + _norm[-1] = _norm[size.width] = 0; + } + else + memset(_norm-1, 0, /* cn* */mapstep*sizeof(s32)); + + _x = internal::getRowPtr(dxBase, dxStride, i-1); + _y = internal::getRowPtr(dyBase, dyStride, i-1); + } +}; + +template +inline void Canny3x3(const Size2D &size, s32 cn, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + s16 * dxBase, ptrdiff_t dxStride, + s16 * dyBase, ptrdiff_t dyStride, + f64 low_thresh, f64 high_thresh, + Margin borderMargin) +{ + s32 low, high; + prepareThresh(low_thresh, high_thresh, low, high); + + ptrdiff_t mapstep; + s32* mag_buf[3]; + u8* map; + _normEstimator normEstimator(size, cn, borderMargin, mapstep, mag_buf, map); + + size_t maxsize = std::max( 1u << 10, size.width * size.height / 10 ); + std::vector stack( maxsize ); + u8 **stack_top = &stack[0]; + u8 **stack_bottom = &stack[0]; + + /* sector numbers + (Top-Left Origin) + + 1 2 3 + * * * + * * * + 0*******0 + * * * + * * * + 3 2 1 + */ + + #define CANNY_PUSH(d) *(d) = u8(2), *stack_top++ = (d) + #define CANNY_POP(d) (d) = *--stack_top + + //i == 0 + normEstimator.firstRow(size, cn, srcBase, srcStride, dxBase, dxStride, dyBase, dyStride, mag_buf); + // calculate magnitude and angle of gradient, perform non-maxima supression. + // fill the map with one of the following values: + // 0 - the pixel might belong to an edge + // 1 - the pixel can not belong to an edge + // 2 - the pixel does belong to an edge + for (size_t i = 1; i <= size.height; i++) + { + const s16 *_x, *_y; + normEstimator.nextRow(size, cn, srcBase, srcStride, dxBase, dxStride, dyBase, dyStride, mapstep, mag_buf, i, _x, _y); + + u8* _map = map + mapstep*i + 1; + _map[-1] = _map[size.width] = 1; + + s32* _mag = mag_buf[1] + 1; // take the central row + ptrdiff_t magstep1 = mag_buf[2] - mag_buf[1]; + ptrdiff_t magstep2 = mag_buf[0] - mag_buf[1]; + + if ((stack_top - stack_bottom) + size.width > maxsize) + { + ptrdiff_t sz = (ptrdiff_t)(stack_top - stack_bottom); + maxsize = maxsize * 3/2; + stack.resize(maxsize); + stack_bottom = &stack[0]; + stack_top = stack_bottom + sz; + } + + s32 prev_flag = 0; + for (ptrdiff_t j = 0; j < (ptrdiff_t)size.width; j++) + { + #define CANNY_SHIFT 15 + const s32 TG22 = (s32)(0.4142135623730950488016887242097*(1< low) + { + s32 xs = _x[j]; + s32 ys = _y[j]; + s32 x = abs(xs); + s32 y = abs(ys) << CANNY_SHIFT; + + s32 tg22x = x * TG22; + + if (y < tg22x) + { + if (m > _mag[j-1] && m >= _mag[j+1]) goto __push; + } + else + { + s32 tg67x = tg22x + (x << (CANNY_SHIFT+1)); + if (y > tg67x) + { + if (m > _mag[j+magstep2] && m >= _mag[j+magstep1]) goto __push; + } + else + { + s32 s = (xs ^ ys) < 0 ? -1 : 1; + if(m > _mag[j+magstep2-s] && m > _mag[j+magstep1+s]) goto __push; + } + } + } + prev_flag = 0; + _map[j] = u8(1); + continue; + __push: + if (!prev_flag && m > high && _map[j-mapstep] != 2) + { + CANNY_PUSH(_map + j); + prev_flag = 1; + } + else + _map[j] = 0; + } + + // scroll the ring buffer + _mag = mag_buf[0]; + mag_buf[0] = mag_buf[1]; + mag_buf[1] = mag_buf[2]; + mag_buf[2] = _mag; + } + + // now track the edges (hysteresis thresholding) + while (stack_top > stack_bottom) + { + u8* m; + if ((size_t)(stack_top - stack_bottom) + 8u > maxsize) + { + ptrdiff_t sz = (ptrdiff_t)(stack_top - stack_bottom); + maxsize = maxsize * 3/2; + stack.resize(maxsize); + stack_bottom = &stack[0]; + stack_top = stack_bottom + sz; + } + + CANNY_POP(m); + + if (!m[-1]) CANNY_PUSH(m - 1); + if (!m[1]) CANNY_PUSH(m + 1); + if (!m[-mapstep-1]) CANNY_PUSH(m - mapstep - 1); + if (!m[-mapstep]) CANNY_PUSH(m - mapstep); + if (!m[-mapstep+1]) CANNY_PUSH(m - mapstep + 1); + if (!m[mapstep-1]) CANNY_PUSH(m + mapstep - 1); + if (!m[mapstep]) CANNY_PUSH(m + mapstep); + if (!m[mapstep+1]) CANNY_PUSH(m + mapstep + 1); + } + + // the final pass, form the final image + uint8x16_t v2 = vmovq_n_u8(2); + const u8* ptrmap = map + mapstep + 1; + for (size_t i = 0; i < size.height; i++, ptrmap += mapstep) + { + u8* _dst = internal::getRowPtr(dstBase, dstStride, i); + ptrdiff_t j = 0; + for (; j < (ptrdiff_t)size.width - 16; j += 16) + { + internal::prefetch(ptrmap); + uint8x16_t vmap = vld1q_u8(ptrmap + j); + uint8x16_t vdst = vceqq_u8(vmap, v2); + vst1q_u8(_dst+j, vdst); + } + for (; j < (ptrdiff_t)size.width; j++) + _dst[j] = (u8)-(ptrmap[j] >> 1); + } +} + +} // namespace +#endif + +bool isCanny3x3Supported(const Size2D &size) +{ + return isSupportedConfiguration() && + size.height >= 2 && size.width >= 9; +} + +void Canny3x3L1(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f64 low_thresh, f64 high_thresh, + Margin borderMargin) +{ + internal::assertSupportedConfiguration(isCanny3x3Supported(size)); +#ifdef CAROTENE_NEON + Canny3x3(size, 1, + srcBase, srcStride, + dstBase, dstStride, + NULL, 0, + NULL, 0, + low_thresh, high_thresh, + borderMargin); +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)low_thresh; + (void)high_thresh; + (void)borderMargin; +#endif +} + +void Canny3x3L2(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f64 low_thresh, f64 high_thresh, + Margin borderMargin) +{ + internal::assertSupportedConfiguration(isCanny3x3Supported(size)); +#ifdef CAROTENE_NEON + Canny3x3(size, 1, + srcBase, srcStride, + dstBase, dstStride, + NULL, 0, + NULL, 0, + low_thresh, high_thresh, + borderMargin); +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)low_thresh; + (void)high_thresh; + (void)borderMargin; +#endif +} + +void Canny3x3L1(const Size2D &size, s32 cn, + s16 * dxBase, ptrdiff_t dxStride, + s16 * dyBase, ptrdiff_t dyStride, + u8 * dstBase, ptrdiff_t dstStride, + f64 low_thresh, f64 high_thresh) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Canny3x3(size, cn, + NULL, 0, + dstBase, dstStride, + dxBase, dxStride, + dyBase, dyStride, + low_thresh, high_thresh, + Margin()); +#else + (void)size; + (void)cn; + (void)dstBase; + (void)dstStride; + (void)dxBase; + (void)dxStride; + (void)dyBase; + (void)dyStride; + (void)low_thresh; + (void)high_thresh; +#endif +} + +void Canny3x3L2(const Size2D &size, s32 cn, + s16 * dxBase, ptrdiff_t dxStride, + s16 * dyBase, ptrdiff_t dyStride, + u8 * dstBase, ptrdiff_t dstStride, + f64 low_thresh, f64 high_thresh) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Canny3x3(size, cn, + NULL, 0, + dstBase, dstStride, + dxBase, dxStride, + dyBase, dyStride, + low_thresh, high_thresh, + Margin()); +#else + (void)size; + (void)cn; + (void)dstBase; + (void)dstStride; + (void)dxBase; + (void)dxStride; + (void)dyBase; + (void)dyStride; + (void)low_thresh; + (void)high_thresh; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/channel_extract.cpp b/3rdparty/carotene/src/channel_extract.cpp new file mode 100644 index 0000000000..fda8f6e153 --- /dev/null +++ b/3rdparty/carotene/src/channel_extract.cpp @@ -0,0 +1,486 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" +#include "vtransform.hpp" + +namespace CAROTENE_NS { + +void extract2(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + u32 coi) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON +#ifndef ANDROID + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; +#endif + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u; + +#ifndef ANDROID + for (; dj < roiw32; sj += 64, dj += 32) + { + internal::prefetch(src + sj); + + uint8x16x2_t v_src = vld2q_u8(src + sj); + vst1q_u8(dst + dj, v_src.val[coi]); + + v_src = vld2q_u8(src + sj + 32); + vst1q_u8(dst + dj + 16, v_src.val[coi]); + } +#endif + + for (; dj < roiw8; sj += 16, dj += 8) + { + uint8x8x2_t v_src = vld2_u8(src + sj); + vst1_u8(dst + dj, v_src.val[coi]); + } + + for (; dj < size.width; sj += 2, ++dj) + { + dst[dj] = src[sj + coi]; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)coi; +#endif +} + +void extract3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + u32 coi) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON +#ifndef ANDROID + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; +#endif + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u; + +#ifndef ANDROID + for (; dj < roiw32; sj += 96, dj += 32) + { + internal::prefetch(src + sj); + + uint8x16x3_t v_src = vld3q_u8(src + sj); + vst1q_u8(dst + dj, v_src.val[coi]); + + v_src = vld3q_u8(src + sj + 48); + vst1q_u8(dst + dj + 16, v_src.val[coi]); + } +#endif + + for (; dj < roiw8; sj += 24, dj += 8) + { + uint8x8x3_t v_src = vld3_u8(src + sj); + vst1_u8(dst + dj, v_src.val[coi]); + } + + for (; dj < size.width; sj += 3, ++dj) + { + dst[dj] = src[sj + coi]; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)coi; +#endif +} + +void extract4(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + u32 coi) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON +#ifndef ANDROID + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; +#endif + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u; + +#ifndef ANDROID + for (; dj < roiw32; sj += 128, dj += 32) + { + internal::prefetch(src + sj); + + uint8x16x4_t v_src = vld4q_u8(src + sj); + vst1q_u8(dst + dj, v_src.val[coi]); + + v_src = vld4q_u8(src + sj + 64); + vst1q_u8(dst + dj + 16, v_src.val[coi]); + } +#endif + + for (; dj < roiw8; sj += 32, dj += 8) + { + uint8x8x4_t v_src = vld4_u8(src + sj); + vst1_u8(dst + dj, v_src.val[coi]); + } + + for (; dj < size.width; sj += 4, ++dj) + { + dst[dj] = src[sj + coi]; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)coi; +#endif +} + +#define FILL_LINES2(macro,type) \ + macro##_LINE(type,0) \ + macro##_LINE(type,1) +#define FILL_LINES3(macro,type) \ + FILL_LINES2(macro,type) \ + macro##_LINE(type,2) +#define FILL_LINES4(macro,type) \ + FILL_LINES3(macro,type) \ + macro##_LINE(type,3) + +#define FARG_LINE(type, n) , type * dst##n##Base, ptrdiff_t dst##n##Stride + +#ifdef CAROTENE_NEON + +#define VROW_LINE(type, n) type * dst##n = internal::getRowPtr(dst##n##Base, dst##n##Stride, i); +#define VST1Q_LINE(type, n) vst1q_##type(dst##n + dj, v_src.val[n]); +#define VST1_LINE(type, n) vst1_##type(dst##n + dj, v_src.val[n]); +#define SST_LINE(type, n) dst##n[dj] = src[sj + n]; + +#define MUL2(val) (val << 1) +#define MUL3(val) (MUL2(val) + val) +#define MUL4(val) (val << 2) + +#define CONTDST2 srcStride == dst0Stride && \ + srcStride == dst1Stride && +#define CONTDST3 srcStride == dst0Stride && \ + srcStride == dst1Stride && \ + srcStride == dst2Stride && +#define CONTDST4 srcStride == dst0Stride && \ + srcStride == dst1Stride && \ + srcStride == dst2Stride && \ + srcStride == dst3Stride && + +#if __GNUC__ == 4 && __GNUC_MINOR__ < 7 + +#define SPLIT_ASM2(sgn, bits) __asm__ ( \ + "vld2." #bits " {d0, d2}, [%[in0]] \n\t" \ + "vld2." #bits " {d1, d3}, [%[in1]] \n\t" \ + "vst1." #bits " {d0-d1}, [%[out0]] \n\t" \ + "vst1." #bits " {d2-d3}, [%[out1]] \n\t" \ + : \ + : [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), \ + [in0] "r" (src + sj), [in1] "r" (src + sj + MUL2(8)/sizeof(sgn##bits)) \ + : "d0","d1","d2","d3" \ + ); +#define SPLIT_ASM3(sgn, bits) __asm__ ( \ + "vld3." #bits " {d0, d2, d4}, [%[in0]] \n\t" \ + "vld3." #bits " {d1, d3, d5}, [%[in1]] \n\t" \ + "vst1." #bits " {d0-d1}, [%[out0]] \n\t" \ + "vst1." #bits " {d2-d3}, [%[out1]] \n\t" \ + "vst1." #bits " {d4-d5}, [%[out2]] \n\t" \ + : \ + : [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), [out2] "r" (dst2 + dj), \ + [in0] "r" (src + sj), [in1] "r" (src + sj + MUL3(8)/sizeof(sgn##bits)) \ + : "d0","d1","d2","d3","d4","d5" \ + ); +#define SPLIT_ASM4(sgn, bits) __asm__ ( \ + "vld4." #bits " {d0, d2, d4, d6}, [%[in0]] \n\t" \ + "vld4." #bits " {d1, d3, d5, d7}, [%[in1]] \n\t" \ + "vst1." #bits " {d0-d1}, [%[out0]] \n\t" \ + "vst1." #bits " {d2-d3}, [%[out1]] \n\t" \ + "vst1." #bits " {d4-d5}, [%[out2]] \n\t" \ + "vst1." #bits " {d6-d7}, [%[out3]] \n\t" \ + : \ + : [out0] "r" (dst0 + dj), [out1] "r" (dst1 + dj), [out2] "r" (dst2 + dj), [out3] "r" (dst3 + dj), \ + [in0] "r" (src + sj), [in1] "r" (src + sj + MUL4(8)/sizeof(sgn##bits)) \ + : "d0","d1","d2","d3","d4","d5","d6","d7" \ + ); + +#define SPLIT_QUAD(sgn, bits, n) { \ + internal::prefetch(src + sj); \ + SPLIT_ASM##n(sgn, bits) \ + } + +#else + +#define SPLIT_QUAD(sgn, bits, n) { \ + internal::prefetch(src + sj); \ + vec128 v_src = vld##n##q_##sgn##bits(src + sj); \ + FILL_LINES##n(VST1Q, sgn##bits) \ + } + +#endif // __GNUC__ == 4 && __GNUC_MINOR__ < 7 + +#define SPLIT(sgn,bits,n) void split##n(const Size2D &_size, \ + const sgn##bits * srcBase, ptrdiff_t srcStride \ + FILL_LINES##n(FARG, sgn##bits) ) \ +{ \ + internal::assertSupportedConfiguration(); \ + Size2D size(_size); \ + if (CONTDST##n \ + dst0Stride == (ptrdiff_t)(size.width)) \ + { \ + size.width *= size.height; \ + size.height = 1; \ + } \ + typedef internal::VecTraits::vec128 vec128; \ + size_t roiw16 = size.width >= (16/sizeof(sgn##bits)-1) ? size.width - (16/sizeof(sgn##bits)-1) : 0; \ + typedef internal::VecTraits::vec64 vec64; \ + size_t roiw8 = size.width >= (8/sizeof(sgn##bits)-1) ? size.width - (8/sizeof(sgn##bits)-1) : 0; \ + \ + for (size_t i = 0u; i < size.height; ++i) \ + { \ + const sgn##bits * src = internal::getRowPtr(srcBase, srcStride, i); \ + FILL_LINES##n(VROW, sgn##bits) \ + size_t sj = 0u, dj = 0u; \ + \ + for (; dj < roiw16; sj += MUL##n(16)/sizeof(sgn##bits), dj += 16/sizeof(sgn##bits)) \ + SPLIT_QUAD(sgn, bits, n) \ + \ + if (dj < roiw8) \ + { \ + vec64 v_src = vld##n##_##sgn##bits(src + sj); \ + FILL_LINES##n(VST1, sgn##bits) \ + sj += MUL##n(8)/sizeof(sgn##bits); \ + dj += 8/sizeof(sgn##bits); \ + } \ + \ + for (; dj < size.width; sj += n, ++dj) \ + { \ + FILL_LINES##n(SST, sgn##bits) \ + } \ + } \ +} + +#define SPLIT64(sgn,n) void split##n(const Size2D &_size, \ + const sgn##64 * srcBase, ptrdiff_t srcStride \ + FILL_LINES##n(FARG, sgn##64) ) \ +{ \ + internal::assertSupportedConfiguration(); \ + Size2D size(_size); \ + if (CONTDST##n \ + dst0Stride == (ptrdiff_t)(size.width)) \ + { \ + size.width *= size.height; \ + size.height = 1; \ + } \ + typedef internal::VecTraits::vec64 vec64; \ + \ + for (size_t i = 0u; i < size.height; ++i) \ + { \ + const sgn##64 * src = internal::getRowPtr(srcBase, srcStride, i); \ + FILL_LINES##n(VROW, sgn##64) \ + size_t sj = 0u, dj = 0u; \ + \ + for (; dj < size.width; sj += n, ++dj) \ + { \ + vec64 v_src = vld##n##_##sgn##64(src + sj); \ + FILL_LINES##n(VST1, sgn##64) \ + } \ + } \ +} + +#if __GNUC__ == 4 && __GNUC_MINOR__ < 7 + +#define ALPHA_QUAD(sgn, bits) { \ + internal::prefetch(src + sj); \ + __asm__ ( \ + "vld4." #bits " {d0, d2, d4, d6}, [%[in0]] \n\t" \ + "vld4." #bits " {d1, d3, d5, d7}, [%[in1]] \n\t" \ + "vst3." #bits " {d0, d2, d4}, [%[out3_1]] \n\t" \ + "vst3." #bits " {d1, d3, d5}, [%[out3_2]] \n\t" \ + "vst1." #bits " {d6-d7}, [%[out1]] \n\t" \ + : \ + : [out3_1] "r" (dst3 + d3j), [out3_2] "r" (dst3 + d3j + 24/sizeof(sgn##bits)), [out1] "r" (dst1 + d1j), \ + [in0] "r" (src + sj), [in1] "r" (src + sj + 32/sizeof(sgn##bits)) \ + : "d0","d1","d2","d3","d4","d5","d6","d7" \ + ); \ + } + +#else + +#define ALPHA_QUAD(sgn, bits) { \ + internal::prefetch(src + sj); \ + union { vec128_4 v4; vec128_3 v3; } vals; \ + vals.v4 = vld4q_##sgn##bits(src + sj); \ + vst3q_##sgn##bits(dst3 + d3j, vals.v3); \ + vst1q_##sgn##bits(dst1 + d1j, vals.v4.val[3]); \ + } + +#endif // __GNUC__ == 4 && __GNUC_MINOR__ < 7 + +#define SPLIT4ALPHA(sgn,bits) void split4(const Size2D &_size, \ + const sgn##bits * srcBase, ptrdiff_t srcStride, \ + sgn##bits * dst3Base, ptrdiff_t dst3Stride, \ + sgn##bits * dst1Base, ptrdiff_t dst1Stride) \ +{ \ + internal::assertSupportedConfiguration(); \ + Size2D size(_size); \ + if (srcStride == dst3Stride && \ + srcStride == dst1Stride && \ + srcStride == (ptrdiff_t)(size.width)) \ + { \ + size.width *= size.height; \ + size.height = 1; \ + } \ + typedef internal::VecTraits::vec128 vec128_4; \ + typedef internal::VecTraits::vec128 vec128_3; \ + size_t roiw16 = size.width >= (16/sizeof(sgn##bits)-1) ? size.width - (16/sizeof(sgn##bits)-1) : 0; \ + typedef internal::VecTraits::vec64 vec64_4; \ + typedef internal::VecTraits::vec64 vec64_3; \ + size_t roiw8 = size.width >= (8/sizeof(sgn##bits)-1) ? size.width - (8/sizeof(sgn##bits)-1) : 0; \ + \ + for (size_t i = 0u; i < size.height; ++i) \ + { \ + const sgn##bits * src = internal::getRowPtr(srcBase, srcStride, i); \ + sgn##bits * dst3 = internal::getRowPtr(dst3Base, dst3Stride, i); \ + sgn##bits * dst1 = internal::getRowPtr(dst1Base, dst1Stride, i); \ + size_t sj = 0u, d3j = 0u, d1j = 0u; \ + \ + for (; d1j < roiw16; sj += MUL4(16)/sizeof(sgn##bits), d3j += MUL3(16)/sizeof(sgn##bits), \ + d1j += 16/sizeof(sgn##bits)) \ + ALPHA_QUAD(sgn, bits) \ + \ + if (d1j < roiw8) \ + { \ + union { vec64_4 v4; vec64_3 v3; } vals; \ + vals.v4 = vld4_##sgn##bits(src + sj); \ + vst3_u8(dst3 + d3j, vals.v3); \ + vst1_u8(dst1 + d1j, vals.v4.val[3]); \ + sj += MUL4(8)/sizeof(sgn##bits); \ + d3j += MUL3(8)/sizeof(sgn##bits); \ + d1j += 8/sizeof(sgn##bits); \ + } \ + \ + for (; d1j < size.width; sj += 4, d3j += 3, ++d1j) \ + { \ + dst3[d3j+0] = src[sj + 0]; \ + dst3[d3j+1] = src[sj + 1]; \ + dst3[d3j+2] = src[sj + 2]; \ + dst1[d1j] = src[sj + 3]; \ + } \ + } \ +} + +#else + +#define VOID_LINE(type, n) (void)dst##n##Base; (void)dst##n##Stride; + +#define SPLIT(sgn,bits,n) void split##n(const Size2D &size, \ + const sgn##bits * srcBase, ptrdiff_t srcStride \ + FILL_LINES##n(FARG, sgn##bits) ) \ +{ \ + internal::assertSupportedConfiguration(); \ + (void)size; \ + (void)srcBase; \ + (void)srcStride; \ + FILL_LINES##n(VOID, sgn##bits) \ +} + +#define SPLIT64(sgn,n) SPLIT(sgn,64,n) + +#define SPLIT4ALPHA(sgn,bits) void split4(const Size2D &size, \ + const sgn##bits * srcBase, ptrdiff_t srcStride, \ + sgn##bits * dst3Base, ptrdiff_t dst3Stride, \ + sgn##bits * dst1Base, ptrdiff_t dst1Stride) \ +{ \ + internal::assertSupportedConfiguration(); \ + (void)size; \ + (void)srcBase; \ + (void)srcStride; \ + (void)dst3Base; \ + (void)dst3Stride; \ + (void)dst1Base; \ + (void)dst1Stride; \ +} + +#endif //CAROTENE_NEON + +SPLIT(u, 8,2) +SPLIT(u, 8,3) +SPLIT(u, 8,4) +SPLIT(u,16,2) +SPLIT(u,16,3) +SPLIT(u,16,4) +SPLIT(s,32,2) +SPLIT(s,32,3) +SPLIT(s,32,4) + +SPLIT64(s, 2) +SPLIT64(s, 3) +SPLIT64(s, 4) + +SPLIT4ALPHA(u,8) + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/channels_combine.cpp b/3rdparty/carotene/src/channels_combine.cpp new file mode 100644 index 0000000000..32b71470e2 --- /dev/null +++ b/3rdparty/carotene/src/channels_combine.cpp @@ -0,0 +1,389 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" +#include "vtransform.hpp" + +namespace CAROTENE_NS { + +#define FILL_LINES2(macro,type) \ + macro##_LINE(type,0) \ + macro##_LINE(type,1) +#define FILL_LINES3(macro,type) \ + FILL_LINES2(macro,type) \ + macro##_LINE(type,2) +#define FILL_LINES4(macro,type) \ + FILL_LINES3(macro,type) \ + macro##_LINE(type,3) + +#define FARG_LINE(type, n) , const type * src##n##Base, ptrdiff_t src##n##Stride + +#ifdef CAROTENE_NEON + +#define VROW_LINE(type, n) const type * src##n = internal::getRowPtr(src##n##Base, src##n##Stride, i); +#define PREF_LINE(type, n) internal::prefetch(src##n + sj); +#define VLD1Q_LINE(type, n) v_dst.val[n] = vld1q_##type(src##n + sj); +#define PRLD_LINE(type, n) internal::prefetch(src##n + sj); v_dst.val[n] = vld1q_##type(src##n + sj); +#define VLD1_LINE(type, n) v_dst.val[n] = vld1_##type(src##n + sj); +#define SLD_LINE(type, n) dst[dj + n] = src##n[sj]; + +#define MUL2(val) (val << 1) +#define MUL3(val) (MUL2(val) + val) +#define MUL4(val) (val << 2) + +#define CONTSRC2 dstStride == src0Stride && \ + dstStride == src1Stride && +#define CONTSRC3 dstStride == src0Stride && \ + dstStride == src1Stride && \ + dstStride == src2Stride && +#define CONTSRC4 dstStride == src0Stride && \ + dstStride == src1Stride && \ + dstStride == src2Stride && \ + dstStride == src3Stride && + +#if __GNUC__ == 4 && __GNUC_MINOR__ < 7 + +#define MERGE_ASM2(sgn, bits) __asm__ ( \ + "vld1." #bits " {d0-d1}, [%[in0]] \n\t" \ + "vld1." #bits " {d2-d3}, [%[in1]] \n\t" \ + "vst2." #bits " {d0, d2}, [%[out0]] \n\t" \ + "vst2." #bits " {d1, d3}, [%[out1]] \n\t" \ + : \ + : [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), \ + [out0] "r" (dst + dj), [out1] "r" (dst + dj + MUL2(8)/sizeof(sgn##bits)) \ + : "d0","d1","d2","d3" \ + ); +#define MERGE_ASM3(sgn, bits) __asm__ ( \ + "vld1." #bits " {d0-d1}, [%[in0]] \n\t" \ + "vld1." #bits " {d2-d3}, [%[in1]] \n\t" \ + "vld1." #bits " {d4-d5}, [%[in2]] \n\t" \ + "vst3." #bits " {d0, d2, d4}, [%[out0]] \n\t" \ + "vst3." #bits " {d1, d3, d5}, [%[out1]] \n\t" \ + : \ + : [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), [in2] "r" (src2 + sj), \ + [out0] "r" (dst + dj), [out1] "r" (dst + dj + MUL3(8)/sizeof(sgn##bits)) \ + : "d0","d1","d2","d3","d4","d5" \ + ); +#define MERGE_ASM4(sgn, bits) __asm__ ( \ + "vld1." #bits " {d0-d1}, [%[in0]] \n\t" \ + "vld1." #bits " {d2-d3}, [%[in1]] \n\t" \ + "vld1." #bits " {d4-d5}, [%[in2]] \n\t" \ + "vld1." #bits " {d6-d7}, [%[in3]] \n\t" \ + "vst4." #bits " {d0, d2, d4, d6}, [%[out0]] \n\t" \ + "vst4." #bits " {d1, d3, d5, d7}, [%[out1]] \n\t" \ + : \ + : [in0] "r" (src0 + sj), [in1] "r" (src1 + sj), [in2] "r" (src2 + sj), [in3] "r" (src3 + sj), \ + [out0] "r" (dst + dj), [out1] "r" (dst + dj + MUL4(8)/sizeof(sgn##bits)) \ + : "d0","d1","d2","d3","d4","d5","d6","d7" \ + ); + +#define MERGE_QUAD(sgn, bits, n) { \ + FILL_LINES##n(PREF, sgn##bits) \ + MERGE_ASM##n(sgn, bits) \ + } + +#else + +#define MERGE_QUAD(sgn, bits, n) { \ + vec128 v_dst; \ + /*FILL_LINES##n(PREF, sgn##bits) \ + FILL_LINES##n(VLD1Q, sgn##bits)*/ \ + FILL_LINES##n(PRLD, sgn##bits) \ + vst##n##q_##sgn##bits(dst + dj, v_dst); \ + } + +#endif // __GNUC__ == 4 && __GNUC_MINOR__ < 7 + +#define COMBINE(sgn,bits,n) void combine##n(const Size2D &_size \ + FILL_LINES##n(FARG, sgn##bits), \ + sgn##bits * dstBase, ptrdiff_t dstStride) \ +{ \ + internal::assertSupportedConfiguration(); \ + Size2D size(_size); \ + if (CONTSRC##n \ + dstStride == (ptrdiff_t)(size.width)) \ + { \ + size.width *= size.height; \ + size.height = 1; \ + } \ + typedef internal::VecTraits::vec128 vec128; \ + size_t roiw16 = size.width >= (16/sizeof(sgn##bits) - 1) ? size.width - (16/sizeof(sgn##bits) - 1) : 0; \ + typedef internal::VecTraits::vec64 vec64; \ + size_t roiw8 = size.width >= (8/sizeof(sgn##bits) - 1) ? size.width - (8/sizeof(sgn##bits) - 1) : 0; \ + \ + for (size_t i = 0u; i < size.height; ++i) \ + { \ + FILL_LINES##n(VROW, sgn##bits) \ + sgn##bits * dst = internal::getRowPtr(dstBase, dstStride, i); \ + size_t sj = 0u, dj = 0u; \ + \ + for (; sj < roiw16; sj += 16/sizeof(sgn##bits), dj += MUL##n(16)/sizeof(sgn##bits)) \ + MERGE_QUAD(sgn, bits, n) \ + \ + if ( sj < roiw8 ) \ + { \ + vec64 v_dst; \ + FILL_LINES##n(VLD1, sgn##bits) \ + vst##n##_##sgn##bits(dst + dj, v_dst); \ + sj += 8/sizeof(sgn##bits); dj += MUL##n(8)/sizeof(sgn##bits); \ + } \ + \ + for (; sj < size.width; ++sj, dj += n) \ + { \ + FILL_LINES##n(SLD, sgn##bits) \ + } \ + } \ +} + +#define COMBINE64(sgn,n) void combine##n(const Size2D &_size \ + FILL_LINES##n(FARG, sgn##64), \ + sgn##64 * dstBase, ptrdiff_t dstStride) \ +{ \ + internal::assertSupportedConfiguration(); \ + Size2D size(_size); \ + if (CONTSRC##n \ + dstStride == (ptrdiff_t)(size.width)) \ + { \ + size.width *= size.height; \ + size.height = 1; \ + } \ + typedef internal::VecTraits::vec64 vec64; \ + \ + for (size_t i = 0u; i < size.height; ++i) \ + { \ + FILL_LINES##n(VROW, sgn##64) \ + sgn##64 * dst = internal::getRowPtr(dstBase, dstStride, i); \ + size_t sj = 0u, dj = 0u; \ + \ + for (; sj < size.width; ++sj, dj += n) \ + { \ + vec64 v_dst; \ + FILL_LINES##n(VLD1, sgn##64) \ + vst##n##_##sgn##64(dst + dj, v_dst); \ + /*FILL_LINES##n(SLD, sgn##64)*/ \ + } \ + } \ +} + +#else + +#define VOID_LINE(type, n) (void)src##n##Base; (void)src##n##Stride; + +#define COMBINE(sgn,bits,n) void combine##n(const Size2D &size \ + FILL_LINES##n(FARG, sgn##bits), \ + sgn##bits * dstBase, ptrdiff_t dstStride) \ +{ \ + internal::assertSupportedConfiguration(); \ + (void)size; \ + FILL_LINES##n(VOID, sgn##bits) \ + (void)dstBase; \ + (void)dstStride; \ +} +#define COMBINE64(sgn,n) COMBINE(sgn,64,n) + +#endif //CAROTENE_NEON + +COMBINE(u, 8,2) +COMBINE(u, 8,3) +COMBINE(u, 8,4) +COMBINE(u,16,2) +COMBINE(u,16,3) +COMBINE(u,16,4) +COMBINE(s,32,2) +COMBINE(s,32,3) +COMBINE(s,32,4) +COMBINE64(s, 2) +COMBINE64(s, 3) +COMBINE64(s, 4) + +void combineYUYV(const Size2D &size, + const u8 * srcyBase, ptrdiff_t srcyStride, + const u8 * srcuBase, ptrdiff_t srcuStride, + const u8 * srcvBase, ptrdiff_t srcvStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON +#ifndef ANDROID + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; +#endif + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0u; i < size.height; i += 1) + { + const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i); + const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i); + const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t syj = 0u, sj = 0u, dj = 0u; + +#ifndef ANDROID + for (; sj < roiw32; sj += 32, syj += 64, dj += 128) + { + internal::prefetch(srcy + syj); + internal::prefetch(srcu + sj); + internal::prefetch(srcv + sj); + + uint8x16x2_t v_y = vld2q_u8(srcy + syj); + uint8x16x4_t v_dst; + v_dst.val[0] = v_y.val[0]; + v_dst.val[1] = vld1q_u8(srcu + sj); + v_dst.val[2] = v_y.val[1]; + v_dst.val[3] = vld1q_u8(srcv + sj); + vst4q_u8(dst + dj, v_dst); + + v_y = vld2q_u8(srcy + syj + 32); + v_dst.val[0] = v_y.val[0]; + v_dst.val[1] = vld1q_u8(srcu + sj + 16); + v_dst.val[2] = v_y.val[1]; + v_dst.val[3] = vld1q_u8(srcv + sj + 16); + vst4q_u8(dst + dj + 64, v_dst); + } +#endif + + for (; sj < roiw8; sj += 8, syj += 16, dj += 32) + { + uint8x8x2_t v_y = vld2_u8(srcy + syj); + uint8x8x4_t v_dst; + v_dst.val[0] = v_y.val[0]; + v_dst.val[1] = vld1_u8(srcu + sj); + v_dst.val[2] = v_y.val[1]; + v_dst.val[3] = vld1_u8(srcv + sj); + vst4_u8(dst + dj, v_dst); + } + + for (; sj < size.width; ++sj, syj += 2, dj += 4) + { + dst[dj] = srcy[syj]; + dst[dj + 1] = srcu[sj]; + dst[dj + 2] = srcy[syj + 1]; + dst[dj + 3] = srcv[sj]; + } + } +#else + (void)size; + (void)srcyBase; + (void)srcyStride; + (void)srcuBase; + (void)srcuStride; + (void)srcvBase; + (void)srcvStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void combineUYVY(const Size2D &size, + const u8 * srcyBase, ptrdiff_t srcyStride, + const u8 * srcuBase, ptrdiff_t srcuStride, + const u8 * srcvBase, ptrdiff_t srcvStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON +#ifndef ANDROID + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; +#endif + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i); + const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i); + const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t syj = 0u, sj = 0u, dj = 0u; + +#ifndef ANDROID + for (; sj < roiw32; sj += 32, syj += 64, dj += 128) + { + internal::prefetch(srcy + syj); + internal::prefetch(srcu + sj); + internal::prefetch(srcv + sj); + + uint8x16x2_t v_y = vld2q_u8(srcy + syj); + uint8x16x4_t v_dst; + v_dst.val[0] = vld1q_u8(srcu + sj); + v_dst.val[1] = v_y.val[0]; + v_dst.val[2] = vld1q_u8(srcv + sj); + v_dst.val[3] = v_y.val[1]; + vst4q_u8(dst + dj, v_dst); + + v_y = vld2q_u8(srcy + syj + 32); + v_dst.val[0] = vld1q_u8(srcu + sj + 16); + v_dst.val[1] = v_y.val[0]; + v_dst.val[2] = vld1q_u8(srcv + sj + 16); + v_dst.val[3] = v_y.val[1]; + vst4q_u8(dst + dj + 64, v_dst); + } +#endif + + for (; sj < roiw8; sj += 8, syj += 16, dj += 32) + { + uint8x8x2_t v_y = vld2_u8(srcy + syj); + uint8x8x4_t v_dst; + v_dst.val[0] = vld1_u8(srcu + sj); + v_dst.val[1] = v_y.val[0]; + v_dst.val[2] = vld1_u8(srcv + sj); + v_dst.val[3] = v_y.val[1]; + vst4_u8(dst + dj, v_dst); + } + + for (; sj < size.width; ++sj, syj += 2, dj += 4) + { + dst[dj] = srcu[sj]; + dst[dj + 1] = srcy[syj]; + dst[dj + 2] = srcv[sj]; + dst[dj + 3] = srcy[syj + 1]; + } + } +#else + (void)size; + (void)srcyBase; + (void)srcyStride; + (void)srcuBase; + (void)srcuStride; + (void)srcvBase; + (void)srcvStride; + (void)dstBase; + (void)dstStride; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/cmp.cpp b/3rdparty/carotene/src/cmp.cpp new file mode 100644 index 0000000000..eda121985e --- /dev/null +++ b/3rdparty/carotene/src/cmp.cpp @@ -0,0 +1,340 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" +#include "vtransform.hpp" + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON + +namespace { + +inline void vnst(u8* dst, uint8x16_t v1, uint8x16_t v2) { vst1q_u8(dst, v1); vst1q_u8(dst+16, v2); } +inline void vnst(u8* dst, uint16x8_t v1, uint16x8_t v2) { vst1q_u8(dst, vcombine_u8(vmovn_u16(v1), vmovn_u16(v2))); } +inline void vnst(u8* dst, uint32x4_t v1, uint32x4_t v2) { vst1_u8(dst, vmovn_u16(vcombine_u16(vmovn_u32(v1), vmovn_u32(v2)))); } + +template struct vtail +{ + static inline void compare(const typename Op::type * src0, const typename Op::type * src1, + u8 * dst, const Op & op, + size_t &x, size_t width) + { + //do nothing since there couldn't be enough data + (void)src0; + (void)src1; + (void)dst; + (void)op; + (void)x; + (void)width; + } +}; +template struct vtail +{ + static inline void compare(const typename Op::type * src0, const typename Op::type * src1, + u8 * dst, const Op & op, + size_t &x, size_t width) + { + typedef typename Op::type type; + typedef typename internal::VecTraits::vec128 vec128; + typedef typename internal::VecTraits::unsign::vec128 uvec128; + //There no more than 15 elements in the tail, so we could handle 8 element vector only once + if( x + 8 < width) + { + vec128 v_src0, v_src1; + uvec128 v_dst; + + v_src0 = internal::vld1q(src0 + x); + v_src1 = internal::vld1q(src1 + x); + op(v_src0, v_src1, v_dst); + internal::vst1(dst + x, internal::vmovn(v_dst)); + x+=8; + } + } +}; +template struct vtail +{ + static inline void compare(const typename Op::type * src0, const typename Op::type * src1, + u8 * dst, const Op & op, + size_t &x, size_t width) + { + typedef typename Op::type type; + typedef typename internal::VecTraits::vec128 vec128; + typedef typename internal::VecTraits::unsign::vec128 uvec128; + typedef typename internal::VecTraits::vec64 vec64; + typedef typename internal::VecTraits::unsign::vec64 uvec64; + //There no more than 31 elements in the tail, so we could handle once 16+8 or 16 or 8 elements + if( x + 16 < width) + { + vec128 v_src0, v_src1; + uvec128 v_dst; + + v_src0 = internal::vld1q(src0 + x); + v_src1 = internal::vld1q(src1 + x); + op(v_src0, v_src1, v_dst); + internal::vst1q(dst + x, v_dst); + x+=16; + } + if( x + 8 < width) + { + vec64 v_src0, v_src1; + uvec64 v_dst; + + v_src0 = internal::vld1(src0 + x); + v_src1 = internal::vld1(src1 + x); + op(v_src0, v_src1, v_dst); + internal::vst1(dst + x, v_dst); + x+=8; + } + } +}; + +template +void vcompare(Size2D size, + const typename Op::type * src0Base, ptrdiff_t src0Stride, + const typename Op::type * src1Base, ptrdiff_t src1Stride, + u8 * dstBase, ptrdiff_t dstStride, const Op & op) +{ + typedef typename Op::type type; + typedef typename internal::VecTraits::vec128 vec128; + typedef typename internal::VecTraits::unsign::vec128 uvec128; + + if (src0Stride == src1Stride && src0Stride == dstStride && + src0Stride == (ptrdiff_t)(size.width * sizeof(type))) + { + size.width *= size.height; + size.height = 1; + } + + const u32 step_base = 32 / sizeof(type); + size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0; + + for (size_t y = 0; y < size.height; ++y) + { + const type * src0 = internal::getRowPtr(src0Base, src0Stride, y); + const type * src1 = internal::getRowPtr(src1Base, src1Stride, y); + u8 * dst = internal::getRowPtr(dstBase, dstStride, y); + size_t x = 0; + + for( ; x < roiw_base; x += step_base ) + { + internal::prefetch(src0 + x); + internal::prefetch(src1 + x); + + vec128 v_src00 = internal::vld1q(src0 + x), v_src01 = internal::vld1q(src0 + x + 16 / sizeof(type)); + vec128 v_src10 = internal::vld1q(src1 + x), v_src11 = internal::vld1q(src1 + x + 16 / sizeof(type)); + uvec128 v_dst0; + uvec128 v_dst1; + + op(v_src00, v_src10, v_dst0); + op(v_src01, v_src11, v_dst1); + + vnst(dst + x, v_dst0, v_dst1); + } + + vtail::compare(src0, src1, dst, op, x, size.width); + + for (; x < size.width; ++x) + { + op(src0 + x, src1 + x, dst + x); + } + } +} + +template +struct OpCmpEQ +{ + typedef T type; + + void operator() (const typename internal::VecTraits::vec128 & v_src0, const typename internal::VecTraits::vec128 & v_src1, + typename internal::VecTraits::unsign::vec128 & v_dst) const + { + v_dst = internal::vceqq(v_src0, v_src1); + } + + void operator() (const typename internal::VecTraits::vec64 & v_src0, const typename internal::VecTraits::vec64 & v_src1, + typename internal::VecTraits::unsign::vec64 & v_dst) const + { + v_dst = internal::vceq(v_src0, v_src1); + } + + void operator() (const T * src0, const T * src1, u8 * dst) const + { + dst[0] = src0[0] == src1[0] ? 255 : 0; + } +}; + +template +struct OpCmpNE +{ + typedef T type; + + void operator() (const typename internal::VecTraits::vec128 & v_src0, const typename internal::VecTraits::vec128 & v_src1, + typename internal::VecTraits::unsign::vec128 & v_dst) const + { + v_dst = internal::vmvnq(internal::vceqq(v_src0, v_src1)); + } + + void operator() (const typename internal::VecTraits::vec64 & v_src0, const typename internal::VecTraits::vec64 & v_src1, + typename internal::VecTraits::unsign::vec64 & v_dst) const + { + v_dst = internal::vmvn(internal::vceq(v_src0, v_src1)); + } + + void operator() (const T * src0, const T * src1, u8 * dst) const + { + dst[0] = src0[0] == src1[0] ? 0 : 255; + } +}; + +template +struct OpCmpGT +{ + typedef T type; + + void operator() (const typename internal::VecTraits::vec128 & v_src0, const typename internal::VecTraits::vec128 & v_src1, + typename internal::VecTraits::unsign::vec128 & v_dst) const + { + v_dst = internal::vcgtq(v_src0, v_src1); + } + + void operator() (const typename internal::VecTraits::vec64 & v_src0, const typename internal::VecTraits::vec64 & v_src1, + typename internal::VecTraits::unsign::vec64 & v_dst) const + { + v_dst = internal::vcgt(v_src0, v_src1); + } + + void operator() (const T * src0, const T * src1, u8 * dst) const + { + dst[0] = src0[0] > src1[0] ? 255 : 0; + } +}; + +template +struct OpCmpGE +{ + typedef T type; + + void operator() (const typename internal::VecTraits::vec128 & v_src0, const typename internal::VecTraits::vec128 & v_src1, + typename internal::VecTraits::unsign::vec128 & v_dst) const + { + v_dst = internal::vcgeq(v_src0, v_src1); + } + + void operator() (const typename internal::VecTraits::vec64 & v_src0, const typename internal::VecTraits::vec64 & v_src1, + typename internal::VecTraits::unsign::vec64 & v_dst) const + { + v_dst = internal::vcge(v_src0, v_src1); + } + + void operator() (const T * src0, const T * src1, u8 * dst) const + { + dst[0] = src0[0] >= src1[0] ? 255 : 0; + } +}; + +} + +#define IMPL_CMPOP(op, type) \ +void cmp##op(const Size2D &size, \ + const type * src0Base, ptrdiff_t src0Stride, \ + const type * src1Base, ptrdiff_t src1Stride, \ + u8 *dstBase, ptrdiff_t dstStride) \ +{ \ + internal::assertSupportedConfiguration(); \ + vcompare(size, \ + src0Base, src0Stride, \ + src1Base, src1Stride, \ + dstBase, dstStride, \ + OpCmp##op()); \ +} + +#else + +#define IMPL_CMPOP(op, type) \ +void cmp##op(const Size2D &size, \ + const type * src0Base, ptrdiff_t src0Stride, \ + const type * src1Base, ptrdiff_t src1Stride, \ + u8 *dstBase, ptrdiff_t dstStride) \ +{ \ + internal::assertSupportedConfiguration(); \ + (void)size; \ + (void)src0Base; \ + (void)src0Stride; \ + (void)src1Base; \ + (void)src1Stride; \ + (void)dstBase; \ + (void)dstStride; \ +} + +#endif + +IMPL_CMPOP(EQ, u8) +IMPL_CMPOP(EQ, s8) +IMPL_CMPOP(EQ, u16) +IMPL_CMPOP(EQ, s16) +IMPL_CMPOP(EQ, u32) +IMPL_CMPOP(EQ, s32) +IMPL_CMPOP(EQ, f32) + +IMPL_CMPOP(NE, u8) +IMPL_CMPOP(NE, s8) +IMPL_CMPOP(NE, u16) +IMPL_CMPOP(NE, s16) +IMPL_CMPOP(NE, u32) +IMPL_CMPOP(NE, s32) +IMPL_CMPOP(NE, f32) + +IMPL_CMPOP(GT, u8) +IMPL_CMPOP(GT, s8) +IMPL_CMPOP(GT, u16) +IMPL_CMPOP(GT, s16) +IMPL_CMPOP(GT, u32) +IMPL_CMPOP(GT, s32) +IMPL_CMPOP(GT, f32) + +IMPL_CMPOP(GE, u8) +IMPL_CMPOP(GE, s8) +IMPL_CMPOP(GE, u16) +IMPL_CMPOP(GE, s16) +IMPL_CMPOP(GE, u32) +IMPL_CMPOP(GE, s32) +IMPL_CMPOP(GE, f32) + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/colorconvert.cpp b/3rdparty/carotene/src/colorconvert.cpp new file mode 100644 index 0000000000..ea2db6043a --- /dev/null +++ b/3rdparty/carotene/src/colorconvert.cpp @@ -0,0 +1,2846 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +#include "saturate_cast.hpp" + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON + +namespace { + +enum +{ + SHIFT = 14, + SHIFT_DELTA = 1 << (SHIFT - 1), + + R2Y_BT601 = 4899, + G2Y_BT601 = 9617, + B2Y_BT601 = 1868, + + R2Y_BT709 = 3483, + G2Y_BT709 = 11718, + B2Y_BT709 = 1183, +}; + +inline uint8x8_t convertToGray(const uint16x8_t & v_r, + const uint16x8_t & v_g, + const uint16x8_t & v_b, + const uint16x4_t & v_r2y, + const uint16x4_t & v_g2y, + const uint16x4_t & v_b2y) +{ + uint32x4_t v_dst0 = vmull_u16(vget_low_u16(v_g), v_g2y); + uint32x4_t v_dst1 = vmull_u16(vget_high_u16(v_g), v_g2y); + + v_dst0 = vmlal_u16(v_dst0, vget_low_u16(v_r), v_r2y); + v_dst1 = vmlal_u16(v_dst1, vget_high_u16(v_r), v_r2y); + + v_dst0 = vmlal_u16(v_dst0, vget_low_u16(v_b), v_b2y); + v_dst1 = vmlal_u16(v_dst1, vget_high_u16(v_b), v_b2y); + + uint8x8_t v_gray = vqmovn_u16(vcombine_u16(vrshrn_n_u32(v_dst0, SHIFT), + vrshrn_n_u32(v_dst1, SHIFT))); + + return v_gray; +} + +} // namespace + +#endif + +void rgb2gray(const Size2D &size, COLOR_SPACE color_space, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + const u32 R2Y = color_space == COLOR_SPACE_BT601 ? R2Y_BT601 : R2Y_BT709; + const u32 G2Y = color_space == COLOR_SPACE_BT601 ? G2Y_BT601 : G2Y_BT709; + const u32 B2Y = color_space == COLOR_SPACE_BT601 ? B2Y_BT601 : B2Y_BT709; + +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + register int16x4_t v_r2y asm ("d31") = vmov_n_s16(R2Y); + register int16x4_t v_g2y asm ("d30") = vmov_n_s16(G2Y); + register int16x4_t v_b2y asm ("d29") = vmov_n_s16(B2Y); +#else + uint16x4_t v_r2y = vdup_n_u16(R2Y), + v_g2y = vdup_n_u16(G2Y), + v_b2y = vdup_n_u16(B2Y); + + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; +#endif + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u; + +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + for (; dj < roiw8; sj += 24, dj += 8) + { + internal::prefetch(src + sj); + __asm__ ( + "vld3.8 {d0-d2}, [%[in]] @RGB \n\t" + "vmovl.u8 q2, d0 @R (d4,d5) \n\t" + "vmovl.u8 q3, d1 @G (d6,d7) \n\t" + "vmovl.u8 q4, d2 @B (d8,d9) \n\t" + "vmull.u16 q5, d6, d30 @Y (q5,q6): G \n\t" + "vmull.u16 q6, d7, d30 @Y (q5,q6): G \n\t" + "vmlal.s16 q5, d8, d29 @Y (q5,q6): GB \n\t" + "vmlal.s16 q6, d9, d29 @Y (q5,q6): GB \n\t" + "vmlal.s16 q5, d4, d31 @Y (q5,q6): GBR \n\t" + "vmlal.s16 q6, d5, d31 @Y (q5,q6): GBR \n\t" + "vrshrn.s32 d8, q5, #14 @Y -> q4 \n\t" + "vrshrn.s32 d9, q6, #14 @Y -> q4 \n\t" + "vqmovn.u16 d4, q4 \n\t" + "vst1.8 {d4}, [%[out]] \n\t" + : /*no output*/ + : [out] "r" (dst + dj), [in] "r" (src + sj), "w" (v_r2y), "w" (v_g2y), "w" (v_b2y) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13" + ); + } +#else + for (; dj < roiw16; sj += 48, dj += 16) + { + internal::prefetch(src + sj); + uint8x16x3_t v_src0 = vld3q_u8(src + sj); + // 0 + uint16x8_t v_r = vmovl_u8(vget_low_u8(v_src0.val[0])), + v_g = vmovl_u8(vget_low_u8(v_src0.val[1])), + v_b = vmovl_u8(vget_low_u8(v_src0.val[2])); + uint8x8_t v_gray0 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y); + + v_r = vmovl_u8(vget_high_u8(v_src0.val[0])), + v_g = vmovl_u8(vget_high_u8(v_src0.val[1])), + v_b = vmovl_u8(vget_high_u8(v_src0.val[2])); + uint8x8_t v_gray1 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y); + + vst1q_u8(dst + dj, vcombine_u8(v_gray0, v_gray1)); + } + + if (dj < roiw8) + { + uint8x8x3_t v_src = vld3_u8(src + sj); + uint16x8_t v_r = vmovl_u8(v_src.val[0]), + v_g = vmovl_u8(v_src.val[1]), + v_b = vmovl_u8(v_src.val[2]); + uint8x8_t v_gray = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y); + + vst1_u8(dst + dj, v_gray); + sj += 24; dj += 8; + } +#endif + + for (; dj < size.width; sj += 3, dj++) + { + u32 val = src[sj] * R2Y + src[sj + 1] * G2Y + src[sj + 2] * B2Y; + dst[dj] = internal::saturate_cast((val + SHIFT_DELTA) >> SHIFT); + } + } +#else + (void)size; + (void)color_space; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void rgbx2gray(const Size2D &size, COLOR_SPACE color_space, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + const u32 R2Y = color_space == COLOR_SPACE_BT601 ? R2Y_BT601 : R2Y_BT709; + const u32 G2Y = color_space == COLOR_SPACE_BT601 ? G2Y_BT601 : G2Y_BT709; + const u32 B2Y = color_space == COLOR_SPACE_BT601 ? B2Y_BT601 : B2Y_BT709; + +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + register int16x4_t v_r2y asm ("d31") = vmov_n_s16(R2Y); + register int16x4_t v_g2y asm ("d30") = vmov_n_s16(G2Y); + register int16x4_t v_b2y asm ("d29") = vmov_n_s16(B2Y); +#else + uint16x4_t v_r2y = vdup_n_u16(R2Y), + v_g2y = vdup_n_u16(G2Y), + v_b2y = vdup_n_u16(B2Y); + + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; +#endif + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u; + +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + for (; dj < roiw8; sj += 32, dj += 8) + { + internal::prefetch(src + sj); + __asm__ ( + "vld4.8 {d0-d3}, [%[in]] @RGBA \n\t" + "vmovl.u8 q2, d0 @R (d4,d5) \n\t" + "vmovl.u8 q3, d1 @G (d6,d7) \n\t" + "vmovl.u8 q4, d2 @B (d8,d9) \n\t" + "vmull.u16 q5, d6, d30 @Y (q5,q6): G \n\t" + "vmull.u16 q6, d7, d30 @Y (q5,q6): G \n\t" + "vmlal.s16 q5, d8, d29 @Y (q5,q6): GB \n\t" + "vmlal.s16 q6, d9, d29 @Y (q5,q6): GB \n\t" + "vmlal.s16 q5, d4, d31 @Y (q5,q6): GBR \n\t" + "vmlal.s16 q6, d5, d31 @Y (q5,q6): GBR \n\t" + "vrshrn.s32 d8, q5, #14 @Y -> q4 \n\t" + "vrshrn.s32 d9, q6, #14 @Y -> q4 \n\t" + "vqmovn.u16 d4, q4 \n\t" + "vst1.8 {d4}, [%[out]] \n\t" + : /*no output*/ + : [out] "r" (dst + dj), [in] "r" (src + sj), "w" (v_r2y), "w" (v_g2y), "w" (v_b2y) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13" + ); + } +#else + for (; dj < roiw16; sj += 64, dj += 16) + { + internal::prefetch(src + sj); + uint8x16x4_t v_src0 = vld4q_u8(src + sj); + + // 0 + uint16x8_t v_r = vmovl_u8(vget_low_u8(v_src0.val[0])), + v_g = vmovl_u8(vget_low_u8(v_src0.val[1])), + v_b = vmovl_u8(vget_low_u8(v_src0.val[2])); + uint8x8_t v_gray0 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y); + + v_r = vmovl_u8(vget_high_u8(v_src0.val[0])), + v_g = vmovl_u8(vget_high_u8(v_src0.val[1])), + v_b = vmovl_u8(vget_high_u8(v_src0.val[2])); + uint8x8_t v_gray1 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y); + + vst1q_u8(dst + dj, vcombine_u8(v_gray0, v_gray1)); + } + + if (dj < roiw8) + { + uint8x8x4_t v_src = vld4_u8(src + sj); + uint16x8_t v_r = vmovl_u8(v_src.val[0]), + v_g = vmovl_u8(v_src.val[1]), + v_b = vmovl_u8(v_src.val[2]); + uint8x8_t v_gray = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y); + + vst1_u8(dst + dj, v_gray); + sj += 32; dj += 8; + } +#endif + + for (; dj < size.width; sj += 4, dj++) + { + u32 val = src[sj] * R2Y + src[sj + 1] * G2Y + src[sj + 2] * B2Y; + dst[dj] = internal::saturate_cast((val + SHIFT_DELTA) >> SHIFT); + } + } +#else + (void)size; + (void)color_space; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void bgr2gray(const Size2D &size, COLOR_SPACE color_space, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + const u32 R2Y = color_space == COLOR_SPACE_BT601 ? R2Y_BT601 : R2Y_BT709; + const u32 G2Y = color_space == COLOR_SPACE_BT601 ? G2Y_BT601 : G2Y_BT709; + const u32 B2Y = color_space == COLOR_SPACE_BT601 ? B2Y_BT601 : B2Y_BT709; + +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + register int16x4_t v_r2y asm ("d31") = vmov_n_s16(R2Y); + register int16x4_t v_g2y asm ("d30") = vmov_n_s16(G2Y); + register int16x4_t v_b2y asm ("d29") = vmov_n_s16(B2Y); +#else + uint16x4_t v_r2y = vdup_n_u16(R2Y), + v_g2y = vdup_n_u16(G2Y), + v_b2y = vdup_n_u16(B2Y); + + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; +#endif + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u; + +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + for (; dj < roiw8; sj += 24, dj += 8) + { + internal::prefetch(src + sj); + __asm__ ( + "vld3.8 {d0-d2}, [%[in]] @BGR \n\t" + "vmovl.u8 q2, d2 @R (d4,d5) \n\t" + "vmovl.u8 q3, d1 @G (d6,d7) \n\t" + "vmovl.u8 q4, d0 @B (d8,d9) \n\t" + "vmull.u16 q5, d6, d30 @Y (q5,q6): G \n\t" + "vmull.u16 q6, d7, d30 @Y (q5,q6): G \n\t" + "vmlal.s16 q5, d8, d29 @Y (q5,q6): GB \n\t" + "vmlal.s16 q6, d9, d29 @Y (q5,q6): GB \n\t" + "vmlal.s16 q5, d4, d31 @Y (q5,q6): GBR \n\t" + "vmlal.s16 q6, d5, d31 @Y (q5,q6): GBR \n\t" + "vrshrn.s32 d8, q5, #14 @Y -> q4 \n\t" + "vrshrn.s32 d9, q6, #14 @Y -> q4 \n\t" + "vqmovn.u16 d4, q4 \n\t" + "vst1.8 {d4}, [%[out]] \n\t" + : /*no output*/ + : [out] "r" (dst + dj), [in] "r" (src + sj), "w" (v_r2y), "w" (v_g2y), "w" (v_b2y) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13" + ); + } +#else + for (; dj < roiw16; sj += 48, dj += 16) + { + internal::prefetch(src + sj); + uint8x16x3_t v_src0 = vld3q_u8(src + sj); + + // 0 + uint16x8_t v_b = vmovl_u8(vget_low_u8(v_src0.val[0])), + v_g = vmovl_u8(vget_low_u8(v_src0.val[1])), + v_r = vmovl_u8(vget_low_u8(v_src0.val[2])); + uint8x8_t v_gray0 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y); + + v_b = vmovl_u8(vget_high_u8(v_src0.val[0])), + v_g = vmovl_u8(vget_high_u8(v_src0.val[1])), + v_r = vmovl_u8(vget_high_u8(v_src0.val[2])); + uint8x8_t v_gray1 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y); + + vst1q_u8(dst + dj, vcombine_u8(v_gray0, v_gray1)); + } + + if (dj < roiw8) + { + uint8x8x3_t v_src = vld3_u8(src + sj); + uint16x8_t v_b = vmovl_u8(v_src.val[0]), + v_g = vmovl_u8(v_src.val[1]), + v_r = vmovl_u8(v_src.val[2]); + uint8x8_t v_gray = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y); + + vst1_u8(dst + dj, v_gray); + sj += 24; dj += 8; + } +#endif + + for (; dj < size.width; sj += 3, dj++) + { + u32 val = src[sj] * B2Y + src[sj + 1] * G2Y + src[sj + 2] * R2Y; + dst[dj] = internal::saturate_cast((val + SHIFT_DELTA) >> SHIFT); + } + } +#else + (void)size; + (void)color_space; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void bgrx2gray(const Size2D &size, COLOR_SPACE color_space, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + const u32 R2Y = color_space == COLOR_SPACE_BT601 ? R2Y_BT601 : R2Y_BT709; + const u32 G2Y = color_space == COLOR_SPACE_BT601 ? G2Y_BT601 : G2Y_BT709; + const u32 B2Y = color_space == COLOR_SPACE_BT601 ? B2Y_BT601 : B2Y_BT709; + +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + register int16x4_t v_r2y asm ("d31") = vmov_n_s16(R2Y); + register int16x4_t v_g2y asm ("d30") = vmov_n_s16(G2Y); + register int16x4_t v_b2y asm ("d29") = vmov_n_s16(B2Y); +#else + uint16x4_t v_r2y = vdup_n_u16(R2Y), + v_g2y = vdup_n_u16(G2Y), + v_b2y = vdup_n_u16(B2Y); + + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; +#endif + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u; + +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + for (; dj < roiw8; sj += 32, dj += 8) + { + internal::prefetch(src + sj); + __asm__ ( + "vld4.8 {d0-d3}, [%[in]] @BGRA \n\t" + "vmovl.u8 q2, d2 @R (d4,d5) \n\t" + "vmovl.u8 q3, d1 @G (d6,d7) \n\t" + "vmovl.u8 q4, d0 @B (d8,d9) \n\t" + "vmull.u16 q5, d6, d30 @Y (q5,q6): G \n\t" + "vmull.u16 q6, d7, d30 @Y (q5,q6): G \n\t" + "vmlal.s16 q5, d8, d29 @Y (q5,q6): GB \n\t" + "vmlal.s16 q6, d9, d29 @Y (q5,q6): GB \n\t" + "vmlal.s16 q5, d4, d31 @Y (q5,q6): GBR \n\t" + "vmlal.s16 q6, d5, d31 @Y (q5,q6): GBR \n\t" + "vrshrn.s32 d8, q5, #14 @Y -> q4 \n\t" + "vrshrn.s32 d9, q6, #14 @Y -> q4 \n\t" + "vqmovn.u16 d4, q4 \n\t" + "vst1.8 {d4}, [%[out]] \n\t" + : /*no output*/ + : [out] "r" (dst + dj), [in] "r" (src + sj), "w" (v_r2y), "w" (v_g2y), "w" (v_b2y) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13" + ); + } +#else + for (; dj < roiw16; sj += 64, dj += 16) + { + internal::prefetch(src + sj); + uint8x16x4_t v_src0 = vld4q_u8(src + sj); + + // 0 + uint16x8_t v_b = vmovl_u8(vget_low_u8(v_src0.val[0])), + v_g = vmovl_u8(vget_low_u8(v_src0.val[1])), + v_r = vmovl_u8(vget_low_u8(v_src0.val[2])); + uint8x8_t v_gray0 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y); + + v_b = vmovl_u8(vget_high_u8(v_src0.val[0])), + v_g = vmovl_u8(vget_high_u8(v_src0.val[1])), + v_r = vmovl_u8(vget_high_u8(v_src0.val[2])); + uint8x8_t v_gray1 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y); + + vst1q_u8(dst + dj, vcombine_u8(v_gray0, v_gray1)); + } + + if (dj < roiw8) + { + uint8x8x4_t v_src = vld4_u8(src + sj); + uint16x8_t v_b = vmovl_u8(v_src.val[0]), + v_g = vmovl_u8(v_src.val[1]), + v_r = vmovl_u8(v_src.val[2]); + uint8x8_t v_gray = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y); + + vst1_u8(dst + dj, v_gray); + sj += 32; dj += 8; + } +#endif + + for (; dj < size.width; sj += 4, dj++) + { + u32 val = src[sj] * B2Y + src[sj + 1] * G2Y + src[sj + 2] * R2Y; + dst[dj] = internal::saturate_cast((val + SHIFT_DELTA) >> SHIFT); + } + } +#else + (void)size; + (void)color_space; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void gray2rgb(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u; + + for (; sj < roiw16; sj += 16, dj += 48) + { + internal::prefetch(src + sj); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + __asm__ ( + "vld1.8 {d0-d1}, [%[in0]] \n\t" + "vmov.8 q1, q0 \n\t" + "vmov.8 q2, q0 \n\t" + "vmov.8 q3, q1 \n\t" + "vst3.8 {d2, d4, d6}, [%[out0]] \n\t" + "vst3.8 {d3, d5, d7}, [%[out1]] \n\t" + : /*no output*/ + : [out0] "r" (dst + dj), [out1] "r" (dst + dj + 24), + [in0] "r" (src + sj) + : "d0","d1","d2","d3","d4","d5","d6","d7" + ); +#else + uint8x16x3_t vRgb1; + vRgb1.val[0] = vld1q_u8(src + sj); + + vRgb1.val[1] = vRgb1.val[0]; + vRgb1.val[2] = vRgb1.val[0]; + + vst3q_u8(dst + dj, vRgb1); +#endif + } + + if (sj < roiw8) + { +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + __asm__ ( + "vld1.8 {d0}, [%[in]] \n\t" + "vmov.8 d1, d0 \n\t" + "vmov.8 d2, d0 \n\t" + "vst3.8 {d0-d2}, [%[out]] \n\t" + : /*no output*/ + : [out] "r" (dst + dj), [in] "r" (src + sj) + : "d0","d1","d2" + ); +#else + uint8x8x3_t vRgb2; + vRgb2.val[0] = vld1_u8(src + sj); + vRgb2.val[1] = vRgb2.val[0]; + vRgb2.val[2] = vRgb2.val[0]; + + vst3_u8(dst + dj, vRgb2); +#endif + sj += 8; dj += 24; + } + + for (; sj < size.width; sj++, dj += 3) + { + dst[dj+0] = src[sj]; + dst[dj+1] = src[sj]; + dst[dj+2] = src[sj]; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void gray2rgbx(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + register uint8x16_t vc255 asm ("q4") = vmovq_n_u8(255); +#else + uint8x16x4_t vRgba; + uint8x8x4_t vRgba2; + vRgba.val[3] = vmovq_n_u8(255); + vRgba2.val[3] = vget_low_u8(vRgba.val[3]); +#endif + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u; + + for (; sj < roiw16; sj += 16, dj += 64) + { + internal::prefetch(src + sj); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + __asm__ ( + "vld1.8 {d0-d1}, [%[in0]] \n\t" + "vmov.8 q1, q0 \n\t" + "vmov.8 q2, q0 \n\t" + "vmov.8 q3, q1 \n\t" + "vst4.8 {d2, d4, d6, d8}, [%[out0]] \n\t" + "vst4.8 {d3, d5, d7, d9}, [%[out1]] \n\t" + : /*no output*/ + : [out0] "r" (dst + dj), [out1] "r" (dst + dj + 32), + [in0] "r" (src + sj), + "w" (vc255) + : "d0","d1","d2","d3","d4","d5","d6","d7" + ); +#else + vRgba.val[0] = vld1q_u8(src + sj); + + vRgba.val[1] = vRgba.val[0]; + vRgba.val[2] = vRgba.val[0]; + + vst4q_u8(dst + dj, vRgba); +#endif + } + + if (sj < roiw8) + { +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + __asm__ ( + "vld1.8 {d5}, [%[in]] \n\t" + "vmov.8 d6, d5 \n\t" + "vmov.8 d7, d5 \n\t" + "vst4.8 {d5-d8}, [%[out]] \n\t" + : /*no output*/ + : [out] "r" (dst + dj), [in] "r" (src + sj), "w" (vc255) + : "d5","d6","d7" + ); +#else + vRgba2.val[0] = vld1_u8(src + sj); + vRgba2.val[1] = vRgba2.val[0]; + vRgba2.val[2] = vRgba2.val[0]; + + vst4_u8(dst + dj, vRgba2); +#endif + sj += 8; dj += 32; + } + + for (; sj < size.width; sj++, dj += 4) + { + dst[dj+0] = src[sj]; + dst[dj+1] = src[sj]; + dst[dj+2] = src[sj]; + dst[dj+3] = 255; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void rgb2rgbx(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; +#if defined(__GNUC__) && defined(__arm__) + register uint8x8_t vc255_0 asm ("d3") = vmov_n_u8(255); +#else + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + union { uint8x16x4_t v4; uint8x16x3_t v3; } v_dst0; + v_dst0.v4.val[3] = vdupq_n_u8(255); + union { uint8x8x4_t v4; uint8x8x3_t v3; } v_dst; + v_dst.v4.val[3] = vdup_n_u8(255); +#endif + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u, j = 0u; + +#if defined(__GNUC__) && defined(__arm__) + for (; j < roiw8; sj += 24, dj += 32, j += 8) + { + internal::prefetch(src + sj); + __asm__ ( + "vld3.8 {d0, d1, d2}, [%[in0]] \n\t" + "vst4.8 {d0, d1, d2, d3}, [%[out0]] \n\t" + : /*no output*/ + : [out0] "r" (dst + dj), + [in0] "r" (src + sj), + "w" (vc255_0) + : "d0","d1","d2" + ); + } +#else + for (; j < roiw16; sj += 48, dj += 64, j += 16) + { + internal::prefetch(src + sj); + v_dst0.v3 = vld3q_u8(src + sj); + vst4q_u8(dst + dj, v_dst0.v4); + } + + if (j < roiw8) + { + v_dst.v3 = vld3_u8(src + sj); + vst4_u8(dst + dj, v_dst.v4); + sj += 24; dj += 32; j += 8; + } +#endif + + for (; j < size.width; ++j, sj += 3, dj += 4) + { + dst[dj] = src[sj]; + dst[dj + 1] = src[sj + 1]; + dst[dj + 2] = src[sj + 2]; + dst[dj + 3] = 255; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void rgbx2rgb(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; +#if !defined(__GNUC__) || !defined(__arm__) + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + union { uint8x16x4_t v4; uint8x16x3_t v3; } v_dst0; + union { uint8x8x4_t v4; uint8x8x3_t v3; } v_dst; +#endif + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u, j = 0u; + +#if defined(__GNUC__) && defined(__arm__) + for (; j < roiw8; sj += 32, dj += 24, j += 8) + { + internal::prefetch(src + sj); + __asm__ ( + "vld4.8 {d0, d1, d2, d3}, [%[in0]] \n\t" + "vst3.8 {d0, d1, d2}, [%[out0]] \n\t" + : /*no output*/ + : [out0] "r" (dst + dj), + [in0] "r" (src + sj) + : "d0","d1","d2","d3" + ); + } +#else + for (; j < roiw16; sj += 64, dj += 48, j += 16) + { + internal::prefetch(src + sj); + v_dst0.v4 = vld4q_u8(src + sj); + vst3q_u8(dst + dj, v_dst0.v3); + } + + if (j < roiw8) + { + v_dst.v4 = vld4_u8(src + sj); + vst3_u8(dst + dj, v_dst.v3); + sj += 32; dj += 24; j += 8; + } +#endif + + for (; j < size.width; ++j, sj += 4, dj += 3) + { + dst[dj] = src[sj]; + dst[dj + 1] = src[sj + 1]; + dst[dj + 2] = src[sj + 2]; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void rgb2bgr(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON +#if !defined(__GNUC__) || !defined(__arm__) + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; +#endif + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u, j = 0u; + + +#if defined(__GNUC__) && defined(__arm__) + for (; j < roiw8; sj += 24, dj += 24, j += 8) + { + internal::prefetch(src + sj); + __asm__ ( + "vld3.8 {d0, d1, d2}, [%[in0]] \n\t" + "vswp d0, d2 \n\t" + "vst3.8 {d0, d1, d2}, [%[out0]] \n\t" + : /*no output*/ + : [out0] "r" (dst + dj), + [in0] "r" (src + sj) + : "d0","d1","d2" + ); + } +#else + for (; j < roiw16; sj += 48, dj += 48, j += 16) + { + internal::prefetch(src + sj); + uint8x16x3_t vals0 = vld3q_u8(src + sj); + + std::swap(vals0.val[0], vals0.val[2]); + + vst3q_u8(dst + dj, vals0); + } + + if (j < roiw8) + { + uint8x8x3_t vals = vld3_u8(src + sj); + std::swap(vals.val[0], vals.val[2]); + vst3_u8(dst + dj, vals); + sj += 24; dj += 24; j += 8; + } +#endif + + for (; j < size.width; ++j, sj += 3, dj += 3) + { + u8 b = src[sj + 2];//Handle src == dst case + dst[dj + 2] = src[sj ]; + dst[dj + 1] = src[sj + 1]; + dst[dj ] = b; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void rgbx2bgrx(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON +#if !defined(__GNUC__) || !defined(__arm__) + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; +#endif + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u, j = 0u; + +#if defined(__GNUC__) && defined(__arm__) + for (; j < roiw8; sj += 32, dj += 32, j += 8) + { + internal::prefetch(src + sj); + __asm__ ( + "vld4.8 {d0, d1, d2, d3}, [%[in0]] \n\t" + "vswp d0, d2 \n\t" + "vst4.8 {d0, d1, d2, d3}, [%[out0]] \n\t" + : /*no output*/ + : [out0] "r" (dst + dj), + [in0] "r" (src + sj) + : "d0","d1","d2","d3" + ); + } +#else + for (; j < roiw16; sj += 64, dj += 64, j += 16) + { + internal::prefetch(src + sj); + uint8x16x4_t vals0 = vld4q_u8(src + sj); + + std::swap(vals0.val[0], vals0.val[2]); + + vst4q_u8(dst + dj, vals0); + } + + if (j < roiw8) + { + uint8x8x4_t vals = vld4_u8(src + sj); + std::swap(vals.val[0], vals.val[2]); + vst4_u8(dst + dj, vals); + sj += 32; dj += 32; j += 8; + } +#endif + + for (; j < size.width; ++j, sj += 4, dj += 4) + { + u8 b = src[sj + 2];//Handle src == dst case + dst[dj + 2] = src[sj ]; + dst[dj + 1] = src[sj + 1]; + dst[dj ] = b; + dst[dj + 3] = src[sj + 3]; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void rgbx2bgr(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON +#if !defined(__GNUC__) || !defined(__arm__) + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; +#endif + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u, j = 0u; + +#if defined(__GNUC__) && defined(__arm__) + for (; j < roiw8; sj += 32, dj += 24, j += 8) + { + internal::prefetch(src + sj); + __asm__ ( + "vld4.8 {d0, d1, d2, d3}, [%[in0]] \n\t" + "vswp d0, d2 \n\t" + "vst3.8 {d0, d1, d2}, [%[out0]] \n\t" + : /*no output*/ + : [out0] "r" (dst + dj), + [in0] "r" (src + sj) + : "d0","d1","d2","d3" + ); + } +#else + for (; j < roiw16; sj += 64, dj += 48, j += 16) + { + internal::prefetch(src + sj); + union { uint8x16x4_t v4; uint8x16x3_t v3; } vals0; + vals0.v4 = vld4q_u8(src + sj); + std::swap(vals0.v3.val[0], vals0.v3.val[2]); + vst3q_u8(dst + dj, vals0.v3); + } + + if (j < roiw8) + { + union { uint8x8x4_t v4; uint8x8x3_t v3; } vals; + vals.v4 = vld4_u8(src + sj); + std::swap(vals.v3.val[0], vals.v3.val[2]); + vst3_u8(dst + dj, vals.v3); + sj += 32; dj += 24; j += 8; + } +#endif + + for (; j < size.width; ++j, sj += 4, dj += 3) + { + dst[dj + 2] = src[sj ]; + dst[dj + 1] = src[sj + 1]; + dst[dj ] = src[sj + 2]; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void rgb2bgrx(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON +#if defined(__GNUC__) && defined(__arm__) + register uint8x8_t vc255 asm ("d3") = vmov_n_u8(255); +#else + union { uint8x16x4_t v4; uint8x16x3_t v3; } vals0; + vals0.v4.val[3] = vmovq_n_u8(255); + union { uint8x8x4_t v4; uint8x8x3_t v3; } vals8; + vals8.v4.val[3] = vmov_n_u8(255); +#endif + +#if !defined(__GNUC__) || !defined(__arm__) + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; +#endif + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u, j = 0u; + +#if defined(__GNUC__) && defined(__arm__) + for (; j < roiw8; sj += 24, dj += 32, j += 8) + { + internal::prefetch(src + sj); + __asm__ ( + "vld3.8 {d0, d1, d2}, [%[in0]] \n\t" + "vswp d0, d2 \n\t" + "vst4.8 {d0, d1, d2, d3}, [%[out0]] \n\t" + : /*no output*/ + : [out0] "r" (dst + dj), + [in0] "r" (src + sj), + "w" (vc255) + : "d0","d1","d2" + ); + } +#else + for (; j < roiw16; sj += 48, dj += 64, j += 16) + { + internal::prefetch(src + sj); + vals0.v3 = vld3q_u8(src + sj); + std::swap(vals0.v4.val[0], vals0.v4.val[2]); + vst4q_u8(dst + dj, vals0.v4); + } + + if (j < roiw8) + { + vals8.v3 = vld3_u8(src + sj); + std::swap(vals8.v4.val[0], vals8.v4.val[2]); + vst4_u8(dst + dj, vals8.v4); + sj += 24; dj += 32; j += 8; + } +#endif + + for (; j < size.width; ++j, sj += 3, dj += 4) + { + dst[dj + 3] = 255; + dst[dj + 2] = src[sj ]; + dst[dj + 1] = src[sj + 1]; + dst[dj ] = src[sj + 2]; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +namespace { + +#ifdef CAROTENE_NEON +inline uint8x8x3_t convertToHSV(const uint8x8_t vR, const uint8x8_t vG, const uint8x8_t vB, + const s32 hrange ) +{ + const s32 hsv_shift = 12; + register const f32 vsdiv_table = f32(255 << hsv_shift); + register f32 vhdiv_table = f32(hrange << hsv_shift); + register const s32 vhrange = hrange; + register const s32 v0 = s32(0); + register const s32 vshift = s32(1 << (hsv_shift-1)); + register const s32 v6 = s32(6); + + uint8x8_t vMin = vmin_u8(vR, vG); + uint8x8_t vMax = vmax_u8(vR, vG); + + uint16x8_t vR_u16 = vmovl_u8(vR); + uint16x8_t vG_u16 = vmovl_u8(vG); + + vMax = vmax_u8(vMax, vB); + vMin = vmin_u8(vMin, vB); + uint16x8_t vB_u16 = vmovl_u8(vB); + + uint16x8_t vDiff = vsubl_u8(vMax, vMin); + + uint16x8_t vV = vmovl_u8(vMax); + uint16x8_t vDiffx2 = vaddq_u16(vDiff, vDiff); + uint32x4_t vDiffL = vmovl_u16(vget_low_u16(vDiff)); + uint32x4_t vDiffH = vmovl_u16(vget_high_u16(vDiff)); + + uint16x8_t vVEqR = vceqq_u16(vR_u16, vV); + uint16x8_t vVEqG = vceqq_u16(vG_u16, vV); + + int16x8_t vG_B = vsubq_s16(vreinterpretq_s16_u16(vG_u16), vreinterpretq_s16_u16(vB_u16)); + uint16x8_t vInvR = vmvnq_u16(vVEqR); + int16x8_t vB_R = vsubq_s16(vreinterpretq_s16_u16(vB_u16), vreinterpretq_s16_u16(vR_u16)); + int16x8_t vR_G = vsubq_s16(vreinterpretq_s16_u16(vR_u16), vreinterpretq_s16_u16(vG_u16)); + + uint16x8_t vMask2 = vandq_u16(vVEqG, vInvR); + vR_u16 = vandq_u16(vreinterpretq_u16_s16(vG_B), vVEqR); + int16x8_t vH2 = vaddq_s16(vB_R, vreinterpretq_s16_u16(vDiffx2)); + + vVEqR = vmvnq_u16(vVEqG); + vB_R = vaddq_s16(vreinterpretq_s16_u16(vDiffx2), vreinterpretq_s16_u16(vDiffx2)); + vG_B = vandq_s16(vreinterpretq_s16_u16(vInvR), vreinterpretq_s16_u16(vVEqR)); + vInvR = vandq_u16(vreinterpretq_u16_s16(vH2), vMask2); + vR_G = vaddq_s16(vR_G, vB_R); + int16x8_t vH = vaddq_s16(vreinterpretq_s16_u16(vR_u16), vreinterpretq_s16_u16(vInvR)); + + uint32x4_t vV_L = vmovl_u16(vget_low_u16(vV)); + vR_G = vandq_s16(vR_G, vG_B); + uint32x4_t vV_H = vmovl_u16(vget_high_u16(vV)); + int16x8_t vDiff4 = vaddq_s16(vH, vR_G); + + int32x4_t vc6 = vdupq_n_s32(v6); + uint32x4_t vLine1 = vmulq_u32(vDiffL, vreinterpretq_u32_s32(vc6)); + uint32x4_t vLine2 = vmulq_u32(vDiffH, vreinterpretq_u32_s32(vc6)); + + float32x4_t vF1 = vcvtq_f32_u32(vV_L); + float32x4_t vF2 = vcvtq_f32_u32(vV_H); + float32x4_t vHF1 = vcvtq_f32_u32(vLine1); + float32x4_t vHF2 = vcvtq_f32_u32(vLine2); + + float32x4_t vXInv1 = vrecpeq_f32(vF1); + float32x4_t vXInv2 = vrecpeq_f32(vF2); + float32x4_t vXInv3 = vrecpeq_f32(vHF1); + float32x4_t vXInv4 = vrecpeq_f32(vHF2); + + float32x4_t vSt1 = vrecpsq_f32(vXInv1, vF1); + float32x4_t vSt2 = vrecpsq_f32(vXInv2, vF2); + float32x4_t vSt3 = vrecpsq_f32(vXInv3, vHF1); + float32x4_t vSt4 = vrecpsq_f32(vXInv4, vHF2); + + vF1 = vmulq_f32(vXInv1, vSt1); + vF2 = vmulq_f32(vXInv2, vSt2); + vHF1 = vmulq_f32(vXInv3, vSt3); + vHF2 = vmulq_f32(vXInv4, vSt4); + + float32x4_t vDivTab = vdupq_n_f32(vsdiv_table); + vSt1 = vmulq_f32(vF1, vDivTab); + vSt2 = vmulq_f32(vF2, vDivTab); + vDivTab = vdupq_n_f32(vhdiv_table); + vSt3 = vmulq_f32(vHF1, vDivTab); + vSt4 = vmulq_f32(vHF2, vDivTab); + + float32x4_t bias = vdupq_n_f32(0.5f); + + vSt1 = vaddq_f32(vSt1, bias); + vSt2 = vaddq_f32(vSt2, bias); + vSt3 = vaddq_f32(vSt3, bias); + vSt4 = vaddq_f32(vSt4, bias); + + uint32x4_t vRes1 = vcvtq_u32_f32(vSt1); + uint32x4_t vRes2 = vcvtq_u32_f32(vSt2); + uint32x4_t vRes3 = vcvtq_u32_f32(vSt3); + uint32x4_t vRes4 = vcvtq_u32_f32(vSt4); + + int32x4_t vH_L = vmovl_s16(vget_low_s16(vDiff4)); + int32x4_t vH_H = vmovl_s16(vget_high_s16(vDiff4)); + + uint32x4_t vDiff_Res1 = vmulq_u32(vDiffL, vRes1); + uint32x4_t vDiff_Res2 = vmulq_u32(vDiffH, vRes2); + uint32x4_t vDiff_Res3 = vmulq_u32(vreinterpretq_u32_s32(vH_L), vRes3); + uint32x4_t vDiff_Res4 = vmulq_u32(vreinterpretq_u32_s32(vH_H), vRes4); + + int32x4_t vShift = vdupq_n_s32(vshift); + uint32x4_t vAddRes1 = vaddq_u32(vDiff_Res1, vreinterpretq_u32_s32(vShift)); + uint32x4_t vAddRes2 = vaddq_u32(vDiff_Res2, vreinterpretq_u32_s32(vShift)); + uint32x4_t vAddRes3 = vaddq_u32(vDiff_Res3, vreinterpretq_u32_s32(vShift)); + uint32x4_t vAddRes4 = vaddq_u32(vDiff_Res4, vreinterpretq_u32_s32(vShift)); + int16x4_t vShrRes1 = vshrn_n_s32(vreinterpretq_s32_u32(vAddRes1), 8); + int16x4_t vShrRes2 = vshrn_n_s32(vreinterpretq_s32_u32(vAddRes2), 8); + int16x4_t vShrRes3 = vshrn_n_s32(vreinterpretq_s32_u32(vAddRes3), 8); + int16x4_t vShrRes4 = vshrn_n_s32(vreinterpretq_s32_u32(vAddRes4), 8); + + int16x8_t vc0 = vdupq_n_s16((s16)v0); + int8x8_t vShrRes1_s8 = vshrn_n_s16(vcombine_s16(vShrRes1, vShrRes2), 4); + uint16x8_t vCltRes_u16 = vcltq_s16(vcombine_s16(vShrRes3, vShrRes4), vc0); + int8x8_t vShrRes2_s8 = vshrn_n_s16(vcombine_s16(vShrRes3, vShrRes4), 4); + + int8x8_t vCltRes_s8 = vmovn_s16(vreinterpretq_s16_u16(vCltRes_u16)); + int8x8_t vcHRange = vdup_n_s8((s8)vhrange); + uint8x8_t vHResAdd = vand_u8(vreinterpret_u8_s8(vCltRes_s8), vreinterpret_u8_s8(vcHRange)); + int8x8_t vHRes = vadd_s8(vShrRes2_s8, vreinterpret_s8_u8(vHResAdd)); + + uint8x8x3_t vHsv; + vHsv.val[0] = vreinterpret_u8_s8(vHRes); + vHsv.val[1] = vreinterpret_u8_s8(vShrRes1_s8); + vHsv.val[2] = vMax; + + return vHsv; +} + +const u8 fastSaturate8u[] = +{ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, + 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, + 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, + 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, + 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, + 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, + 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, + 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, + 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255 +}; + +inline void convertToHSV(const s32 r, const s32 g, const s32 b, + const s32 &hrange, const s32 &hsv_shift, + u8* dst) +{ + s32 h, s, v = b; + s32 vmin = b, diff; + s32 vr, vg; + + v += fastSaturate8u[g-v+256]; + v += fastSaturate8u[r-v+256]; + vmin -= fastSaturate8u[vmin-g+256]; + vmin -= fastSaturate8u[vmin-r+256]; + + diff = v - vmin; + vr = v == r ? -1 : 0; + vg = v == g ? -1 : 0; + + s = (s32(diff * (255 << hsv_shift) * (1.0f/(f32)v)) + (1 << (hsv_shift-1))) >> hsv_shift; + h = (vr & (g - b)) + (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff)))); + h = ((h * s32((hrange << hsv_shift)/(6.f*diff) + 0.5)) + (1 << (hsv_shift-1))) >> hsv_shift; + h += h < 0 ? hrange : 0; + + dst[0] = internal::saturate_cast(h); + dst[1] = (u8)s; + dst[2] = (u8)v; +} + +#define CONVERT_TO_HSV_ASM(loadop, rreg, breg) \ + __asm__ ( \ + #loadop ", [%[in]] @RGB \n\t" \ + "vmin.u8 d3, d0, d1 @VMin (d3) \n\t" \ + "vmax.u8 d6, d0, d1 @V (d6) \n\t" \ + "vmovl.u8 q2, " #rreg " @V16_R (d4,d5) \n\t" \ + "vmovl.u8 q4, d1 @V16_G (d8,d9) \n\t" \ + "vmax.u8 d6, d6, d2 \n\t" \ + "vmin.u8 d3, d3, d2 \n\t" \ + "vmovl.u8 q0, " #breg " @V16_B (d0,d1) \n\t" \ + "vsubl.u8 q8, d6, d3 @V16_Diff (d16,d17) \n\t" \ + \ + "vmovl.u8 q5, d6 @V16_V (d10,d11) \n\t" \ + "vadd.s16 q10, q8, q8 @V16_Diff_2 (d20,d21) \n\t" \ + "vmovl.u16 q9, d16 @V32_Diff_L (d18,d19) \n\t" \ + "vmovl.u16 q11, d17 @V32_Diff_H (d22,d23) \n\t" \ + "vceq.u16 q12, q2, q5 @V==R(d24,d25) \n\t" \ + "vceq.u16 q13, q4, q5 @V==G(d26,d27) \n\t" \ + \ + "vsub.s16 q8, q4, q0 @V16_G-B (d16,d17) \n\t" \ + "vmvn.u16 q15, q12 @V16~R \n\t" \ + "vsub.s16 q6, q0, q2 @V16_B-R (d12,d13) \n\t" \ + "vsub.s16 q7, q2, q4 @V16_R-G (d14,d15) \n\t" \ + "vand.u16 q1, q13, q15 @VMask2 \n\t" \ + "vand.u16 q2, q8, q12 @V16_H(d4,d5) \n\t" \ + "vadd.s16 q4, q6, q10 @V16_H2 \n\t" \ + "vmvn.u16 q12, q13 @V16~G \n\t" \ + "vadd.s16 q6, q10, q10 @VDiff16_4 (d12,d13) \n\t" \ + "vand.u16 q8, q15, q12 @VMask3 \n\t" \ + "vand.u16 q15, q4, q1 @vH2(d30,d31) \n\t" \ + "vadd.s16 q7, q7, q6 @V16_H3 (d14,d15) \n\t" \ + "vadd.s16 q14, q2, q15 @vH16 \n\t" \ + "vmovl.u16 q12, d10 @V32_V_L \n\t" \ + "vand.s16 q7, q7, q8 @vH16 \n\t" \ + "vmovl.u16 q13, d11 @V32_V_H \n\t" \ + "vadd.s16 q2, q14, q7 @V16_Diff_4 \n\t" \ + \ + "vdup.32 q4, %[v6] \n\t" \ + "vmul.u32 q14, q9, q4 \n\t" \ + "vmul.u32 q15, q11, q4 \n\t" \ + "vcvt.f32.u32 q4, q12 @VF1 (d8,d9) \n\t" \ + "vcvt.f32.u32 q8, q13 @VF2 \n\t" \ + "vcvt.f32.u32 q0, q14 @HF1 \n\t" \ + "vcvt.f32.u32 q1, q15 @HF2 \n\t" \ + "vrecpe.f32 q12, q4 @Vxinv \n\t" \ + "vrecpe.f32 q13, q8 @Vxinv \n\t" \ + "vrecpe.f32 q5, q0 @Vxinv \n\t" \ + "vrecpe.f32 q7, q1 @Vxinv \n\t" \ + "vrecps.f32 q14, q12, q4 @Vst1 \n\t" \ + "vrecps.f32 q15, q13, q8 @Vst1 \n\t" \ + "vrecps.f32 q10, q5, q0 @Vst1 \n\t" \ + "vrecps.f32 q6, q7, q1 @Vst1 \n\t" \ + "vmul.f32 q4, q12, q14 \n\t" \ + "vmul.f32 q8, q13, q15 \n\t" \ + "vmul.f32 q0, q5, q10 \n\t" \ + "vmul.f32 q1, q7, q6 \n\t" \ + "vdup.32 q12, %[vsdiv_table] \n\t" \ + "vmul.f32 q14, q4, q12 \n\t" \ + "vmul.f32 q15, q8, q12 \n\t" \ + "vdup.32 q12, %[vhdiv_table] \n\t" \ + "vmul.f32 q10, q0, q12 \n\t" \ + "vmul.f32 q6, q1, q12 \n\t" \ + \ + "vdup.32 q12, %[bias] \n\t" \ + \ + "vadd.f32 q7, q14, q12 \n\t" \ + "vadd.f32 q13, q15, q12 \n\t" \ + "vcvt.u32.f32 q4, q7 \n\t" \ + "vcvt.u32.f32 q8, q13 \n\t" \ + \ + "vadd.f32 q14, q10, q12 \n\t" \ + "vadd.f32 q7, q6, q12 \n\t" \ + "vcvt.u32.f32 q0, q14 \n\t" \ + "vcvt.u32.f32 q1, q7 @Vres \n\t" \ + \ + "vmovl.s16 q7, d4 @V32_H_L (d14,d15) \n\t" \ + "vmovl.s16 q5, d5 @V32_H_H (d10,d11) \n\t" \ + "vmul.u32 q14, q9, q4 \n\t" \ + "vmul.u32 q15, q11, q8 \n\t" \ + "vmul.u32 q10, q7, q0 \n\t" \ + "vmul.u32 q6, q5, q1 \n\t" \ + \ + "vdup.32 q12, %[vshift] \n\t" \ + "vadd.u32 q13, q14, q12 \n\t" \ + "vadd.u32 q8, q15, q12 \n\t" \ + "vadd.u32 q0, q10, q12 \n\t" \ + "vadd.u32 q1, q6, q12 \n\t" \ + "vshrn.s32 d8, q13, #8 \n\t" \ + "vshrn.s32 d9, q8, #8 \n\t" \ + "vshrn.s32 d10, q0, #8 \n\t" \ + "vshrn.s32 d11, q1, #8 \n\t" \ + \ + "vdup.16 q8, %[v0] \n\t" \ + "vshrn.s16 d5, q4, #4 \n\t" \ + "vclt.s16 q9, q5, q8 \n\t" \ + "vshrn.s16 d4, q5, #4 \n\t" \ + \ + "vmovn.s16 d9, q9 \n\t" \ + "vdup.8 d7, %[vhrange] \n\t" \ + "vand.u8 d10, d9, d7 \n\t" \ + "vadd.s8 d4, d4, d10 \n\t" \ + "vst3.8 {d4-d6}, [%[out]] @HSV \n\t" \ + : /*no output*/ \ + : [out] "r" (dst + dj), [in] "r" (src + sj), \ + [vsdiv_table] "r" (vsdiv_table), \ + [vshift] "r" (vshift), \ + [vhdiv_table] "r" (vhdiv_table), \ + [v6] "r" (v6), [vhrange] "r" (vhrange), \ + [v0] "r" (v0), [bias] "r" (bias) \ + : "d0","d1","d2","d3","d4","d5","d6","d7", \ + "d8","d9","d10","d11","d12","d13","d14","d15", \ + "d16","d17","d18","d19","d20","d21","d22","d23", \ + "d24","d25","d26","d27","d28","d29","d30","d31" \ + ); + +#if __GNUC_MINOR__ < 7 + +#define YCRCB_CONSTS \ + register int16x4_t vcYR asm ("d31") = vmov_n_s16(4899); \ + register int16x4_t vcYG asm ("d30") = vmov_n_s16(9617); \ + register int16x4_t vcYB asm ("d29") = vmov_n_s16(1868); \ + register int16x4_t vcCrG asm ("d28") = vmov_n_s16(6860); \ + register int16x4_t vcCrB asm ("d27") = vmov_n_s16(1332); \ + register int16x4_t vcCbR asm ("d26") = vmov_n_s16(2765); \ + register int16x4_t vcCbG asm ("d25") = vmov_n_s16(5427); + +#else + +#define YCRCB_CONSTS \ + const s16 convertCoeffs[] = { 4899, 4899, 4899, 4899, \ + 9617, 9617, 9617, 9617, \ + 1868, 1868, 1868, 1868, \ + 6860, 6860, 6860, 6860, \ + 1332, 1332, 1332, 1332, \ + 2765, 2765, 2765, 2765, \ + 5427, 5427, 5427, 5427 }; \ + const int16x8_t vcYRG = vld1q_s16(convertCoeffs); /*YR and YG*/ \ + const int16x4_t vcYB = vld1_s16(convertCoeffs + 8); /*YB*/ \ + const int16x8_t vcCrGB = vld1q_s16(convertCoeffs + 12); /*CrG and CrB*/ \ + const int16x8_t vcCbRG = vld1q_s16(convertCoeffs + 20); /*CbR and CbG*/ + +#endif + +#define CONVERTTOYCRCB(loadcmd, rreg, greg, breg) \ + __asm__ ( \ + #loadcmd ", [%[in]] @RGB \n\t" \ + "vmovl.u8 q2, " #rreg " @R (d4,d5) \n\t" \ + "vmovl.u8 q3, " #greg " @G (d6,d7) \n\t" \ + "vmovl.u8 q4, " #breg " @B (d8,d9) \n\t" \ + \ + "vshll.u16 q7, d4, #13 @Cr(q7,q8): R \n\t" \ + "vmull.u16 q5, d6, d30 @Y (q5,q6): G \n\t" \ + "vshll.u16 q9, d8, #13 @Cb(q9,q10): B \n\t" \ + "vshll.u16 q8, d5, #13 @Cr(q7,q8): R \n\t" \ + "vmull.u16 q6, d7, d30 @Y (q5,q6): G \n\t" \ + "vshll.u16 q10, d9, #13 @Cb(q9,q10): B \n\t" \ + \ + "vmlsl.s16 q7, d6, d28 @Cr(q7,q8): RG \n\t" \ + "vmlal.s16 q5, d8, d29 @Y (q5,q6): GB \n\t" \ + "vmlsl.s16 q9, d4, d26 @Cb(q9,q10): BR \n\t" \ + "vmlsl.s16 q8, d7, d28 @Cr(q7,q8): RG \n\t" \ + "vmlal.s16 q6, d9, d29 @Y (q5,q6): GB \n\t" \ + "vmlsl.s16 q10, d5, d26 @Cb(q9,q10): BR \n\t" \ + \ + "vmlsl.s16 q7, d8, d27 @Cr(q7,q8): RGB \n\t" \ + "vmlal.s16 q5, d4, d31 @Y (q5,q6): GBR \n\t" \ + "vmlsl.s16 q9, d6, d25 @Cb(q9,q10): BRG \n\t" \ + "vmlsl.s16 q8, d9, d27 @Cr(q7,q8): RGB \n\t" \ + "vmlal.s16 q6, d5, d31 @Y (q5,q6): GBR \n\t" \ + "vmlsl.s16 q10, d7, d25 @Cb(q9,q10): BRG \n\t" \ + \ + "vrshrn.s32 d4, q7, #14 @Cr -> q2 \n\t" \ + "vrshrn.s32 d8, q5, #14 @Y -> q4 \n\t" \ + "vrshrn.s32 d6, q9, #14 @Cb -> q3 \n\t" \ + "vrshrn.s32 d5, q8, #14 @Cr -> q2 \n\t" \ + "vrshrn.s32 d9, q6, #14 @Y -> q4 \n\t" \ + "vrshrn.s32 d7, q10, #14 @Cb -> q3 \n\t" \ + \ + "vmov.s16 q5, #128 \n\t" \ + "vmov.s16 q6, #128 \n\t" \ + "vadd.i16 q5, q2 @Cr -> q5 \n\t" \ + "vadd.i16 q6, q3 @Cb -> q6 \n\t" \ + \ + "vqmovn.u16 d4, q4 \n\t" \ + "vqmovun.s16 d5, q5 \n\t" \ + "vqmovun.s16 d6, q6 \n\t" \ + \ + "vst3.8 {d4-d6}, [%[out]] \n\t" \ + : /*no output*/ \ + : [out] "r" (dst + dj), [in] "r" (src + sj), \ + "w" (vcYR), "w" (vcYG), "w" (vcYB), \ + "w" (vcCrB), "w" (vcCrG), "w" (vcCbG), "w" (vcCbR) \ + : "d0","d1","d2","d3","d4","d5","d6","d7", \ + "d8","d9","d10","d11","d12","d13","d14","d15", \ + "d16","d17","d18","d19","d20","d21" \ + ); + + +inline uint8x8x3_t convertToYCrCb( const int16x8_t& vR, const int16x8_t& vG, const int16x8_t& vB, + const int16x8_t& vcYRG, const int16x4_t& vcYB, + const int16x8_t& vcCrGB, const int16x8_t& vcCbRG ) +{ + int32x4_t vCrL = vshll_n_s16(vget_low_s16(vR), 13); // R + int32x4_t vCrH = vshll_n_s16(vget_high_s16(vR), 13); // R + int32x4_t vYL = vmull_s16(vget_low_s16(vG), vget_high_s16(vcYRG)); // G + int32x4_t vYH = vmull_s16(vget_high_s16(vG), vget_high_s16(vcYRG)); // G + int32x4_t vCbL = vshll_n_s16(vget_low_s16(vB), 13); // B + int32x4_t vCbH = vshll_n_s16(vget_high_s16(vB), 13); // B + + vCrL = vmlsl_s16(vCrL, vget_low_s16(vG), vget_low_s16(vcCrGB)); // RG + vCrH = vmlsl_s16(vCrH, vget_high_s16(vG), vget_low_s16(vcCrGB)); // RG + vYL = vmlal_s16(vYL, vget_low_s16(vB), vcYB); // GB + vYH = vmlal_s16(vYH, vget_high_s16(vB), vcYB); // GB + vCbL = vmlsl_s16(vCbL, vget_low_s16(vR), vget_low_s16(vcCbRG)); // BR + vCbH = vmlsl_s16(vCbH, vget_high_s16(vR), vget_low_s16(vcCbRG)); // BR + + vCrL = vmlsl_s16(vCrL, vget_low_s16(vB), vget_high_s16(vcCrGB)); // RGB + vCrH = vmlsl_s16(vCrH, vget_high_s16(vB), vget_high_s16(vcCrGB)); // RGB + vYL = vmlal_s16(vYL, vget_low_s16(vR), vget_low_s16(vcYRG)); // GBR + vYH = vmlal_s16(vYH, vget_high_s16(vR), vget_low_s16(vcYRG)); // GBR + vCbL = vmlsl_s16(vCbL, vget_low_s16(vG), vget_high_s16(vcCbRG)); // BRG + vCbH = vmlsl_s16(vCbH, vget_high_s16(vG), vget_high_s16(vcCbRG)); // BRG + + int16x4_t vCrL_ = vrshrn_n_s32(vCrL, 14); + int16x4_t vCrH_ = vrshrn_n_s32(vCrH, 14); + int16x4_t vYL_ = vrshrn_n_s32(vYL, 14); + int16x4_t vYH_ = vrshrn_n_s32(vYH, 14); + int16x4_t vCbL_ = vrshrn_n_s32(vCbL, 14); + int16x4_t vCbH_ = vrshrn_n_s32(vCbH, 14); + + int16x8_t vCr = vmovq_n_s16(128); + int16x8_t vCb = vmovq_n_s16(128); + + vCr = vaddq_s16(vCr, vcombine_s16(vCrL_, vCrH_)); + vCb = vaddq_s16(vCb, vcombine_s16(vCbL_, vCbH_)); + + uint8x8x3_t vYCrCb; + vYCrCb.val[0] = vqmovn_u16(vreinterpretq_u16_s16(vcombine_s16(vYL_, vYH_))); + vYCrCb.val[1] = vqmovun_s16(vCr); + vYCrCb.val[2] = vqmovun_s16(vCb); + + return vYCrCb; +} + +#define S_CONVERTTOYCRCB(R, G, B) \ + s32 Y = (R * 4899 + G * 9617 + B * 1868 + (1 << 13)) >> 14; \ + s32 Cr = 128 + ((R * 8192 - G * 6860 - B * 1332 + (1 << 13)) >> 14); \ + s32 Cb = 128 + ((R * (-2765) - G * 5427 + B * 8192 + (1 << 13)) >> 14); \ + dst[dj + 0] = internal::saturate_cast(Y); \ + dst[dj + 1] = internal::saturate_cast(Cr); \ + dst[dj + 2] = internal::saturate_cast(Cb); + +#define COEFF_Y ( 149) +#define COEFF_BU ( 129) +#define COEFF_RV ( 102) +#define COEFF_GU ( 25) +#define COEFF_GV ( 52) +#define COEFF_R (-14248) +#define COEFF_G ( 8663) +#define COEFF_B (-17705) + +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 +#define YUV420ALPHA3_CONST +#define YUV420ALPHA4_CONST register uint8x16_t c255 asm ("q13") = vmovq_n_u8(255); +#define YUV420ALPHA3_CONVERT +#define YUV420ALPHA4_CONVERT , "w" (c255) +#define YUV420STORE1CMD3 "vst3.8 {d20, d22, d24}" +#define YUV420STORE2CMD3 "vst3.8 {d21, d23, d25}" +#define YUV420STORE1CMD4 "vst4.8 {d20, d22, d24, d26}" +#define YUV420STORE2CMD4 "vst4.8 {d21, d23, d25, d27}" + +#define YUV420_CONSTS(cn, bIdx, vIdx) \ + register const s32 cR = s16(COEFF_R); \ + register const s32 cG = s16(COEFF_G); \ + register const s32 cB = s16(COEFF_B); \ + \ + register uint8x16_t vc16 asm ("q15") = vmovq_n_u8(16); \ + register uint8x8_t cGU asm ("d14") = vmov_n_u8(COEFF_GU); \ + register uint8x8_t cGV asm ("d15") = vmov_n_u8(COEFF_GV); \ + register uint8x8_t cRV asm ("d16") = vmov_n_u8(COEFF_RV); \ + register uint8x8_t cBU asm ("d17") = vmov_n_u8(COEFF_BU); \ + register uint8x16_t cRGBY asm ("q3") = vmovq_n_u8(COEFF_Y); \ + YUV420ALPHA##cn##_CONST + +#define CONVERTYUV420TORGB(cn, ureg, vreg, rreg, breg) \ + __asm__ ( \ + "vld2.8 {d0-d1}, [%[inUV]] @UV \n\t" \ + "vdup.16 q4, %[cG] @cG \n\t" \ + "vld2.8 {d2-d3}, [%[inY1]] @YY \n\t" \ + "vdup.16 "#rreg", %[cR] @cR \n\t" \ + "vld2.8 {d4-d5}, [%[inY2]] @YY \n\t" \ + "vdup.16 "#breg", %[cB] @cB \n\t" \ + "vmlsl.u8 q4, "#ureg", d14 @cG-25u \n\t" \ + "vmax.u8 q1, q15 @max(Y,16) \n\t" \ + "vmlal.u8 "#rreg", "#vreg", d16 @cR+102*v \n\t" \ + "vmlal.u8 "#breg", "#ureg", d17 @cB+129*u \n\t" \ + "vmax.u8 q2, q15 @max(Y,16) \n\t" \ + "vmlsl.u8 q4, "#vreg", d15 @cG-25u-52v \n\t" \ + /*q10,q11,q12,q13 - for output*/ \ + "vmull.u8 q9, d3, d6 @h 149*y \n\t" \ + "vmull.u8 q10, d2, d7 @l 149*y \n\t" \ + "vshr.u16 q9, #1 @h (149*y)/2 \n\t" \ + "vshr.u16 q10, #1 @l (149*y)/2 \n\t" \ + \ + "vhadd.s16 q0, q9, q4 @hG ((149*y)/2 + cG - 25*u - 52*v)/2 \n\t" \ + "vhadd.s16 q12, q10, q6 @lB ((149*y)/2 + cB + 129*u)/2 \n\t" \ + "vhadd.s16 q1, q9, q5 @hR ((149*y)/2 + cR + 102*v)/2 \n\t" \ + "vhadd.s16 q11, q10, q4 @lG ((149*y)/2 + cG - 25*u - 52*v)/2 \n\t" \ + "vhadd.s16 q9, q6 @hB ((149*y)/2 + cB + 129*u)/2 \n\t" \ + "vhadd.s16 q10, q5 @lR ((149*y)/2 + cR + 102*v)/2 \n\t" \ + \ + "vqrshrun.s16 d24, q12, #5 @lB ((149*y)/2 + cB + 129*u)/2/32 \n\t" \ + "vqrshrun.s16 d22, q11, #5 @lG ((149*y)/2 + cG - 25*u - 52*v)/2/32 \n\t" \ + "vqrshrun.s16 d20, q10, #5 @lR ((149*y)/2 + cR + 102*v)/2/32 \n\t" \ + "vqrshrun.s16 d23, q0, #5 @hG ((149*y)/2 + cG - 25*u - 52*v)/2/32 \n\t" \ + "vqrshrun.s16 d21, q1, #5 @hR ((149*y)/2 + cR + 102*v)/2/32 \n\t" \ + "vqrshrun.s16 d25, q9, #5 @hB ((149*y)/2 + cB + 129*u)/2/32 \n\t" \ + \ + "vzip.8 d22, d23 @G \n\t" \ + "vzip.8 d20, d21 @R \n\t" \ + "vzip.8 d24, d25 @B \n\t" \ + \ + YUV420STORE1CMD##cn", [%[out1]] \n\t" \ + YUV420STORE2CMD##cn", [%[out1x]] \n\t" \ + \ + "vmull.u8 q9, d5, d6 @h 149*y \n\t" \ + "vmull.u8 q10, d4, d7 @l 149*y \n\t" \ + "vshr.u16 q9, #1 @h (149*y)/2 \n\t" \ + "vshr.u16 q10, #1 @l (149*y)/2 \n\t" \ + \ + "vhadd.s16 q0, q9, q4 @hG ((149*y)/2 + cG - 25*u - 52*v)/2 \n\t" \ + "vhadd.s16 q12, q10, q6 @lB ((149*y)/2 + cB + 129*u)/2 \n\t" \ + "vhadd.s16 q1, q9, q5 @hR ((149*y)/2 + cR + 102*v)/2 \n\t" \ + "vhadd.s16 q11, q10, q4 @lG ((149*y)/2 + cG - 25*u - 52*v)/2 \n\t" \ + "vhadd.s16 q9, q6 @hB ((149*y)/2 + cB + 129*u)/2 \n\t" \ + "vhadd.s16 q10, q5 @lR ((149*y)/2 + cR + 102*v)/2 \n\t" \ + \ + "vqrshrun.s16 d24, q12, #5 @lB ((149*y)/2 + cB + 129*u)/2/32 \n\t" \ + "vqrshrun.s16 d22, q11, #5 @lG ((149*y)/2 + cG - 25*u - 52*v)/2/32 \n\t" \ + "vqrshrun.s16 d20, q10, #5 @lR ((149*y)/2 + cR + 102*v)/2/32 \n\t" \ + "vqrshrun.s16 d23, q0, #5 @hG ((149*y)/2 + cG - 25*u - 52*v)/2/32 \n\t" \ + "vqrshrun.s16 d21, q1, #5 @hR ((149*y)/2 + cR + 102*v)/2/32 \n\t" \ + "vqrshrun.s16 d25, q9, #5 @hB ((149*y)/2 + cB + 129*u)/2/32 \n\t" \ + \ + "vzip.8 d22, d23 @G \n\t" \ + "vzip.8 d20, d21 @R \n\t" \ + "vzip.8 d24, d25 @B \n\t" \ + \ + YUV420STORE1CMD##cn", [%[out2]] \n\t" \ + YUV420STORE2CMD##cn", [%[out2x]] \n\t" \ + \ + : /*no output*/ \ + : [out1] "r" (dst1 + dj), [out2] "r" (dst2 + dj), \ + [out1x] "r" (dst1 + dj+cn*8), [out2x] "r" (dst2 + dj+cn*8), \ + [inUV] "r" (uv+j), [inY1] "r" (y1+j), [inY2] "r" (y2+j), \ + [cR] "r" (cR), [cG] "r" (cG), [cB] "r" (cB), \ + "w" (vc16), "w" (cGU), "w" (cGV), "w" (cBU), "w" (cRV), "w" (cRGBY) YUV420ALPHA##cn##_CONVERT \ + : "d0","d1","d2","d3","d4","d5","d8","d9","d10","d11","d12", \ + "d13","d18","d19","d20","d21","d22","d23","d24","d25" \ + ); + +#else + +template +struct _convertYUV420Internals +{ + uint16x8_t vc14216; + uint16x8_t vc17672; + uint16x8_t vc8696; + uint8x8_t vc102; + uint8x8_t vc25; + uint8x8_t vc129; + uint8x8_t vc52; + uint16x8_t vc_1; + uint8x8_t vc149; + uint8x8_t vc16; + _convertYUV420Internals() + { + vc14216 = vdupq_n_u16(-COEFF_R); + vc17672 = vdupq_n_u16(-COEFF_B); + vc8696 = vdupq_n_u16(COEFF_G); + vc102 = vdup_n_u8(COEFF_RV); + vc25 = vdup_n_u8(COEFF_GU); + vc129 = vdup_n_u8(COEFF_BU); + vc52 = vdup_n_u8(COEFF_GV); + vc_1 = vdupq_n_u16((uint16_t)-1); + vc149 = vdup_n_u8(COEFF_Y); + vc16 = vdup_n_u8(16); + } + + inline void UVrgbToRGB( const int16x8_t &ruv, const int16x8_t &guv, const int16x8_t &buv, + const u8 *y, uint8x16x3_t &rgbl ) + { + //y get line + uint8x8x2_t yl = vld2_u8(y); + yl.val[0] = vmax_u8(yl.val[0], vc16); + yl.val[1] = vmax_u8(yl.val[1], vc16); + + //y part line + uint16x8_t yodd1 = vmlal_u8(vc_1, yl.val[0], vc149); //(-1+149*y) + uint16x8_t yevn1 = vmlal_u8(vc_1, yl.val[1], vc149); //(-1+149*y) + int16x8_t yodd1h = (int16x8_t)vshrq_n_u16(yodd1, 1); //(-1+149*y)/2 + int16x8_t yevn1h = (int16x8_t)vshrq_n_u16(yevn1, 1); //(-1+149*y)/2 + + //y line calc rgb + int16x8_t rodd1w = vhsubq_s16(yodd1h, ruv); //((-1+149*y)/2 - (14216-102*v))/2 + int16x8_t gevn1w = vhaddq_s16(yevn1h, guv); //((-1+149*y)/2 + ((8696-25*u)-52*v))/2 + int16x8_t bodd1w = vhsubq_s16(yodd1h, buv); //((-1+149*y)/2 - (17672-129*u))/2 + int16x8_t revn1w = vhsubq_s16(yevn1h, ruv); //((-1+149*y)/2 - (14216-102*v))/2 + int16x8_t godd1w = vhaddq_s16(yodd1h, guv); //((-1+149*y)/2 + ((8696-25*u)-52*v))/2 + int16x8_t bevn1w = vhsubq_s16(yevn1h, buv); //((-1+149*y)/2 - (17672-129*u))/2 + + //y line clamp + narrow + uint8x8_t rodd1n = vqshrun_n_s16(rodd1w, 5); + uint8x8_t revn1n = vqshrun_n_s16(revn1w, 5); + uint8x8_t godd1n = vqshrun_n_s16(godd1w, 5); + uint8x8x2_t r1 = vzip_u8 (rodd1n, revn1n); + uint8x8_t gevn1n = vqshrun_n_s16(gevn1w, 5); + uint8x8_t bodd1n = vqshrun_n_s16(bodd1w, 5); + uint8x8x2_t g1 = vzip_u8 (godd1n, gevn1n); + uint8x8_t bevn1n = vqshrun_n_s16(bevn1w, 5); + uint8x8x2_t b1 = vzip_u8 (bodd1n, bevn1n); + rgbl.val[2 - bIdx] = vcombine_u8(r1.val[0], r1.val[1]); + rgbl.val[1] = vcombine_u8(g1.val[0], g1.val[1]); + rgbl.val[0 + bIdx] = vcombine_u8(b1.val[0], b1.val[1]); + } +}; + +template +struct _convertYUV420 +{ + _convertYUV420Internals convertYUV420Internals; + + inline void ToRGB( const u8 *y1, const u8 *y2, const u8 *uv, + u8 *dst1, u8 *dst2 ) + { + uint8x8x2_t raw_uv = vld2_u8(uv); + uint16x8_t gu = vmlsl_u8(convertYUV420Internals.vc8696, raw_uv.val[1-vIdx], convertYUV420Internals.vc25); //(8696-25*u) + int16x8_t ruv = (int16x8_t)vmlsl_u8(convertYUV420Internals.vc14216, raw_uv.val[vIdx], convertYUV420Internals.vc102); //(14216-102*v) + + int16x8_t buv = (int16x8_t)vmlsl_u8(convertYUV420Internals.vc17672, raw_uv.val[1-vIdx], convertYUV420Internals.vc129); //(17672-129*u) + int16x8_t guv = (int16x8_t)vmlsl_u8(gu, raw_uv.val[vIdx], convertYUV420Internals.vc52); //((8696-25*u)-52*v)) + + uint8x16x3_t rgbl; + //y line1 + convertYUV420Internals.UVrgbToRGB(ruv, guv, buv, y1, rgbl); + vst3q_u8(dst1, rgbl); + //y line2 + convertYUV420Internals.UVrgbToRGB(ruv, guv, buv, y2, rgbl); + vst3q_u8(dst2, rgbl); + } +}; + +template +struct _convertYUV420<4, bIdx, vIdx> +{ + _convertYUV420Internals convertYUV420Internals; + + inline void ToRGB( const u8 *y1, const u8 *y2, const u8 *uv, + u8 *dst1, u8 *dst2 ) + { + uint8x8x2_t raw_uv = vld2_u8(uv); + uint16x8_t gu = vmlsl_u8(convertYUV420Internals.vc8696, raw_uv.val[1-vIdx], convertYUV420Internals.vc25); //(8696-25*u) + int16x8_t ruv = (int16x8_t)vmlsl_u8(convertYUV420Internals.vc14216, raw_uv.val[vIdx], convertYUV420Internals.vc102); //(14216-102*v) + + int16x8_t buv = (int16x8_t)vmlsl_u8(convertYUV420Internals.vc17672, raw_uv.val[1-vIdx], convertYUV420Internals.vc129); //(17672-129*u) + int16x8_t guv = (int16x8_t)vmlsl_u8(gu, raw_uv.val[vIdx], convertYUV420Internals.vc52); //((8696-25*u)-52*v)) + + union { uint8x16x4_t v4; uint8x16x3_t v3; } rgbl; + rgbl.v4.val[3] = vdupq_n_u8(0xff); + //y line1 + convertYUV420Internals.UVrgbToRGB(ruv, guv, buv, y1, rgbl.v3); + vst4q_u8(dst1, rgbl.v4); + //y line2 + convertYUV420Internals.UVrgbToRGB(ruv, guv, buv, y2, rgbl.v3); + vst4q_u8(dst2, rgbl.v4); + } +}; + +#define YUV420_CONSTS(cn, bIdx, vIdx) _convertYUV420 convertYUV420; + +#endif + +template inline void fillAlpha(u8 *, u8 *){} +template <> inline void fillAlpha<4>(u8 *dst1, u8 *dst2) +{ + dst1[3] = 255; + dst1[7] = 255; + dst2[3] = 255; + dst2[7] = 255; +} +template +inline void convertYUV420ToRGB(const u8 *y1, const u8 *y2, const u8 *uv, u8* dst1, u8 *dst2) +{ + int Y11 = y1[0]; + int Y12 = y1[1]; + int Y21 = y2[0]; + int Y22 = y2[1]; + + int U = uv[1 - vIdx]; + int V = uv[vIdx]; + + int y11 = (COEFF_Y * std::max(16, Y11)) >> 1; + int y12 = (COEFF_Y * std::max(16, Y12)) >> 1; + int y21 = (COEFF_Y * std::max(16, Y21)) >> 1; + int y22 = (COEFF_Y * std::max(16, Y22)) >> 1; + + int uvR = COEFF_R + COEFF_RV * V; + int uvG = COEFF_G - COEFF_GU * U - COEFF_GV * V; + int uvB = COEFF_B + COEFF_BU * U; + + dst1[2-bIdx] = internal::saturate_cast((((y11 + uvR) >> 1) + (1 << 4)) >> 5); + dst1[1] = internal::saturate_cast((((y11 + uvG) >> 1) + (1 << 4)) >> 5); + dst1[bIdx] = internal::saturate_cast((((y11 + uvB) >> 1) + (1 << 4)) >> 5); + + dst1[cn+2-bIdx] = internal::saturate_cast((((y12 + uvR) >> 1) + (1 << 4)) >> 5); + dst1[cn+1] = internal::saturate_cast((((y12 + uvG) >> 1) + (1 << 4)) >> 5); + dst1[cn+bIdx] = internal::saturate_cast((((y12 + uvB) >> 1) + (1 << 4)) >> 5); + + dst2[2-bIdx] = internal::saturate_cast((((y21 + uvR) >> 1) + (1 << 4)) >> 5); + dst2[1] = internal::saturate_cast((((y21 + uvG) >> 1) + (1 << 4)) >> 5); + dst2[bIdx] = internal::saturate_cast((((y21 + uvB) >> 1) + (1 << 4)) >> 5); + + dst2[cn+2-bIdx] = internal::saturate_cast((((y22 + uvR) >> 1) + (1 << 4)) >> 5); + dst2[cn+1] = internal::saturate_cast((((y22 + uvG) >> 1) + (1 << 4)) >> 5); + dst2[cn+bIdx] = internal::saturate_cast((((y22 + uvB) >> 1) + (1 << 4)) >> 5); + + fillAlpha(dst1, dst2); +} + +// converts R, G, B (B, G, R) pixels to RGB(BGR)565 format respectively +inline uint8x16x2_t convertTo565( const uint8x16_t& vR, const uint8x16_t& vG, const uint8x16_t& vB ) +{ + uint8x16x2_t vRgb565; // rrrrRRRR ggggGGGG bbbbBBBB + + vRgb565.val[1] = vsriq_n_u8(vB, vG, 5); // xxxxxxxx bbbbBggg + vRgb565.val[0] = vshlq_n_u8(vG, 3); // gGGGG000 bbbbBggg + vRgb565.val[0] = vsriq_n_u8(vRgb565.val[0], vR, 3); // gGGrrrrR bbbbBggg + + return vRgb565; +} +inline void convertTo565( const u16 R, const u16 G, const u16 B, u8 * dst ) +{ + *((u16*)dst) = (R >> 3)|((G&~3) << 3)|((B&~7) << 8); +} +#endif + +} //namespace + +void rgb2hsv(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + s32 hrange) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + const s32 hsv_shift = 12; +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + register const f32 vsdiv_table = f32(255 << hsv_shift); + register f32 vhdiv_table = f32(hrange << hsv_shift); + register const s32 vhrange = hrange; + register const s32 v0 = s32(0); + register const s32 vshift = s32(1 << (hsv_shift-1)); + register const s32 v6 = s32(6); + register const f32 bias = 0.5f; +#endif + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u, j = 0u; + + for (; j < roiw8; sj += 24, dj += 24, j += 8) + { + internal::prefetch(src + sj); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + CONVERT_TO_HSV_ASM(vld3.8 {d0-d2}, d0, d2) +#else + uint8x8x3_t vRgb = vld3_u8(src + sj); + uint8x8x3_t vHsv = convertToHSV(vRgb.val[0], vRgb.val[1], vRgb.val[2], hrange); + vst3_u8(dst + dj, vHsv); +#endif + } + + for (; j < size.width; ++j, sj += 3, dj += 3) + { + convertToHSV(src[sj], src[sj+1], src[sj+2], hrange, hsv_shift, dst+dj); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)hrange; +#endif +} + +void rgbx2hsv(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + s32 hrange) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + const s32 hsv_shift = 12; +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + register const f32 vsdiv_table = f32(255 << hsv_shift); + register f32 vhdiv_table = f32(hrange << hsv_shift); + register const s32 vhrange = hrange; + register const s32 v0 = s32(0); + register const s32 vshift = s32(1 << (hsv_shift-1)); + register const s32 v6 = s32(6); + register const f32 bias = 0.5f; +#endif + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u, j = 0u; + + for (; j < roiw8; sj += 32, dj += 24, j += 8) + { + internal::prefetch(src + sj); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + CONVERT_TO_HSV_ASM(vld4.8 {d0-d3}, d0, d2) +#else + uint8x8x4_t vRgb = vld4_u8(src + sj); + uint8x8x3_t vHsv = convertToHSV(vRgb.val[0], vRgb.val[1], vRgb.val[2], hrange); + vst3_u8(dst + dj, vHsv); +#endif + } + + for (; j < size.width; ++j, sj += 4, dj += 3) + { + convertToHSV(src[sj], src[sj+1], src[sj+2], hrange, hsv_shift, dst+dj); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)hrange; +#endif +} + +void bgr2hsv(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + s32 hrange) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + const s32 hsv_shift = 12; +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + register const f32 vsdiv_table = f32(255 << hsv_shift); + register f32 vhdiv_table = f32(hrange << hsv_shift); + register const s32 vhrange = hrange; + register const s32 v0 = s32(0); + register const s32 vshift = s32(1 << (hsv_shift-1)); + register const s32 v6 = s32(6); + register const f32 bias = 0.5f; +#endif + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u, j = 0u; + + for (; j < roiw8; sj += 24, dj += 24, j += 8) + { + internal::prefetch(src + sj); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + CONVERT_TO_HSV_ASM(vld3.8 {d0-d2}, d2, d0) +#else + uint8x8x3_t vRgb = vld3_u8(src + sj); + uint8x8x3_t vHsv = convertToHSV(vRgb.val[2], vRgb.val[1], vRgb.val[0], hrange); + vst3_u8(dst + dj, vHsv); +#endif + } + + for (; j < size.width; ++j, sj += 3, dj += 3) + { + convertToHSV(src[sj+2], src[sj+1], src[sj], hrange, hsv_shift, dst+dj); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)hrange; +#endif +} + +void bgrx2hsv(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + s32 hrange) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + const s32 hsv_shift = 12; +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + register const f32 vsdiv_table = f32(255 << hsv_shift); + register f32 vhdiv_table = f32(hrange << hsv_shift); + register const s32 vhrange = hrange; + register const s32 v0 = s32(0); + register const s32 vshift = s32(1 << (hsv_shift-1)); + register const s32 v6 = s32(6); + register const f32 bias = 0.5f; +#endif + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u, j = 0u; + + for (; j < roiw8; sj += 32, dj += 24, j += 8) + { + internal::prefetch(src + sj); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + CONVERT_TO_HSV_ASM(vld4.8 {d0-d3}, d2, d0) +#else + uint8x8x4_t vRgb = vld4_u8(src + sj); + uint8x8x3_t vHsv = convertToHSV(vRgb.val[2], vRgb.val[1], vRgb.val[0], hrange); + vst3_u8(dst + dj, vHsv); +#endif + } + + for (; j < size.width; ++j, sj += 4, dj += 3) + { + convertToHSV(src[sj+2], src[sj+1], src[sj], hrange, hsv_shift, dst+dj); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)hrange; +#endif +} + +void rgbx2bgr565(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u, j = 0u; + + for (; j < roiw16; sj += 64, dj += 32, j += 16) + { + internal::prefetch(src + sj); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + __asm__ ( + "vld4.8 {d2, d4, d6, d8}, [%[in0]] @ q0 q1 q2 q3 q4 \n\t" + "vld4.8 {d3, d5, d7, d9}, [%[in1]] @ xxxxxxxx rrrrRRRR ggggGGGG bbbbBBBB xxxxxxxx \n\t" + "vsri.8 q1, q2, #5 @ xxxxxxxx rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t" + "vshl.u8 q0, q2, #3 @ gGGGG000 rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t" + "vsri.8 q0, q3, #3 @ gGGbbbbB rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t" + "vst2.8 {d0, d2}, [%[out0]] \n\t" + "vst2.8 {d1, d3}, [%[out1]] \n\t" + : /*no output*/ + : [out0] "r" (dst + dj), + [out1] "r" (dst + dj + 16), + [in0] "r" (src + sj), + [in1] "r" (src + sj + 32) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9" + ); +#else + uint8x16x4_t vRgba = vld4q_u8(src + sj); + uint8x16x2_t vVal565 = convertTo565(vRgba.val[2], vRgba.val[1], vRgba.val[0]); + vst2q_u8(dst + dj, vVal565); +#endif + } + + for (; j < size.width; ++j, sj += 4, dj += 2) + { + convertTo565(src[sj + 2], src[sj + 1], src[sj], dst + dj); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void rgb2bgr565(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u, j = 0u; + + for (; j < roiw16; sj += 48, dj += 32, j += 16) + { + internal::prefetch(src + sj); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + __asm__ ( + "vld3.8 {d2, d4, d6}, [%[in0]] @ q0 q1 q2 q3 q4 \n\t" + "vld3.8 {d3, d5, d7}, [%[in1]] @ xxxxxxxx rrrrRRRR ggggGGGG bbbbBBBB xxxxxxxx \n\t" + "vsri.8 q1, q2, #5 @ xxxxxxxx rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t" + "vshl.u8 q0, q2, #3 @ gGGGG000 rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t" + "vsri.8 q0, q3, #3 @ gGGbbbbB rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t" + "vst2.8 {d0, d2}, [%[out0]] \n\t" + "vst2.8 {d1, d3}, [%[out1]] \n\t" + : /*no output*/ + : [out0] "r" (dst + dj), + [out1] "r" (dst + dj + 16), + [in0] "r" (src + sj), + [in1] "r" (src + sj + 24) + : "d0","d1","d2","d3","d4","d5","d6","d7" + ); +#else + uint8x16x3_t vRgba = vld3q_u8(src + sj); + uint8x16x2_t vVal565 = convertTo565(vRgba.val[2], vRgba.val[1], vRgba.val[0]); + vst2q_u8(dst + dj, vVal565); +#endif + } + + for (; j < size.width; ++j, sj += 3, dj += 2) + { + convertTo565(src[sj + 2], src[sj + 1], src[sj], dst + dj); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void rgbx2rgb565(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u, j = 0u; + + for (; j < roiw16; sj += 64, dj += 32, j += 16) + { + internal::prefetch(src + sj); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + __asm__ ( + "vld4.8 {d0, d2, d4, d6}, [%[in0]] @ q0 q1 q2 q3 \n\t" + "vld4.8 {d1, d3, d5, d7}, [%[in1]] @ rrrrRRRR ggggGGGG bbbbBBBB aaaaAAAA \n\t" + "vsri.8 q2, q1, #5 @ rrrrRRRR ggggGGGG bbbbBggg aaaaAAAA \n\t" + "vshl.u8 q1, #3 @ rrrrRRRR gGGGG000 bbbbBggg aaaaAAAA \n\t" + "vsri.8 q1, q0, #3 @ rrrrRRRR gGGrrrrR bbbbBggg aaaaAAAA \n\t" + "vst2.8 {d2, d4}, [%[out0]] \n\t" + "vst2.8 {d3, d5}, [%[out1]] \n\t" + : /*no output*/ + : [out0] "r" (dst + dj), + [out1] "r" (dst + dj + 16), + [in0] "r" (src + sj), + [in1] "r" (src + sj + 32) + : "d0","d1","d2","d3","d4","d5","d6","d7" + ); +#else + uint8x16x4_t vRgba = vld4q_u8(src + sj); + uint8x16x2_t vVal565 = convertTo565(vRgba.val[0], vRgba.val[1], vRgba.val[2]); + vst2q_u8(dst + dj, vVal565); +#endif + } + + for (; j < size.width; ++j, sj += 4, dj += 2) + { + convertTo565(src[sj], src[sj + 1], src[sj + 2], dst + dj); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void rgb2rgb565(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u, j = 0u; + + for (; j < roiw16; sj += 48, dj += 32, j += 16) + { + internal::prefetch(src + sj); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + __asm__ ( + "vld3.8 {d0, d2, d4}, [%[in0]] @ q0 q1 q2 q3 \n\t" + "vld3.8 {d1, d3, d5}, [%[in1]] @ rrrrRRRR ggggGGGG bbbbBBBB xxxxxxxx \n\t" + "vsri.8 q2, q1, #5 @ rrrrRRRR ggggGGGG bbbbBggg xxxxxxxx \n\t" + "vshl.u8 q1, #3 @ rrrrRRRR gGGGG000 bbbbBggg xxxxxxxx \n\t" + "vsri.8 q1, q0, #3 @ rrrrRRRR gGGrrrrR bbbbBggg xxxxxxxx \n\t" + "vst2.8 {d2, d4}, [%[out0]] \n\t" + "vst2.8 {d3, d5}, [%[out1]] \n\t" + : /*no output*/ + : [out0] "r" (dst + dj), + [out1] "r" (dst + dj + 16), + [in0] "r" (src + sj), + [in1] "r" (src + sj + 24) + : "d0","d1","d2","d3","d4","d5" + ); +#else + uint8x16x3_t vRgba = vld3q_u8(src + sj); + uint8x16x2_t vVal565 = convertTo565(vRgba.val[0], vRgba.val[1], vRgba.val[2]); + vst2q_u8(dst + dj, vVal565); +#endif + } + + for (; j < size.width; ++j, sj += 3, dj += 2) + { + convertTo565(src[sj], src[sj + 1], src[sj + 2], dst + dj); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void rgb2ycrcb(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + YCRCB_CONSTS + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u, j = 0u; + + for (; j < roiw8; sj += 24, dj += 24, j += 8) + { + internal::prefetch(src + sj); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + CONVERTTOYCRCB(vld3.8 {d0-d2}, d0, d1, d2) +#else + uint8x8x3_t vRgb = vld3_u8(src + sj); + int16x8_t vR = vreinterpretq_s16_u16(vmovl_u8(vRgb.val[0])); + int16x8_t vG = vreinterpretq_s16_u16(vmovl_u8(vRgb.val[1])); + int16x8_t vB = vreinterpretq_s16_u16(vmovl_u8(vRgb.val[2])); + uint8x8x3_t vYCrCb = convertToYCrCb(vR, vG, vB, vcYRG, vcYB, vcCrGB, vcCbRG); + vst3_u8(dst + dj, vYCrCb); +#endif + } + + for (; j < size.width; ++j, sj += 3, dj += 3) + { + S_CONVERTTOYCRCB(src[sj], src[sj + 1], src[sj + 2]); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void rgbx2ycrcb(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + YCRCB_CONSTS + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u, j = 0u; + + for (; j < roiw8; sj += 32, dj += 24, j += 8) + { + internal::prefetch(src + sj); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + CONVERTTOYCRCB(vld4.8 {d0-d3}, d0, d1, d2) +#else + uint8x8x4_t vRgba = vld4_u8(src + sj); + int16x8_t vR = vreinterpretq_s16_u16(vmovl_u8(vRgba.val[0])); + int16x8_t vG = vreinterpretq_s16_u16(vmovl_u8(vRgba.val[1])); + int16x8_t vB = vreinterpretq_s16_u16(vmovl_u8(vRgba.val[2])); + uint8x8x3_t vYCrCb = convertToYCrCb(vR, vG, vB, vcYRG, vcYB, vcCrGB, vcCbRG); + vst3_u8(dst + dj, vYCrCb); +#endif + } + + for (; j < size.width; ++j, sj += 4, dj += 3) + { + S_CONVERTTOYCRCB(src[sj], src[sj + 1], src[sj + 2]); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void bgr2ycrcb(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + YCRCB_CONSTS + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u, j = 0u; + + for (; j < roiw8; sj += 24, dj += 24, j += 8) + { + internal::prefetch(src + sj); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + CONVERTTOYCRCB(vld3.8 {d0-d2}, d2, d1, d0) +#else + uint8x8x3_t vBgr = vld3_u8(src + sj); + int16x8_t vB = vreinterpretq_s16_u16(vmovl_u8(vBgr.val[0])); + int16x8_t vG = vreinterpretq_s16_u16(vmovl_u8(vBgr.val[1])); + int16x8_t vR = vreinterpretq_s16_u16(vmovl_u8(vBgr.val[2])); + uint8x8x3_t vYCrCb = convertToYCrCb(vR, vG, vB, vcYRG, vcYB, vcCrGB, vcCbRG); + vst3_u8(dst + dj, vYCrCb); +#endif + } + + for (; j < size.width; ++j, sj += 3, dj += 3) + { + S_CONVERTTOYCRCB(src[sj + 2], src[sj + 1], src[sj]); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void bgrx2ycrcb(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + YCRCB_CONSTS + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0u; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0u, dj = 0u, j = 0u; + + for (; j < roiw8; sj += 32, dj += 24, j += 8) + { + internal::prefetch(src + sj); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + CONVERTTOYCRCB(vld4.8 {d0-d3}, d2, d1, d0) +#else + uint8x8x4_t vBgra = vld4_u8(src + sj); + int16x8_t vB = vreinterpretq_s16_u16(vmovl_u8(vBgra.val[0])); + int16x8_t vG = vreinterpretq_s16_u16(vmovl_u8(vBgra.val[1])); + int16x8_t vR = vreinterpretq_s16_u16(vmovl_u8(vBgra.val[2])); + uint8x8x3_t vYCrCb = convertToYCrCb(vR, vG, vB, vcYRG, vcYB, vcCrGB, vcCbRG); + vst3_u8(dst + dj, vYCrCb); +#endif + } + + for (; j < size.width; ++j, sj += 4, dj += 3) + { + S_CONVERTTOYCRCB(src[sj + 2], src[sj + 1], src[sj]); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void yuv420sp2rgb(const Size2D &size, + const u8 * yBase, ptrdiff_t yStride, + const u8 * uvBase, ptrdiff_t uvStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + // input data: + ////////////// Y matrix: + // {y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15, y16} + // {Y1, Y2, Y3, Y4, Y5, Y6, Y7, Y8, Y9, Y10, Y11, Y12, Y13, Y14, Y15, Y16} + ////////////// UV matrix: + // {v12, u12, v34, u34, v56, u56, v78, u78, v90 u90, V12, U12, V34, U34, V56, U56} + + // fp version + // R = 1.164(Y - 16) + 1.596(V - 128) + // G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) + // B = 1.164(Y - 16) + 2.018(U - 128) + + // integer version + // R = [((149*y)/2 + (-14248+102*v) )/2]/32 + // G = [((149*y)/2 + ((8663- 25*u)-52*v))/2]/32 + // B = [((149*y)/2 + (-17705+129*u) )/2]/32 + + // error estimation: + //Rerr = 0.0000625 * y − 0.00225 * v − 0.287 + //Gerr = 0.0000625 * y + 0.0005 * v + 0.000375 * u + 0.128625 + //Berr = 0.0000625 * y − 0.002375 * u - 0.287375 + + //real error test: + //================= + //R: 1 less: 520960 == 3.11% of full space + //G: 1 less: 251425 == 1.50% of full space + //B: 1 less: 455424 == 2.71% of full space + //================= + //R: 1 more: 642048 == 3.83% of full space + //G: 1 more: 192458 == 1.15% of full space + //B: 1 more: 445184 == 2.65% of full space + + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + YUV420_CONSTS(3, 2, 0) + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0u; i < size.height; i+=2) + { + const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1); + const u8 * y1 = internal::getRowPtr(yBase, yStride, i); + const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1); + u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i); + u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1); + + size_t dj = 0u, j = 0u; + for (; j < roiw16; dj += 48, j += 16) + { + internal::prefetch(uv + j); + internal::prefetch(y1 + j); + internal::prefetch(y2 + j); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + CONVERTYUV420TORGB(3, d1, d0, q5, q6) +#else + convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj); +#endif + } + for (; j + 2 <= size.width; j+=2, dj += 6) + { + convertYUV420ToRGB<3, 2, 0>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj); + } + } +#else + (void)size; + (void)yBase; + (void)yStride; + (void)uvBase; + (void)uvStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void yuv420sp2rgbx(const Size2D &size, + const u8 * yBase, ptrdiff_t yStride, + const u8 * uvBase, ptrdiff_t uvStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + YUV420_CONSTS(4, 2, 0) + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0u; i < size.height; i+=2) + { + const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1); + const u8 * y1 = internal::getRowPtr(yBase, yStride, i); + const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1); + u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i); + u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1); + + size_t dj = 0u, j = 0u; + for (; j < roiw16; dj += 64, j += 16) + { + internal::prefetch(uv + j); + internal::prefetch(y1 + j); + internal::prefetch(y2 + j); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + CONVERTYUV420TORGB(4, d1, d0, q5, q6) +#else + convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj); +#endif + } + for (; j + 2 <= size.width; j+=2, dj += 8) + { + convertYUV420ToRGB<4, 2, 0>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj); + } + } +#else + (void)size; + (void)yBase; + (void)yStride; + (void)uvBase; + (void)uvStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void yuv420i2rgb(const Size2D &size, + const u8 * yBase, ptrdiff_t yStride, + const u8 * uvBase, ptrdiff_t uvStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + YUV420_CONSTS(3, 2, 1) + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0u; i < size.height; i+=2) + { + const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1); + const u8 * y1 = internal::getRowPtr(yBase, yStride, i); + const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1); + u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i); + u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1); + + size_t dj = 0u, j = 0u; + for (; j < roiw16; dj += 48, j += 16) + { + internal::prefetch(uv + j); + internal::prefetch(y1 + j); + internal::prefetch(y2 + j); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + CONVERTYUV420TORGB(3, d0, d1, q5, q6) +#else + convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj); +#endif + } + for (; j + 2 <= size.width; j+=2, dj += 6) + { + convertYUV420ToRGB<3, 2, 1>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj); + } + } +#else + (void)size; + (void)yBase; + (void)yStride; + (void)uvBase; + (void)uvStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void yuv420i2rgbx(const Size2D &size, + const u8 * yBase, ptrdiff_t yStride, + const u8 * uvBase, ptrdiff_t uvStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + YUV420_CONSTS(4, 2, 1) + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0u; i < size.height; i+=2) + { + const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1); + const u8 * y1 = internal::getRowPtr(yBase, yStride, i); + const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1); + u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i); + u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1); + + size_t dj = 0u, j = 0u; + for (; j < roiw16; dj += 64, j += 16) + { + internal::prefetch(uv + j); + internal::prefetch(y1 + j); + internal::prefetch(y2 + j); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + CONVERTYUV420TORGB(4, d0, d1, q5, q6) +#else + convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj); +#endif + } + for (; j + 2 <= size.width; j+=2, dj += 8) + { + convertYUV420ToRGB<4, 2, 1>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj); + } + } +#else + (void)size; + (void)yBase; + (void)yStride; + (void)uvBase; + (void)uvStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void yuv420sp2bgr(const Size2D &size, + const u8 * yBase, ptrdiff_t yStride, + const u8 * uvBase, ptrdiff_t uvStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + YUV420_CONSTS(3, 0, 0) + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0u; i < size.height; i+=2) + { + const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1); + const u8 * y1 = internal::getRowPtr(yBase, yStride, i); + const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1); + u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i); + u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1); + + size_t dj = 0u, j = 0u; + for (; j < roiw16; dj += 48, j += 16) + { + internal::prefetch(uv + j); + internal::prefetch(y1 + j); + internal::prefetch(y2 + j); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + CONVERTYUV420TORGB(3, d1, d0, q6, q5) +#else + convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj); +#endif + } + for (; j + 2 <= size.width; j+=2, dj += 6) + { + convertYUV420ToRGB<3, 0, 0>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj); + } + } +#else + (void)size; + (void)yBase; + (void)yStride; + (void)uvBase; + (void)uvStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void yuv420sp2bgrx(const Size2D &size, + const u8 * yBase, ptrdiff_t yStride, + const u8 * uvBase, ptrdiff_t uvStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + YUV420_CONSTS(4, 0, 0) + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0u; i < size.height; i+=2) + { + const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1); + const u8 * y1 = internal::getRowPtr(yBase, yStride, i); + const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1); + u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i); + u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1); + + size_t dj = 0u, j = 0u; + for (; j < roiw16; dj += 64, j += 16) + { + internal::prefetch(uv + j); + internal::prefetch(y1 + j); + internal::prefetch(y2 + j); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + CONVERTYUV420TORGB(4, d1, d0, q6, q5) +#else + convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj); +#endif + } + for (; j + 2 <= size.width; j+=2, dj += 8) + { + convertYUV420ToRGB<4, 0, 0>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj); + } + } +#else + (void)size; + (void)yBase; + (void)yStride; + (void)uvBase; + (void)uvStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void yuv420i2bgr(const Size2D &size, + const u8 * yBase, ptrdiff_t yStride, + const u8 * uvBase, ptrdiff_t uvStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + YUV420_CONSTS(3, 0, 1) + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0u; i < size.height; i+=2) + { + const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1); + const u8 * y1 = internal::getRowPtr(yBase, yStride, i); + const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1); + u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i); + u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1); + + size_t dj = 0u, j = 0u; + for (; j < roiw16; dj += 48, j += 16) + { + internal::prefetch(uv + j); + internal::prefetch(y1 + j); + internal::prefetch(y2 + j); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + CONVERTYUV420TORGB(3, d0, d1, q6, q5) +#else + convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj); +#endif + } + for (; j + 2 <= size.width; j+=2, dj += 6) + { + convertYUV420ToRGB<3, 0, 1>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj); + } + } +#else + (void)size; + (void)yBase; + (void)yStride; + (void)uvBase; + (void)uvStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void yuv420i2bgrx(const Size2D &size, + const u8 * yBase, ptrdiff_t yStride, + const u8 * uvBase, ptrdiff_t uvStride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + YUV420_CONSTS(4, 0, 1) + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0u; i < size.height; i+=2) + { + const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1); + const u8 * y1 = internal::getRowPtr(yBase, yStride, i); + const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1); + u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i); + u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1); + + size_t dj = 0u, j = 0u; + for (; j < roiw16; dj += 64, j += 16) + { + internal::prefetch(uv + j); + internal::prefetch(y1 + j); + internal::prefetch(y2 + j); +#if defined(__GNUC__) && __GNUC_MINOR__ < 7 + CONVERTYUV420TORGB(4, d0, d1, q6, q5) +#else + convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj); +#endif + } + for (; j + 2 <= size.width; j+=2, dj += 8) + { + convertYUV420ToRGB<4, 0, 1>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj); + } + } +#else + (void)size; + (void)yBase; + (void)yStride; + (void)uvBase; + (void)uvStride; + (void)dstBase; + (void)dstStride; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/common.cpp b/3rdparty/carotene/src/common.cpp new file mode 100644 index 0000000000..c85b0123b6 --- /dev/null +++ b/3rdparty/carotene/src/common.cpp @@ -0,0 +1,108 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include +#include + +#include "common.hpp" + +namespace CAROTENE_NS { + +bool isSupportedConfiguration() +{ +#ifdef CAROTENE_NEON + return true; +#else + return false; +#endif +} + +namespace internal { + +void assertSupportedConfiguration(bool parametersSupported) +{ + if (!isSupportedConfiguration()) { + std::cerr << "internal error: attempted to use an unavailable function" << std::endl; + std::abort(); + } + + if (!parametersSupported) { + std::cerr << "internal error: attempted to use a function with unsupported parameters" << std::endl; + std::abort(); + } +} + +ptrdiff_t borderInterpolate(ptrdiff_t _p, size_t _len, BORDER_MODE borderType, size_t startMargin, size_t endMargin) +{ + ptrdiff_t p = _p + (ptrdiff_t)startMargin; + size_t len = _len + startMargin + endMargin; + if( (size_t)p < len ) + return _p; + else if( borderType == BORDER_MODE_REPLICATE ) + p = p < 0 ? 0 : (ptrdiff_t)len - 1; + else if( borderType == BORDER_MODE_REFLECT || borderType == BORDER_MODE_REFLECT101 ) + { + s32 delta = borderType == BORDER_MODE_REFLECT101; + if( len == 1 ) + return 0; + do + { + if( p < 0 ) + p = -p - 1 + delta; + else + p = (ptrdiff_t)len - 1 - (p - (ptrdiff_t)len) - delta; + } + while( (size_t)p >= len ); + } + else if( borderType == BORDER_MODE_WRAP ) + { + if( p < 0 ) + p -= ((p-(ptrdiff_t)len+1)/(ptrdiff_t)len)*(ptrdiff_t)len; + if( p >= (ptrdiff_t)len ) + p %= (ptrdiff_t)len; + } + else if( borderType == BORDER_MODE_CONSTANT ) + p = -1; + else + internal::assertSupportedConfiguration(false); + return p - (ptrdiff_t)startMargin; +} + +} // namespace internal +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/common.hpp b/3rdparty/carotene/src/common.hpp new file mode 100644 index 0000000000..e46231a58a --- /dev/null +++ b/3rdparty/carotene/src/common.hpp @@ -0,0 +1,96 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#ifndef CAROTENE_SRC_COMMON_HPP +#define CAROTENE_SRC_COMMON_HPP + +#include +#include + +#if defined WITH_NEON && (defined __ARM_NEON__ || defined __ARM_NEON) +#define CAROTENE_NEON +#endif + +#ifdef CAROTENE_NEON +#include +#include "intrinsics.hpp" +#endif + +#include +#include "saturate_cast.hpp" + +namespace CAROTENE_NS { namespace internal { + +inline void prefetch(const void *ptr, size_t offset = 32*10) +{ +#if defined __GNUC__ + __builtin_prefetch(reinterpret_cast(ptr) + offset); +#elif defined _MSC_VER && defined CAROTENE_NEON + __prefetch(reinterpret_cast(ptr) + offset); +#else + (void)ptr; + (void)offset; +#endif +} + +template +inline T *getRowPtr(T *base, ptrdiff_t stride, size_t row) +{ + char *baseRaw = const_cast(reinterpret_cast(base)); + return reinterpret_cast(baseRaw + ptrdiff_t(row) * stride); +} + +void assertSupportedConfiguration(bool parametersSupported = true); + +ptrdiff_t borderInterpolate(ptrdiff_t _p, size_t _len, BORDER_MODE borderType, size_t startMargin = 0, size_t endMargin = 0); + +/*! + * Aligns pointer by the certain number of bytes + * + * This small inline function aligns the pointer by the certain number of bytes by shifting + * it forward by 0 or a positive offset. + */ +template inline T* alignPtr(T* ptr, size_t n=sizeof(T)) +{ + return (T*)(((size_t)ptr + n-1) & -n); +} + +}} + +#endif diff --git a/3rdparty/carotene/src/convert.cpp b/3rdparty/carotene/src/convert.cpp new file mode 100644 index 0000000000..2f95e29cb3 --- /dev/null +++ b/3rdparty/carotene/src/convert.cpp @@ -0,0 +1,1331 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON + +#define CVT_FUNC(T1, T2, SIMD_SIZE, CVTINIT, CVTROW) \ + void convert(const Size2D &_size, \ + const T1 * srcBase, ptrdiff_t srcStride, \ + T2 * dstBase, ptrdiff_t dstStride) \ + { \ + internal::assertSupportedConfiguration(); \ + Size2D size(_size); \ + if (srcStride == dstStride && \ + srcStride == (ptrdiff_t)(size.width)) \ + { \ + size.width *= size.height; \ + size.height = 1; \ + } \ + const ptrdiff_t sstep = srcStride / sizeof(T1); \ + const ptrdiff_t dstep = dstStride / sizeof(T2); \ + const size_t w = size.width & ~(SIMD_SIZE-1); \ + if (size.width >= SIMD_SIZE) \ + { \ + const T1* _src = srcBase; \ + T2* _dst = dstBase; \ + CVTINIT \ + for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \ + CVTROW \ + } \ + if(w < size.width) \ + { \ + const T1* _src = srcBase; \ + T2* _dst = dstBase; \ + for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \ + for(size_t i = w; i < size.width; i++ ) \ + _dst[i] = internal::saturate_cast(_src[i]); \ + } \ + } + +#else + +#define CVT_FUNC(T1, T2, SIMD_SIZE, CVTINIT, CVTROW) \ + void convert(const Size2D &, \ + const T1 *, ptrdiff_t, \ + T2 *, ptrdiff_t) \ + { \ + internal::assertSupportedConfiguration(); \ + } + +#endif + +CVT_FUNC(u8, s8, 16, + uint8x16_t v127 = vdupq_n_u8(127);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + uint8x16_t vu8 = vld1q_u8(_src + i); + int8x16_t vu1 = vreinterpretq_s8_u8(vminq_u8(vu8, v127)); + vst1q_s8(_dst + i, vu1); + } +}) + +#if __GNUC_MINOR__ < 7 +CVT_FUNC(u8, u16, 16, + register uint8x16_t zero0 asm ("q1") = vmovq_n_u8(0);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d0-d1}, [%[src]] \n\t" + "vst2.8 {d0,d2}, [%[dst1]] \n\t" + "vst2.8 {d1,d3}, [%[dst2]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 8), + "w" (zero0) + : "d0","d1" + ); + } +}) +#else +CVT_FUNC(u8, u16, 16, + uint8x16x2_t vline; + vline.val[1] = vmovq_n_u8(0);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + vline.val[0] = vld1q_u8(_src + i); + vst2q_u8((uint8_t*)(_dst + i), vline); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVT_FUNC(u8, s32, 16, + register uint8x16_t zero0 asm ("q1") = vmovq_n_u8(0); + register uint8x16_t zero1 asm ("q2") = vmovq_n_u8(0); + register uint8x16_t zero2 asm ("q3") = vmovq_n_u8(0);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d0-d1}, [%[src]] \n\t" + "vst4.8 {d0,d2,d4,d6}, [%[dst1]] \n\t" + "vst4.8 {d1,d3,d5,d7}, [%[dst2]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 8), + "w" (zero0), "w" (zero1), "w" (zero2) + : "d0","d1" + ); + } +}) +#else +CVT_FUNC(u8, s32, 16, + uint8x16x4_t vline; + vline.val[1] = vmovq_n_u8(0); + vline.val[2] = vmovq_n_u8(0); + vline.val[3] = vmovq_n_u8(0);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + vline.val[0] = vld1q_u8(_src + i); + vst4q_u8((uint8_t*)(_dst + i), vline); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(u8, f32, 16, +, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d0-d1}, [%[src]] \n\t" + "vmovl.u8 q1, d0 \n\t" + "vmovl.u8 q2, d1 \n\t" + "vmovl.u16 q3, d2 \n\t" + "vmovl.u16 q4, d3 \n\t" + "vmovl.u16 q5, d4 \n\t" + "vmovl.u16 q6, d5 \n\t" + "vcvt.f32.u32 q7, q3 \n\t" + "vcvt.f32.u32 q8, q4 \n\t" + "vcvt.f32.u32 q9, q5 \n\t" + "vcvt.f32.u32 q10, q6 \n\t" + "vst1.32 {d14-d15}, [%[dst1]] \n\t" + "vst1.32 {d16-d17}, [%[dst2]] \n\t" + "vst1.32 {d18-d19}, [%[dst3]] \n\t" + "vst1.32 {d20-d21}, [%[dst4]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 4), + [dst3] "r" (_dst + i + 8), + [dst4] "r" (_dst + i + 12) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21" + ); + } +}) +#else +CVT_FUNC(u8, f32, 16, +, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + uint8x16_t vline_u8 = vld1q_u8(_src + i); + + uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8(vline_u8)); + uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline_u8)); + + uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16(vline1_u16)); + uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16)); + uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16(vline2_u16)); + uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16)); + + float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); + float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); + float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32); + float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32); + + vst1q_f32(_dst + i, vline1_f32); + vst1q_f32(_dst + i + 4, vline2_f32); + vst1q_f32(_dst + i + 8, vline3_f32); + vst1q_f32(_dst + i + 12, vline4_f32); + } +}) +#endif + +CVT_FUNC(s8, u8, 16, + int8x16_t vZero = vdupq_n_s8(0);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + int8x16_t vu8 = vld1q_s8(_src + i); + uint8x16_t vu1 = vreinterpretq_u8_s8(vmaxq_s8(vu8, vZero)); + vst1q_u8(_dst + i, vu1); + } +}) + +#if __GNUC_MINOR__ < 7 +CVT_FUNC(s8, u16, 16, + register uint8x16_t zero0 asm ("q1") = vmovq_n_u8(0);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d0-d1}, [%[src]] \n\t" + "vmax.s8 q0, q1 \n\t" + "vst2.8 {d0,d2}, [%[dst1]] \n\t" + "vst2.8 {d1,d3}, [%[dst2]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 8), + "w" (zero0) + : "d0","d1" + ); + } +}) +#else +CVT_FUNC(s8, u16, 16, + int8x16x2_t vline_s8; + vline_s8.val[1] = vmovq_n_s8(0);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + vline_s8.val[0] = vld1q_s8(_src + i); + vline_s8.val[0] = vmaxq_s8(vline_s8.val[0], vline_s8.val[1]); + vst2q_s8((int8_t*)(_dst + i), vline_s8); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(s8, s16, 16, +, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d0-d1}, [%[src]] \n\t" + "vmovl.s8 q1, d0 \n\t" + "vmovl.s8 q2, d1 \n\t" + "vst1.16 {d2-d3}, [%[dst1]] \n\t" + "vst1.16 {d4-d5}, [%[dst2]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 8) + : "d0","d1","d2","d3","d4","d5" + ); + } +}) +#else +CVT_FUNC(s8, s16, 16, +, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + int8x16_t vline_s8 = vld1q_s8(_src + i); + + int16x8_t vline1_s16 = vmovl_s8(vget_low_s8(vline_s8)); + int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline_s8)); + + vst1q_s16(_dst + i, vline1_s16); + vst1q_s16(_dst + i + 8, vline2_s16); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVT_FUNC(s8, s32, 16, +, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d0-d1}, [%[src]] \n\t" + "vmovl.s8 q1, d0 \n\t" + "vmovl.s8 q2, d1 \n\t" + "vmovl.s16 q3, d2 \n\t" + "vmovl.s16 q4, d3 \n\t" + "vmovl.s16 q5, d4 \n\t" + "vmovl.s16 q6, d5 \n\t" + "vst1.32 {d6-d7}, [%[dst1]] \n\t" + "vst1.32 {d8-d9}, [%[dst2]] \n\t" + "vst1.32 {d10-d11}, [%[dst3]] \n\t" + "vst1.32 {d12-d13}, [%[dst4]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 4), + [dst3] "r" (_dst + i + 8), + [dst4] "r" (_dst + i + 12) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13" + ); + } +}) +#else +CVT_FUNC(s8, s32, 16, +, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + int8x16_t vline_s8 = vld1q_s8(_src + i); + + int16x8_t vline1_s16 = vmovl_s8(vget_low_s8(vline_s8)); + int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline_s8)); + + int32x4_t vline1_s32 = vmovl_s16(vget_low_s16(vline1_s16)); + int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16)); + int32x4_t vline3_s32 = vmovl_s16(vget_low_s16(vline2_s16)); + int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16)); + + vst1q_s32(_dst + i, vline1_s32); + vst1q_s32(_dst + i + 4, vline2_s32); + vst1q_s32(_dst + i + 8, vline3_s32); + vst1q_s32(_dst + i + 12, vline4_s32); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(s8, f32, 16, +, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d0-d1}, [%[src]] \n\t" + "vmovl.s8 q1, d0 \n\t" + "vmovl.s8 q2, d1 \n\t" + "vmovl.s16 q3, d2 \n\t" + "vmovl.s16 q4, d3 \n\t" + "vmovl.s16 q5, d4 \n\t" + "vmovl.s16 q6, d5 \n\t" + "vcvt.f32.s32 q7, q3 \n\t" + "vcvt.f32.s32 q8, q4 \n\t" + "vcvt.f32.s32 q9, q5 \n\t" + "vcvt.f32.s32 q10, q6 \n\t" + "vst1.32 {d14-d15}, [%[dst1]] \n\t" + "vst1.32 {d16-d17}, [%[dst2]] \n\t" + "vst1.32 {d18-d19}, [%[dst3]] \n\t" + "vst1.32 {d20-d21}, [%[dst4]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 4), + [dst3] "r" (_dst + i + 8), + [dst4] "r" (_dst + i + 12) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21" + ); + } +}) +#else +CVT_FUNC(s8, f32, 16, +, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + int8x16_t vline_s8 = vld1q_s8(_src + i); + + int16x8_t vline1_s16 = vmovl_s8(vget_low_s8(vline_s8)); + int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline_s8)); + + int32x4_t vline1_s32 = vmovl_s16(vget_low_s16(vline1_s16)); + int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16)); + int32x4_t vline3_s32 = vmovl_s16(vget_low_s16(vline2_s16)); + int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16)); + + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32); + float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32); + + vst1q_f32(_dst + i, vline1_f32); + vst1q_f32(_dst + i + 4, vline2_f32); + vst1q_f32(_dst + i + 8, vline3_f32); + vst1q_f32(_dst + i + 12, vline4_f32); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(u16, u8, 16, +, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d0-d1}, [%[src1]] \n\t" + "vqmovn.u16 d4, q0 \n\t" + "vld1.8 {d2-d3}, [%[src2]] \n\t" + "vqmovn.u16 d5, q1 \n\t" + "vst1.8 {d4-d5}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i), + [src2] "r" (_src + i + 8), + [dst] "r" (_dst + i + 0) + : "d0","d1","d2","d3","d4","d5" + ); + } +}) +#else +CVT_FUNC(u16, u8, 16, +, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + uint16x8_t vline1_u16 = vld1q_u16(_src + i); + uint16x8_t vline2_u16 = vld1q_u16(_src + i + 8); + + uint8x8_t vline1_u8 = vqmovn_u16(vline1_u16); + uint8x8_t vline2_u8 = vqmovn_u16(vline2_u16); + + vst1q_u8(_dst + i, vcombine_u8(vline1_u8, vline2_u8)); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(u16, s8, 16, + register uint8x16_t v127 asm ("q4") = vmovq_n_u8(127);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d0-d1}, [%[src1]] \n\t" + "vqmovn.u16 d4, q0 \n\t" + "vld1.8 {d2-d3}, [%[src2]] \n\t" + "vqmovn.u16 d5, q1 \n\t" + "vmin.u8 q3, q2, q4 \n\t" + "vst1.8 {d6-d7}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i), + [src2] "r" (_src + i + 8), + [dst] "r" (_dst + i + 0), + "w" (v127) + : "d0","d1","d2","d3","d4","d5","d6","d7" + ); + } +}) +#else +CVT_FUNC(u16, s8, 16, + uint8x8_t v127 = vmov_n_u8(127);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + uint16x8_t vline1_u16 = vld1q_u16(_src + i); + uint16x8_t vline2_u16 = vld1q_u16(_src + i + 8); + + uint8x8_t vline1_u8 = vqmovn_u16(vline1_u16); + uint8x8_t vline2_u8 = vqmovn_u16(vline2_u16); + vline1_u8 = vmin_u8(vline1_u8, v127); + vline2_u8 = vmin_u8(vline2_u8, v127); + + vst1q_s8(_dst + i, vcombine_s8(vreinterpret_s8_u8(vline1_u8), vreinterpret_s8_u8(vline2_u8))); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVT_FUNC(u16, s16, 8, + register uint16x8_t v32767 asm ("q4") = vmovq_n_u16(0x7FFF);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.16 {d0-d1}, [%[src]] \n\t" + "vmin.u16 q1, q0, q4 \n\t" + "vst1.16 {d2-d3}, [%[dst]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst] "r" (_dst + i + 0), + "w" (v32767) + : "d0","d1","d2","d3" + ); + } +}) +#else +CVT_FUNC(u16, s16, 8, + uint16x8_t v32767 = vmovq_n_u16(0x7FFF);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + uint16x8_t vline_u16 = vld1q_u16(_src + i); + vline_u16 = vminq_u16(vline_u16, v32767); + vst1q_s16((_dst + i), vreinterpretq_s16_u16(vline_u16)); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVT_FUNC(u16, s32, 8, + register uint16x8_t zero0 asm ("q1") = vmovq_n_u16(0);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.16 {d0-d1}, [%[src]] \n\t" + "vst2.16 {d0,d2}, [%[dst1]] \n\t" + "vst2.16 {d1,d3}, [%[dst2]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i), + [dst2] "r" (_dst + i + 4), + "w" (zero0) + : "d0","d1"//,"d2","d3"//,"d4","d5","d6","d7" + ); + } +}) +#else +CVT_FUNC(u16, s32, 8, + uint16x8x2_t vline; + vline.val[1] = vmovq_n_u16(0);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + vline.val[0] = vld1q_u16(_src + i); + vst2q_u16((uint16_t*)(_dst + i), vline); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(u16, f32, 8, +, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.16 {d0-d1}, [%[src]] \n\t" + "vmovl.u16 q1, d0 \n\t" + "vmovl.u16 q2, d1 \n\t" + "vcvt.f32.u32 q3, q1 \n\t" + "vcvt.f32.u32 q4, q2 \n\t" + "vst1.32 {d6-d7}, [%[dst1]] \n\t" + "vst1.32 {d8-d9}, [%[dst2]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 4) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9" + ); + } +}) +#else +CVT_FUNC(u16, f32, 8, +, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + uint16x8_t vline_u16 = vld1q_u16(_src + i); + + uint32x4_t vline_u32_lo = vmovl_u16(vget_low_u16(vline_u16)); + uint32x4_t vline_u32_hi = vmovl_u16(vget_high_u16(vline_u16)); + + float32x4_t vline_f32_lo = vcvtq_f32_u32(vline_u32_lo); + float32x4_t vline_f32_hi = vcvtq_f32_u32(vline_u32_hi); + + vst1q_f32(_dst + i, vline_f32_lo); + vst1q_f32(_dst + i + 4, vline_f32_hi); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(s16, u8, 16, +, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d0-d1}, [%[src1]] \n\t" + "vld1.8 {d2-d3}, [%[src2]] \n\t" + "vqmovun.s16 d4, q0 \n\t" + "vqmovun.s16 d5, q1 \n\t" + "vst1.8 {d4-d5}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i), + [src2] "r" (_src + i + 8), + [dst] "r" (_dst + i + 0) + : "d0","d1","d2","d3","d4","d5" + ); + } +}) +#else +CVT_FUNC(s16, u8, 16, +, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + int16x8_t vline1_s16 = vld1q_s16(_src + i); + int16x8_t vline2_s16 = vld1q_s16(_src + i + 8); + + uint8x8_t vline1_u8 = vqmovun_s16(vline1_s16); + uint8x8_t vline2_u8 = vqmovun_s16(vline2_s16); + + vst1q_u8(_dst + i, vcombine_u8(vline1_u8, vline2_u8)); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(s16, s8, 16, +, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d0-d1}, [%[src1]] \n\t" + "vld1.8 {d2-d3}, [%[src2]] \n\t" + "vqmovn.s16 d4, q0 \n\t" + "vqmovn.s16 d5, q1 \n\t" + "vst1.8 {d4-d5}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i), + [src2] "r" (_src + i + 8), + [dst] "r" (_dst + i + 0) + : "d0","d1","d2","d3","d4","d5" + ); + } +}) +#else +CVT_FUNC(s16, s8, 16, +, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + int16x8_t vline1_s16 = vld1q_s16(_src + i); + int16x8_t vline2_s16 = vld1q_s16(_src + i + 8); + + int8x8_t vline1_s8 = vqmovn_s16(vline1_s16); + int8x8_t vline2_s8 = vqmovn_s16(vline2_s16); + + vst1q_s8(_dst + i, vcombine_s8(vline1_s8, vline2_s8)); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVT_FUNC(s16, u16, 8, + register int16x8_t vZero asm ("q4") = vmovq_n_s16(0);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.16 {d0-d1}, [%[src]] \n\t" + "vmax.s16 q1, q0, q4 \n\t" + "vst1.16 {d2-d3}, [%[dst]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst] "r" (_dst + i + 0), + "w" (vZero) + : "d0","d1","d2","d3" + ); + } +}) +#else +CVT_FUNC(s16, u16, 8, + int16x4_t vZero = vmov_n_s16(0);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int16x8_t vline_s16 = vld1q_s16(_src + i); + + int16x4_t vline_s16_lo = vmax_s16(vget_low_s16(vline_s16), vZero); + int16x4_t vline_s16_hi = vmax_s16(vget_high_s16(vline_s16), vZero); + + vst1q_u16(_dst + i, vcombine_u16(vreinterpret_u16_s16(vline_s16_lo), vreinterpret_u16_s16(vline_s16_hi))); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(s16, s32, 8, +, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.16 {d0-d1}, [%[src]] \n\t" + "vmovl.s16 q1, d0 \n\t" + "vmovl.s16 q2, d1 \n\t" + "vst1.32 {d2-d3}, [%[dst1]] \n\t" + "vst1.32 {d4-d5}, [%[dst2]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 4) + : "d0","d1","d2","d3","d4","d5" + ); + } +}) +#else +CVT_FUNC(s16, s32, 8, +, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int16x8_t vline_s16 = vld1q_s16(_src + i); + + int32x4_t vline_s32_lo = vmovl_s16(vget_low_s16(vline_s16)); + int32x4_t vline_s32_hi = vmovl_s16(vget_high_s16(vline_s16)); + + vst1q_s32(_dst + i, vline_s32_lo); + vst1q_s32(_dst + i + 4, vline_s32_hi); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(s16, f32, 8, +, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.16 {d0-d1}, [%[src]] \n\t" + "vmovl.s16 q1, d0 \n\t" + "vmovl.s16 q2, d1 \n\t" + "vcvt.f32.s32 q3, q1 \n\t" + "vcvt.f32.s32 q4, q2 \n\t" + "vst1.32 {d6-d7}, [%[dst1]] \n\t" + "vst1.32 {d8-d9}, [%[dst2]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 4) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9" + ); + } +}) +#else +CVT_FUNC(s16, f32, 8, +, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int16x8_t vline_s16 = vld1q_s16(_src + i); + + int32x4_t vline_s32_lo = vmovl_s16(vget_low_s16(vline_s16)); + int32x4_t vline_s32_hi = vmovl_s16(vget_high_s16(vline_s16)); + float32x4_t vline_f32_lo = vcvtq_f32_s32(vline_s32_lo); + float32x4_t vline_f32_hi = vcvtq_f32_s32(vline_s32_hi); + + vst1q_f32(_dst + i, vline_f32_lo); + vst1q_f32(_dst + i + 4, vline_f32_hi); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(s32, u8, 8, +, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d0-d1}, [%[src1]] \n\t" + "vld1.32 {d2-d3}, [%[src2]] \n\t" + "vqmovun.s32 d4, q0 \n\t" + "vqmovun.s32 d5, q1 \n\t" + "vqmovn.u16 d6, q2 \n\t" + "vst1.8 {d6}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i + 0), + [src2] "r" (_src + i + 4), + [dst] "r" (_dst + i) + : "d0","d1","d2","d3","d4","d5","d6" + ); + } +}) +#else +CVT_FUNC(s32, u8, 8, +, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int32x4_t vline1_s32 = vld1q_s32(_src + i); + int32x4_t vline2_s32 = vld1q_s32(_src + i + 4); + + uint16x4_t vline1_u16 = vqmovun_s32(vline1_s32); + uint16x4_t vline2_u16 = vqmovun_s32(vline2_s32); + uint8x8_t vline_u8 = vqmovn_u16(vcombine_u16(vline1_u16, vline2_u16)); + + vst1_u8(_dst + i, vline_u8); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(s32, s8, 8, +, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d0-d1}, [%[src1]] \n\t" + "vld1.32 {d2-d3}, [%[src2]] \n\t" + "vqmovn.s32 d4, q0 \n\t" + "vqmovn.s32 d5, q1 \n\t" + "vqmovn.s16 d6, q2 \n\t" + "vst1.8 {d6}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i + 0), + [src2] "r" (_src + i + 4), + [dst] "r" (_dst + i) + : "d0","d1","d2","d3","d4","d5","d6" + ); + } +}) +#else +CVT_FUNC(s32, s8, 8, +, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int32x4_t vline1_s32 = vld1q_s32(_src + i); + int32x4_t vline2_s32 = vld1q_s32(_src + i + 4); + + int16x4_t vline1_s16 = vqmovn_s32(vline1_s32); + int16x4_t vline2_s16 = vqmovn_s32(vline2_s32); + int8x8_t vline_s8 = vqmovn_s16(vcombine_s16(vline1_s16, vline2_s16)); + + vst1_s8(_dst + i, vline_s8); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(s32, u16, 8, +, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d0-d1}, [%[src1]] \n\t" + "vld1.32 {d2-d3}, [%[src2]] \n\t" + "vqmovun.s32 d4, q0 \n\t" + "vqmovun.s32 d5, q1 \n\t" + "vst1.16 {d4-d5}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i + 0), + [src2] "r" (_src + i + 4), + [dst] "r" (_dst + i) + : "d0","d1","d2","d3","d4","d5" + ); + } +}) +#else +CVT_FUNC(s32, u16, 8, +, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int32x4_t vline1_s32 = vld1q_s32(_src + i); + int32x4_t vline2_s32 = vld1q_s32(_src + i + 4); + + uint16x4_t vline1_u16 = vqmovun_s32(vline1_s32); + uint16x4_t vline2_u16 = vqmovun_s32(vline2_s32); + + vst1q_u16(_dst + i, vcombine_u16(vline1_u16, vline2_u16)); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(s32, s16, 8, +, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d0-d1}, [%[src1]] \n\t" + "vld1.32 {d2-d3}, [%[src2]] \n\t" + "vqmovn.s32 d4, q0 \n\t" + "vqmovn.s32 d5, q1 \n\t" + "vst1.8 {d4-d5}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i + 0), + [src2] "r" (_src + i + 4), + [dst] "r" (_dst + i) + : "d0","d1","d2","d3","d4","d5" + ); + } +}) +#else +CVT_FUNC(s32, s16, 8, +, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int32x4_t vline1_s32 = vld1q_s32(_src + i); + int32x4_t vline2_s32 = vld1q_s32(_src + i + 4); + + int16x4_t vline1_s16 = vqmovn_s32(vline1_s32); + int16x4_t vline2_s16 = vqmovn_s32(vline2_s32); + + vst1q_s16(_dst + i, vcombine_s16(vline1_s16, vline2_s16)); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(s32, f32, 8, +, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d0-d1}, [%[src]] \n\t" + "vcvt.f32.s32 q1, q0 \n\t" + "vst1.32 {d2-d3}, [%[dst]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst] "r" (_dst + i) + : "d0","d1","d2","d3"//,"d4","d5" + ); + __asm__ ( + "vld1.32 {d0-d1}, [%[src]] \n\t" + "vcvt.f32.s32 q1, q0 \n\t" + "vst1.32 {d2-d3}, [%[dst]] \n\t" + : /*no output*/ + : [src] "r" (_src + i + 4), + [dst] "r" (_dst + i + 4) + : "d0","d1","d2","d3"//,"d4","d5" + ); + } +}) +#else +CVT_FUNC(s32, f32, 8, +, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int32x4_t vline_s32 = vld1q_s32(_src + i); + float32x4_t vline_f32 = vcvtq_f32_s32(vline_s32); + vst1q_f32(_dst + i, vline_f32); + + vline_s32 = vld1q_s32(_src + i + 4); + vline_f32 = vcvtq_f32_s32(vline_s32); + vst1q_f32(_dst + i + 4, vline_f32); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(f32, u8, 8, + register float32x4_t vmult asm ("q0") = vdupq_n_f32((float)(1 << 16)); + register uint32x4_t vmask asm ("q1") = vdupq_n_u32(1<<16);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d4-d5}, [%[src1]] \n\t" + "vld1.32 {d6-d7}, [%[src2]] \n\t" + "vmul.f32 q4, q2, q0 \n\t" + "vmul.f32 q5, q3, q0 \n\t" + "vcvt.u32.f32 q6, q4 \n\t" + "vcvt.u32.f32 q7, q5 \n\t" + "vbic q8, q1, q6 \n\t" + "vbic q9, q1, q7 \n\t" + "vshr.u32 q10, q8, #16 \n\t" + "vshr.u32 q11, q9, #16 \n\t" + "vqsub.u32 q12, q6, q10 \n\t" + "vqsub.u32 q13, q7, q11 \n\t" + "vqrshrn.u32 d28, q12, #16 \n\t" + "vqrshrn.u32 d29, q13, #16 \n\t" + "vqmovn.u16 d30, q14 \n\t" + "vst1.8 {d30}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i + 0), + [src2] "r" (_src + i + 4), + [dst] "r" (_dst + i), + "w" (vmult), "w" (vmask) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30" + ); + } +}) +#else +CVT_FUNC(f32, u8, 8, + float32x4_t vmult = vdupq_n_f32((float)(1 << 16)); + uint32x4_t vmask = vdupq_n_u32(1<<16);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + float32x4_t vline1_f32 = vld1q_f32(_src + i); + float32x4_t vline2_f32 = vld1q_f32(_src + i + 4); + + float32x4_t vline1w_f32 = vmulq_f32(vline1_f32, vmult); + float32x4_t vline2w_f32 = vmulq_f32(vline2_f32, vmult); + + uint32x4_t vline1_u32 = vcvtq_u32_f32(vline1w_f32); + uint32x4_t vline2_u32 = vcvtq_u32_f32(vline2w_f32); + + uint32x4_t vl1_masked = vbicq_u32(vmask, vline1_u32); + uint32x4_t vl2_masked = vbicq_u32(vmask, vline2_u32); + uint32x4_t vl1_masked2 = vshrq_n_u32(vl1_masked, 16); + uint32x4_t vl2_masked2 = vshrq_n_u32(vl2_masked, 16); + uint32x4_t vline1r_u32 = vqsubq_u32(vline1_u32, vl1_masked2); + uint32x4_t vline2r_u32 = vqsubq_u32(vline2_u32, vl2_masked2); + + uint16x4_t vline1_u16 = vqrshrn_n_u32(vline1r_u32, 16); + uint16x4_t vline2_u16 = vqrshrn_n_u32(vline2r_u32, 16); + + uint8x8_t vline_u8 = vqmovn_u16(vcombine_u16(vline1_u16, vline2_u16)); + vst1_u8(_dst + i, vline_u8); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(f32, s8, 8, + register float32x4_t vhalf asm ("q0") = vdupq_n_f32(0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d2-d3}, [%[src1]] \n\t" + "vld1.32 {d4-d5}, [%[src2]] \n\t" + "vadd.f32 q3, q1, q0 \n\t" + "vadd.f32 q4, q2, q0 \n\t" + "vcvt.s32.f32 q5, q3 \n\t" + "vcvt.s32.f32 q6, q4 \n\t" + "vqmovn.s32 d14, q5 \n\t" + "vqmovn.s32 d15, q6 \n\t" + "vqmovn.s16 d16, q7 \n\t" + "vst1.8 {d16}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i + 0), + [src2] "r" (_src + i + 4), + [dst] "r" (_dst + i), + "w" (vhalf) + : "d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17" + ); + } +}) +#else +CVT_FUNC(f32, s8, 8, + float32x4_t vhalf = vdupq_n_f32(0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + float32x4_t vline1_f32 = vld1q_f32(_src + i); + float32x4_t vline2_f32 = vld1q_f32(_src + i + 4); + + vline1_f32 = vaddq_f32(vline1_f32, vhalf); + vline2_f32 = vaddq_f32(vline2_f32, vhalf); + + int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); + int16x4_t vline1_s16 = vqmovn_s32(vline1_s32); + int16x4_t vline2_s16 = vqmovn_s32(vline2_s32); + + int8x8_t vline_s8 = vqmovn_s16(vcombine_s16(vline1_s16, vline2_s16)); + + vst1_s8(_dst + i, vline_s8); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(f32, u16, 8, + register float32x4_t vhalf asm ("q0") = vdupq_n_f32(0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d2-d3}, [%[src]] \n\t" + "vadd.f32 q2, q1, q0 \n\t" + "vcvt.u32.f32 q3, q2 \n\t" + "vqmovn.u32 d8, q3 \n\t" + "vst1.16 {d8}, [%[dst]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst] "r" (_dst + i), + "w" (vhalf) + : "d2","d3","d4","d5","d6","d7","d8" + ); + __asm__ ( + "vld1.32 {d2-d3}, [%[src]] \n\t" + "vadd.f32 q2, q1, q0 \n\t" + "vcvt.u32.f32 q3, q2 \n\t" + "vqmovn.u32 d8, q3 \n\t" + "vst1.16 {d8}, [%[dst]] \n\t" + : /*no output*/ + : [src] "r" (_src + i + 4), + [dst] "r" (_dst + i + 4), + "w" (vhalf) + : "d2","d3","d4","d5","d6","d7","d8" + ); + } +}) +#else +CVT_FUNC(f32, u16, 8, + float32x4_t vhalf = vdupq_n_f32(0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + float32x4_t vline_f32 = vld1q_f32(_src + i); + + vline_f32 = vaddq_f32(vline_f32, vhalf); + uint32x4_t vline_u32 = vcvtq_u32_f32(vline_f32); + uint16x4_t vline_u16 = vqmovn_u32(vline_u32); + + vst1_u16(_dst + i, vline_u16); + + vline_f32 = vld1q_f32(_src + i + 4); + + vline_f32 = vaddq_f32(vline_f32, vhalf); + vline_u32 = vcvtq_u32_f32(vline_f32); + vline_u16 = vqmovn_u32(vline_u32); + + vst1_u16(_dst + i + 4, vline_u16); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(f32, s16, 8, + register float32x4_t vhalf asm ("q0") = vdupq_n_f32(0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d2-d3}, [%[src]] \n\t" + "vadd.f32 q2, q1, q0 \n\t" + "vcvt.s32.f32 q3, q2 \n\t" + "vqmovn.s32 d8, q3 \n\t" + "vst1.16 {d8}, [%[dst]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst] "r" (_dst + i), + "w" (vhalf) + : "d2","d3","d4","d5","d6","d7","d8" + ); + __asm__ ( + "vld1.32 {d2-d3}, [%[src]] \n\t" + "vadd.f32 q2, q1, q0 \n\t" + "vcvt.s32.f32 q3, q2 \n\t" + "vqmovn.s32 d8, q3 \n\t" + "vst1.16 {d8}, [%[dst]] \n\t" + : /*no output*/ + : [src] "r" (_src + i + 4), + [dst] "r" (_dst + i + 4), + "w" (vhalf) + : "d2","d3","d4","d5","d6","d7","d8" + ); + } +}) +#else +CVT_FUNC(f32, s16, 8, + float32x4_t vhalf = vdupq_n_f32(0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + float32x4_t vline_f32 = vld1q_f32(_src + i); + + vline_f32 = vaddq_f32(vline_f32, vhalf); + int32x4_t vline_s32 = vcvtq_s32_f32(vline_f32); + int16x4_t vline_s16 = vqmovn_s32(vline_s32); + + vst1_s16(_dst + i, vline_s16); + + vline_f32 = vld1q_f32(_src + i + 4); + + vline_f32 = vaddq_f32(vline_f32, vhalf); + vline_s32 = vcvtq_s32_f32(vline_f32); + vline_s16 = vqmovn_s32(vline_s32); + + vst1_s16(_dst + i + 4, vline_s16); + } +}) +#endif + +#if __GNUC_MINOR__ < 6 +CVT_FUNC(f32, s32, 8, + register float32x4_t vhalf asm ("q0") = vdupq_n_f32(0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d2-d3}, [%[src1]] \n\t" + "vld1.32 {d4-d5}, [%[src2]] \n\t" + "vadd.f32 q3, q1, q0 \n\t" + "vadd.f32 q4, q2, q0 \n\t" + "vcvt.s32.f32 q5, q3 \n\t" + "vcvt.s32.f32 q6, q4 \n\t" + "vst1.32 {q5}, [%[dst1]] \n\t" + "vst1.32 {q6}, [%[dst2]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i), + [src2] "r" (_src + i + 4), + [dst1] "r" (_dst + i), + [dst2] "r" (_dst + i + 4), + "w" (vhalf) + : "d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13" + ); + } +}) +#else +CVT_FUNC(f32, s32, 8, + float32x4_t vhalf = vdupq_n_f32(0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + float32x4_t vline_f32 = vld1q_f32(_src + i); + + vline_f32 = vaddq_f32(vline_f32, vhalf); + int32x4_t vline_s32 = vcvtq_s32_f32(vline_f32); + + vst1q_s32(_dst + i, vline_s32); + + vline_f32 = vld1q_f32(_src + i + 4); + + vline_f32 = vaddq_f32(vline_f32, vhalf); + vline_s32 = vcvtq_s32_f32(vline_f32); + + vst1q_s32(_dst + i + 4, vline_s32); + } +}) +#endif + +void convert(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride) +{ + convert(_size, srcBase, srcStride, (u16*)dstBase, dstStride); +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/convert_depth.cpp b/3rdparty/carotene/src/convert_depth.cpp new file mode 100644 index 0000000000..21b0c18a69 --- /dev/null +++ b/3rdparty/carotene/src/convert_depth.cpp @@ -0,0 +1,399 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +#include + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON + +namespace { + +template +void lshiftConst(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride) +{ + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + s16 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + uint8x16_t v_src = vld1q_u8(src + j); + int16x8_t v_dst0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src))); + int16x8_t v_dst1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src))); + + vst1q_s16(dst + j, vshlq_n_s16(v_dst0, shift)); + vst1q_s16(dst + j + 8, vshlq_n_s16(v_dst1, shift)); + } + for (; j < roiw8; j += 8) + { + int16x8_t v_dst = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j))); + vst1q_s16(dst + j, vshlq_n_s16(v_dst, shift)); + } + + for (; j < size.width; j++) + { + dst[j] = ((s16)src[j] << shift); + } + } +} + +template <> +void lshiftConst<0>(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride) +{ + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + s16 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + uint8x16_t v_src = vld1q_u8(src + j); + int16x8_t v_dst0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src))); + int16x8_t v_dst1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src))); + + vst1q_s16(dst + j, v_dst0); + vst1q_s16(dst + j + 8, v_dst1); + } + for (; j < roiw8; j += 8) + { + int16x8_t v_dst = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + j))); + vst1q_s16(dst + j, v_dst); + } + + for (; j < size.width; j++) + { + dst[j] = (s16)src[j]; + } + } +} + +template +void rshiftConst(const Size2D &size, + const s16 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + CONVERT_POLICY cpolicy) +{ + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s16 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + if (cpolicy == CONVERT_POLICY_SATURATE) + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + int16x8_t v_src0 = vshrq_n_s16(vld1q_s16(src + j), shift), + v_src1 = vshrq_n_s16(vld1q_s16(src + j + 8), shift); + uint8x16_t v_dst = vcombine_u8(vqmovun_s16(v_src0), + vqmovun_s16(v_src1)); + vst1q_u8(dst + j, v_dst); + } + for (; j < roiw8; j += 8) + { + int16x8_t v_src = vshrq_n_s16(vld1q_s16(src + j), shift); + vst1_u8(dst + j, vqmovun_s16(v_src)); + } + + for (; j < size.width; j++) + { + dst[j] = internal::saturate_cast((src[j] >> shift)); + } + } + else // CONVERT_POLICY_WRAP + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + int16x8_t v_src0 = vshrq_n_s16(vld1q_s16(src + j), shift), + v_src1 = vshrq_n_s16(vld1q_s16(src + j + 8), shift); + int8x16_t v_dst = vcombine_s8(vmovn_s16(v_src0), + vmovn_s16(v_src1)); + vst1q_u8(dst + j, vreinterpretq_u8_s8(v_dst)); + } + for (; j < roiw8; j += 8) + { + int16x8_t v_src = vshrq_n_s16(vld1q_s16(src + j), shift); + vst1_u8(dst + j, vreinterpret_u8_s8(vmovn_s16(v_src))); + } + + for (; j < size.width; j++) + { + dst[j] = (u8)((src[j] >> shift)); + } + } + } +} + +template <> +void rshiftConst<0>(const Size2D &size, + const s16 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + CONVERT_POLICY cpolicy) +{ + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s16 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + if (cpolicy == CONVERT_POLICY_SATURATE) + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8); + uint8x16_t v_dst = vcombine_u8(vqmovun_s16(v_src0), vqmovun_s16(v_src1)); + vst1q_u8(dst + j, v_dst); + } + for (; j < roiw8; j += 8) + { + int16x8_t v_src = vld1q_s16(src + j); + vst1_u8(dst + j, vqmovun_s16(v_src)); + } + + for (; j < size.width; j++) + { + dst[j] = internal::saturate_cast(src[j]); + } + } + else // CONVERT_POLICY_WRAP + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8); + int8x16_t v_dst = vcombine_s8(vmovn_s16(v_src0), vmovn_s16(v_src1)); + vst1q_u8(dst + j, vreinterpretq_u8_s8(v_dst)); + } + for (; j < roiw8; j += 8) + { + int16x8_t v_src = vld1q_s16(src + j); + vst1_u8(dst + j, vreinterpret_u8_s8(vmovn_s16(v_src))); + } + + for (; j < size.width; j++) + { + dst[j] = (u8)src[j]; + } + } + } +} + +typedef void (* lshiftConstFunc)(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride); + +typedef void (* rshiftConstFunc)(const Size2D &size, + const s16 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + CONVERT_POLICY cpolicy); + +} // namespace + +#endif + +void lshift(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + u32 shift) +{ + internal::assertSupportedConfiguration(); + +#ifdef CAROTENE_NEON + if (shift >= 16u) + { + for (size_t i = 0; i < size.height; ++i) + { + s16 * dst = internal::getRowPtr(dstBase, dstStride, i); + std::memset(dst, 0, sizeof(s16) * size.width); + } + return; + } + + // this ugly contruction is needed to avoid: + // /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant + // return (int16x8_t)__builtin_neon_vshl_nv8hi (__a, __b, 1); + + lshiftConstFunc funcs[16] = + { + lshiftConst<0>, + lshiftConst<1>, + lshiftConst<2>, + lshiftConst<3>, + lshiftConst<4>, + lshiftConst<5>, + lshiftConst<6>, + lshiftConst<7>, + lshiftConst<8>, + lshiftConst<9>, + lshiftConst<10>, + lshiftConst<11>, + lshiftConst<12>, + lshiftConst<13>, + lshiftConst<14>, + lshiftConst<15> + }, func = funcs[shift]; + + func(size, srcBase, srcStride, dstBase, dstStride); +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)shift; +#endif +} + +void rshift(const Size2D &size, + const s16 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + u32 shift, CONVERT_POLICY cpolicy) +{ + internal::assertSupportedConfiguration(); + +#ifdef CAROTENE_NEON + if (shift >= 16) + { + if (cpolicy == CONVERT_POLICY_WRAP) + { + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + int16x8_t v_zero = vdupq_n_s16(0); + + for (size_t i = 0; i < size.height; ++i) + { + const s16 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8); + uint8x16_t v_dst = vcombine_u8(vmovn_u16(vcltq_s16(v_src0, v_zero)), + vmovn_u16(vcltq_s16(v_src1, v_zero))); + vst1q_u8(dst + j, v_dst); + } + for (; j < roiw8; j += 8) + { + int16x8_t v_src = vld1q_s16(src + j); + vst1_u8(dst + j, vmovn_u16(vcltq_s16(v_src, v_zero))); + } + + for (; j < size.width; j++) + { + dst[j] = src[j] >= 0 ? 0 : 255; + } + } + } + else + { + for (size_t i = 0; i < size.height; ++i) + { + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + std::memset(dst, 0, sizeof(u8) * size.width); + } + } + return; + } + + // this ugly contruction is needed to avoid: + // /usr/lib/gcc/arm-linux-gnueabihf/4.8/include/arm_neon.h:3581:59: error: argument must be a constant + // return (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 1); + + rshiftConstFunc funcs[16] = + { + rshiftConst<0>, + rshiftConst<1>, + rshiftConst<2>, + rshiftConst<3>, + rshiftConst<4>, + rshiftConst<5>, + rshiftConst<6>, + rshiftConst<7>, + rshiftConst<8>, + rshiftConst<9>, + rshiftConst<10>, + rshiftConst<11>, + rshiftConst<12>, + rshiftConst<13>, + rshiftConst<14>, + rshiftConst<15> + }, func = funcs[shift]; + + func(size, srcBase, srcStride, dstBase, dstStride, cpolicy); +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)shift; + (void)cpolicy; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/convert_scale.cpp b/3rdparty/carotene/src/convert_scale.cpp new file mode 100644 index 0000000000..50c110b3ee --- /dev/null +++ b/3rdparty/carotene/src/convert_scale.cpp @@ -0,0 +1,2498 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON + +#define CVTS_FUNC(T1, T2, SIMD_SIZE, CVTINIT, CVTROW) \ + void convertScale(const Size2D &_size, \ + const T1 * srcBase, ptrdiff_t srcStride, \ + T2 * dstBase, ptrdiff_t dstStride, \ + f64 alpha, f64 beta) \ + { \ + internal::assertSupportedConfiguration(); \ + Size2D size(_size); \ + if (srcStride == dstStride && \ + srcStride == (ptrdiff_t)(size.width)) \ + { \ + size.width *= size.height; \ + size.height = 1; \ + } \ + const ptrdiff_t sstep = srcStride / sizeof(T1); \ + const ptrdiff_t dstep = dstStride / sizeof(T2); \ + const size_t w = size.width & ~(SIMD_SIZE-1); \ + if (size.width >= SIMD_SIZE) \ + { \ + const T1* _src = srcBase; \ + T2* _dst = dstBase; \ + CVTINIT \ + for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \ + CVTROW \ + } \ + if(w < size.width) \ + { \ + const T1* _src = srcBase; \ + T2* _dst = dstBase; \ + for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \ + for(size_t i = w; i < size.width; i++ ) \ + _dst[i] = internal::saturate_cast(_src[i]*alpha + beta); \ + } \ + } + +#define CVTS_FUNC1(T1, SIMD_SIZE, CVTSINIT, CVTSROW) \ + void convertScale(const Size2D &_size, \ + const T1 * srcBase, ptrdiff_t srcStride, \ + T1 * dstBase, ptrdiff_t dstStride, \ + f64 alpha, f64 beta) \ + { \ + internal::assertSupportedConfiguration(); \ + Size2D size(_size); \ + if (srcStride == dstStride && \ + srcStride == (ptrdiff_t)(size.width)) \ + { \ + size.width *= size.height; \ + size.height = 1; \ + } \ + const ptrdiff_t sstep = srcStride / sizeof(T1); \ + const ptrdiff_t dstep = dstStride / sizeof(T1); \ + const size_t w = size.width & ~(SIMD_SIZE-1); \ + if (size.width >= SIMD_SIZE) \ + { \ + const T1* _src = srcBase; \ + T1* _dst = dstBase; \ + CVTSINIT \ + for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \ + CVTSROW \ + } \ + if(w < size.width) \ + { \ + const T1* _src = srcBase; \ + T1* _dst = dstBase; \ + for (ptrdiff_t h = size.height; h--; _src += sstep, _dst += dstep ) \ + for(size_t i = w; i < size.width; i++ ) \ + _dst[i] = internal::saturate_cast(_src[i]*alpha + beta); \ + } \ + } + +#else + +#define CVTS_FUNC(T1, T2, SIMD_SIZE, CVTINIT, CVTROW) \ + void convertScale(const Size2D &, \ + const T1 *, ptrdiff_t, \ + T2 *, ptrdiff_t, \ + f64, f64) \ + { \ + internal::assertSupportedConfiguration(); \ + } + +#define CVTS_FUNC1(T1, SIMD_SIZE, CVTSINIT, CVTSROW) \ + void convertScale(const Size2D &, \ + const T1 *, ptrdiff_t, \ + T1 *, ptrdiff_t, \ + f64, f64) \ + { \ + internal::assertSupportedConfiguration(); \ + } + +#endif + +#if defined(__GNUC__) && defined(__arm__) +CVTS_FUNC1(u8, 16, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d4-d5}, [%[src]] \n\t" + "vmovl.u8 q3, d4 \n\t" + "vmovl.u8 q4, d5 \n\t" + "vmovl.u16 q5, d6 \n\t" + "vmovl.u16 q6, d7 \n\t" + "vmovl.u16 q7, d8 \n\t" + "vmovl.u16 q8, d9 \n\t" + "vcvt.f32.u32 q9, q5 \n\t" + "vcvt.f32.u32 q10, q6 \n\t" + "vcvt.f32.u32 q11, q7 \n\t" + "vcvt.f32.u32 q12, q8 \n\t" + "vmul.f32 q13, q9, q0 \n\t" + "vmul.f32 q14, q10, q0 \n\t" + "vmul.f32 q15, q11, q0 \n\t" + "vmul.f32 q2, q12, q0 \n\t" + "vadd.f32 q3, q13, q1 \n\t" + "vadd.f32 q4, q14, q1 \n\t" + "vadd.f32 q5, q15, q1 \n\t" + "vadd.f32 q6, q2, q1 \n\t" + "vcvt.s32.f32 q7, q3 \n\t" + "vcvt.s32.f32 q8, q4 \n\t" + "vcvt.s32.f32 q9, q5 \n\t" + "vcvt.s32.f32 q10, q6 \n\t" + "vqmovun.s32 d22, q7 \n\t" + "vqmovun.s32 d23, q8 \n\t" + "vqmovun.s32 d24, q9 \n\t" + "vqmovun.s32 d25, q10 \n\t" + "vqmovn.u16 d26, q11 \n\t" + "vqmovn.u16 d27, q12 \n\t" + "vst1.8 {d26-d27}, [%[dst1]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31" + ); + } +}) +#else +CVTS_FUNC1(u8, 16, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + uint8x16_t vline = vld1q_u8(_src + i); + uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline)); + uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline)); + uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16)); + uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16)); + uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16)); + uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16)); + float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); + float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); + float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32); + float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline3_f32 = vmulq_f32(vline3_f32, vscale); + vline4_f32 = vmulq_f32(vline4_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline3_f32 = vaddq_f32(vline3_f32, vshift); + vline4_f32 = vaddq_f32(vline4_f32, vshift); + int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); + int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32); + int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32); + uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32)); + uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32)); + vst1q_u8(_dst + i, vcombine_u8(vqmovn_u16(vRes1_u16), vqmovn_u16(vRes2_u16))); + } +}) +#endif + +#if defined(__GNUC__) && defined(__arm__) +CVTS_FUNC(u8, s8, 16, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d4-d5}, [%[src]] \n\t" + "vmovl.u8 q3, d4 \n\t" + "vmovl.u8 q4, d5 \n\t" + "vmovl.u16 q5, d6 \n\t" + "vmovl.u16 q6, d7 \n\t" + "vmovl.u16 q7, d8 \n\t" + "vmovl.u16 q8, d9 \n\t" + "vcvt.f32.u32 q9, q5 \n\t" + "vcvt.f32.u32 q10, q6 \n\t" + "vcvt.f32.u32 q11, q7 \n\t" + "vcvt.f32.u32 q12, q8 \n\t" + "vmul.f32 q13, q9, q0 \n\t" + "vmul.f32 q14, q10, q0 \n\t" + "vmul.f32 q15, q11, q0 \n\t" + "vmul.f32 q2, q12, q0 \n\t" + "vadd.f32 q3, q13, q1 \n\t" + "vadd.f32 q4, q14, q1 \n\t" + "vadd.f32 q5, q15, q1 \n\t" + "vadd.f32 q6, q2, q1 \n\t" + "vcvt.s32.f32 q7, q3 \n\t" + "vcvt.s32.f32 q8, q4 \n\t" + "vcvt.s32.f32 q9, q5 \n\t" + "vcvt.s32.f32 q10, q6 \n\t" + "vqmovn.s32 d22, q7 \n\t" + "vqmovn.s32 d23, q8 \n\t" + "vqmovn.s32 d24, q9 \n\t" + "vqmovn.s32 d25, q10 \n\t" + "vqmovn.s16 d26, q11 \n\t" + "vqmovn.s16 d27, q12 \n\t" + "vst1.8 {d26-d27}, [%[dst1]] \n\t" + : //no output + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31" + ); + } +}) +#else +CVTS_FUNC(u8, s8, 16, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + uint8x16_t vline = vld1q_u8(_src + i); + uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline)); + uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline)); + uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16)); + uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16)); + uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16)); + uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16)); + float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); + float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); + float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32); + float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline3_f32 = vmulq_f32(vline3_f32, vscale); + vline4_f32 = vmulq_f32(vline4_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline3_f32 = vaddq_f32(vline3_f32, vshift); + vline4_f32 = vaddq_f32(vline4_f32, vshift); + int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); + int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32); + int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32); + int16x8_t vRes1_u16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32)); + int16x8_t vRes2_u16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32)); + vst1q_s8(_dst + i, vcombine_s8(vqmovn_s16(vRes1_u16), vqmovn_s16(vRes2_u16))); + } +}) +#endif + +#if defined(__GNUC__) && defined(__arm__) +CVTS_FUNC(u8, u16, 16, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d4-d5}, [%[src]] \n\t" + "vmovl.u8 q3, d4 \n\t" + "vmovl.u8 q4, d5 \n\t" + "vmovl.u16 q5, d6 \n\t" + "vmovl.u16 q6, d7 \n\t" + "vmovl.u16 q7, d8 \n\t" + "vmovl.u16 q8, d9 \n\t" + "vcvt.f32.u32 q9, q5 \n\t" + "vcvt.f32.u32 q10, q6 \n\t" + "vcvt.f32.u32 q11, q7 \n\t" + "vcvt.f32.u32 q12, q8 \n\t" + "vmul.f32 q13, q9, q0 \n\t" + "vmul.f32 q14, q10, q0 \n\t" + "vmul.f32 q15, q11, q0 \n\t" + "vmul.f32 q2, q12, q0 \n\t" + "vadd.f32 q3, q13, q1 \n\t" + "vadd.f32 q4, q14, q1 \n\t" + "vadd.f32 q5, q15, q1 \n\t" + "vadd.f32 q6, q2, q1 \n\t" + "vcvt.s32.f32 q7, q3 \n\t" + "vcvt.s32.f32 q8, q4 \n\t" + "vcvt.s32.f32 q9, q5 \n\t" + "vcvt.s32.f32 q10, q6 \n\t" + "vqmovun.s32 d22, q7 \n\t" + "vqmovun.s32 d23, q8 \n\t" + "vqmovun.s32 d24, q9 \n\t" + "vqmovun.s32 d25, q10 \n\t" + "vst1.16 {d22-d23}, [%[dst1]] \n\t" + "vst1.16 {d24-d25}, [%[dst2]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 8), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31" + ); + } +}) +#else +CVTS_FUNC(u8, u16, 16, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + uint8x16_t vline = vld1q_u8(_src + i); + uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline)); + uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline)); + uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16)); + uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16)); + uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16)); + uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16)); + float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); + float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); + float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32); + float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline3_f32 = vmulq_f32(vline3_f32, vscale); + vline4_f32 = vmulq_f32(vline4_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline3_f32 = vaddq_f32(vline3_f32, vshift); + vline4_f32 = vaddq_f32(vline4_f32, vshift); + int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); + int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32); + int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32); + vst1q_u16(_dst + i + 0, vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32))); + vst1q_u16(_dst + i + 8, vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32))); + } +}) +#endif + +#if defined(__GNUC__) && defined(__arm__) +CVTS_FUNC(u8, s16, 16, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d4-d5}, [%[src]] \n\t" + "vmovl.u8 q3, d4 \n\t" + "vmovl.u8 q4, d5 \n\t" + "vmovl.u16 q5, d6 \n\t" + "vmovl.u16 q6, d7 \n\t" + "vmovl.u16 q7, d8 \n\t" + "vmovl.u16 q8, d9 \n\t" + "vcvt.f32.u32 q9, q5 \n\t" + "vcvt.f32.u32 q10, q6 \n\t" + "vcvt.f32.u32 q11, q7 \n\t" + "vcvt.f32.u32 q12, q8 \n\t" + "vmul.f32 q13, q9, q0 \n\t" + "vmul.f32 q14, q10, q0 \n\t" + "vmul.f32 q15, q11, q0 \n\t" + "vmul.f32 q2, q12, q0 \n\t" + "vadd.f32 q3, q13, q1 \n\t" + "vadd.f32 q4, q14, q1 \n\t" + "vadd.f32 q5, q15, q1 \n\t" + "vadd.f32 q6, q2, q1 \n\t" + "vcvt.s32.f32 q7, q3 \n\t" + "vcvt.s32.f32 q8, q4 \n\t" + "vcvt.s32.f32 q9, q5 \n\t" + "vcvt.s32.f32 q10, q6 \n\t" + "vqmovn.s32 d22, q7 \n\t" + "vqmovn.s32 d23, q8 \n\t" + "vqmovn.s32 d24, q9 \n\t" + "vqmovn.s32 d25, q10 \n\t" + "vst1.16 {d22-d23}, [%[dst1]] \n\t" + "vst1.16 {d24-d25}, [%[dst2]] \n\t" + : //no output + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 8), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31" + ); + } +}) +#else +CVTS_FUNC(u8, s16, 16, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + uint8x16_t vline = vld1q_u8(_src + i); + uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline)); + uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline)); + uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16)); + uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16)); + uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16)); + uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16)); + float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); + float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); + float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32); + float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline3_f32 = vmulq_f32(vline3_f32, vscale); + vline4_f32 = vmulq_f32(vline4_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline3_f32 = vaddq_f32(vline3_f32, vshift); + vline4_f32 = vaddq_f32(vline4_f32, vshift); + int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); + int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32); + int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32); + vst1q_s16(_dst + i + 0, vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32))); + vst1q_s16(_dst + i + 8, vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32))); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(u8, s32, 16, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d4-d5}, [%[src]] \n\t" + "vmovl.u8 q3, d4 \n\t" + "vmovl.u8 q4, d5 \n\t" + "vmovl.u16 q5, d6 \n\t" + "vmovl.u16 q6, d7 \n\t" + "vmovl.u16 q7, d8 \n\t" + "vmovl.u16 q8, d9 \n\t" + "vcvt.f32.u32 q9, q5 \n\t" + "vcvt.f32.u32 q10, q6 \n\t" + "vcvt.f32.u32 q11, q7 \n\t" + "vcvt.f32.u32 q12, q8 \n\t" + "vmul.f32 q13, q9, q0 \n\t" + "vmul.f32 q14, q10, q0 \n\t" + "vmul.f32 q15, q11, q0 \n\t" + "vmul.f32 q2, q12, q0 \n\t" + "vadd.f32 q3, q13, q1 \n\t" + "vadd.f32 q4, q14, q1 \n\t" + "vadd.f32 q5, q15, q1 \n\t" + "vadd.f32 q6, q2, q1 \n\t" + "vcvt.s32.f32 q7, q3 \n\t" + "vcvt.s32.f32 q8, q4 \n\t" + "vcvt.s32.f32 q9, q5 \n\t" + "vcvt.s32.f32 q10, q6 \n\t" + "vst1.32 {d14-d15}, [%[dst1]] \n\t" + "vst1.32 {d16-d17}, [%[dst2]] \n\t" + "vst1.32 {d18-d19}, [%[dst3]] \n\t" + "vst1.32 {d20-d21}, [%[dst4]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 4), + [dst3] "r" (_dst + i + 8), + [dst4] "r" (_dst + i + 12), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10", + "d11","d12","d13","d14","d15","d16","d17", + "d18","d19","d20","d21","d22","d23","d24", + "d25","d26","d27","d28","d29","d30","d31" + ); + } +}) +#else +CVTS_FUNC(u8, s32, 16, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + uint8x16_t vline = vld1q_u8(_src + i); + uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline)); + uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline)); + uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16)); + uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16)); + uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16)); + uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16)); + float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); + float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); + float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32); + float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline3_f32 = vmulq_f32(vline3_f32, vscale); + vline4_f32 = vmulq_f32(vline4_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline3_f32 = vaddq_f32(vline3_f32, vshift); + vline4_f32 = vaddq_f32(vline4_f32, vshift); + int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); + int32x4_t vline3_s32 = vcvtq_s32_f32(vline3_f32); + int32x4_t vline4_s32 = vcvtq_s32_f32(vline4_f32); + vst1q_s32(_dst + i + 0, vline1_s32); + vst1q_s32(_dst + i + 4, vline2_s32); + vst1q_s32(_dst + i + 8, vline3_s32); + vst1q_s32(_dst + i + 12, vline4_s32); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(u8, f32, 16, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d4-d5}, [%[src]] \n\t" + "vmovl.u8 q3, d4 \n\t" + "vmovl.u8 q4, d5 \n\t" + "vmovl.u16 q5, d6 \n\t" + "vmovl.u16 q6, d7 \n\t" + "vmovl.u16 q7, d8 \n\t" + "vmovl.u16 q8, d9 \n\t" + "vcvt.f32.u32 q9, q5 \n\t" + "vcvt.f32.u32 q10, q6 \n\t" + "vcvt.f32.u32 q11, q7 \n\t" + "vcvt.f32.u32 q12, q8 \n\t" + "vmul.f32 q13, q9, q0 \n\t" + "vmul.f32 q14, q10, q0 \n\t" + "vmul.f32 q15, q11, q0 \n\t" + "vmul.f32 q2, q12, q0 \n\t" + "vadd.f32 q3, q13, q1 \n\t" + "vadd.f32 q4, q14, q1 \n\t" + "vadd.f32 q5, q15, q1 \n\t" + "vadd.f32 q6, q2, q1 \n\t" + "vst1.32 {d6-d7}, [%[dst1]] \n\t" + "vst1.32 {d8-d9}, [%[dst2]] \n\t" + "vst1.32 {d10-d11}, [%[dst3]] \n\t" + "vst1.32 {d12-d13}, [%[dst4]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 4), + [dst3] "r" (_dst + i + 8), + [dst4] "r" (_dst + i + 12), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10", + "d11","d12","d13","d14","d15","d16","d17", + "d18","d19","d20","d21","d22","d23","d24", + "d25","d26","d27","d28","d29","d30","d31" + ); + } +}) +#else +CVTS_FUNC(u8, f32, 16, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + uint8x16_t vline = vld1q_u8(_src + i); + uint16x8_t vline1_u16 = vmovl_u8(vget_low_u8 (vline)); + uint16x8_t vline2_u16 = vmovl_u8(vget_high_u8(vline)); + uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline1_u16)); + uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline1_u16)); + uint32x4_t vline3_u32 = vmovl_u16(vget_low_u16 (vline2_u16)); + uint32x4_t vline4_u32 = vmovl_u16(vget_high_u16(vline2_u16)); + float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); + float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); + float32x4_t vline3_f32 = vcvtq_f32_u32(vline3_u32); + float32x4_t vline4_f32 = vcvtq_f32_u32(vline4_u32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline3_f32 = vmulq_f32(vline3_f32, vscale); + vline4_f32 = vmulq_f32(vline4_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline3_f32 = vaddq_f32(vline3_f32, vshift); + vline4_f32 = vaddq_f32(vline4_f32, vshift); + vst1q_f32(_dst + i + 0, vline1_f32); + vst1q_f32(_dst + i + 4, vline2_f32); + vst1q_f32(_dst + i + 8, vline3_f32); + vst1q_f32(_dst + i + 12, vline4_f32); + } +}) +#endif + +#if defined(__GNUC__) && defined(__arm__) +CVTS_FUNC(s8, u8, 16, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d4-d5}, [%[src]] \n\t" + "vmovl.s8 q3, d4 \n\t" + "vmovl.s8 q4, d5 \n\t" + "vmovl.s16 q5, d6 \n\t" + "vmovl.s16 q6, d7 \n\t" + "vmovl.s16 q7, d8 \n\t" + "vmovl.s16 q8, d9 \n\t" + "vcvt.f32.s32 q9, q5 \n\t" + "vcvt.f32.s32 q10, q6 \n\t" + "vcvt.f32.s32 q11, q7 \n\t" + "vcvt.f32.s32 q12, q8 \n\t" + "vmul.f32 q13, q9, q0 \n\t" + "vmul.f32 q14, q10, q0 \n\t" + "vmul.f32 q15, q11, q0 \n\t" + "vmul.f32 q2, q12, q0 \n\t" + "vadd.f32 q3, q13, q1 \n\t" + "vadd.f32 q4, q14, q1 \n\t" + "vadd.f32 q5, q15, q1 \n\t" + "vadd.f32 q6, q2, q1 \n\t" + "vcvt.s32.f32 q7, q3 \n\t" + "vcvt.s32.f32 q8, q4 \n\t" + "vcvt.s32.f32 q9, q5 \n\t" + "vcvt.s32.f32 q10, q6 \n\t" + "vqmovun.s32 d22, q7 \n\t" + "vqmovun.s32 d23, q8 \n\t" + "vqmovun.s32 d24, q9 \n\t" + "vqmovun.s32 d25, q10 \n\t" + "vqmovn.u16 d26, q11 \n\t" + "vqmovn.u16 d27, q12 \n\t" + "vst1.8 {d26-d27}, [%[dst1]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31" + ); + } +}) +#else +CVTS_FUNC(s8, u8, 16, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + int8x16_t vline = vld1q_s8(_src + i); + int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline)); + int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline)); + int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16)); + int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16)); + int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16)); + int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16)); + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32); + float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline3_f32 = vmulq_f32(vline3_f32, vscale); + vline4_f32 = vmulq_f32(vline4_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline3_f32 = vaddq_f32(vline3_f32, vshift); + vline4_f32 = vaddq_f32(vline4_f32, vshift); + vline1_s32 = vcvtq_s32_f32(vline1_f32); + vline2_s32 = vcvtq_s32_f32(vline2_f32); + vline3_s32 = vcvtq_s32_f32(vline3_f32); + vline4_s32 = vcvtq_s32_f32(vline4_f32); + uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32)); + uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32)); + vst1q_u8(_dst + i, vcombine_u8(vqmovn_u16(vRes1_u16), vqmovn_u16(vRes2_u16))); + } +}) +#endif + +#if defined(__GNUC__) && defined(__arm__) +CVTS_FUNC1(s8, 16, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d4-d5}, [%[src]] \n\t" + "vmovl.s8 q3, d4 \n\t" + "vmovl.s8 q4, d5 \n\t" + "vmovl.s16 q5, d6 \n\t" + "vmovl.s16 q6, d7 \n\t" + "vmovl.s16 q7, d8 \n\t" + "vmovl.s16 q8, d9 \n\t" + "vcvt.f32.s32 q9, q5 \n\t" + "vcvt.f32.s32 q10, q6 \n\t" + "vcvt.f32.s32 q11, q7 \n\t" + "vcvt.f32.s32 q12, q8 \n\t" + "vmul.f32 q13, q9, q0 \n\t" + "vmul.f32 q14, q10, q0 \n\t" + "vmul.f32 q15, q11, q0 \n\t" + "vmul.f32 q2, q12, q0 \n\t" + "vadd.f32 q3, q13, q1 \n\t" + "vadd.f32 q4, q14, q1 \n\t" + "vadd.f32 q5, q15, q1 \n\t" + "vadd.f32 q6, q2, q1 \n\t" + "vcvt.s32.f32 q7, q3 \n\t" + "vcvt.s32.f32 q8, q4 \n\t" + "vcvt.s32.f32 q9, q5 \n\t" + "vcvt.s32.f32 q10, q6 \n\t" + "vqmovn.s32 d22, q7 \n\t" + "vqmovn.s32 d23, q8 \n\t" + "vqmovn.s32 d24, q9 \n\t" + "vqmovn.s32 d25, q10 \n\t" + "vqmovn.s16 d26, q11 \n\t" + "vqmovn.s16 d27, q12 \n\t" + "vst1.8 {d26-d27}, [%[dst1]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31" + ); + } +}) +#else +CVTS_FUNC1(s8, 16, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + int8x16_t vline = vld1q_s8(_src + i); + int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline)); + int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline)); + int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16)); + int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16)); + int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16)); + int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16)); + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32); + float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline3_f32 = vmulq_f32(vline3_f32, vscale); + vline4_f32 = vmulq_f32(vline4_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline3_f32 = vaddq_f32(vline3_f32, vshift); + vline4_f32 = vaddq_f32(vline4_f32, vshift); + vline1_s32 = vcvtq_s32_f32(vline1_f32); + vline2_s32 = vcvtq_s32_f32(vline2_f32); + vline3_s32 = vcvtq_s32_f32(vline3_f32); + vline4_s32 = vcvtq_s32_f32(vline4_f32); + int16x8_t vRes1_s16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32)); + int16x8_t vRes2_s16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32)); + vst1q_s8(_dst + i, vcombine_s8(vqmovn_s16(vRes1_s16), vqmovn_s16(vRes2_s16))); + } +}) +#endif + +#if defined(__GNUC__) && defined(__arm__) +CVTS_FUNC(s8, u16, 16, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d4-d5}, [%[src]] \n\t" + "vmovl.s8 q3, d4 \n\t" + "vmovl.s8 q4, d5 \n\t" + "vmovl.s16 q5, d6 \n\t" + "vmovl.s16 q6, d7 \n\t" + "vmovl.s16 q7, d8 \n\t" + "vmovl.s16 q8, d9 \n\t" + "vcvt.f32.s32 q9, q5 \n\t" + "vcvt.f32.s32 q10, q6 \n\t" + "vcvt.f32.s32 q11, q7 \n\t" + "vcvt.f32.s32 q12, q8 \n\t" + "vmul.f32 q13, q9, q0 \n\t" + "vmul.f32 q14, q10, q0 \n\t" + "vmul.f32 q15, q11, q0 \n\t" + "vmul.f32 q2, q12, q0 \n\t" + "vadd.f32 q3, q13, q1 \n\t" + "vadd.f32 q4, q14, q1 \n\t" + "vadd.f32 q5, q15, q1 \n\t" + "vadd.f32 q6, q2, q1 \n\t" + "vcvt.s32.f32 q7, q3 \n\t" + "vcvt.s32.f32 q8, q4 \n\t" + "vcvt.s32.f32 q9, q5 \n\t" + "vcvt.s32.f32 q10, q6 \n\t" + "vqmovun.s32 d22, q7 \n\t" + "vqmovun.s32 d23, q8 \n\t" + "vqmovun.s32 d24, q9 \n\t" + "vqmovun.s32 d25, q10 \n\t" + "vst1.16 {d22-d23}, [%[dst1]] \n\t" + "vst1.16 {d24-d25}, [%[dst2]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 8), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31" + ); + } +}) +#else +CVTS_FUNC(s8, u16, 16, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + int8x16_t vline = vld1q_s8(_src + i); + int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline)); + int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline)); + int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16)); + int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16)); + int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16)); + int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16)); + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32); + float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline3_f32 = vmulq_f32(vline3_f32, vscale); + vline4_f32 = vmulq_f32(vline4_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline3_f32 = vaddq_f32(vline3_f32, vshift); + vline4_f32 = vaddq_f32(vline4_f32, vshift); + vline1_s32 = vcvtq_s32_f32(vline1_f32); + vline2_s32 = vcvtq_s32_f32(vline2_f32); + vline3_s32 = vcvtq_s32_f32(vline3_f32); + vline4_s32 = vcvtq_s32_f32(vline4_f32); + uint16x8_t vRes1_u16 = vcombine_u16(vqmovun_s32(vline1_s32), vqmovun_s32(vline2_s32)); + uint16x8_t vRes2_u16 = vcombine_u16(vqmovun_s32(vline3_s32), vqmovun_s32(vline4_s32)); + vst1q_u16(_dst + i + 0, vRes1_u16); + vst1q_u16(_dst + i + 8, vRes2_u16); + } +}) +#endif + +#if defined(__GNUC__) && defined(__arm__) +CVTS_FUNC(s8, s16, 16, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d4-d5}, [%[src]] \n\t" + "vmovl.s8 q3, d4 \n\t" + "vmovl.s8 q4, d5 \n\t" + "vmovl.s16 q5, d6 \n\t" + "vmovl.s16 q6, d7 \n\t" + "vmovl.s16 q7, d8 \n\t" + "vmovl.s16 q8, d9 \n\t" + "vcvt.f32.s32 q9, q5 \n\t" + "vcvt.f32.s32 q10, q6 \n\t" + "vcvt.f32.s32 q11, q7 \n\t" + "vcvt.f32.s32 q12, q8 \n\t" + "vmul.f32 q13, q9, q0 \n\t" + "vmul.f32 q14, q10, q0 \n\t" + "vmul.f32 q15, q11, q0 \n\t" + "vmul.f32 q2, q12, q0 \n\t" + "vadd.f32 q3, q13, q1 \n\t" + "vadd.f32 q4, q14, q1 \n\t" + "vadd.f32 q5, q15, q1 \n\t" + "vadd.f32 q6, q2, q1 \n\t" + "vcvt.s32.f32 q7, q3 \n\t" + "vcvt.s32.f32 q8, q4 \n\t" + "vcvt.s32.f32 q9, q5 \n\t" + "vcvt.s32.f32 q10, q6 \n\t" + "vqmovn.s32 d22, q7 \n\t" + "vqmovn.s32 d23, q8 \n\t" + "vqmovn.s32 d24, q9 \n\t" + "vqmovn.s32 d25, q10 \n\t" + "vst1.16 {d22-d23}, [%[dst1]] \n\t" + "vst1.16 {d24-d25}, [%[dst2]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 8), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30","d31" + ); + } +}) +#else +CVTS_FUNC(s8, s16, 16, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + int8x16_t vline = vld1q_s8(_src + i); + int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline)); + int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline)); + int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16)); + int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16)); + int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16)); + int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16)); + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32); + float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline3_f32 = vmulq_f32(vline3_f32, vscale); + vline4_f32 = vmulq_f32(vline4_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline3_f32 = vaddq_f32(vline3_f32, vshift); + vline4_f32 = vaddq_f32(vline4_f32, vshift); + vline1_s32 = vcvtq_s32_f32(vline1_f32); + vline2_s32 = vcvtq_s32_f32(vline2_f32); + vline3_s32 = vcvtq_s32_f32(vline3_f32); + vline4_s32 = vcvtq_s32_f32(vline4_f32); + int16x8_t vRes1_s16 = vcombine_s16(vqmovn_s32(vline1_s32), vqmovn_s32(vline2_s32)); + int16x8_t vRes2_s16 = vcombine_s16(vqmovn_s32(vline3_s32), vqmovn_s32(vline4_s32)); + vst1q_s16(_dst + i + 0, vRes1_s16); + vst1q_s16(_dst + i + 8, vRes2_s16); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(s8, s32, 16, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d4-d5}, [%[src]] \n\t" + "vmovl.s8 q3, d4 \n\t" + "vmovl.s8 q4, d5 \n\t" + "vmovl.s16 q5, d6 \n\t" + "vmovl.s16 q6, d7 \n\t" + "vmovl.s16 q7, d8 \n\t" + "vmovl.s16 q8, d9 \n\t" + "vcvt.f32.s32 q9, q5 \n\t" + "vcvt.f32.s32 q10, q6 \n\t" + "vcvt.f32.s32 q11, q7 \n\t" + "vcvt.f32.s32 q12, q8 \n\t" + "vmul.f32 q13, q9, q0 \n\t" + "vmul.f32 q14, q10, q0 \n\t" + "vmul.f32 q15, q11, q0 \n\t" + "vmul.f32 q2, q12, q0 \n\t" + "vadd.f32 q3, q13, q1 \n\t" + "vadd.f32 q4, q14, q1 \n\t" + "vadd.f32 q5, q15, q1 \n\t" + "vadd.f32 q6, q2, q1 \n\t" + "vcvt.s32.f32 q7, q3 \n\t" + "vcvt.s32.f32 q8, q4 \n\t" + "vcvt.s32.f32 q9, q5 \n\t" + "vcvt.s32.f32 q10, q6 \n\t" + "vst1.32 {d14-d15}, [%[dst1]] \n\t" + "vst1.32 {d16-d17}, [%[dst2]] \n\t" + "vst1.32 {d18-d19}, [%[dst3]] \n\t" + "vst1.32 {d20-d21}, [%[dst4]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 4), + [dst3] "r" (_dst + i + 8), + [dst4] "r" (_dst + i + 12), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10", + "d11","d12","d13","d14","d15","d16","d17", + "d18","d19","d20","d21","d22","d23","d24", + "d25","d26","d27","d28","d29","d30","d31" + ); + } +}) +#else +CVTS_FUNC(s8, s32, 16, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + int8x16_t vline = vld1q_s8(_src + i); + int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline)); + int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline)); + int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16)); + int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16)); + int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16)); + int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16)); + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32); + float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline3_f32 = vmulq_f32(vline3_f32, vscale); + vline4_f32 = vmulq_f32(vline4_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline3_f32 = vaddq_f32(vline3_f32, vshift); + vline4_f32 = vaddq_f32(vline4_f32, vshift); + vline1_s32 = vcvtq_s32_f32(vline1_f32); + vline2_s32 = vcvtq_s32_f32(vline2_f32); + vline3_s32 = vcvtq_s32_f32(vline3_f32); + vline4_s32 = vcvtq_s32_f32(vline4_f32); + vst1q_s32(_dst + i + 0, vline1_s32); + vst1q_s32(_dst + i + 4, vline2_s32); + vst1q_s32(_dst + i + 8, vline3_s32); + vst1q_s32(_dst + i + 12, vline4_s32); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(s8, f32, 16, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d4-d5}, [%[src]] \n\t" + "vmovl.s8 q3, d4 \n\t" + "vmovl.s8 q4, d5 \n\t" + "vmovl.s16 q5, d6 \n\t" + "vmovl.s16 q6, d7 \n\t" + "vmovl.s16 q7, d8 \n\t" + "vmovl.s16 q8, d9 \n\t" + "vcvt.f32.s32 q9, q5 \n\t" + "vcvt.f32.s32 q10, q6 \n\t" + "vcvt.f32.s32 q11, q7 \n\t" + "vcvt.f32.s32 q12, q8 \n\t" + "vmul.f32 q13, q9, q0 \n\t" + "vmul.f32 q14, q10, q0 \n\t" + "vmul.f32 q15, q11, q0 \n\t" + "vmul.f32 q2, q12, q0 \n\t" + "vadd.f32 q3, q13, q1 \n\t" + "vadd.f32 q4, q14, q1 \n\t" + "vadd.f32 q5, q15, q1 \n\t" + "vadd.f32 q6, q2, q1 \n\t" + "vst1.32 {d6-d7}, [%[dst1]] \n\t" + "vst1.32 {d8-d9}, [%[dst2]] \n\t" + "vst1.32 {d10-d11}, [%[dst3]] \n\t" + "vst1.32 {d12-d13}, [%[dst4]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 4), + [dst3] "r" (_dst + i + 8), + [dst4] "r" (_dst + i + 12), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10", + "d11","d12","d13","d14","d15","d16","d17", + "d18","d19","d20","d21","d22","d23","d24", + "d25","d26","d27","d28","d29","d30","d31" + ); + } +}) +#else +CVTS_FUNC(s8, f32, 16, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta);, +{ + for (size_t i = 0; i < w; i += 16) + { + internal::prefetch(_src + i); + int8x16_t vline = vld1q_s8(_src + i); + int16x8_t vline1_s16 = vmovl_s8(vget_low_s8 (vline)); + int16x8_t vline2_s16 = vmovl_s8(vget_high_s8(vline)); + int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline1_s16)); + int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline1_s16)); + int32x4_t vline3_s32 = vmovl_s16(vget_low_s16 (vline2_s16)); + int32x4_t vline4_s32 = vmovl_s16(vget_high_s16(vline2_s16)); + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + float32x4_t vline3_f32 = vcvtq_f32_s32(vline3_s32); + float32x4_t vline4_f32 = vcvtq_f32_s32(vline4_s32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline3_f32 = vmulq_f32(vline3_f32, vscale); + vline4_f32 = vmulq_f32(vline4_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline3_f32 = vaddq_f32(vline3_f32, vshift); + vline4_f32 = vaddq_f32(vline4_f32, vshift); + vst1q_f32(_dst + i + 0, vline1_f32); + vst1q_f32(_dst + i + 4, vline2_f32); + vst1q_f32(_dst + i + 8, vline3_f32); + vst1q_f32(_dst + i + 12, vline4_f32); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(u16, u8, 16, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d4-d5}, [%[src1]] \n\t" + "vmovl.u16 q3, d4 \n\t" + "vmovl.u16 q4, d5 \n\t" + "vcvt.f32.u32 q5, q3 \n\t" + "vcvt.f32.u32 q6, q4 \n\t" + "vmul.f32 q7, q5, q0 \n\t" + "vmul.f32 q8, q6, q0 \n\t" + "vadd.f32 q9, q7, q1 \n\t" + "vadd.f32 q10, q8, q1 \n\t" + "vcvt.s32.f32 q11, q9 \n\t" + "vcvt.s32.f32 q12, q10 \n\t" + "vqmovn.s32 d26, q11 \n\t" + "vqmovn.s32 d27, q12 \n\t" + "vqmovun.s16 d28, q13 \n\t" + "vst1.8 {d28}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i), + [dst] "r" (_dst + i + 0), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28" + ); + } +}) +#else +CVTS_FUNC(u16, u8, 16, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + uint16x8_t vline = vld1q_u16(_src + i); + uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline)); + uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline)); + float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); + float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); + int16x4_t vRes1 = vqmovn_s32(vline1_s32); + int16x4_t vRes2 = vqmovn_s32(vline2_s32); + uint8x8_t vRes = vqmovun_s16(vcombine_s16(vRes1, vRes2)); + vst1_u8(_dst + i, vRes); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(u16, s8, 16, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d4-d5}, [%[src1]] \n\t" + "vmovl.u16 q3, d4 \n\t" + "vmovl.u16 q4, d5 \n\t" + "vcvt.f32.u32 q5, q3 \n\t" + "vcvt.f32.u32 q6, q4 \n\t" + "vmul.f32 q7, q5, q0 \n\t" + "vmul.f32 q8, q6, q0 \n\t" + "vadd.f32 q9, q7, q1 \n\t" + "vadd.f32 q10, q8, q1 \n\t" + "vcvt.s32.f32 q11, q9 \n\t" + "vcvt.s32.f32 q12, q10 \n\t" + "vqmovn.s32 d26, q11 \n\t" + "vqmovn.s32 d27, q12 \n\t" + "vqmovn.s16 d28, q13 \n\t" + "vst1.8 {d28}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i), + [dst] "r" (_dst + i + 0), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28" + ); + } +}) +#else +CVTS_FUNC(u16, s8, 16, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + uint16x8_t vline = vld1q_u16(_src + i); + uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline)); + uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline)); + float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); + float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); + int16x4_t vRes1 = vqmovn_s32(vline1_s32); + int16x4_t vRes2 = vqmovn_s32(vline2_s32); + int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2)); + vst1_s8(_dst + i, vRes); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC1(u16, 16, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.16 {d4-d5}, [%[src]] \n\t" + "vmovl.u16 q3, d4 \n\t" + "vmovl.u16 q4, d5 \n\t" + "vcvt.f32.u32 q5, q3 \n\t" + "vcvt.f32.u32 q6, q4 \n\t" + "vmul.f32 q7, q5, q0 \n\t" + "vmul.f32 q8, q6, q0 \n\t" + "vadd.f32 q9, q7, q1 \n\t" + "vadd.f32 q10, q8, q1 \n\t" + "vcvt.s32.f32 q11, q9 \n\t" + "vcvt.s32.f32 q12, q10 \n\t" + "vqmovun.s32 d26, q11 \n\t" + "vqmovun.s32 d27, q12 \n\t" + "vst1.16 {d26-d27}, [%[dst]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst] "r" (_dst + i + 0), + "w" (vshift), "w" (vscale) + : "d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27" + ); + } +}) +#else +CVTS_FUNC1(u16, 16, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + uint16x8_t vline = vld1q_u16(_src + i); + uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline)); + uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline)); + float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); + float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); + uint16x4_t vRes1 = vqmovun_s32(vline1_s32); + uint16x4_t vRes2 = vqmovun_s32(vline2_s32); + vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2)); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(u16, s16, 8, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.16 {d4-d5}, [%[src]] \n\t" + "vmovl.u16 q3, d4 \n\t" + "vmovl.u16 q4, d5 \n\t" + "vcvt.f32.u32 q5, q3 \n\t" + "vcvt.f32.u32 q6, q4 \n\t" + "vmul.f32 q7, q5, q0 \n\t" + "vmul.f32 q8, q6, q0 \n\t" + "vadd.f32 q9, q7, q1 \n\t" + "vadd.f32 q10, q8, q1 \n\t" + "vcvt.s32.f32 q11, q9 \n\t" + "vcvt.s32.f32 q12, q10 \n\t" + "vqmovn.s32 d26, q11 \n\t" + "vqmovn.s32 d27, q12 \n\t" + "vst1.16 {d26-d27}, [%[dst]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst] "r" (_dst + i + 0), + "w" (vshift), "w" (vscale) + : "d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27" + ); + } +}) +#else +CVTS_FUNC(u16, s16, 8, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + uint16x8_t vline = vld1q_u16(_src + i); + uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline)); + uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline)); + float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); + float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); + int16x4_t vRes1 = vqmovn_s32(vline1_s32); + int16x4_t vRes2 = vqmovn_s32(vline2_s32); + vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2)); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(u16, s32, 8, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.16 {d4-d5}, [%[src]] \n\t" + "vmovl.u16 q3, d4 \n\t" + "vmovl.u16 q4, d5 \n\t" + "vcvt.f32.u32 q5, q3 \n\t" + "vcvt.f32.u32 q6, q4 \n\t" + "vmul.f32 q7, q5, q0 \n\t" + "vmul.f32 q8, q6, q0 \n\t" + "vadd.f32 q9, q7, q1 \n\t" + "vadd.f32 q10, q8, q1 \n\t" + "vcvt.s32.f32 q11, q9 \n\t" + "vcvt.s32.f32 q12, q10 \n\t" + "vst1.32 {d22-d23}, [%[dst1]] \n\t" + "vst1.32 {d24-d25}, [%[dst2]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i), + [dst2] "r" (_dst + i + 4), + "w" (vshift), "w" (vscale) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25" + ); + } +}) +#else +CVTS_FUNC(u16, s32, 8, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + uint16x8_t vline = vld1q_u16(_src + i); + uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline)); + uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline)); + float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); + float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); + vst1q_s32(_dst + i + 0, vline1_s32); + vst1q_s32(_dst + i + 4, vline2_s32); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(u16, f32, 8, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.16 {d4-d5}, [%[src]] \n\t" + "vmovl.u16 q3, d4 \n\t" + "vmovl.u16 q4, d5 \n\t" + "vcvt.f32.u32 q5, q3 \n\t" + "vcvt.f32.u32 q6, q4 \n\t" + "vmul.f32 q7, q5, q0 \n\t" + "vmul.f32 q8, q6, q0 \n\t" + "vadd.f32 q9, q7, q1 \n\t" + "vadd.f32 q10, q8, q1 \n\t" + "vst1.32 {d18-d19}, [%[dst1]] \n\t" + "vst1.32 {d20-d21}, [%[dst2]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 4), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21" + ); + } +}) +#else +CVTS_FUNC(u16, f32, 8, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + uint16x8_t vline = vld1q_u16(_src + i); + uint32x4_t vline1_u32 = vmovl_u16(vget_low_u16 (vline)); + uint32x4_t vline2_u32 = vmovl_u16(vget_high_u16(vline)); + float32x4_t vline1_f32 = vcvtq_f32_u32(vline1_u32); + float32x4_t vline2_f32 = vcvtq_f32_u32(vline2_u32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vst1q_f32(_dst + i + 0, vline1_f32); + vst1q_f32(_dst + i + 4, vline2_f32); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(s16, u8, 16, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d4-d5}, [%[src1]] \n\t" + "vmovl.s16 q3, d4 \n\t" + "vmovl.s16 q4, d5 \n\t" + "vcvt.f32.s32 q5, q3 \n\t" + "vcvt.f32.s32 q6, q4 \n\t" + "vmul.f32 q7, q5, q0 \n\t" + "vmul.f32 q8, q6, q0 \n\t" + "vadd.f32 q9, q7, q1 \n\t" + "vadd.f32 q10, q8, q1 \n\t" + "vcvt.s32.f32 q11, q9 \n\t" + "vcvt.s32.f32 q12, q10 \n\t" + "vqmovn.s32 d26, q11 \n\t" + "vqmovn.s32 d27, q12 \n\t" + "vqmovun.s16 d28, q13 \n\t" + "vst1.8 {d28}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i), + [dst] "r" (_dst + i + 0), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28" + ); + } +}) +#else +CVTS_FUNC(s16, u8, 16, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int16x8_t vline = vld1q_s16(_src + i); + int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline)); + int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline)); + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline1_s32 = vcvtq_s32_f32(vline1_f32); + vline2_s32 = vcvtq_s32_f32(vline2_f32); + int16x4_t vRes1 = vqmovn_s32(vline1_s32); + int16x4_t vRes2 = vqmovn_s32(vline2_s32); + uint8x8_t vRes = vqmovun_s16(vcombine_s16(vRes1, vRes2)); + vst1_u8(_dst + i, vRes); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(s16, s8, 16, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.8 {d4-d5}, [%[src1]] \n\t" + "vmovl.s16 q3, d4 \n\t" + "vmovl.s16 q4, d5 \n\t" + "vcvt.f32.s32 q5, q3 \n\t" + "vcvt.f32.s32 q6, q4 \n\t" + "vmul.f32 q7, q5, q0 \n\t" + "vmul.f32 q8, q6, q0 \n\t" + "vadd.f32 q9, q7, q1 \n\t" + "vadd.f32 q10, q8, q1 \n\t" + "vcvt.s32.f32 q11, q9 \n\t" + "vcvt.s32.f32 q12, q10 \n\t" + "vqmovn.s32 d26, q11 \n\t" + "vqmovn.s32 d27, q12 \n\t" + "vqmovn.s16 d28, q13 \n\t" + "vst1.8 {d28}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i), + [dst] "r" (_dst + i + 0), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28" + ); + } +}) +#else +CVTS_FUNC(s16, s8, 16, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int16x8_t vline = vld1q_s16(_src + i); + int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline)); + int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline)); + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline1_s32 = vcvtq_s32_f32(vline1_f32); + vline2_s32 = vcvtq_s32_f32(vline2_f32); + int16x4_t vRes1 = vqmovn_s32(vline1_s32); + int16x4_t vRes2 = vqmovn_s32(vline2_s32); + int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2)); + vst1_s8(_dst + i, vRes); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(s16, u16, 8, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.16 {d4-d5}, [%[src]] \n\t" + "vmovl.s16 q3, d4 \n\t" + "vmovl.s16 q4, d5 \n\t" + "vcvt.f32.s32 q5, q3 \n\t" + "vcvt.f32.s32 q6, q4 \n\t" + "vmul.f32 q7, q5, q0 \n\t" + "vmul.f32 q8, q6, q0 \n\t" + "vadd.f32 q9, q7, q1 \n\t" + "vadd.f32 q10, q8, q1 \n\t" + "vcvt.s32.f32 q11, q9 \n\t" + "vcvt.s32.f32 q12, q10 \n\t" + "vqmovun.s32 d26, q11 \n\t" + "vqmovun.s32 d27, q12 \n\t" + "vst1.16 {d26-d27}, [%[dst]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst] "r" (_dst + i + 0), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27" + ); + } +}) +#else +CVTS_FUNC(s16, u16, 8, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int16x8_t vline = vld1q_s16(_src + i); + int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline)); + int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline)); + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline1_s32 = vcvtq_s32_f32(vline1_f32); + vline2_s32 = vcvtq_s32_f32(vline2_f32); + uint16x4_t vRes1 = vqmovun_s32(vline1_s32); + uint16x4_t vRes2 = vqmovun_s32(vline2_s32); + vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2)); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC1(s16, 16, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.16 {d4-d5}, [%[src]] \n\t" + "vmovl.s16 q3, d4 \n\t" + "vmovl.s16 q4, d5 \n\t" + "vcvt.f32.s32 q5, q3 \n\t" + "vcvt.f32.s32 q6, q4 \n\t" + "vmul.f32 q7, q5, q0 \n\t" + "vmul.f32 q8, q6, q0 \n\t" + "vadd.f32 q9, q7, q1 \n\t" + "vadd.f32 q10, q8, q1 \n\t" + "vcvt.s32.f32 q11, q9 \n\t" + "vcvt.s32.f32 q12, q10 \n\t" + "vqmovn.s32 d26, q11 \n\t" + "vqmovn.s32 d27, q12 \n\t" + "vst1.16 {d26-d27}, [%[dst]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst] "r" (_dst + i + 0), + "w" (vshift), "w" (vscale) + : "d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27" + ); + } +}) +#else +CVTS_FUNC1(s16, 16, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int16x8_t vline = vld1q_s16(_src + i); + int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline)); + int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline)); + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline1_s32 = vcvtq_s32_f32(vline1_f32); + vline2_s32 = vcvtq_s32_f32(vline2_f32); + int16x4_t vRes1 = vqmovn_s32(vline1_s32); + int16x4_t vRes2 = vqmovn_s32(vline2_s32); + vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2)); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(s16, s32, 8, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.16 {d4-d5}, [%[src]] \n\t" + "vmovl.s16 q3, d4 \n\t" + "vmovl.s16 q4, d5 \n\t" + "vcvt.f32.s32 q5, q3 \n\t" + "vcvt.f32.s32 q6, q4 \n\t" + "vmul.f32 q7, q5, q0 \n\t" + "vmul.f32 q8, q6, q0 \n\t" + "vadd.f32 q9, q7, q1 \n\t" + "vadd.f32 q10, q8, q1 \n\t" + "vcvt.s32.f32 q11, q9 \n\t" + "vcvt.s32.f32 q12, q10 \n\t" + "vst1.32 {d22-d23}, [%[dst1]] \n\t" + "vst1.32 {d24-d25}, [%[dst2]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 4), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25" + ); + } +}) +#else +CVTS_FUNC(s16, s32, 8, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int16x8_t vline = vld1q_s16(_src + i); + int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline)); + int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline)); + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline1_s32 = vcvtq_s32_f32(vline1_f32); + vline2_s32 = vcvtq_s32_f32(vline2_f32); + vst1q_s32(_dst + i + 0, vline1_s32); + vst1q_s32(_dst + i + 4, vline2_s32); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(s16, f32, 8, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.16 {d4-d5}, [%[src]] \n\t" + "vmovl.s16 q3, d4 \n\t" + "vmovl.s16 q4, d5 \n\t" + "vcvt.f32.s32 q5, q3 \n\t" + "vcvt.f32.s32 q6, q4 \n\t" + "vmul.f32 q7, q5, q0 \n\t" + "vmul.f32 q8, q6, q0 \n\t" + "vadd.f32 q9, q7, q1 \n\t" + "vadd.f32 q10, q8, q1 \n\t" + "vst1.32 {d18-d19}, [%[dst1]] \n\t" + "vst1.32 {d20-d21}, [%[dst2]] \n\t" + : /*no output*/ + : [src] "r" (_src + i), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 4), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21" + ); + } +}) +#else +CVTS_FUNC(s16, f32, 8, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int16x8_t vline = vld1q_s16(_src + i); + int32x4_t vline1_s32 = vmovl_s16(vget_low_s16 (vline)); + int32x4_t vline2_s32 = vmovl_s16(vget_high_s16(vline)); + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vst1q_f32(_dst + i + 0, vline1_f32); + vst1q_f32(_dst + i + 4, vline2_f32); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(s32, u8, 8, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d4-d5}, [%[src1]] \n\t" + "vld1.32 {d6-d7}, [%[src2]] \n\t" + "vcvt.f32.s32 q4, q2 \n\t" + "vcvt.f32.s32 q5, q3 \n\t" + "vmul.f32 q6, q4, q0 \n\t" + "vmul.f32 q7, q5, q0 \n\t" + "vadd.f32 q8, q6, q1 \n\t" + "vadd.f32 q9, q7, q1 \n\t" + "vcvt.s32.f32 q10, q8 \n\t" + "vcvt.s32.f32 q11, q9 \n\t" + "vqmovun.s32 d24, q10 \n\t" + "vqmovun.s32 d25, q11 \n\t" + "vqmovn.u16 d26, q12 \n\t" + "vst1.8 {d26}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i + 0), + [src2] "r" (_src + i + 4), + [dst] "r" (_dst + i), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26" + ); + } +}) +#else +CVTS_FUNC(s32, u8, 8, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int32x4_t vline1_s32 = vld1q_s32(_src + i + 0); + int32x4_t vline2_s32 = vld1q_s32(_src + i + 4); + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline1_s32 = vcvtq_s32_f32(vline1_f32); + vline2_s32 = vcvtq_s32_f32(vline2_f32); + uint16x4_t vRes1 = vqmovun_s32(vline1_s32); + uint16x4_t vRes2 = vqmovun_s32(vline2_s32); + uint8x8_t vRes = vqmovn_u16(vcombine_u16(vRes1, vRes2)); + vst1_u8(_dst + i, vRes); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(s32, s8, 8, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d4-d5}, [%[src1]] \n\t" + "vld1.32 {d6-d7}, [%[src2]] \n\t" + "vcvt.f32.s32 q4, q2 \n\t" + "vcvt.f32.s32 q5, q3 \n\t" + "vmul.f32 q6, q4, q0 \n\t" + "vmul.f32 q7, q5, q0 \n\t" + "vadd.f32 q8, q6, q1 \n\t" + "vadd.f32 q9, q7, q1 \n\t" + "vcvt.s32.f32 q10, q8 \n\t" + "vcvt.s32.f32 q11, q9 \n\t" + "vqmovn.s32 d24, q10 \n\t" + "vqmovn.s32 d25, q11 \n\t" + "vqmovn.s16 d26, q12 \n\t" + "vst1.8 {d26}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i + 0), + [src2] "r" (_src + i + 4), + [dst] "r" (_dst + i), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26" + ); + } +}) +#else +CVTS_FUNC(s32, s8, 8, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int32x4_t vline1_s32 = vld1q_s32(_src + i + 0); + int32x4_t vline2_s32 = vld1q_s32(_src + i + 4); + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline1_s32 = vcvtq_s32_f32(vline1_f32); + vline2_s32 = vcvtq_s32_f32(vline2_f32); + int16x4_t vRes1 = vqmovn_s32(vline1_s32); + int16x4_t vRes2 = vqmovn_s32(vline2_s32); + int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2)); + vst1_s8(_dst + i, vRes); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(s32, u16, 8, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d4-d5}, [%[src1]] \n\t" + "vld1.32 {d6-d7}, [%[src2]] \n\t" + "vcvt.f32.s32 q4, q2 \n\t" + "vcvt.f32.s32 q5, q3 \n\t" + "vmul.f32 q6, q4, q0 \n\t" + "vmul.f32 q7, q5, q0 \n\t" + "vadd.f32 q8, q6, q1 \n\t" + "vadd.f32 q9, q7, q1 \n\t" + "vcvt.s32.f32 q10, q8 \n\t" + "vcvt.s32.f32 q11, q9 \n\t" + "vqmovun.s32 d24, q10 \n\t" + "vqmovun.s32 d25, q11 \n\t" + "vst1.16 {d24-d25}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i + 0), + [src2] "r" (_src + i + 4), + [dst] "r" (_dst + i), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25" + ); + } +}) +#else +CVTS_FUNC(s32, u16, 8, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int32x4_t vline1_s32 = vld1q_s32(_src + i + 0); + int32x4_t vline2_s32 = vld1q_s32(_src + i + 4); + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline1_s32 = vcvtq_s32_f32(vline1_f32); + vline2_s32 = vcvtq_s32_f32(vline2_f32); + uint16x4_t vRes1 = vqmovun_s32(vline1_s32); + uint16x4_t vRes2 = vqmovun_s32(vline2_s32); + vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2)); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(s32, s16, 8, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d4-d5}, [%[src1]] \n\t" + "vld1.32 {d6-d7}, [%[src2]] \n\t" + "vcvt.f32.s32 q4, q2 \n\t" + "vcvt.f32.s32 q5, q3 \n\t" + "vmul.f32 q6, q4, q0 \n\t" + "vmul.f32 q7, q5, q0 \n\t" + "vadd.f32 q8, q6, q1 \n\t" + "vadd.f32 q9, q7, q1 \n\t" + "vcvt.s32.f32 q10, q8 \n\t" + "vcvt.s32.f32 q11, q9 \n\t" + "vqmovn.s32 d24, q10 \n\t" + "vqmovn.s32 d25, q11 \n\t" + "vst1.8 {d24-d25}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i + 0), + [src2] "r" (_src + i + 4), + [dst] "r" (_dst + i), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25" + ); + } +}) +#else +CVTS_FUNC(s32, s16, 8, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int32x4_t vline1_s32 = vld1q_s32(_src + i + 0); + int32x4_t vline2_s32 = vld1q_s32(_src + i + 4); + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline1_s32 = vcvtq_s32_f32(vline1_f32); + vline2_s32 = vcvtq_s32_f32(vline2_f32); + int16x4_t vRes1 = vqmovn_s32(vline1_s32); + int16x4_t vRes2 = vqmovn_s32(vline2_s32); + vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2)); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC1(s32, 8, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d4-d5}, [%[src1]] \n\t" + "vld1.32 {d6-d7}, [%[src2]] \n\t" + "vcvt.f32.s32 q4, q2 \n\t" + "vcvt.f32.s32 q5, q3 \n\t" + "vmul.f32 q6, q4, q0 \n\t" + "vmul.f32 q7, q5, q0 \n\t" + "vadd.f32 q8, q6, q1 \n\t" + "vadd.f32 q9, q7, q1 \n\t" + "vcvt.s32.f32 q10, q8 \n\t" + "vcvt.s32.f32 q11, q9 \n\t" + "vst1.32 {d20-d21}, [%[dst1]] \n\t" + "vst1.32 {d22-d23}, [%[dst2]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i + 0), + [src2] "r" (_src + i + 4), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 4), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23" + ); + } +}) +#else +CVTS_FUNC1(s32, 8, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int32x4_t vline1_s32 = vld1q_s32(_src + i + 0); + int32x4_t vline2_s32 = vld1q_s32(_src + i + 4); + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vline1_s32 = vcvtq_s32_f32(vline1_f32); + vline2_s32 = vcvtq_s32_f32(vline2_f32); + vst1q_s32(_dst + i + 0, vline1_s32); + vst1q_s32(_dst + i + 4, vline2_s32); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(s32, f32, 8, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d4-d5}, [%[src1]] \n\t" + "vld1.32 {d6-d7}, [%[src2]] \n\t" + "vcvt.f32.s32 q4, q2 \n\t" + "vcvt.f32.s32 q5, q3 \n\t" + "vmul.f32 q6, q4, q0 \n\t" + "vmul.f32 q7, q5, q0 \n\t" + "vadd.f32 q8, q6, q1 \n\t" + "vadd.f32 q9, q7, q1 \n\t" + "vst1.32 {d16-d17}, [%[dst1]] \n\t" + "vst1.32 {d18-d19}, [%[dst2]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i), + [src2] "r" (_src + i + 4), + [dst1] "r" (_dst + i), + [dst2] "r" (_dst + i + 4), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19" + ); + } +}) +#else +CVTS_FUNC(s32, f32, 8, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + int32x4_t vline1_s32 = vld1q_s32(_src + i + 0); + int32x4_t vline2_s32 = vld1q_s32(_src + i + 4); + float32x4_t vline1_f32 = vcvtq_f32_s32(vline1_s32); + float32x4_t vline2_f32 = vcvtq_f32_s32(vline2_s32); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vst1q_f32(_dst + i + 0, vline1_f32); + vst1q_f32(_dst + i + 4, vline2_f32); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(f32, u8, 8, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)((1 << 16)*alpha)); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)((1 << 16)*beta)); + register uint32x4_t vmask asm ("q2") = vdupq_n_u32(1<<16);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d6-d7}, [%[src1]] \n\t" + "vld1.32 {d8-d9}, [%[src2]] \n\t" + "vmul.f32 q5, q3, q0 \n\t" + "vmul.f32 q6, q4, q0 \n\t" + "vadd.f32 q7, q5, q1 \n\t" + "vadd.f32 q8, q6, q1 \n\t" + "vcvt.u32.f32 q9, q7 \n\t" + "vcvt.u32.f32 q10, q8 \n\t" + "vbic q11, q2, q6 \n\t" + "vbic q12, q2, q7 \n\t" + "vshr.u32 q13, q11, #16 \n\t" + "vshr.u32 q14, q12, #16 \n\t" + "vqsub.u32 q7, q9, q13 \n\t" + "vqsub.u32 q8, q10, q14 \n\t" + "vqrshrn.u32 d22, q7, #16 \n\t" + "vqrshrn.u32 d23, q8, #16 \n\t" + "vqmovn.u16 d30, q11 \n\t" + "vst1.8 {d30}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i + 0), + [src2] "r" (_src + i + 4), + [dst] "r" (_dst + i), + "w" (vscale), "w" (vshift), "w" (vmask) + : "d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29","d30" + ); + } +}) +#else +CVTS_FUNC(f32, u8, 8, + float32x4_t vscale = vdupq_n_f32((f32)((1 << 16)*alpha)); + float32x4_t vshift = vdupq_n_f32((f32)((1 << 16)*beta)); + uint32x4_t vmask = vdupq_n_u32(1<<16);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + float32x4_t vline1_f32 = vld1q_f32(_src + i + 0); + float32x4_t vline2_f32 = vld1q_f32(_src + i + 4); + + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + float32x4_t vline1Shifted_f32 = vaddq_f32(vline1_f32, vshift); + float32x4_t vline2Shifted_f32 = vaddq_f32(vline2_f32, vshift); + uint32x4_t vline1_u32 = vcvtq_u32_f32(vline1Shifted_f32); + uint32x4_t vline2_u32 = vcvtq_u32_f32(vline2Shifted_f32); + uint32x4_t vline1Mask = vbicq_u32(vmask, vreinterpretq_u32_f32(vline2_f32)); + uint32x4_t vline2Mask = vbicq_u32(vmask, vreinterpretq_u32_f32(vline1Shifted_f32)); + vline1Mask = vshrq_n_u32(vline1Mask, 16); + vline2Mask = vshrq_n_u32(vline2Mask, 16); + vline1_u32 = vqsubq_u32(vline1_u32, vline1Mask); + vline2_u32 = vqsubq_u32(vline2_u32, vline2Mask); + uint16x4_t vRes1 = vqrshrn_n_u32(vline1_u32, 16); + uint16x4_t vRes2 = vqrshrn_n_u32(vline2_u32, 16); + uint8x8_t vRes = vqmovn_u16(vcombine_u16(vRes1, vRes2)); + + vst1_u8(_dst + i, vRes); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(f32, s8, 8, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d4-d5}, [%[src1]] \n\t" + "vld1.32 {d6-d7}, [%[src2]] \n\t" + "vmul.f32 q4, q2, q0 \n\t" + "vmul.f32 q5, q3, q0 \n\t" + "vadd.f32 q6, q4, q1 \n\t" + "vadd.f32 q7, q5, q1 \n\t" + "vcvt.s32.f32 q8, q6 \n\t" + "vcvt.s32.f32 q9, q7 \n\t" + "vqmovn.s32 d14, q8 \n\t" + "vqmovn.s32 d15, q9 \n\t" + "vqmovn.s16 d16, q7 \n\t" + "vst1.8 {d16}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i + 0), + [src2] "r" (_src + i + 4), + [dst] "r" (_dst + i), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19" + ); + } +}) +#else +CVTS_FUNC(f32, s8, 8, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + float32x4_t vline1_f32 = vld1q_f32(_src + i + 0); + float32x4_t vline2_f32 = vld1q_f32(_src + i + 4); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); + int16x4_t vRes1 = vqmovn_s32(vline1_s32); + int16x4_t vRes2 = vqmovn_s32(vline2_s32); + int8x8_t vRes = vqmovn_s16(vcombine_s16(vRes1, vRes2)); + vst1_s8(_dst + i, vRes); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(f32, u16, 8, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d4-d5}, [%[src1]] \n\t" + "vld1.32 {d6-d7}, [%[src2]] \n\t" + "vmul.f32 q4, q2, q0 \n\t" + "vmul.f32 q5, q3, q0 \n\t" + "vadd.f32 q6, q4, q1 \n\t" + "vadd.f32 q7, q5, q1 \n\t" + "vcvt.u32.f32 q8, q6 \n\t" + "vcvt.u32.f32 q9, q7 \n\t" + "vqmovn.u32 d8, q8 \n\t" + "vqmovn.u32 d9, q9 \n\t" + "vst1.16 {d8-d9}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i + 0), + [src2] "r" (_src + i + 4), + [dst] "r" (_dst + i), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19" + ); + } +}) +#else +CVTS_FUNC(f32, u16, 8, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + float32x4_t vline1_f32 = vld1q_f32(_src + i + 0); + float32x4_t vline2_f32 = vld1q_f32(_src + i + 4); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + uint32x4_t vline1_u32 = vcvtq_u32_f32(vline1_f32); + uint32x4_t vline2_u32 = vcvtq_u32_f32(vline2_f32); + uint16x4_t vRes1 = vqmovn_u32(vline1_u32); + uint16x4_t vRes2 = vqmovn_u32(vline2_u32); + vst1q_u16(_dst + i, vcombine_u16(vRes1, vRes2)); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(f32, s16, 8, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d4-d5}, [%[src1]] \n\t" + "vld1.32 {d6-d7}, [%[src2]] \n\t" + "vmul.f32 q4, q2, q0 \n\t" + "vmul.f32 q5, q3, q0 \n\t" + "vadd.f32 q6, q4, q1 \n\t" + "vadd.f32 q7, q5, q1 \n\t" + "vcvt.s32.f32 q8, q6 \n\t" + "vcvt.s32.f32 q9, q7 \n\t" + "vqmovn.s32 d8, q8 \n\t" + "vqmovn.s32 d9, q9 \n\t" + "vst1.16 {d8-d9}, [%[dst]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i + 0), + [src2] "r" (_src + i + 4), + [dst] "r" (_dst + i), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19" + ); + } +}) +#else +CVTS_FUNC(f32, s16, 8, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + float32x4_t vline1_f32 = vld1q_f32(_src + i + 0); + float32x4_t vline2_f32 = vld1q_f32(_src + i + 4); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); + int16x4_t vRes1 = vqmovn_s32(vline1_s32); + int16x4_t vRes2 = vqmovn_s32(vline2_s32); + vst1q_s16(_dst + i, vcombine_s16(vRes1, vRes2)); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC(f32, s32, 8, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d4-d5}, [%[src1]] \n\t" + "vld1.32 {d6-d7}, [%[src2]] \n\t" + "vmul.f32 q4, q2, q0 \n\t" + "vmul.f32 q5, q3, q0 \n\t" + "vadd.f32 q6, q4, q1 \n\t" + "vadd.f32 q7, q5, q1 \n\t" + "vcvt.s32.f32 q4, q6 \n\t" + "vcvt.s32.f32 q5, q7 \n\t" + "vst1.32 {d8-d9}, [%[dst1]] \n\t" + "vst1.32 {d10-d11}, [%[dst2]] \n\t" + : //no output + : [src1] "r" (_src + i), + [src2] "r" (_src + i + 4), + [dst1] "r" (_dst + i), + [dst2] "r" (_dst + i + 4), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15" + ); + } +}) +#else +CVTS_FUNC(f32, s32, 8, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta + 0.5f);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + float32x4_t vline1_f32 = vld1q_f32(_src + i + 0); + float32x4_t vline2_f32 = vld1q_f32(_src + i + 4); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + int32x4_t vline1_s32 = vcvtq_s32_f32(vline1_f32); + int32x4_t vline2_s32 = vcvtq_s32_f32(vline2_f32); + vst1q_s32(_dst + i + 0, vline1_s32); + vst1q_s32(_dst + i + 4, vline2_s32); + } +}) +#endif + +#if __GNUC_MINOR__ < 7 +CVTS_FUNC1(f32, 8, + register float32x4_t vscale asm ("q0") = vdupq_n_f32((f32)alpha); + register float32x4_t vshift asm ("q1") = vdupq_n_f32((f32)beta);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + __asm__ ( + "vld1.32 {d4-d5}, [%[src1]] \n\t" + "vld1.32 {d6-d7}, [%[src2]] \n\t" + "vmul.f32 q4, q2, q0 \n\t" + "vmul.f32 q5, q3, q0 \n\t" + "vadd.f32 q6, q4, q1 \n\t" + "vadd.f32 q7, q5, q1 \n\t" + "vst1.32 {d12-d13}, [%[dst1]] \n\t" + "vst1.32 {d14-d15}, [%[dst2]] \n\t" + : /*no output*/ + : [src1] "r" (_src + i + 0), + [src2] "r" (_src + i + 4), + [dst1] "r" (_dst + i + 0), + [dst2] "r" (_dst + i + 4), + "w" (vscale), "w" (vshift) + : "d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19" + ); + } +}) +#else +CVTS_FUNC1(f32, 8, + float32x4_t vscale = vdupq_n_f32((f32)alpha); + float32x4_t vshift = vdupq_n_f32((f32)beta);, +{ + for (size_t i = 0; i < w; i += 8) + { + internal::prefetch(_src + i); + float32x4_t vline1_f32 = vld1q_f32(_src + i + 0); + float32x4_t vline2_f32 = vld1q_f32(_src + i + 4); + vline1_f32 = vmulq_f32(vline1_f32, vscale); + vline2_f32 = vmulq_f32(vline2_f32, vscale); + vline1_f32 = vaddq_f32(vline1_f32, vshift); + vline2_f32 = vaddq_f32(vline2_f32, vshift); + vst1q_f32(_dst + i + 0, vline1_f32); + vst1q_f32(_dst + i + 4, vline2_f32); + } +}) +#endif + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/convolution.cpp b/3rdparty/carotene/src/convolution.cpp new file mode 100644 index 0000000000..498d7ad883 --- /dev/null +++ b/3rdparty/carotene/src/convolution.cpp @@ -0,0 +1,340 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" +#include "saturate_cast.hpp" + +namespace CAROTENE_NS { + +bool isConvolutionSupported(const Size2D &size, const Size2D &ksize, + BORDER_MODE border) +{ + return isSupportedConfiguration() && size.width >= 8 && + (border == BORDER_MODE_CONSTANT || + border == BORDER_MODE_REPLICATE) && + (ksize.width == 3) && (ksize.height == 3); +} + +#ifdef CAROTENE_NEON + +namespace { + +template +int32x4_t vshrq_s32(int32x4_t value) +{ + return vshrq_n_s32(value, shift); +} + +template <> +int32x4_t vshrq_s32<0>(int32x4_t value) +{ + return value; +} + +} // namespace + +typedef int32x4_t (* vshrq_s32_func)(int32x4_t value); + +#endif + +void convolution(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue, + const Size2D & ksize, s16 * kernelBase, u32 scale) +{ + internal::assertSupportedConfiguration(isConvolutionSupported(size, ksize, border)); +#ifdef CAROTENE_NEON + const uint8x8_t v_zero_u8 = vdup_n_u8(0); + const uint8x8_t v_border = vdup_n_u8(borderValue); + const int32x4_t v_zero_s32 = vdupq_n_s32(0); + + uint8x8_t tprev[3] = { v_zero_u8, v_zero_u8, v_zero_u8 }, + tcurr[3] = { v_zero_u8, v_zero_u8, v_zero_u8 }, + tnext[3] = { v_zero_u8, v_zero_u8, v_zero_u8 }; + uint8x8_t t0 = v_zero_u8, t1 = v_zero_u8, t2 = v_zero_u8; + + ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height; + static const vshrq_s32_func vshrq_s32_a[33] = + { + vshrq_s32<0>, + vshrq_s32<1>, + vshrq_s32<2>, + vshrq_s32<3>, + vshrq_s32<4>, + vshrq_s32<5>, + vshrq_s32<6>, + vshrq_s32<7>, + vshrq_s32<8>, + vshrq_s32<9>, + vshrq_s32<10>, + vshrq_s32<11>, + vshrq_s32<12>, + vshrq_s32<13>, + vshrq_s32<14>, + vshrq_s32<15>, + vshrq_s32<16>, + vshrq_s32<17>, + vshrq_s32<18>, + vshrq_s32<19>, + vshrq_s32<20>, + vshrq_s32<21>, + vshrq_s32<22>, + vshrq_s32<23>, + vshrq_s32<24>, + vshrq_s32<25>, + vshrq_s32<26>, + vshrq_s32<27>, + vshrq_s32<28>, + vshrq_s32<29>, + vshrq_s32<30>, + vshrq_s32<31>, + vshrq_s32<32> + }; + vshrq_s32_func vshrq_s32_p = vshrq_s32_a[scale]; + + for (ptrdiff_t y = 0; y < height; ++y) + { + const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max(y - 1, 0)); + const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y); + const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1)); + u8 * drow = internal::getRowPtr(dstBase, dstStride, y); + + u8 prevx[3] = { 0, 0, 0 }, + currx[3] = { 0, 0, 0 }, + nextx[3] = { 0, 0, 0 }; + ptrdiff_t x = 0; + const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8); + + // perform vertical convolution + for ( ; x <= bwidth; x += 8) + { + internal::prefetch(srow0 + x); + internal::prefetch(srow1 + x); + internal::prefetch(srow2 + x); + + uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x); + uint8x8_t x1 = vld1_u8(srow1 + x); + uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x); + + // calculate values for plain CPU part below if needed + if (x + 8 >= bwidth) + { + ptrdiff_t x3 = x == width ? width - 1 : x; + ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max(x3 - 1, 0); + + if (border == BORDER_MODE_CONSTANT && x4 < 0) + prevx[0] = prevx[1] = prevx[2] = borderValue; + else + { + prevx[0] = srow0 ? srow0[x4] : borderValue; + prevx[1] = srow1[x4] ; + prevx[2] = srow2 ? srow2[x4] : borderValue; + } + + currx[0] = srow0 ? srow0[x3] : borderValue; + currx[1] = srow1[x3] ; + currx[2] = srow2 ? srow2[x3] : borderValue; + } + + // make shift + if (x) + { + tprev[0] = tcurr[0]; + tcurr[0] = tnext[0]; + + tprev[1] = tcurr[1]; + tcurr[1] = tnext[1]; + + tprev[2] = tcurr[2]; + tcurr[2] = tnext[2]; + } + + tnext[0] = x0; + tnext[1] = x1; + tnext[2] = x2; + + // make extrapolation for the first elements + if (!x) + { + // make border + if (border == BORDER_MODE_CONSTANT) + tcurr[0] = tcurr[1] = tcurr[2] = v_border; + else if (border == BORDER_MODE_REPLICATE) + { + tcurr[0] = vdup_n_u8(vget_lane_u8(tnext[0], 0)); + tcurr[1] = vdup_n_u8(vget_lane_u8(tnext[1], 0)); + tcurr[2] = vdup_n_u8(vget_lane_u8(tnext[2], 0)); + } + + continue; + } + + int32x4_t v_dst0 = v_zero_s32, v_dst1 = v_zero_s32; + + { + // combine 3 "shifted" vectors + t0 = vext_u8(tprev[0], tcurr[0], 7); + t1 = tcurr[0]; + t2 = vext_u8(tcurr[0], tnext[0], 1); + + int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2)); + + v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[8]); + v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[7]); + v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[6]); + + v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[8]); + v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[7]); + v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[6]); + } + + { + // combine 3 "shifted" vectors + t0 = vext_u8(tprev[1], tcurr[1], 7); + t1 = tcurr[1]; + t2 = vext_u8(tcurr[1], tnext[1], 1); + + int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2)); + + v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[5]); + v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[4]); + v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[3]); + + v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[5]); + v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[4]); + v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[3]); + } + + { + // combine 3 "shifted" vectors + t0 = vext_u8(tprev[2], tcurr[2], 7); + t1 = tcurr[2]; + t2 = vext_u8(tcurr[2], tnext[2], 1); + + int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2)); + + v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[2]); + v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[1]); + v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[0]); + + v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[2]); + v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[1]); + v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[0]); + } + + + // make scale + v_dst0 = vshrq_s32_p(v_dst0); + v_dst1 = vshrq_s32_p(v_dst1); + + // and add them + vst1_u8(drow + x - 8, vqmovn_u16(vcombine_u16(vqmovun_s32(v_dst0), + vqmovun_s32(v_dst1)))); + } + + x -= 8; + if (x == width) + --x; + + for ( ; x < width; ++x) + { + // make extrapolation for the last elements + if (x + 1 >= width) + { + if (border == BORDER_MODE_CONSTANT) + { + nextx[0] = borderValue; + nextx[1] = borderValue; + nextx[2] = borderValue; + } + else if (border == BORDER_MODE_REPLICATE) + { + nextx[0] = srow0[x]; + nextx[1] = srow1[x]; + nextx[2] = srow2[x]; + } + } + else + { + nextx[0] = srow0 ? srow0[x + 1] : borderValue; + nextx[1] = srow1[x + 1] ; + nextx[2] = srow2 ? srow2[x + 1] : borderValue; + } + + s32 val = 0; + for (s32 _y = 0; _y < 3; ++_y) + val += prevx[_y] * kernelBase[(2 - _y) * 3 + 2] + + currx[_y] * kernelBase[(2 - _y) * 3 + 1] + + nextx[_y] * kernelBase[(2 - _y) * 3 + 0]; + + drow[x] = internal::saturate_cast(val >> scale); + + // make shift + prevx[0] = currx[0]; + currx[0] = nextx[0]; + + prevx[1] = currx[1]; + currx[1] = nextx[1]; + + prevx[2] = currx[2]; + currx[2] = nextx[2]; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)border; + (void)borderValue; + (void)ksize; + (void)kernelBase; + (void)scale; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/count_nonzero.cpp b/3rdparty/carotene/src/count_nonzero.cpp new file mode 100644 index 0000000000..be87767cbd --- /dev/null +++ b/3rdparty/carotene/src/count_nonzero.cpp @@ -0,0 +1,430 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +#include + +namespace CAROTENE_NS { + +s32 countNonZero(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + size_t roiw16 = size.width & ~15u; + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const u8* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + + #define COUNTNONZERO8U_BLOCK_SIZE (16*255) + uint8x16_t vc1 = vmovq_n_u8(1); + for (; i < roiw16;) + { + size_t lim = std::min(i + COUNTNONZERO8U_BLOCK_SIZE, size.width) - 16; + uint8x16_t vs = vmovq_n_u8(0); + + for (; i <= lim; i+= 16) + { + internal::prefetch(src + i); + uint8x16_t vln = vld1q_u8(src + i); + uint8x16_t vnz = vminq_u8(vln, vc1); + vs = vaddq_u8(vs, vnz); + } + + uint32x4_t vs4 = vpaddlq_u16(vpaddlq_u8(vs)); + uint32x2_t vs2 = vadd_u32(vget_low_u32(vs4), vget_high_u32(vs4)); + + s32 s[2]; + vst1_u32((u32*)s, vs2); + + if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 2GB of non-zeros... + { + return 0x7fFFffFF; + } + result += (s[0] += s[1]); + if (s[0] < 0 || result < 0) + { + return 0x7fFFffFF; + } + } + for (; i < size.width; i++) + result += (src[i] != 0)?1:0; + if (result < 0)//saturate in case of overflow ~ 2GB of non-zeros... + { + return 0x7fFFffFF; + } + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0; +#endif +} + +s32 countNonZero(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + size_t roiw8 = size.width & ~7u; + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const u16* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + + #define COUNTNONZERO16U_BLOCK_SIZE (8*(256*256-1)) + uint16x8_t vc1 = vmovq_n_u16(1); + for (; i < roiw8;) + { + size_t lim = std::min(i + COUNTNONZERO16U_BLOCK_SIZE, size.width) - 8; + uint16x8_t vs = vmovq_n_u16(0); + + for (; i <= lim; i+= 8) + { + internal::prefetch(src + i); + uint16x8_t vln = vld1q_u16(src + i); + uint16x8_t vnz = vminq_u16(vln, vc1); + vs = vaddq_u16(vs, vnz); + } + + uint32x4_t vs4 = vpaddlq_u16(vs); + uint32x2_t vs2 = vadd_u32(vget_low_u32(vs4), vget_high_u32(vs4)); + + s32 s[2]; + vst1_u32((u32*)s, vs2); + + if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 4GB of non-zeros... + { + return 0x7fFFffFF; + } + result += (s[0] += s[1]); + if (s[0] < 0 || result < 0) + { + return 0x7fFFffFF; + } + } + for (; i < size.width; i++) + result += (src[i] != 0)?1:0; + if (result < 0)//saturate in case of overflow ~ 4GB of non-zeros... + { + return 0x7fFFffFF; + } + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0; +#endif +} + +s32 countNonZero(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + size_t roiw4 = size.width & ~3u; + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const u32* src = (const u32*)internal::getRowPtr( srcBase, srcStride, k); + u32 i = 0; + + uint32x4_t vc1 = vmovq_n_u32(1); + uint32x4_t vs = vmovq_n_u32(0); + + for (; i < roiw4; i += 4 ) + { + internal::prefetch(src + i); + uint32x4_t vln = vld1q_u32(src + i); + uint32x4_t vnz = vminq_u32(vln, vc1); + vs = vqaddq_u32(vs, vnz); + } + + uint32x2_t vs2 = vqadd_u32(vget_low_u32(vs), vget_high_u32(vs)); + + s32 s[2]; + vst1_u32((u32*)s, vs2); + + if (s[0] < 0 || s[1] < 0)//saturate in case of overflow ~ 8GB of non-zeros... + { + return 0x7fFFffFF; + } + result += (s[0] += s[1]); + if (s[0] < 0 || result < 0) + { + return 0x7fFFffFF; + } + + for (; i < size.width; i++) + result += (src[i] != 0)?1:0; + if (result < 0)//saturate in case of overflow ~ 8GB of non-zeros... + { + return 0x7fFFffFF; + } + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0; +#endif +} + +s32 countNonZero(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + size_t roiw4 = size.width & ~3u; + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const f32* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + + float32x4_t vc0 = vmovq_n_f32(0); + int32x4_t vs = vmovq_n_s32(0); + + for (; i < roiw4; i += 4 ) + { + internal::prefetch(src + i); + float32x4_t vln = vld1q_f32(src + i); + int32x4_t vnz = vreinterpretq_s32_u32(vmvnq_u32(vceqq_f32(vln, vc0))); + vs = vqaddq_s32(vs, vnz); + } + + int32x2_t vs2 = vqneg_s32(vqadd_s32(vget_low_s32(vs), vget_high_s32(vs))); + + int s[2]; + vst1_s32(s, vs2); + + result += (s[0] += s[1]); + if (s[0] < 0 || result < 0)//case of overflow ~ 8GB of non-zeros... + { + return 0x7fFFffFF; + } + + for (; i < size.width; i++) + result += (src[i] < std::numeric_limits::min() && src[i] > -std::numeric_limits::min())?0:1; + + if (result < 0) + { + return 0x7fFFffFF; + } + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0; +#endif +} + +s32 countNonZero(const Size2D &_size, + const f64 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + size_t roiw8 = size.width & ~7u; + size_t roiw4 = size.width & ~3u; + size_t roiw2 = size.width & ~1u; + uint64x2_t vmask1 = vdupq_n_u64(0x7fFFffFFffFFffFFULL); //will treat denormals as non-zero + uint32x4_t vc0 = vmovq_n_u32(0); + + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const f64* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + + int32x2_t vs1 = vmov_n_s32(0); + int32x2_t vs2 = vmov_n_s32(0); + int32x2_t vs3 = vmov_n_s32(0); + int32x2_t vs4 = vmov_n_s32(0); + + for (; i < roiw8; i += 8 ) + { + internal::prefetch(src + i + 6); + uint64x2_t vln1 = vld1q_u64((const u64*)(src + i)); + uint64x2_t vln2 = vld1q_u64((const u64*)(src + i + 2)); + uint64x2_t vln3 = vld1q_u64((const u64*)(src + i + 4)); + uint64x2_t vln4 = vld1q_u64((const u64*)(src + i + 6)); + + uint64x2_t vm1 = vandq_u64(vln1, vmask1); + uint64x2_t vm2 = vandq_u64(vln2, vmask1); + uint64x2_t vm3 = vandq_u64(vln3, vmask1); + uint64x2_t vm4 = vandq_u64(vln4, vmask1); + + uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0); + uint32x4_t vequ2 = vceqq_u32(vreinterpretq_u32_u64(vm2), vc0); + uint32x4_t vequ3 = vceqq_u32(vreinterpretq_u32_u64(vm3), vc0); + uint32x4_t vequ4 = vceqq_u32(vreinterpretq_u32_u64(vm4), vc0); + + uint32x4_t vlx1 = vmvnq_u32(vequ1); + uint32x4_t vlx2 = vmvnq_u32(vequ2); + uint32x4_t vlx3 = vmvnq_u32(vequ3); + uint32x4_t vlx4 = vmvnq_u32(vequ4); + + int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1))); + int32x2_t vnz2 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx2), vget_high_u32(vlx2))); + int32x2_t vnz3 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx3), vget_high_u32(vlx3))); + int32x2_t vnz4 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx4), vget_high_u32(vlx4))); + + vs1 = vqadd_s32(vs1, vnz1); + vs2 = vqadd_s32(vs2, vnz2); + vs3 = vqadd_s32(vs3, vnz3); + vs4 = vqadd_s32(vs4, vnz4); + } + + if (i < roiw4) + { + internal::prefetch(src + i + 2); + uint64x2_t vln1 = vld1q_u64((const u64*)(src + i)); + uint64x2_t vln2 = vld1q_u64((const u64*)(src + i + 2)); + + uint64x2_t vm1 = vandq_u64(vln1, vmask1); + uint64x2_t vm2 = vandq_u64(vln2, vmask1); + + uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0); + uint32x4_t vequ2 = vceqq_u32(vreinterpretq_u32_u64(vm2), vc0); + + uint32x4_t vlx1 = vmvnq_u32(vequ1); + uint32x4_t vlx2 = vmvnq_u32(vequ2); + + int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1))); + int32x2_t vnz2 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx2), vget_high_u32(vlx2))); + + vs1 = vqadd_s32(vs1, vnz1); + vs2 = vqadd_s32(vs2, vnz2); + i += 4; + } + + if (i < roiw2) + { + internal::prefetch(src + i); + uint64x2_t vln1 = vld1q_u64((const u64*)(src + i)); + + uint64x2_t vm1 = vandq_u64(vln1, vmask1); + + uint32x4_t vequ1 = vceqq_u32(vreinterpretq_u32_u64(vm1), vc0); + + uint32x4_t vlx1 = vmvnq_u32(vequ1); + + int32x2_t vnz1 = vreinterpret_s32_u32(vpmax_u32(vget_low_u32(vlx1), vget_high_u32(vlx1))); + + vs1 = vqadd_s32(vs1, vnz1); + i += 2; + } + + vs1 = vqadd_s32(vs1, vs2); + vs3 = vqadd_s32(vs3, vs4); + vs1 = vqadd_s32(vs1, vs3); + int32x2_t vsneg = vqneg_s32(vs1); + + s32 s[2]; + vst1_s32(s, vsneg); + + result += (s[0] += s[1]); + if (s[0] < 0 || result < 0)//case of overflow ~ 16GB of non-zeros... + { + return 0x7fFFffFF; + } + + for (; i < size.width; i++) + result += (src[i] < std::numeric_limits::min() && src[i] > -std::numeric_limits::min())?0:1; + if (result < 0) + { + return 0x7fFFffFF; + } + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/div.cpp b/3rdparty/carotene/src/div.cpp new file mode 100644 index 0000000000..9c03202a83 --- /dev/null +++ b/3rdparty/carotene/src/div.cpp @@ -0,0 +1,694 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2016, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" +#include "vtransform.hpp" + +#include +#include +#include +#include + +namespace CAROTENE_NS { + +namespace { + +#ifdef CAROTENE_NEON + +template +inline T divSaturateQ(const T &v1, const T &v2, const float scale) +{ + return internal::vcombine(internal::vqmovn(divSaturateQ(internal::vmovl(internal::vget_low(v1)), + internal::vmovl(internal::vget_low(v2)), scale)), + internal::vqmovn(divSaturateQ(internal::vmovl(internal::vget_high(v1)), + internal::vmovl(internal::vget_high(v2)), scale)) + ); +} +template <> +inline int32x4_t divSaturateQ(const int32x4_t &v1, const int32x4_t &v2, const float scale) +{ return vcvtq_s32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2)))); } +template <> +inline uint32x4_t divSaturateQ(const uint32x4_t &v1, const uint32x4_t &v2, const float scale) +{ return vcvtq_u32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2)))); } + +template +inline T divSaturate(const T &v1, const T &v2, const float scale) +{ + return internal::vqmovn(divSaturateQ(internal::vmovl(v1), internal::vmovl(v2), scale)); +} +template <> +inline int32x2_t divSaturate(const int32x2_t &v1, const int32x2_t &v2, const float scale) +{ return vcvt_s32_f32(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2)))); } +template <> +inline uint32x2_t divSaturate(const uint32x2_t &v1, const uint32x2_t &v2, const float scale) +{ return vcvt_u32_f32(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2)))); } + + +template +inline T divWrapQ(const T &v1, const T &v2, const float scale) +{ + return internal::vcombine(internal::vmovn(divWrapQ(internal::vmovl(internal::vget_low(v1)), + internal::vmovl(internal::vget_low(v2)), scale)), + internal::vmovn(divWrapQ(internal::vmovl(internal::vget_high(v1)), + internal::vmovl(internal::vget_high(v2)), scale)) + ); +} +template <> +inline int32x4_t divWrapQ(const int32x4_t &v1, const int32x4_t &v2, const float scale) +{ return vcvtq_s32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_s32(v1), scale), internal::vrecpq_f32(vcvtq_f32_s32(v2)))); } +template <> +inline uint32x4_t divWrapQ(const uint32x4_t &v1, const uint32x4_t &v2, const float scale) +{ return vcvtq_u32_f32(vmulq_f32(vmulq_n_f32(vcvtq_f32_u32(v1), scale), internal::vrecpq_f32(vcvtq_f32_u32(v2)))); } + +template +inline T divWrap(const T &v1, const T &v2, const float scale) +{ + return internal::vmovn(divWrapQ(internal::vmovl(v1), internal::vmovl(v2), scale)); +} +template <> +inline int32x2_t divWrap(const int32x2_t &v1, const int32x2_t &v2, const float scale) +{ return vcvt_s32_f32(vmul_f32(vmul_n_f32(vcvt_f32_s32(v1), scale), internal::vrecp_f32(vcvt_f32_s32(v2)))); } +template <> +inline uint32x2_t divWrap(const uint32x2_t &v1, const uint32x2_t &v2, const float scale) +{ return vcvt_u32_f32(vmul_f32(vmul_n_f32(vcvt_f32_u32(v1), scale), internal::vrecp_f32(vcvt_f32_u32(v2)))); } + +inline uint8x16_t vtstq(const uint8x16_t & v0, const uint8x16_t & v1) { return vtstq_u8 (v0, v1); } +inline uint16x8_t vtstq(const uint16x8_t & v0, const uint16x8_t & v1) { return vtstq_u16(v0, v1); } +inline uint32x4_t vtstq(const uint32x4_t & v0, const uint32x4_t & v1) { return vtstq_u32(v0, v1); } +inline int8x16_t vtstq(const int8x16_t & v0, const int8x16_t & v1) { return vreinterpretq_s8_u8 (vtstq_s8 (v0, v1)); } +inline int16x8_t vtstq(const int16x8_t & v0, const int16x8_t & v1) { return vreinterpretq_s16_u16(vtstq_s16(v0, v1)); } +inline int32x4_t vtstq(const int32x4_t & v0, const int32x4_t & v1) { return vreinterpretq_s32_u32(vtstq_s32(v0, v1)); } + +inline uint8x8_t vtst(const uint8x8_t & v0, const uint8x8_t & v1) { return vtst_u8 (v0, v1); } +inline uint16x4_t vtst(const uint16x4_t & v0, const uint16x4_t & v1) { return vtst_u16(v0, v1); } +inline uint32x2_t vtst(const uint32x2_t & v0, const uint32x2_t & v1) { return vtst_u32(v0, v1); } +inline int8x8_t vtst(const int8x8_t & v0, const int8x8_t & v1) { return vreinterpret_s8_u8 (vtst_s8 (v0, v1)); } +inline int16x4_t vtst(const int16x4_t & v0, const int16x4_t & v1) { return vreinterpret_s16_u16(vtst_s16(v0, v1)); } +inline int32x2_t vtst(const int32x2_t & v0, const int32x2_t & v1) { return vreinterpret_s32_u32(vtst_s32(v0, v1)); } +#endif + +template +void div(const Size2D &size, + const T * src0Base, ptrdiff_t src0Stride, + const T * src1Base, ptrdiff_t src1Stride, + T * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + internal::assertSupportedConfiguration(); + +#ifdef CAROTENE_NEON + typedef typename internal::VecTraits::vec128 vec128; + typedef typename internal::VecTraits::vec64 vec64; + + if (scale == 0.0f || + (std::numeric_limits::is_integer && + (scale * std::numeric_limits::max()) < 1.0f && + (scale * std::numeric_limits::max()) > -1.0f)) + { + for (size_t y = 0; y < size.height; ++y) + { + T * dst = internal::getRowPtr(dstBase, dstStride, y); + std::memset(dst, 0, sizeof(T) * size.width); + } + return; + } + + const size_t step128 = 16 / sizeof(T); + size_t roiw128 = size.width >= (step128 - 1) ? size.width - step128 + 1 : 0; + const size_t step64 = 8 / sizeof(T); + size_t roiw64 = size.width >= (step64 - 1) ? size.width - step64 + 1 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const T * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const T * src1 = internal::getRowPtr(src1Base, src1Stride, i); + T * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + if (cpolicy == CONVERT_POLICY_SATURATE) + { + for (; j < roiw128; j += step128) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + + vec128 v_src0 = internal::vld1q(src0 + j); + vec128 v_src1 = internal::vld1q(src1 + j); + + vec128 v_mask = vtstq(v_src1,v_src1); + internal::vst1q(dst + j, internal::vandq(v_mask, divSaturateQ(v_src0, v_src1, scale))); + } + for (; j < roiw64; j += step64) + { + vec64 v_src0 = internal::vld1(src0 + j); + vec64 v_src1 = internal::vld1(src1 + j); + + vec64 v_mask = vtst(v_src1,v_src1); + internal::vst1(dst + j, internal::vand(v_mask,divSaturate(v_src0, v_src1, scale))); + } + for (; j < size.width; j++) + { + dst[j] = src1[j] ? internal::saturate_cast(scale * src0[j] / src1[j]) : 0; + } + } + else // CONVERT_POLICY_WRAP + { + for (; j < roiw128; j += step128) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + + vec128 v_src0 = internal::vld1q(src0 + j); + vec128 v_src1 = internal::vld1q(src1 + j); + + vec128 v_mask = vtstq(v_src1,v_src1); + internal::vst1q(dst + j, internal::vandq(v_mask, divWrapQ(v_src0, v_src1, scale))); + } + for (; j < roiw64; j += step64) + { + vec64 v_src0 = internal::vld1(src0 + j); + vec64 v_src1 = internal::vld1(src1 + j); + + vec64 v_mask = vtst(v_src1,v_src1); + internal::vst1(dst + j, internal::vand(v_mask,divWrap(v_src0, v_src1, scale))); + } + for (; j < size.width; j++) + { + dst[j] = src1[j] ? (T)((s32)trunc(scale * src0[j] / src1[j])) : 0; + } + } + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)cpolicy; + (void)scale; +#endif +} + +#ifdef CAROTENE_NEON + +template +inline T recipSaturateQ(const T &v2, const float scale) +{ + return internal::vcombine(internal::vqmovn(recipSaturateQ(internal::vmovl(internal::vget_low(v2)), scale)), + internal::vqmovn(recipSaturateQ(internal::vmovl(internal::vget_high(v2)), scale)) + ); +} +template <> +inline int32x4_t recipSaturateQ(const int32x4_t &v2, const float scale) +{ return vcvtq_s32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_s32(v2)), scale)); } +template <> +inline uint32x4_t recipSaturateQ(const uint32x4_t &v2, const float scale) +{ return vcvtq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_u32(v2)), scale)); } + +template +inline T recipSaturate(const T &v2, const float scale) +{ + return internal::vqmovn(recipSaturateQ(internal::vmovl(v2), scale)); +} +template <> +inline int32x2_t recipSaturate(const int32x2_t &v2, const float scale) +{ return vcvt_s32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_s32(v2)), scale)); } +template <> +inline uint32x2_t recipSaturate(const uint32x2_t &v2, const float scale) +{ return vcvt_u32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_u32(v2)), scale)); } + + +template +inline T recipWrapQ(const T &v2, const float scale) +{ + return internal::vcombine(internal::vmovn(recipWrapQ(internal::vmovl(internal::vget_low(v2)), scale)), + internal::vmovn(recipWrapQ(internal::vmovl(internal::vget_high(v2)), scale)) + ); +} +template <> +inline int32x4_t recipWrapQ(const int32x4_t &v2, const float scale) +{ return vcvtq_s32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_s32(v2)), scale)); } +template <> +inline uint32x4_t recipWrapQ(const uint32x4_t &v2, const float scale) +{ return vcvtq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(vcvtq_f32_u32(v2)), scale)); } + +template +inline T recipWrap(const T &v2, const float scale) +{ + return internal::vmovn(recipWrapQ(internal::vmovl(v2), scale)); +} +template <> +inline int32x2_t recipWrap(const int32x2_t &v2, const float scale) +{ return vcvt_s32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_s32(v2)), scale)); } +template <> +inline uint32x2_t recipWrap(const uint32x2_t &v2, const float scale) +{ return vcvt_u32_f32(vmul_n_f32(internal::vrecp_f32(vcvt_f32_u32(v2)), scale)); } +#endif + +template +void recip(const Size2D &size, + const T * src1Base, ptrdiff_t src1Stride, + T * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + internal::assertSupportedConfiguration(); + +#ifdef CAROTENE_NEON + typedef typename internal::VecTraits::vec128 vec128; + typedef typename internal::VecTraits::vec64 vec64; + + if (scale == 0.0f || + (std::numeric_limits::is_integer && + scale < 1.0f && + scale > -1.0f)) + { + for (size_t y = 0; y < size.height; ++y) + { + T * dst = internal::getRowPtr(dstBase, dstStride, y); + std::memset(dst, 0, sizeof(T) * size.width); + } + return; + } + + const size_t step128 = 16 / sizeof(T); + size_t roiw128 = size.width >= (step128 - 1) ? size.width - step128 + 1 : 0; + const size_t step64 = 8 / sizeof(T); + size_t roiw64 = size.width >= (step64 - 1) ? size.width - step64 + 1 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const T * src1 = internal::getRowPtr(src1Base, src1Stride, i); + T * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + if (cpolicy == CONVERT_POLICY_SATURATE) + { + for (; j < roiw128; j += step128) + { + internal::prefetch(src1 + j); + + vec128 v_src1 = internal::vld1q(src1 + j); + + vec128 v_mask = vtstq(v_src1,v_src1); + internal::vst1q(dst + j, internal::vandq(v_mask, recipSaturateQ(v_src1, scale))); + } + for (; j < roiw64; j += step64) + { + vec64 v_src1 = internal::vld1(src1 + j); + + vec64 v_mask = vtst(v_src1,v_src1); + internal::vst1(dst + j, internal::vand(v_mask, recipSaturate(v_src1, scale))); + } + for (; j < size.width; j++) + { + dst[j] = src1[j] ? internal::saturate_cast(scale / src1[j]) : 0; + } + } + else // CONVERT_POLICY_WRAP + { + for (; j < roiw128; j += step128) + { + internal::prefetch(src1 + j); + + vec128 v_src1 = internal::vld1q(src1 + j); + + vec128 v_mask = vtstq(v_src1,v_src1); + internal::vst1q(dst + j, internal::vandq(v_mask, recipWrapQ(v_src1, scale))); + } + for (; j < roiw64; j += step64) + { + vec64 v_src1 = internal::vld1(src1 + j); + + vec64 v_mask = vtst(v_src1,v_src1); + internal::vst1(dst + j, internal::vand(v_mask, recipWrap(v_src1, scale))); + } + for (; j < size.width; j++) + { + dst[j] = src1[j] ? (T)((s32)trunc(scale / src1[j])) : 0; + } + } + } +#else + (void)size; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)cpolicy; + (void)scale; +#endif +} + +} + +void div(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + u8 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + div(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy); +} + +void div(const Size2D &size, + const s8 * src0Base, ptrdiff_t src0Stride, + const s8 * src1Base, ptrdiff_t src1Stride, + s8 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + div(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy); +} + +void div(const Size2D &size, + const u16 * src0Base, ptrdiff_t src0Stride, + const u16 * src1Base, ptrdiff_t src1Stride, + u16 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + div(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy); +} + +void div(const Size2D &size, + const s16 * src0Base, ptrdiff_t src0Stride, + const s16 * src1Base, ptrdiff_t src1Stride, + s16 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + div(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy); +} + +void div(const Size2D &size, + const s32 * src0Base, ptrdiff_t src0Stride, + const s32 * src1Base, ptrdiff_t src1Stride, + s32 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + div(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy); +} + +void div(const Size2D &size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride, + f32 * dstBase, ptrdiff_t dstStride, + f32 scale) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + if (scale == 0.0f) + { + for (size_t y = 0; y < size.height; ++y) + { + f32 * dst = internal::getRowPtr(dstBase, dstStride, y); + std::memset(dst, 0, sizeof(f32) * size.width); + } + return; + } + + float32x4_t v_zero = vdupq_n_f32(0.0f); + + size_t roiw128 = size.width >= 3 ? size.width - 3 : 0; + size_t roiw64 = size.width >= 1 ? size.width - 1 : 0; + + if (std::fabs(scale - 1.0f) < FLT_EPSILON) + { + for (size_t i = 0; i < size.height; ++i) + { + const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i); + f32 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw128; j += 4) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + + float32x4_t v_src0 = vld1q_f32(src0 + j); + float32x4_t v_src1 = vld1q_f32(src1 + j); + + uint32x4_t v_mask = vceqq_f32(v_src1,v_zero); + vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32( + vreinterpretq_u32_f32(vmulq_f32(v_src0, internal::vrecpq_f32(v_src1))), v_mask))); + } + + for (; j < roiw64; j += 2) + { + float32x2_t v_src0 = vld1_f32(src0 + j); + float32x2_t v_src1 = vld1_f32(src1 + j); + + uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero)); + vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32( + vreinterpret_u32_f32(vmul_f32(v_src0, internal::vrecp_f32(v_src1))), v_mask))); + } + + for (; j < size.width; j++) + { + dst[j] = src1[j] ? src0[j] / src1[j] : 0.0f; + } + } + } + else + { + for (size_t i = 0; i < size.height; ++i) + { + const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i); + f32 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw128; j += 4) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + + float32x4_t v_src0 = vld1q_f32(src0 + j); + float32x4_t v_src1 = vld1q_f32(src1 + j); + + uint32x4_t v_mask = vceqq_f32(v_src1,v_zero); + vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32( + vreinterpretq_u32_f32(vmulq_f32(vmulq_n_f32(v_src0, scale), + internal::vrecpq_f32(v_src1))), v_mask))); + } + + for (; j < roiw64; j += 2) + { + float32x2_t v_src0 = vld1_f32(src0 + j); + float32x2_t v_src1 = vld1_f32(src1 + j); + + uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero)); + vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32( + vreinterpret_u32_f32(vmul_f32(vmul_n_f32(v_src0, scale), + internal::vrecp_f32(v_src1))), v_mask))); + } + + for (; j < size.width; j++) + { + dst[j] = src1[j] ? src0[j] * scale / src1[j] : 0.0f; + } + } + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)scale; +#endif +} + +void reciprocal(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + recip(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy); +} + +void reciprocal(const Size2D &size, + const s8 * srcBase, ptrdiff_t srcStride, + s8 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + recip(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy); +} + +void reciprocal(const Size2D &size, + const u16 * srcBase, ptrdiff_t srcStride, + u16 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + recip(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy); +} + +void reciprocal(const Size2D &size, + const s16 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + recip(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy); +} + +void reciprocal(const Size2D &size, + const s32 * srcBase, ptrdiff_t srcStride, + s32 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + recip(size, srcBase, srcStride, dstBase, dstStride, scale, cpolicy); +} + +void reciprocal(const Size2D &size, + const f32 * srcBase, ptrdiff_t srcStride, + f32 * dstBase, ptrdiff_t dstStride, + f32 scale) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + if (scale == 0.0f) + { + for (size_t y = 0; y < size.height; ++y) + { + f32 * dst = internal::getRowPtr(dstBase, dstStride, y); + std::memset(dst, 0, sizeof(f32) * size.width); + } + return; + } + + float32x4_t v_zero = vdupq_n_f32(0.0f); + + size_t roiw128 = size.width >= 3 ? size.width - 3 : 0; + size_t roiw64 = size.width >= 1 ? size.width - 1 : 0; + + if (std::fabs(scale - 1.0f) < FLT_EPSILON) + { + for (size_t i = 0; i < size.height; ++i) + { + const f32 * src1 = internal::getRowPtr(srcBase, srcStride, i); + f32 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw128; j += 4) + { + internal::prefetch(src1 + j); + + float32x4_t v_src1 = vld1q_f32(src1 + j); + + uint32x4_t v_mask = vceqq_f32(v_src1,v_zero); + vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32( + vreinterpretq_u32_f32(internal::vrecpq_f32(v_src1)), v_mask))); + } + + for (; j < roiw64; j += 2) + { + float32x2_t v_src1 = vld1_f32(src1 + j); + + uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero)); + vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32( + vreinterpret_u32_f32(internal::vrecp_f32(v_src1)), v_mask))); + } + + for (; j < size.width; j++) + { + dst[j] = src1[j] ? 1.0f / src1[j] : 0; + } + } + } + else + { + for (size_t i = 0; i < size.height; ++i) + { + const f32 * src1 = internal::getRowPtr(srcBase, srcStride, i); + f32 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw128; j += 4) + { + internal::prefetch(src1 + j); + + float32x4_t v_src1 = vld1q_f32(src1 + j); + + uint32x4_t v_mask = vceqq_f32(v_src1,v_zero); + vst1q_f32(dst + j, vreinterpretq_f32_u32(vbicq_u32( + vreinterpretq_u32_f32(vmulq_n_f32(internal::vrecpq_f32(v_src1), + scale)),v_mask))); + } + + for (; j < roiw64; j += 2) + { + float32x2_t v_src1 = vld1_f32(src1 + j); + + uint32x2_t v_mask = vceq_f32(v_src1,vget_low_f32(v_zero)); + vst1_f32(dst + j, vreinterpret_f32_u32(vbic_u32( + vreinterpret_u32_f32(vmul_n_f32(internal::vrecp_f32(v_src1), + scale)), v_mask))); + } + + for (; j < size.width; j++) + { + dst[j] = src1[j] ? scale / src1[j] : 0; + } + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)scale; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/dot_product.cpp b/3rdparty/carotene/src/dot_product.cpp new file mode 100644 index 0000000000..1759ea7cd5 --- /dev/null +++ b/3rdparty/carotene/src/dot_product.cpp @@ -0,0 +1,260 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +namespace CAROTENE_NS { + +f64 dotProduct(const Size2D &_size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (src0Stride == src1Stride && + src0Stride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + +// It is possible to accumulate up to 66051 uchar multiplication results in uint32 without overflow +// We process 16 elements and accumulate two new elements per step. So we could handle 66051/2*16 elements +#define DOT_UINT_BLOCKSIZE 66050*8 + f64 result = 0.0; + for (size_t row = 0; row < size.height; ++row) + { + const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, row); + const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, row); + + size_t i = 0; + uint64x2_t ws = vmovq_n_u64(0); + + while(i + 16 <= size.width) + { + size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16; + + uint32x4_t s1 = vmovq_n_u32(0); + uint32x4_t s2 = vmovq_n_u32(0); + + for (; i <= lim; i += 16) + { + internal::prefetch(src0 + i); + internal::prefetch(src1 + i); + + uint8x16_t vs1 = vld1q_u8(src0 + i); + uint8x16_t vs2 = vld1q_u8(src1 + i); + + uint16x8_t vdot1 = vmull_u8(vget_low_u8(vs1), vget_low_u8(vs2)); + uint16x8_t vdot2 = vmull_u8(vget_high_u8(vs1), vget_high_u8(vs2)); + + s1 = vpadalq_u16(s1, vdot1); + s2 = vpadalq_u16(s2, vdot2); + } + + ws = vpadalq_u32(ws, s1); + ws = vpadalq_u32(ws, s2); + } + + if(i + 8 <= size.width) + { + uint8x8_t vs1 = vld1_u8(src0 + i); + uint8x8_t vs2 = vld1_u8(src1 + i); + + ws = vpadalq_u32(ws, vpaddlq_u16(vmull_u8(vs1, vs2))); + i += 8; + } + + result += (double)vget_lane_u64(vadd_u64(vget_low_u64(ws), vget_high_u64(ws)), 0); + + for (; i < size.width; ++i) + result += s32(src0[i]) * s32(src1[i]); + } + return result; +#else + (void)_size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + + return 0; +#endif +} + +f64 dotProduct(const Size2D &_size, + const s8 * src0Base, ptrdiff_t src0Stride, + const s8 * src1Base, ptrdiff_t src1Stride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (src0Stride == src1Stride && + src0Stride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + +// It is possible to accumulate up to 131071 schar multiplication results in sint32 without overflow +// We process 16 elements and accumulate two new elements per step. So we could handle 131071/2*16 elements +#define DOT_INT_BLOCKSIZE 131070*8 + f64 result = 0.0; + for (size_t row = 0; row < size.height; ++row) + { + const s8 * src0 = internal::getRowPtr(src0Base, src0Stride, row); + const s8 * src1 = internal::getRowPtr(src1Base, src1Stride, row); + + size_t i = 0; + int64x2_t ws = vmovq_n_s64(0); + + while(i + 16 <= size.width) + { + size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16; + + int32x4_t s1 = vmovq_n_s32(0); + int32x4_t s2 = vmovq_n_s32(0); + + for (; i <= lim; i += 16) + { + internal::prefetch(src0 + i); + internal::prefetch(src1 + i); + + int8x16_t vs1 = vld1q_s8(src0 + i); + int8x16_t vs2 = vld1q_s8(src1 + i); + + int16x8_t vdot1 = vmull_s8(vget_low_s8(vs1), vget_low_s8(vs2)); + int16x8_t vdot2 = vmull_s8(vget_high_s8(vs1), vget_high_s8(vs2)); + + s1 = vpadalq_s16(s1, vdot1); + s2 = vpadalq_s16(s2, vdot2); + } + + ws = vpadalq_s32(ws, s1); + ws = vpadalq_s32(ws, s2); + } + + if(i + 8 <= size.width) + { + int8x8_t vs1 = vld1_s8(src0 + i); + int8x8_t vs2 = vld1_s8(src1 + i); + + ws = vpadalq_s32(ws, vpaddlq_s16(vmull_s8(vs1, vs2))); + i += 8; + } + + result += (double)vget_lane_s64(vadd_s64(vget_low_s64(ws), vget_high_s64(ws)), 0); + + for (; i < size.width; ++i) + result += s32(src0[i]) * s32(src1[i]); + } + return result; +#else + (void)_size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + + return 0; +#endif +} + +f64 dotProduct(const Size2D &_size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (src0Stride == src1Stride && + src0Stride == (ptrdiff_t)(size.width * sizeof(f32))) + { + size.width *= size.height; + size.height = 1; + } + +#define DOT_FLOAT_BLOCKSIZE (1 << 13) + f64 result = 0.0; + for (size_t row = 0; row < size.height; ++row) + { + const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, row); + const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, row); + + size_t i = 0; + while(i + 4 <= size.width) + { + size_t lim = std::min(i + DOT_FLOAT_BLOCKSIZE, size.width) - 4; + float32x4_t v_sum = vdupq_n_f32(0.0f); + + for( ; i <= lim; i += 4 ) + { + internal::prefetch(src0 + i); + internal::prefetch(src1 + i); + v_sum = vmlaq_f32(v_sum, vld1q_f32(src0 + i), vld1q_f32(src1 + i)); + } + + float32x2_t vres = vpadd_f32(vget_low_f32(v_sum),vget_high_f32(v_sum)); + result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1); + } + + if(i + 2 <= size.width) + { + float32x2_t vres = vmul_f32(vld1_f32(src0 + i), vld1_f32(src1 + i)); + result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1); + i += 2; + } + + for (; i < size.width; ++i) + result += src0[i] * src1[i]; + } + return result; +#else + (void)_size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + + return 0; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/fast.cpp b/3rdparty/carotene/src/fast.cpp new file mode 100644 index 0000000000..9506c1b6be --- /dev/null +++ b/3rdparty/carotene/src/fast.cpp @@ -0,0 +1,428 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + + +/* This is FAST corner detector, contributed to OpenCV by the author, Edward Rosten. + Below is the original copyright and the references */ + +/* +Copyright (c) 2006, 2008 Edward Rosten +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + + *Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + *Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + *Neither the name of the University of Cambridge nor the names of + its contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* +The references are: + * Machine learning for high-speed corner detection, + E. Rosten and T. Drummond, ECCV 2006 + * Faster and better: A machine learning approach to corner detection + E. Rosten, R. Porter and T. Drummond, PAMI, 2009 +*/ + +#include "common.hpp" + +#include +#include + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON +namespace +{ + +void makeOffsets(ptrdiff_t pixel[], ptrdiff_t row_stride) +{ + pixel[0] = 0 + row_stride * 3; + pixel[1] = 1 + row_stride * 3; + pixel[2] = 2 + row_stride * 2; + pixel[3] = 3 + row_stride * 1; + pixel[4] = 3 + row_stride * 0; + pixel[5] = 3 + row_stride * -1; + pixel[6] = 2 + row_stride * -2; + pixel[7] = 1 + row_stride * -3; + pixel[8] = 0 + row_stride * -3; + pixel[9] = -1 + row_stride * -3; + pixel[10] = -2 + row_stride * -2; + pixel[11] = -3 + row_stride * -1; + pixel[12] = -3 + row_stride * 0; + pixel[13] = -3 + row_stride * 1; + pixel[14] = -2 + row_stride * 2; + pixel[15] = -1 + row_stride * 3; +} + +u8 cornerScore(const u8* ptr, const ptrdiff_t pixel[]) +{ + const s32 K = 8, N = 16 + K + 1; + s32 k, v = ptr[0]; + s16 d[(N + 7) & ~7]; + for( k = 0; k < N; k++ ) + d[k] = (s16)(v - ptr[pixel[k]]); + + int16x8_t q0 = vdupq_n_s16((s16)(-1000)); + int16x8_t q1 = vdupq_n_s16((s16)(1000)); + + int16x8_t d0_7 = vld1q_s16(d + 0); + int16x8_t d8_15 = vld1q_s16(d + 8); + int16x8_t d16_23 = vld1q_s16(d + 16); + int16x8_t d24 = vld1q_s16(d + 24); + + //k == 0 + int16x8_t v0k0 = vextq_s16(d0_7, d8_15, 1); + int16x8_t v1k0 = vextq_s16(d0_7, d8_15, 2); + int16x8_t ak0 = vminq_s16(v0k0, v1k0); + int16x8_t bk0 = vmaxq_s16(v0k0, v1k0); + + v0k0 = vextq_s16(d0_7, d8_15, 3); + ak0 = vminq_s16(ak0, v0k0); + bk0 = vmaxq_s16(bk0, v0k0); + + v1k0 = vextq_s16(d0_7, d8_15, 4); + ak0 = vminq_s16(ak0, v1k0); + bk0 = vmaxq_s16(bk0, v1k0); + + v0k0 = vextq_s16(d0_7, d8_15, 5); + ak0 = vminq_s16(ak0, v0k0); + bk0 = vmaxq_s16(bk0, v0k0); + + v1k0 = vextq_s16(d0_7, d8_15, 6); + ak0 = vminq_s16(ak0, v1k0); + bk0 = vmaxq_s16(bk0, v1k0); + + v0k0 = vextq_s16(d0_7, d8_15, 7); + ak0 = vminq_s16(ak0, v0k0); + bk0 = vmaxq_s16(bk0, v0k0); + + ak0 = vminq_s16(ak0, d8_15); + bk0 = vmaxq_s16(bk0, d8_15); + + q0 = vmaxq_s16(q0, vminq_s16(ak0, d0_7)); + q1 = vminq_s16(q1, vmaxq_s16(bk0, d0_7)); + + v1k0 = vextq_s16(d8_15, d16_23, 1); + q0 = vmaxq_s16(q0, vminq_s16(ak0, v1k0)); + q1 = vminq_s16(q1, vmaxq_s16(bk0, v1k0)); + + //k == 8 + int16x8_t v0k8 = v1k0; + int16x8_t v1k8 = vextq_s16(d8_15, d16_23, 2); + int16x8_t ak8 = vminq_s16(v0k8, v1k8); + int16x8_t bk8 = vmaxq_s16(v0k8, v1k8); + + v0k8 = vextq_s16(d8_15, d16_23, 3); + ak8 = vminq_s16(ak8, v0k8); + bk8 = vmaxq_s16(bk8, v0k8); + + v1k8 = vextq_s16(d8_15, d16_23, 4); + ak8 = vminq_s16(ak8, v1k8); + bk8 = vmaxq_s16(bk8, v1k8); + + v0k8 = vextq_s16(d8_15, d16_23, 5); + ak8 = vminq_s16(ak8, v0k8); + bk8 = vmaxq_s16(bk8, v0k8); + + v1k8 = vextq_s16(d8_15, d16_23, 6); + ak8 = vminq_s16(ak8, v1k8); + bk8 = vmaxq_s16(bk8, v1k8); + + v0k8 = vextq_s16(d8_15, d16_23, 7); + ak8 = vminq_s16(ak8, v0k8); + bk8 = vmaxq_s16(bk8, v0k8); + + ak8 = vminq_s16(ak8, d16_23); + bk8 = vmaxq_s16(bk8, d16_23); + + q0 = vmaxq_s16(q0, vminq_s16(ak8, d8_15)); + q1 = vminq_s16(q1, vmaxq_s16(bk8, d8_15)); + + v1k8 = vextq_s16(d16_23, d24, 1); + q0 = vmaxq_s16(q0, vminq_s16(ak8, v1k8)); + q1 = vminq_s16(q1, vmaxq_s16(bk8, v1k8)); + + //fin + int16x8_t q = vmaxq_s16(q0, vsubq_s16(vmovq_n_s16(0), q1)); + int16x4_t q2 = vmax_s16(vget_low_s16(q), vget_high_s16(q)); + int32x4_t q2w = vmovl_s16(q2); + int32x2_t q4 = vmax_s32(vget_low_s32(q2w), vget_high_s32(q2w)); + int32x2_t q8 = vmax_s32(q4, vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(q4), 32))); + + return (u8)(vget_lane_s32(q8, 0) - 1); +} + +} //namespace +#endif + +void FAST(const Size2D &size, + u8 *srcBase, ptrdiff_t srcStride, + KeypointStore *keypoints, + u8 threshold, bool nonmax_suppression) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + //keypoints.clear(); + + const s32 K = 8, N = 16 + K + 1; + ptrdiff_t i, j, k, pixel[N]; + makeOffsets(pixel, srcStride); + for(k = 16; k < N; k++) + pixel[k] = pixel[k - 16]; + + uint8x16_t delta = vdupq_n_u8(128); + uint8x16_t t = vdupq_n_u8(threshold); + uint8x16_t K16 = vdupq_n_u8((u8)K); + + u8 threshold_tab[512]; + for( i = -255; i <= 255; i++ ) + threshold_tab[i+255] = (u8)(i < -threshold ? 1 : i > threshold ? 2 : 0); + + std::vector _buf((size.width+16)*3*(sizeof(ptrdiff_t) + sizeof(u8)) + 128); + u8* buf[3]; + buf[0] = &_buf[0]; buf[1] = buf[0] + size.width; buf[2] = buf[1] + size.width; + ptrdiff_t* cpbuf[3]; + cpbuf[0] = (ptrdiff_t*)internal::alignPtr(buf[2] + size.width, sizeof(ptrdiff_t)) + 1; + cpbuf[1] = cpbuf[0] + size.width + 1; + cpbuf[2] = cpbuf[1] + size.width + 1; + memset(buf[0], 0, size.width*3); + + for(i = 3; i < (ptrdiff_t)size.height-2; i++) + { + const u8* ptr = internal::getRowPtr(srcBase, srcStride, i) + 3; + u8* curr = buf[(i - 3)%3]; + ptrdiff_t* cornerpos = cpbuf[(i - 3)%3]; + memset(curr, 0, size.width); + ptrdiff_t ncorners = 0; + + if( i < (ptrdiff_t)size.height - 3 ) + { + j = 3; + + for(; j < (ptrdiff_t)size.width - 16 - 3; j += 16, ptr += 16) + { + internal::prefetch(ptr); + internal::prefetch(ptr + pixel[0]); + internal::prefetch(ptr + pixel[2]); + + uint8x16_t v0 = vld1q_u8(ptr); + int8x16_t v1 = vreinterpretq_s8_u8(veorq_u8(vqsubq_u8(v0, t), delta)); + int8x16_t v2 = vreinterpretq_s8_u8(veorq_u8(vqaddq_u8(v0, t), delta)); + + int8x16_t x0 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[0]), delta)); + int8x16_t x1 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[4]), delta)); + int8x16_t x2 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[8]), delta)); + int8x16_t x3 = vreinterpretq_s8_u8(vsubq_u8(vld1q_u8(ptr + pixel[12]), delta)); + + uint8x16_t m0 = vandq_u8(vcgtq_s8(x0, v2), vcgtq_s8(x1, v2)); + uint8x16_t m1 = vandq_u8(vcgtq_s8(v1, x0), vcgtq_s8(v1, x1)); + m0 = vorrq_u8(m0, vandq_u8(vcgtq_s8(x1, v2), vcgtq_s8(x2, v2))); + m1 = vorrq_u8(m1, vandq_u8(vcgtq_s8(v1, x1), vcgtq_s8(v1, x2))); + m0 = vorrq_u8(m0, vandq_u8(vcgtq_s8(x2, v2), vcgtq_s8(x3, v2))); + m1 = vorrq_u8(m1, vandq_u8(vcgtq_s8(v1, x2), vcgtq_s8(v1, x3))); + m0 = vorrq_u8(m0, vandq_u8(vcgtq_s8(x3, v2), vcgtq_s8(x0, v2))); + m1 = vorrq_u8(m1, vandq_u8(vcgtq_s8(v1, x3), vcgtq_s8(v1, x0))); + m0 = vorrq_u8(m0, m1); + + u64 mask[2]; + vst1q_u64(mask, vreinterpretq_u64_u8(m0)); + + if( mask[0] == 0 ) + { + if (mask[1] != 0) + { + j -= 8; + ptr -= 8; + } + continue; + } + + uint8x16_t c0 = vmovq_n_u8(0); + uint8x16_t c1 = vmovq_n_u8(0); + uint8x16_t max0 = vmovq_n_u8(0); + uint8x16_t max1 = vmovq_n_u8(0); + for( k = 0; k < N; k++ ) + { + int8x16_t x = vreinterpretq_s8_u8(veorq_u8(vld1q_u8(ptr + pixel[k]), delta)); + m0 = vcgtq_s8(x, v2); + m1 = vcgtq_s8(v1, x); + + c0 = vandq_u8(vsubq_u8(c0, m0), m0); + c1 = vandq_u8(vsubq_u8(c1, m1), m1); + + max0 = vmaxq_u8(max0, c0); + max1 = vmaxq_u8(max1, c1); + } + + max0 = vmaxq_u8(max0, max1); + u8 m[16]; + vst1q_u8(m, vcgtq_u8(max0, K16)); + + for( k = 0; k < 16; ++k ) + if(m[k]) + { + cornerpos[ncorners++] = j+k; + if(nonmax_suppression) + curr[j+k] = cornerScore(ptr+k, pixel); + } + } + + for( ; j < (s32)size.width - 3; j++, ptr++ ) + { + s32 v = ptr[0]; + const u8* tab = &threshold_tab[0] - v + 255; + s32 d = tab[ptr[pixel[0]]] | tab[ptr[pixel[8]]]; + + if( d == 0 ) + continue; + + d &= tab[ptr[pixel[2]]] | tab[ptr[pixel[10]]]; + d &= tab[ptr[pixel[4]]] | tab[ptr[pixel[12]]]; + d &= tab[ptr[pixel[6]]] | tab[ptr[pixel[14]]]; + + if( d == 0 ) + continue; + + d &= tab[ptr[pixel[1]]] | tab[ptr[pixel[9]]]; + d &= tab[ptr[pixel[3]]] | tab[ptr[pixel[11]]]; + d &= tab[ptr[pixel[5]]] | tab[ptr[pixel[13]]]; + d &= tab[ptr[pixel[7]]] | tab[ptr[pixel[15]]]; + + if( d & 1 ) + { + s32 vt = v - threshold, count = 0; + + for( k = 0; k < N; k++ ) + { + s32 x = ptr[pixel[k]]; + if(x < vt) + { + if( ++count > K ) + { + cornerpos[ncorners++] = j; + if(nonmax_suppression) + curr[j] = cornerScore(ptr, pixel); + break; + } + } + else + count = 0; + } + } + + if( d & 2 ) + { + s32 vt = v + threshold, count = 0; + + for( k = 0; k < N; k++ ) + { + s32 x = ptr[pixel[k]]; + if(x > vt) + { + if( ++count > K ) + { + cornerpos[ncorners++] = j; + if(nonmax_suppression) + curr[j] = cornerScore(ptr, pixel); + break; + } + } + else + count = 0; + } + } + } + } + + cornerpos[-1] = ncorners; + + if( i == 3 ) + continue; + + const u8* prev = buf[(i - 4 + 3)%3]; + const u8* pprev = buf[(i - 5 + 3)%3]; + cornerpos = cpbuf[(i - 4 + 3)%3]; + ncorners = cornerpos[-1]; + + for( k = 0; k < ncorners; k++ ) + { + j = cornerpos[k]; + s32 score = prev[j]; + if( !nonmax_suppression || + (score > prev[j+1] && score > prev[j-1] && + score > pprev[j-1] && score > pprev[j] && score > pprev[j+1] && + score > curr[j-1] && score > curr[j] && score > curr[j+1]) ) + { + keypoints->push((f32)j, (f32)(i-1), 7.f, -1, (f32)score); + } + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)keypoints; + (void)threshold; + (void)nonmax_suppression; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/fill_minmaxloc.cpp b/3rdparty/carotene/src/fill_minmaxloc.cpp new file mode 100644 index 0000000000..fdf0e35d03 --- /dev/null +++ b/3rdparty/carotene/src/fill_minmaxloc.cpp @@ -0,0 +1,442 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON + +namespace { + +template +void process(const T * src, size_t j0, size_t j1, size_t i, + T minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity, + T maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity) +{ + for (size_t j = j0; j < j1; ++j) + { + T val = src[j]; + + if (val == maxVal) + { + if (maxLocCount < maxLocCapacity) + { + maxLocPtr[maxLocCount] = j; + maxLocPtr[maxLocCount + 1] = i; + } + maxLocCount += 2; + } + + if (val == minVal) + { + if (minLocCount < minLocCapacity) + { + minLocPtr[minLocCount] = j; + minLocPtr[minLocCount + 1] = i; + } + minLocCount += 2; + } + } +} + +} // namespace + +#endif + +void fillMinMaxLocs(const Size2D & size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity, + u8 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + uint8x16_t v_maxval16 = vdupq_n_u8(maxVal), v_minval16 = vdupq_n_u8(minVal); + uint8x8_t v_maxval8 = vdup_n_u8(maxVal), v_minval8 = vdup_n_u8(minVal); + + u64 mask[2] = { 0ul }; + + minLocCapacity <<= 1; + maxLocCapacity <<= 1; + + for (size_t i = 0; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + size_t j = 0; + + for ( ; j < roiw16; j += 16) + { + internal::prefetch(src + j); + uint8x16_t v_src = vld1q_u8(src + j); + + uint8x16_t v_maxmask = vceqq_u8(v_src, v_maxval16); + uint8x16_t v_minmask = vceqq_u8(v_src, v_minval16); + uint8x16_t v_mask = vorrq_u8(v_maxmask, v_minmask); + + vst1q_u8((u8 *)&mask[0], v_mask); + + if (mask[0]) + process(src, j, j + 8, i, + minVal, minLocPtr, minLocCount, minLocCapacity, + maxVal, maxLocPtr, maxLocCount, maxLocCapacity); + if (mask[1]) + process(src, j + 8, j + 16, i, + minVal, minLocPtr, minLocCount, minLocCapacity, + maxVal, maxLocPtr, maxLocCount, maxLocCapacity); + } + for ( ; j < roiw8; j += 8) + { + uint8x8_t v_src = vld1_u8(src + j); + + uint8x8_t v_maxmask = vceq_u8(v_src, v_maxval8); + uint8x8_t v_minmask = vceq_u8(v_src, v_minval8); + uint8x8_t v_mask = vorr_u8(v_maxmask, v_minmask); + + vst1_u8((u8 *)&mask[0], v_mask); + + if (mask[0]) + process(src, j, j + 8, i, + minVal, minLocPtr, minLocCount, minLocCapacity, + maxVal, maxLocPtr, maxLocCount, maxLocCapacity); + } + + process(src, j, size.width, i, + minVal, minLocPtr, minLocCount, minLocCapacity, + maxVal, maxLocPtr, maxLocCount, maxLocCapacity); + } + + minLocCount >>= 1; + maxLocCount >>= 1; +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)minVal; + (void)minLocPtr; + (void)minLocCount; + (void)minLocCapacity; + (void)maxVal; + (void)maxLocPtr; + (void)maxLocCount; + (void)maxLocCapacity; +#endif +} + +void fillMinMaxLocs(const Size2D & size, + const u16 * srcBase, ptrdiff_t srcStride, + u16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity, + u16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + uint16x8_t v_maxval8 = vdupq_n_u16(maxVal), + v_minval8 = vdupq_n_u16(minVal); + u64 mask[2] = { 0ul }; + + minLocCapacity <<= 1; + maxLocCapacity <<= 1; + + for (size_t i = 0; i < size.height; ++i) + { + const u16 * src = internal::getRowPtr(srcBase, srcStride, i); + size_t j = 0; + + for ( ; j < roiw16; j += 16) + { + internal::prefetch(src + j); + uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8); + + uint16x8_t v_mask0 = vorrq_u16(vceqq_u16(v_src0, v_maxval8), vceqq_u16(v_src0, v_minval8)); + uint16x8_t v_mask1 = vorrq_u16(vceqq_u16(v_src1, v_maxval8), vceqq_u16(v_src1, v_minval8)); + + vst1q_u8((u8 *)&mask[0], vcombine_u8(vmovn_u16(v_mask0), vmovn_u16(v_mask1))); + + if (mask[0]) + process(src, j, j + 8, i, + minVal, minLocPtr, minLocCount, minLocCapacity, + maxVal, maxLocPtr, maxLocCount, maxLocCapacity); + if (mask[1]) + process(src, j + 8, j + 16, i, + minVal, minLocPtr, minLocCount, minLocCapacity, + maxVal, maxLocPtr, maxLocCount, maxLocCapacity); + } + for ( ; j < roiw8; j += 8) + { + internal::prefetch(src + j); + uint16x8_t v_src = vld1q_u16(src + j); + + uint16x8_t v_maxmask = vceqq_u16(v_src, v_maxval8); + uint16x8_t v_minmask = vceqq_u16(v_src, v_minval8); + uint16x8_t v_mask = vorrq_u16(v_maxmask, v_minmask); + + vst1_u8((u8 *)&mask[0], vmovn_u16(v_mask)); + + if (mask[0]) + process(src, j, j + 8, i, + minVal, minLocPtr, minLocCount, minLocCapacity, + maxVal, maxLocPtr, maxLocCount, maxLocCapacity); + } + + process(src, j, size.width, i, + minVal, minLocPtr, minLocCount, minLocCapacity, + maxVal, maxLocPtr, maxLocCount, maxLocCapacity); + } + + minLocCount >>= 1; + maxLocCount >>= 1; +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)minVal; + (void)minLocPtr; + (void)minLocCount; + (void)minLocCapacity; + (void)maxVal; + (void)maxLocPtr; + (void)maxLocCount; + (void)maxLocCapacity; +#endif +} + +void fillMinMaxLocs(const Size2D & size, + const s16 * srcBase, ptrdiff_t srcStride, + s16 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity, + s16 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + int16x8_t v_maxval8 = vdupq_n_s16(maxVal), + v_minval8 = vdupq_n_s16(minVal); + u64 mask[2] = { 0ul }; + + minLocCapacity <<= 1; + maxLocCapacity <<= 1; + + for (size_t i = 0; i < size.height; ++i) + { + const s16 * src = internal::getRowPtr(srcBase, srcStride, i); + size_t j = 0; + + for ( ; j < roiw16; j += 16) + { + internal::prefetch(src + j); + int16x8_t v_src0 = vld1q_s16(src + j), v_src1 = vld1q_s16(src + j + 8); + + uint16x8_t v_mask0 = vorrq_u16(vceqq_s16(v_src0, v_maxval8), vceqq_s16(v_src0, v_minval8)); + uint16x8_t v_mask1 = vorrq_u16(vceqq_s16(v_src1, v_maxval8), vceqq_s16(v_src1, v_minval8)); + + vst1q_u8((u8 *)&mask[0], vcombine_u8(vmovn_u16(v_mask0), vmovn_u16(v_mask1))); + + if (mask[0]) + process(src, j, j + 8, i, + minVal, minLocPtr, minLocCount, minLocCapacity, + maxVal, maxLocPtr, maxLocCount, maxLocCapacity); + if (mask[1]) + process(src, j + 8, j + 16, i, + minVal, minLocPtr, minLocCount, minLocCapacity, + maxVal, maxLocPtr, maxLocCount, maxLocCapacity); + } + for ( ; j < roiw8; j += 8) + { + internal::prefetch(src + j); + int16x8_t v_src = vld1q_s16(src + j); + + uint16x8_t v_maxmask = vceqq_s16(v_src, v_maxval8); + uint16x8_t v_minmask = vceqq_s16(v_src, v_minval8); + uint16x8_t v_mask = vorrq_u16(v_maxmask, v_minmask); + + vst1_u8((u8 *)&mask[0], vmovn_u16(v_mask)); + + if (mask[0]) + process(src, j, j + 8, i, + minVal, minLocPtr, minLocCount, minLocCapacity, + maxVal, maxLocPtr, maxLocCount, maxLocCapacity); + } + + process(src, j, size.width, i, + minVal, minLocPtr, minLocCount, minLocCapacity, + maxVal, maxLocPtr, maxLocCount, maxLocCapacity); + } + + minLocCount >>= 1; + maxLocCount >>= 1; +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)minVal; + (void)minLocPtr; + (void)minLocCount; + (void)minLocCapacity; + (void)maxVal; + (void)maxLocPtr; + (void)maxLocCount; + (void)maxLocCapacity; +#endif +} + +void fillMinMaxLocs(const Size2D & size, + const s32 * srcBase, ptrdiff_t srcStride, + s32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity, + s32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + int32x4_t v_maxval4 = vdupq_n_s32(maxVal), + v_minval4 = vdupq_n_s32(minVal); + u64 mask = 0ul; + + minLocCapacity <<= 1; + maxLocCapacity <<= 1; + + for (size_t i = 0; i < size.height; ++i) + { + const s32 * src = internal::getRowPtr(srcBase, srcStride, i); + size_t j = 0; + + for ( ; j < roiw8; j += 8) + { + internal::prefetch(src + j); + int32x4_t v_src0 = vld1q_s32(src + j), v_src1 = vld1q_s32(src + j + 4); + + uint32x4_t v_mask0 = vorrq_u32(vceqq_s32(v_src0, v_maxval4), vceqq_s32(v_src0, v_minval4)); + uint32x4_t v_mask1 = vorrq_u32(vceqq_s32(v_src1, v_maxval4), vceqq_s32(v_src1, v_minval4)); + + vst1_u8((u8 *)&mask, vmovn_u16(vcombine_u16(vmovn_u32(v_mask0), vmovn_u32(v_mask1)))); + + if (mask) + process(src, j, j + 8, i, + minVal, minLocPtr, minLocCount, minLocCapacity, + maxVal, maxLocPtr, maxLocCount, maxLocCapacity); + } + + process(src, j, size.width, i, + minVal, minLocPtr, minLocCount, minLocCapacity, + maxVal, maxLocPtr, maxLocCount, maxLocCapacity); + } + + minLocCount >>= 1; + maxLocCount >>= 1; +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)minVal; + (void)minLocPtr; + (void)minLocCount; + (void)minLocCapacity; + (void)maxVal; + (void)maxLocPtr; + (void)maxLocCount; + (void)maxLocCapacity; +#endif +} + +void fillMinMaxLocs(const Size2D & size, + const u32 * srcBase, ptrdiff_t srcStride, + u32 minVal, size_t * minLocPtr, s32 & minLocCount, s32 minLocCapacity, + u32 maxVal, size_t * maxLocPtr, s32 & maxLocCount, s32 maxLocCapacity) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + uint32x4_t v_maxval4 = vdupq_n_u32(maxVal), + v_minval4 = vdupq_n_u32(minVal); + u64 mask = 0ul; + + minLocCapacity <<= 1; + maxLocCapacity <<= 1; + + for (size_t i = 0; i < size.height; ++i) + { + const u32 * src = internal::getRowPtr(srcBase, srcStride, i); + size_t j = 0; + + for ( ; j < roiw8; j += 8) + { + internal::prefetch(src + j); + uint32x4_t v_src0 = vld1q_u32(src + j), v_src1 = vld1q_u32(src + j + 4); + + uint32x4_t v_mask0 = vorrq_u32(vceqq_u32(v_src0, v_maxval4), vceqq_u32(v_src0, v_minval4)); + uint32x4_t v_mask1 = vorrq_u32(vceqq_u32(v_src1, v_maxval4), vceqq_u32(v_src1, v_minval4)); + + vst1_u8((u8 *)&mask, vmovn_u16(vcombine_u16(vmovn_u32(v_mask0), vmovn_u32(v_mask1)))); + + if (mask) + process(src, j, j + 8, i, + minVal, minLocPtr, minLocCount, minLocCapacity, + maxVal, maxLocPtr, maxLocCount, maxLocCapacity); + } + + process(src, j, size.width, i, + minVal, minLocPtr, minLocCount, minLocCapacity, + maxVal, maxLocPtr, maxLocCount, maxLocCapacity); + } + + minLocCount >>= 1; + maxLocCount >>= 1; +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)minVal; + (void)minLocPtr; + (void)minLocCount; + (void)minLocCapacity; + (void)maxVal; + (void)maxLocPtr; + (void)maxLocCount; + (void)maxLocCapacity; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/flip.cpp b/3rdparty/carotene/src/flip.cpp new file mode 100644 index 0000000000..339398dd92 --- /dev/null +++ b/3rdparty/carotene/src/flip.cpp @@ -0,0 +1,222 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" +#include "vtransform.hpp" + +#include + +namespace CAROTENE_NS { + +bool isFlipSupported(FLIP_MODE flipMode, u32 elemSize) +{ + bool supportedElemSize = (elemSize == 1) || (elemSize == 2) || (elemSize == 3) || (elemSize == 4); + return isSupportedConfiguration() && + ((supportedElemSize && ((flipMode == FLIP_BOTH_MODE) || (flipMode == FLIP_HORIZONTAL_MODE))) || + (flipMode == FLIP_VERTICAL_MODE)); +} + +#ifdef CAROTENE_NEON + +namespace { + +template +void flip(const Size2D & size, + const void * srcBase, ptrdiff_t srcStride, + void * dstBase, ptrdiff_t dstStride, + FLIP_MODE flipMode) +{ + using namespace internal; + + typedef typename VecTraits::vec128 vec128; + typedef typename VecTraits::vec64 vec64; + + u32 step_base = 16 / sizeof(T), step_tail = 8 / sizeof(T); + size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0; + size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const T * src = getRowPtr((const T *)srcBase, srcStride, i); + T * dst = getRowPtr((T *)dstBase, dstStride, (flipMode & FLIP_VERTICAL_MODE) != 0 ? size.height - i - 1 : i); + size_t js = 0, jd = size.width; + + for (; js < roiw_base; js += step_base, jd -= step_base) + { + prefetch(src + js); + + vec128 v_src = vld1q(src + js); + vec128 v_dst = vrev64q(v_src); + v_dst = vcombine(vget_high(v_dst), vget_low(v_dst)); + vst1q(dst + jd - step_base, v_dst); + } + for (; js < roiw_tail; js += step_tail, jd -= step_tail) + { + vec64 v_src = vld1(src + js); + vst1(dst + jd - step_tail, vrev64(v_src)); + } + + for (--jd; js < size.width; ++js, --jd) + dst[jd] = src[js]; + } +} + +template +void flip3(const Size2D & size, + const void * srcBase, ptrdiff_t srcStride, + void * dstBase, ptrdiff_t dstStride, + FLIP_MODE flipMode) +{ + using namespace internal; + +#ifndef ANDROID + typedef typename VecTraits::vec128 vec128; +#endif + typedef typename VecTraits::vec64 vec64; + +#ifndef ANDROID + u32 step_base = 16 / sizeof(T), step_base3 = step_base * 3; + size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0; +#endif + u32 step_tail = 8 / sizeof(T), step_tail3 = step_tail * 3; + size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const T * src = getRowPtr((const T *)srcBase, srcStride, i); + T * dst = getRowPtr((T *)dstBase, dstStride, (flipMode & FLIP_VERTICAL_MODE) != 0 ? size.height - i - 1 : i); + size_t j = 0, js = 0, jd = size.width * 3; + +#ifndef ANDROID + for (; j < roiw_base; j += step_base, js += step_base3, jd -= step_base3) + { + prefetch(src + js); + + vec128 v_src = vld3q(src + js), v_dst; + v_src.val[0] = vrev64q(v_src.val[0]); + v_src.val[1] = vrev64q(v_src.val[1]); + v_src.val[2] = vrev64q(v_src.val[2]); + + v_dst.val[0] = vcombine(vget_high(v_src.val[0]), vget_low(v_src.val[0])); + v_dst.val[1] = vcombine(vget_high(v_src.val[1]), vget_low(v_src.val[1])); + v_dst.val[2] = vcombine(vget_high(v_src.val[2]), vget_low(v_src.val[2])); + + vst3q(dst + jd - step_base3, v_dst); + } +#endif // ANDROID + + for (; j < roiw_tail; j += step_tail, js += step_tail3, jd -= step_tail3) + { + vec64 v_src = vld3(src + js), v_dst; + v_dst.val[0] = vrev64(v_src.val[0]); + v_dst.val[1] = vrev64(v_src.val[1]); + v_dst.val[2] = vrev64(v_src.val[2]); + + vst3(dst + jd - step_tail3, v_dst); + } + + for (jd -= 3; j < size.width; ++j, js += 3, jd -= 3) + { + dst[jd] = src[js]; + dst[jd + 1] = src[js + 1]; + dst[jd + 2] = src[js + 2]; + } + } +} + +typedef void (* flipFunc)(const Size2D &size, + const void * srcBase, ptrdiff_t srcStride, + void * dstBase, ptrdiff_t dstStride, + FLIP_MODE flipMode); + +} // namespace + +#endif + +void flip(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + FLIP_MODE flipMode, u32 elemSize) +{ + internal::assertSupportedConfiguration(isFlipSupported(flipMode, elemSize)); +#ifdef CAROTENE_NEON + + if (flipMode == FLIP_VERTICAL_MODE) + { + for (size_t y = 0; y < size.height; ++y) + { + const u8 * src_row = internal::getRowPtr(srcBase, srcStride, y); + u8 * dst_row = internal::getRowPtr(dstBase, dstStride, size.height - y - 1); + + std::memcpy(dst_row, src_row, elemSize * size.width); + } + return; + } + + flipFunc func = NULL; + + if (elemSize == (u32)sizeof(u8)) + func = &flip; + if (elemSize == (u32)sizeof(u16)) + func = &flip; + if (elemSize == (u32)sizeof(u32)) + func = &flip; + if (elemSize == (u32)sizeof(u8) * 3) + func = &flip3; + + if (func == NULL) + return; + + func(size, + srcBase, srcStride, + dstBase, dstStride, + flipMode); + +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)flipMode; + (void)elemSize; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/gaussian_blur.cpp b/3rdparty/carotene/src/gaussian_blur.cpp new file mode 100644 index 0000000000..069373e419 --- /dev/null +++ b/3rdparty/carotene/src/gaussian_blur.cpp @@ -0,0 +1,1059 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" +#include "saturate_cast.hpp" +#include "separable_filter.hpp" + +namespace CAROTENE_NS { + +bool isGaussianBlur3x3Supported(const Size2D &size, BORDER_MODE border) +{ + return isSupportedConfiguration() && size.width >= 8 && + (border == BORDER_MODE_CONSTANT || + border == BORDER_MODE_REPLICATE); +} + +void gaussianBlur3x3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue) +{ + internal::assertSupportedConfiguration(isGaussianBlur3x3Supported(size, border)); +#ifdef CAROTENE_NEON + const uint16x8_t v_border_x4 = vdupq_n_u16(borderValue << 2); + const uint16x8_t v_zero = vdupq_n_u16(0); + const uint8x8_t v_border = vdup_n_u8(borderValue); + + uint16x8_t tprev = v_zero, tcurr = v_zero, tnext = v_zero; + uint16x8_t t0 = v_zero, t1 = v_zero, t2 = v_zero; + + ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height; + + for (ptrdiff_t y = 0; y < height; ++y) + { + const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max(y - 1, 0)); + const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y); + const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1)); + u8 * drow = internal::getRowPtr(dstBase, dstStride, y); + + s16 prevx = 0, currx = 0, nextx = 0; + ptrdiff_t x = 0; + const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8); + + // perform vertical convolution + for ( ; x <= bwidth; x += 8) + { + internal::prefetch(srow0 + x); + internal::prefetch(srow1 + x); + internal::prefetch(srow2 + x); + + uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x); + uint8x8_t x1 = vld1_u8(srow1 + x); + uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x); + + // calculate values for plain CPU part below if needed + if (x + 8 >= bwidth) + { + ptrdiff_t x3 = x == width ? width - 1 : x; + ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max(x3 - 1, 0); + + if (border == BORDER_MODE_CONSTANT && x4 < 0) + prevx = borderValue; + else + prevx = (srow2 ? srow2[x4] : borderValue) + (srow1[x4] << 1) + (srow0 ? srow0[x4] : borderValue); + + currx = (srow2 ? srow2[x3] : borderValue) + (srow1[x3] << 1) + (srow0 ? srow0[x3] : borderValue); + } + + // make shift + if (x) + { + tprev = tcurr; + tcurr = tnext; + } + + // and calculate next value + tnext = vaddq_u16(vaddl_u8(x0, x2), vshll_n_u8(x1, 1)); + + // make extrapolation for the first elements + if (!x) + { + // make border + if (border == BORDER_MODE_CONSTANT) + tcurr = v_border_x4; + else if (border == BORDER_MODE_REPLICATE) + tcurr = vdupq_n_u16(vgetq_lane_u16(tnext, 0)); + + continue; + } + + // combine 3 "shifted" vectors + t0 = vextq_u16(tprev, tcurr, 7); + t1 = tcurr; + t2 = vextq_u16(tcurr, tnext, 1); + + // and add them + t0 = vqaddq_u16(vshlq_n_u16(t1, 1), vqaddq_u16(t0, t2)); + vst1_u8(drow + x - 8, vshrn_n_u16(t0, 4)); + } + + x -= 8; + if (x == width) + --x; + + for ( ; x < width; ++x) + { + // make extrapolation for the last elements + if (x + 1 >= width) + { + if (border == BORDER_MODE_CONSTANT) + nextx = borderValue << 2; + else if (border == BORDER_MODE_REPLICATE) + nextx = srow2[x] + (srow1[x] << 1) + srow0[x]; + } + else + nextx = (srow2 ? srow2[x + 1] : borderValue) + + (srow1[x + 1] << 1) + + (srow0 ? srow0[x + 1] : borderValue); + + f32 val = (prevx + (currx << 1) + nextx) >> 4; + drow[x] = internal::saturate_cast((s32)val); + + // make shift + prevx = currx; + currx = nextx; + } + } +#else + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)borderValue; +#endif +} + +bool isGaussianBlur3x3MarginSupported(const Size2D &size, BORDER_MODE border, Margin borderMargin) +{ + return isSeparableFilter3x3Supported(size, border, 0, 0, borderMargin); +} + +void gaussianBlur3x3Margin(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue, Margin borderMargin) +{ + internal::assertSupportedConfiguration(isGaussianBlur3x3MarginSupported(size, border, borderMargin)); +#ifdef CAROTENE_NEON + internal::sepFilter3x3::process( + size, srcBase, srcStride, dstBase, dstStride, + 0, 0, border, borderValue, borderMargin); +#else + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)borderValue; +#endif +} + +bool isGaussianBlur5x5Supported(const Size2D &size, s32 cn, BORDER_MODE border) +{ + return isSupportedConfiguration() && + cn > 0 && cn <= 4 && + size.width >= 8 && size.height >= 2 && + (border == BORDER_MODE_CONSTANT || + border == BORDER_MODE_REFLECT101 || + border == BORDER_MODE_REFLECT || + border == BORDER_MODE_REPLICATE || + border == BORDER_MODE_WRAP); +} + +void gaussianBlur5x5(const Size2D &size, s32 cn, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderType, u8 borderValue, Margin borderMargin) +{ + internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType)); +#ifdef CAROTENE_NEON + size_t colsn = size.width * cn; + + std::vector _tmp; + u8 *tmp = 0; + if (borderType == BORDER_MODE_CONSTANT) + { + _tmp.assign(colsn + 4*cn, borderValue); + tmp = &_tmp[cn << 1]; + } + + ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + + //1-line buffer + std::vector _buf(cn * (size.width + 4) + 32 / sizeof(u16)); + u16* lane = internal::alignPtr(&_buf[cn << 1], 32); + + if (borderType == BORDER_MODE_CONSTANT) + for (s32 k = 0; k < cn; ++k) + { + lane[-cn+k] = borderValue; + lane[-cn-cn+k] = borderValue; + lane[colsn+k] = borderValue; + lane[colsn+cn+k] = borderValue; + } + + uint8x8_t vc6u8 = vmov_n_u8(6); + uint16x8_t vc6u16 = vmovq_n_u16(6); + uint16x8_t vc4u16 = vmovq_n_u16(4); + + for (size_t i = 0; i < size.height; ++i) + { + u8* dst = internal::getRowPtr(dstBase, dstStride, i); + //vertical convolution + ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom); + ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom); + ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom); + ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom); + + const u8* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp; + const u8* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp; + const u8* ln2 = internal::getRowPtr(srcBase, srcStride, i); + const u8* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp; + const u8* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp; + + size_t x = 0; + for (; x <= colsn - 8; x += 8) + { + internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2)); + uint8x8_t v0 = vld1_u8(ln0+x); + uint8x8_t v1 = vld1_u8(ln1+x); + uint8x8_t v2 = vld1_u8(ln2+x); + uint8x8_t v3 = vld1_u8(ln3+x); + uint8x8_t v4 = vld1_u8(ln4+x); + + uint16x8_t v = vaddl_u8(v0, v4); + uint16x8_t v13 = vaddl_u8(v1, v3); + + v = vmlal_u8(v, v2, vc6u8); + v = vmlaq_u16(v, v13, vc4u16); + + vst1q_u16(lane + x, v); + } + for (; x < colsn; ++x) + lane[x] = ln0[x] + ln4[x] + u16(4) * (ln1[x] + ln3[x]) + u16(6) * ln2[x]; + + //left&right borders + if (borderType != BORDER_MODE_CONSTANT) + for (s32 k = 0; k < cn; ++k) + { + lane[-cn+k] = lane[idx_l1 + k]; + lane[-cn-cn+k] = lane[idx_l2 + k]; + + lane[colsn+k] = lane[idx_r1 + k]; + lane[colsn+cn+k] = lane[idx_r2 + k]; + } + + //horizontal convolution + x = 0; + switch(cn) + { + case 1: + for (; x <= colsn - 8; x += 8) + { + internal::prefetch(lane + x); + + uint16x8_t lane0 = vld1q_u16(lane + x - 2); + uint16x8_t lane4 = vld1q_u16(lane + x + 2); + uint16x8_t lane1 = vld1q_u16(lane + x - 1); + uint16x8_t lane3 = vld1q_u16(lane + x + 1); + uint16x8_t lane2 = vld1q_u16(lane + x + 0); + + uint16x8_t ln04 = vaddq_u16(lane0, lane4); + uint16x8_t ln13 = vaddq_u16(lane1, lane3); + + uint16x8_t ln042 = vmlaq_u16(ln04, lane2, vc6u16); + uint16x8_t lsw = vmlaq_u16(ln042, ln13, vc4u16); + + uint8x8_t ls = vrshrn_n_u16(lsw, 8); + + vst1_u8(dst + x, ls); + } + break; + case 2: + for (; x <= colsn - 8*2; x += 8*2) + { + internal::prefetch(lane + x); + + u16* lidx0 = lane + x - 2*2; + u16* lidx1 = lane + x - 1*2; + u16* lidx3 = lane + x + 1*2; + u16* lidx4 = lane + x + 2*2; +#if __GNUC_MINOR__ < 7 + __asm__ __volatile__ ( + "vld2.16 {d0, d2}, [%[in0]]! \n\t" + "vld2.16 {d1, d3}, [%[in0]] \n\t" + "vld2.16 {d8, d10}, [%[in4]]! \n\t" + "vld2.16 {d9, d11}, [%[in4]] \n\t" + "vadd.i16 q0, q4 \n\t" + "vadd.i16 q1, q5 \n\t" + "vld2.16 {d16, d18}, [%[in1]]! \n\t" + "vld2.16 {d17, d19}, [%[in1]] \n\t" + "vld2.16 {d8, d10}, [%[in3]]! \n\t" + "vld2.16 {d9, d11}, [%[in3]] \n\t" + "vadd.i16 q4, q8 \n\t" + "vadd.i16 q5, q9 \n\t" + "vld2.16 {d16, d18}, [%[in2]] \n\t" + "vld2.16 {d17, d19}, [%[in22]] \n\t" + "vmla.i16 q0, q4, %q[c4] \n\t" + "vmla.i16 q1, q5, %q[c4] \n\t" + "vmla.i16 q0, q8, %q[c6] \n\t" + "vmla.i16 q1, q9, %q[c6] \n\t" + "vrshrn.u16 d8, q0, #8 \n\t" + "vrshrn.u16 d9, q1, #8 \n\t" + "vst2.8 {d8-d9}, [%[out]] \n\t" + : [in0] "=r" (lidx0), + [in1] "=r" (lidx1), + [in3] "=r" (lidx3), + [in4] "=r" (lidx4) + : [out] "r" (dst + x), + "0" (lidx0), + "1" (lidx1), + "2" (lidx3), + "3" (lidx4), + [in2] "r" (lane + x), + [in22] "r" (lane + x + 4*2), + [c4] "w" (vc4u16), [c6] "w" (vc6u16) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23" + ); +#else + uint16x8x2_t vLane0 = vld2q_u16(lidx0); + uint16x8x2_t vLane1 = vld2q_u16(lidx1); + uint16x8x2_t vLane2 = vld2q_u16(lane + x); + uint16x8x2_t vLane3 = vld2q_u16(lidx3); + uint16x8x2_t vLane4 = vld2q_u16(lidx4); + + uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane4.val[0]); + uint16x8_t vSum_1_5 = vaddq_u16(vLane0.val[1], vLane4.val[1]); + + uint16x8_t vSum_4_8 = vaddq_u16(vLane1.val[0], vLane3.val[0]); + uint16x8_t vSum_5_9 = vaddq_u16(vLane1.val[1], vLane3.val[1]); + + vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_4_8, vc4u16); + vSum_1_5 = vmlaq_u16(vSum_1_5, vSum_5_9, vc4u16); + vSum_0_4 = vmlaq_u16(vSum_0_4, vLane2.val[0], vc6u16); + vSum_1_5 = vmlaq_u16(vSum_1_5, vLane2.val[1], vc6u16); + + uint8x8x2_t vRes; + vRes.val[0] = vrshrn_n_u16(vSum_0_4, 8); + vRes.val[1] = vrshrn_n_u16(vSum_1_5, 8); + vst2_u8(dst + x, vRes); +#endif + } + break; + case 3: + for (; x <= colsn - 8*3; x += 8*3) + { + internal::prefetch(lane + x); + + u16* lidx0 = lane + x - 2*3; + u16* lidx1 = lane + x - 1*3; + u16* lidx3 = lane + x + 1*3; + u16* lidx4 = lane + x + 2*3; +#if defined(__GNUC__) && defined(__arm__) + __asm__ __volatile__ ( + "vld3.16 {d0, d2, d4}, [%[in0]]! \n\t" + "vld3.16 {d1, d3, d5}, [%[in0]] \n\t" + "vld3.16 {d8, d10, d12}, [%[in4]]! \n\t" + "vld3.16 {d9, d11, d13}, [%[in4]] \n\t" + "vadd.i16 q0, q4 \n\t" + "vadd.i16 q1, q5 \n\t" + "vadd.i16 q2, q6 \n\t" + "vld3.16 {d16, d18, d20}, [%[in1]]! \n\t" + "vld3.16 {d17, d19, d21}, [%[in1]] \n\t" + "vld3.16 {d8, d10, d12}, [%[in3]]! \n\t" + "vld3.16 {d9, d11, d13}, [%[in3]] \n\t" + "vadd.i16 q4, q8 \n\t" + "vadd.i16 q5, q9 \n\t" + "vadd.i16 q6, q10 \n\t" + "vld3.16 {d16, d18, d20}, [%[in2]] \n\t" + "vld3.16 {d17, d19, d21}, [%[in22]] \n\t" + "vmla.i16 q0, q4, %q[c4] \n\t" + "vmla.i16 q1, q5, %q[c4] \n\t" + "vmla.i16 q2, q6, %q[c4] \n\t" + "vmla.i16 q0, q8, %q[c6] \n\t" + "vmla.i16 q1, q9, %q[c6] \n\t" + "vmla.i16 q2, q10, %q[c6] \n\t" + "vrshrn.u16 d8, q0, #8 \n\t" + "vrshrn.u16 d9, q1, #8 \n\t" + "vrshrn.u16 d10, q2, #8 \n\t" + "vst3.8 {d8-d10}, [%[out]] \n\t" + : [in0] "=r" (lidx0), + [in1] "=r" (lidx1), + [in3] "=r" (lidx3), + [in4] "=r" (lidx4) + : [out] "r" (dst + x), + "0" (lidx0), + "1" (lidx1), + "2" (lidx3), + "3" (lidx4), + [in2] "r" (lane + x), + [in22] "r" (lane + x + 4*3), + [c4] "w" (vc4u16), [c6] "w" (vc6u16) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23" + ); +#else + uint16x8x3_t vLane0 = vld3q_u16(lidx0); + uint16x8x3_t vLane1 = vld3q_u16(lidx1); + uint16x8x3_t vLane2 = vld3q_u16(lane + x); + uint16x8x3_t vLane3 = vld3q_u16(lidx3); + uint16x8x3_t vLane4 = vld3q_u16(lidx4); + + uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane4.val[0]); + uint16x8_t vSum_1_5 = vaddq_u16(vLane0.val[1], vLane4.val[1]); + uint16x8_t vSum_2_6 = vaddq_u16(vLane0.val[2], vLane4.val[2]); + + uint16x8_t vSum_3_1 = vaddq_u16(vLane3.val[0], vLane1.val[0]); + uint16x8_t vSum_4_2 = vaddq_u16(vLane3.val[1], vLane1.val[1]); + uint16x8_t vSum_5_6 = vaddq_u16(vLane3.val[2], vLane1.val[2]); + + vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_3_1, vc4u16); + vSum_1_5 = vmlaq_u16(vSum_1_5, vSum_4_2, vc4u16); + vSum_2_6 = vmlaq_u16(vSum_2_6, vSum_5_6, vc4u16); + + vSum_0_4 = vmlaq_u16(vSum_0_4, vLane2.val[0], vc6u16); + vSum_1_5 = vmlaq_u16(vSum_1_5, vLane2.val[1], vc6u16); + vSum_2_6 = vmlaq_u16(vSum_2_6, vLane2.val[2], vc6u16); + + uint8x8x3_t vRes; + vRes.val[0] = vrshrn_n_u16(vSum_0_4, 8); + vRes.val[1] = vrshrn_n_u16(vSum_1_5, 8); + vRes.val[2] = vrshrn_n_u16(vSum_2_6, 8); + + vst3_u8(dst + x, vRes); +#endif + } + break; + case 4: + for (; x <= colsn - 8*4; x += 8*4) + { + internal::prefetch(lane + x); + internal::prefetch(lane + x + 16); + + u16* lidx0 = lane + x - 2*4; + u16* lidx1 = lane + x - 1*4; + u16* lidx3 = lane + x + 1*4; + u16* lidx4 = lane + x + 2*4; +#if defined(__GNUC__) && defined(__arm__) + __asm__ __volatile__ ( + "vld4.16 {d0, d2, d4, d6}, [%[in0]]! \n\t" + "vld4.16 {d1, d3, d5, d7}, [%[in0]] \n\t" + "vld4.16 {d8, d10, d12, d14}, [%[in4]]! \n\t" + "vld4.16 {d9, d11, d13, d15}, [%[in4]] \n\t" + "vadd.i16 q0, q4 \n\t" + "vadd.i16 q1, q5 \n\t" + "vadd.i16 q2, q6 \n\t" + "vadd.i16 q3, q7 \n\t" + "vld4.16 {d16, d18, d20, d22}, [%[in1]]! \n\t" + "vld4.16 {d17, d19, d21, d23}, [%[in1]] \n\t" + "vld4.16 {d8, d10, d12, d14}, [%[in3]]! \n\t" + "vld4.16 {d9, d11, d13, d15}, [%[in3]] \n\t" + "vadd.i16 q4, q8 \n\t" + "vadd.i16 q5, q9 \n\t" + "vadd.i16 q6, q10 \n\t" + "vadd.i16 q7, q11 \n\t" + "vld4.16 {d16, d18, d20, d22}, [%[in2],:256] \n\t" + "vld4.16 {d17, d19, d21, d23}, [%[in22],:256] \n\t" + "vmla.i16 q0, q4, %q[c4] \n\t" + "vmla.i16 q1, q5, %q[c4] \n\t" + "vmla.i16 q2, q6, %q[c4] \n\t" + "vmla.i16 q3, q7, %q[c4] \n\t" + "vmla.i16 q0, q8, %q[c6] \n\t" + "vmla.i16 q1, q9, %q[c6] \n\t" + "vmla.i16 q2, q10, %q[c6] \n\t" + "vmla.i16 q3, q11, %q[c6] \n\t" + "vrshrn.u16 d8, q0, #8 \n\t" + "vrshrn.u16 d9, q1, #8 \n\t" + "vrshrn.u16 d10, q2, #8 \n\t" + "vrshrn.u16 d11, q3, #8 \n\t" + "vst4.8 {d8-d11}, [%[out]] \n\t" + : [in0] "=r" (lidx0), + [in1] "=r" (lidx1), + [in3] "=r" (lidx3), + [in4] "=r" (lidx4) + : [out] "r" (dst + x), + "0" (lidx0), + "1" (lidx1), + "2" (lidx3), + "3" (lidx4), + [in2] "r" (lane + x), + [in22] "r" (lane + x + 4*4), + [c4] "w" (vc4u16), [c6] "w" (vc6u16) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23" + ); +#else + uint16x8x4_t vLane0 = vld4q_u16(lidx0); + uint16x8x4_t vLane2 = vld4q_u16(lidx4); + uint16x8x4_t vLane4 = vld4q_u16(lidx1); + uint16x8x4_t vLane6 = vld4q_u16(lidx3); + uint16x8x4_t vLane8 = vld4q_u16(lane + x); + + uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane2.val[0]); + uint16x8_t vSum_1_5 = vaddq_u16(vLane0.val[1], vLane2.val[1]); + uint16x8_t vSum_2_6 = vaddq_u16(vLane0.val[2], vLane2.val[2]); + uint16x8_t vSum_3_7 = vaddq_u16(vLane0.val[3], vLane2.val[3]); + + uint16x8_t vSum_4_8 = vaddq_u16(vLane4.val[0], vLane6.val[0]); + uint16x8_t vSum_5_9 = vaddq_u16(vLane4.val[1], vLane6.val[1]); + uint16x8_t vSum_6_10 = vaddq_u16(vLane4.val[2], vLane6.val[2]); + uint16x8_t vSum_7_11 = vaddq_u16(vLane4.val[3], vLane6.val[3]); + + vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_4_8, vc4u16); + vSum_1_5 = vmlaq_u16(vSum_1_5, vSum_5_9, vc4u16); + vSum_2_6 = vmlaq_u16(vSum_2_6, vSum_6_10, vc4u16); + vSum_3_7 = vmlaq_u16(vSum_3_7, vSum_7_11, vc4u16); + + vSum_0_4 = vmlaq_u16(vSum_0_4, vLane8.val[0], vc6u16); + vSum_1_5 = vmlaq_u16(vSum_1_5, vLane8.val[1], vc6u16); + vSum_2_6 = vmlaq_u16(vSum_2_6, vLane8.val[2], vc6u16); + vSum_3_7 = vmlaq_u16(vSum_3_7, vLane8.val[3], vc6u16); + + uint8x8x4_t vRes; + vRes.val[0] = vrshrn_n_u16(vSum_0_4, 8); + vRes.val[1] = vrshrn_n_u16(vSum_1_5, 8); + vRes.val[2] = vrshrn_n_u16(vSum_2_6, 8); + vRes.val[3] = vrshrn_n_u16(vSum_3_7, 8); + + vst4_u8(dst + x, vRes); +#endif + } + break; + } + for (s32 h = 0; h < cn; ++h) + { + u16* ln = lane + h; + u8* dt = dst + h; + for (size_t k = x; k < colsn; k += cn) + { + dt[k] = (u8)((ln[k-2*cn] + ln[k+2*cn] + + u16(4) * (ln[k-cn] + ln[k+cn]) + + u16(6) * ln[k] + (1 << 7)) >> 8); + } + } + } +#else + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)borderValue; + (void)borderMargin; +#endif +} + +void gaussianBlur5x5(const Size2D &size, s32 cn, + const u16 * srcBase, ptrdiff_t srcStride, + u16 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderType, u16 borderValue, Margin borderMargin) +{ + internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType)); +#ifdef CAROTENE_NEON + size_t colsn = size.width * cn; + + std::vector _tmp; + u16 *tmp = 0; + if (borderType == BORDER_MODE_CONSTANT) + { + _tmp.assign(colsn + 4*cn, borderValue); + tmp = &_tmp[cn << 1]; + } + + ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + + //1-line buffer + std::vector _buf(cn * (size.width + 4) + 32 / sizeof(u32)); + u32* lane = internal::alignPtr(&_buf[cn << 1], 32); + + if (borderType == BORDER_MODE_CONSTANT) + for (s32 k = 0; k < cn; ++k) + { + lane[-cn+k] = borderValue; + lane[-cn-cn+k] = borderValue; + lane[colsn+k] = borderValue; + lane[colsn+cn+k] = borderValue; + } + + uint16x4_t vc6u16 = vmov_n_u16(6); + uint32x4_t vc6u32 = vmovq_n_u32(6); + uint32x4_t vc4u32 = vmovq_n_u32(4); + + for (size_t i = 0; i < size.height; ++i) + { + u16* dst = internal::getRowPtr(dstBase, dstStride, i); + //vertical convolution + ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom); + ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom); + ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom); + ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom); + + const u16* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp; + const u16* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp; + const u16* ln2 = internal::getRowPtr(srcBase, srcStride, i); + const u16* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp; + const u16* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp; + + size_t x = 0; + for (; x <= colsn - 4; x += 4) + { + internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2)); + uint16x4_t v0 = vld1_u16(ln0+x); + uint16x4_t v1 = vld1_u16(ln1+x); + uint16x4_t v2 = vld1_u16(ln2+x); + uint16x4_t v3 = vld1_u16(ln3+x); + uint16x4_t v4 = vld1_u16(ln4+x); + + uint32x4_t v = vaddl_u16(v0, v4); + uint32x4_t v13 = vaddl_u16(v1, v3); + + v = vmlal_u16(v, v2, vc6u16); + v = vmlaq_u32(v, v13, vc4u32); + + vst1q_u32(lane + x, v); + } + for (; x < colsn; ++x) + lane[x] = ln0[x] + ln4[x] + 4*(ln1[x] + ln3[x]) + 6*ln2[x]; + + //left&right borders + if (borderType != BORDER_MODE_CONSTANT) + for (s32 k = 0; k < cn; ++k) + { + lane[-cn+k] = lane[idx_l1 + k]; + lane[-cn-cn+k] = lane[idx_l2 + k]; + + lane[colsn+k] = lane[idx_r1 + k]; + lane[colsn+cn+k] = lane[idx_r2 + k]; + } + + //horizontal convolution + x = 0; + for (; x <= colsn - 4; x += 4) + { + internal::prefetch(lane + x); + + uint32x4_t lane0 = vld1q_u32(lane + x - 2); + uint32x4_t lane4 = vld1q_u32(lane + x + 2); + uint32x4_t lane1 = vld1q_u32(lane + x - 1); + uint32x4_t lane3 = vld1q_u32(lane + x + 1); + uint32x4_t lane2 = vld1q_u32(lane + x + 0); + + uint32x4_t ln04 = vaddq_u32(lane0, lane4); + uint32x4_t ln13 = vaddq_u32(lane1, lane3); + + uint32x4_t ln042 = vmlaq_u32(ln04, lane2, vc6u32); + uint32x4_t lsw = vmlaq_u32(ln042, ln13, vc4u32); + + uint16x4_t ls = vrshrn_n_u32(lsw, 8); + + vst1_u16(dst + x, ls); + } + for (s32 h = 0; h < cn; ++h) + { + u32* ln = lane + h; + u16* dt = dst + h; + for (size_t k = x; k < colsn; k += cn) + { + dt[k] = (u16)((ln[k-2*cn] + ln[k+2*cn] + 4*(ln[k-cn] + ln[k+cn]) + 6*ln[k] + (1<<7))>>8); + } + } + } +#else + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)borderValue; + (void)borderMargin; +#endif +} + +void gaussianBlur5x5(const Size2D &size, s32 cn, + const s16 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderType, s16 borderValue, Margin borderMargin) +{ + internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType)); +#ifdef CAROTENE_NEON + size_t colsn = size.width * cn; + + std::vector _tmp; + s16 *tmp = 0; + if (borderType == BORDER_MODE_CONSTANT) + { + _tmp.assign(colsn + 4*cn, borderValue); + tmp = &_tmp[cn << 1]; + } + + ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + + //1-line buffer + std::vector _buf(cn * (size.width + 4) + 32 / sizeof(s32)); + s32* lane = internal::alignPtr(&_buf[cn << 1], 32); + + if (borderType == BORDER_MODE_CONSTANT) + for (s32 k = 0; k < cn; ++k) + { + lane[-cn+k] = borderValue; + lane[-cn-cn+k] = borderValue; + lane[colsn+k] = borderValue; + lane[colsn+cn+k] = borderValue; + } + + int16x4_t vc6s16 = vmov_n_s16(6); + int32x4_t vc6s32 = vmovq_n_s32(6); + int32x4_t vc4s32 = vmovq_n_s32(4); + + for (size_t i = 0; i < size.height; ++i) + { + s16* dst = internal::getRowPtr(dstBase, dstStride, i); + //vertical convolution + ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom); + ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom); + ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom); + ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom); + + const s16* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp; + const s16* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp; + const s16* ln2 = internal::getRowPtr(srcBase, srcStride, i); + const s16* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp; + const s16* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp; + + size_t x = 0; + for (; x <= colsn - 4; x += 4) + { + internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2)); + int16x4_t v0 = vld1_s16(ln0+x); + int16x4_t v1 = vld1_s16(ln1+x); + int16x4_t v2 = vld1_s16(ln2+x); + int16x4_t v3 = vld1_s16(ln3+x); + int16x4_t v4 = vld1_s16(ln4+x); + + int32x4_t v = vaddl_s16(v0, v4); + int32x4_t v13 = vaddl_s16(v1, v3); + + v = vmlal_s16(v, v2, vc6s16); + v = vmlaq_s32(v, v13, vc4s32); + + vst1q_s32(lane + x, v); + } + for (; x < colsn; ++x) + lane[x] = ln0[x] + ln4[x] + 4*(ln1[x] + ln3[x]) + 6*ln2[x]; + + //left&right borders + if (borderType != BORDER_MODE_CONSTANT) + for (s32 k = 0; k < cn; ++k) + { + lane[-cn+k] = lane[idx_l1 + k]; + lane[-cn-cn+k] = lane[idx_l2 + k]; + + lane[colsn+k] = lane[idx_r1 + k]; + lane[colsn+cn+k] = lane[idx_r2 + k]; + } + + //horizontal convolution + x = 0; + switch(cn) + { + case 1: + case 2: + case 3: + for (; x <= colsn - 4; x += 4) + { + internal::prefetch(lane + x); + + int32x4_t lane0 = vld1q_s32(lane + x - 2); + int32x4_t lane4 = vld1q_s32(lane + x + 2); + int32x4_t lane1 = vld1q_s32(lane + x - 1); + int32x4_t lane3 = vld1q_s32(lane + x + 1); + int32x4_t lane2 = vld1q_s32(lane + x + 0); + + int32x4_t ln04 = vaddq_s32(lane0, lane4); + int32x4_t ln13 = vaddq_s32(lane1, lane3); + + int32x4_t ln042 = vmlaq_s32(ln04, lane2, vc6s32); + int32x4_t lsw = vmlaq_s32(ln042, ln13, vc4s32); + + int16x4_t ls = vrshrn_n_s32(lsw, 8); + + vst1_s16(dst + x, ls); + } + break; + case 4: +/* for (; x <= colsn - 4*4; x += 4*4) + { + internal::prefetch(lane + x); + internal::prefetch(lane + x + 16); + + ptrdiff_t* lidx0 = lane + x - 2*4; + ptrdiff_t* lidx1 = lane + x - 1*4; + ptrdiff_t* lidx3 = lane + x + 1*4; + ptrdiff_t* lidx4 = lane + x + 2*4; + + __asm__ __volatile__ ( + "vld4.32 {d0, d2, d4, d6}, [%[in0]]! \n\t" + "vld4.32 {d1, d3, d5, d7}, [%[in0]] \n\t" + "vld4.32 {d8, d10, d12, d14}, [%[in4]]! \n\t" + "vld4.32 {d9, d11, d13, d15}, [%[in4]] \n\t" + "vadd.i32 q0, q4 \n\t" + "vadd.i32 q1, q5 \n\t" + "vadd.i32 q2, q6 \n\t" + "vadd.i32 q3, q7 \n\t" + "vld4.32 {d16, d18, d20, d22}, [%[in1]]! \n\t" + "vld4.32 {d17, d19, d21, d23}, [%[in1]] \n\t" + "vld4.32 {d8, d10, d12, d14}, [%[in3]]! \n\t" + "vld4.32 {d9, d11, d13, d15}, [%[in3]] \n\t" + "vadd.i32 q4, q8 \n\t" + "vadd.i32 q5, q9 \n\t" + "vadd.i32 q6, q10 \n\t" + "vadd.i32 q7, q11 \n\t" + "vld4.32 {d16, d18, d20, d22}, [%[in2],:256] \n\t" + "vld4.32 {d17, d19, d21, d23}, [%[in22],:256] \n\t" + "vmla.i32 q0, q4, %q[c4] \n\t" + "vmla.i32 q1, q5, %q[c4] \n\t" + "vmla.i32 q2, q6, %q[c4] \n\t" + "vmla.i32 q3, q7, %q[c4] \n\t" + "vmla.i32 q0, q8, %q[c6] \n\t" + "vmla.i32 q1, q9, %q[c6] \n\t" + "vmla.i32 q2, q10, %q[c6] \n\t" + "vmla.i32 q3, q11, %q[c6] \n\t" + "vrshrn.i32 d8, q0, #8 \n\t" + "vrshrn.i32 d9, q1, #8 \n\t" + "vrshrn.i32 d10, q2, #8 \n\t" + "vrshrn.i32 d11, q3, #8 \n\t" + "vst4.16 {d8-d11}, [%[out]] \n\t" + : [in0] "=r" (lidx0), + [in1] "=r" (lidx1), + [in3] "=r" (lidx3), + [in4] "=r" (lidx4) + : [out] "r" (dst + x), + "0" (lidx0), + "1" (lidx1), + "2" (lidx3), + "3" (lidx4), + [in2] "r" (lane + x), + [in22] "r" (lane + x + 4*2), + [c4] "w" (vc4s32), [c6] "w" (vc6s32) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23" + ); +*/ + for (; x <= colsn - 4; x += 4) + { + internal::prefetch(lane + x); + + int32x4_t lane0 = vld1q_s32(lane + x - 2); + int32x4_t lane4 = vld1q_s32(lane + x + 2); + int32x4_t lane1 = vld1q_s32(lane + x - 1); + int32x4_t lane3 = vld1q_s32(lane + x + 1); + int32x4_t lane2 = vld1q_s32(lane + x + 0); + + int32x4_t ln04 = vaddq_s32(lane0, lane4); + int32x4_t ln13 = vaddq_s32(lane1, lane3); + + int32x4_t ln042 = vmlaq_s32(ln04, lane2, vc6s32); + int32x4_t lsw = vmlaq_s32(ln042, ln13, vc4s32); + + int16x4_t ls = vrshrn_n_s32(lsw, 8); + + vst1_s16(dst + x, ls); + } + break; + } + for (s32 h = 0; h < cn; ++h) + { + s32* ln = lane + h; + s16* dt = dst + h; + for (size_t k = x; k < colsn; k += cn) + { + dt[k] = (s16)((ln[k-2*cn] + ln[k+2*cn] + 4*(ln[k-cn] + ln[k+cn]) + 6*ln[k] + (1<<7))>>8); + } + } + } +#else + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)borderValue; + (void)borderMargin; +#endif +} + +void gaussianBlur5x5(const Size2D &size, s32 cn, + const s32 * srcBase, ptrdiff_t srcStride, + s32 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderType, s32 borderValue, Margin borderMargin) +{ + internal::assertSupportedConfiguration(isGaussianBlur5x5Supported(size, cn, borderType)); +#ifdef CAROTENE_NEON + size_t colsn = size.width * cn; + + std::vector _tmp; + s32 *tmp = 0; + if (borderType == BORDER_MODE_CONSTANT) + { + _tmp.assign(colsn + 4*cn, borderValue); + tmp = &_tmp[cn << 1]; + } + + ptrdiff_t idx_l1 = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + ptrdiff_t idx_l2 = internal::borderInterpolate(-2, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + ptrdiff_t idx_r1 = internal::borderInterpolate(size.width + 0, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + ptrdiff_t idx_r2 = internal::borderInterpolate(size.width + 1, size.width, borderType, borderMargin.left, borderMargin.right) * cn; + + //1-line buffer + std::vector _buf(cn * (size.width + 4) + 32 / sizeof(s32)); + s32* lane = internal::alignPtr(&_buf[cn << 1], 32); + + if (borderType == BORDER_MODE_CONSTANT) + for (s32 k = 0; k < cn; ++k) + { + lane[-cn+k] = borderValue; + lane[-cn-cn+k] = borderValue; + lane[colsn+k] = borderValue; + lane[colsn+cn+k] = borderValue; + } + + int32x4_t vc6s32 = vmovq_n_s32(6); + int32x4_t vc4s32 = vmovq_n_s32(4); + + for (size_t i = 0; i < size.height; ++i) + { + s32* dst = internal::getRowPtr(dstBase, dstStride, i); + //vertical convolution + ptrdiff_t idx_rm2 = internal::borderInterpolate(i - 2, size.height, borderType, borderMargin.top, borderMargin.bottom); + ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom); + ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom); + ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom); + + const s32* ln0 = idx_rm2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm2) : tmp; + const s32* ln1 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp; + const s32* ln2 = internal::getRowPtr(srcBase, srcStride, i); + const s32* ln3 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp; + const s32* ln4 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp; + + size_t x = 0; + for (; x <= colsn - 4; x += 4) + { + internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2)); + int32x4_t v0 = vld1q_s32(ln0+x); + int32x4_t v1 = vld1q_s32(ln1+x); + int32x4_t v2 = vld1q_s32(ln2+x); + int32x4_t v3 = vld1q_s32(ln3+x); + int32x4_t v4 = vld1q_s32(ln4+x); + + int32x4_t v = vaddq_s32(v0, v4); + int32x4_t v13 = vaddq_s32(v1, v3); + + v = vmlaq_s32(v, v2, vc6s32); + v = vmlaq_s32(v, v13, vc4s32); + + vst1q_s32(lane + x, v); + } + for (; x < colsn; ++x) + lane[x] = ln0[x] + ln4[x] + 4*(ln1[x] + ln3[x]) + 6*ln2[x]; + + //left&right borders + if (borderType != BORDER_MODE_CONSTANT) + for (s32 k = 0; k < cn; ++k) + { + lane[-cn+k] = lane[idx_l1 + k]; + lane[-cn-cn+k] = lane[idx_l2 + k]; + + lane[colsn+k] = lane[idx_r1 + k]; + lane[colsn+cn+k] = lane[idx_r2 + k]; + } + + //horizontal convolution + x = 0; + for (; x <= colsn - 4; x += 4) + { + internal::prefetch(lane + x); + + int32x4_t lane0 = vld1q_s32(lane + x - 2); + int32x4_t lane4 = vld1q_s32(lane + x + 2); + int32x4_t lane1 = vld1q_s32(lane + x - 1); + int32x4_t lane3 = vld1q_s32(lane + x + 1); + int32x4_t lane2 = vld1q_s32(lane + x + 0); + + int32x4_t ln04 = vaddq_s32(lane0, lane4); + int32x4_t ln13 = vaddq_s32(lane1, lane3); + + int32x4_t ln042 = vmlaq_s32(ln04, lane2, vc6s32); + int32x4_t lsw = vmlaq_s32(ln042, ln13, vc4s32); + + vst1q_s32(dst + x, lsw); + } + for (s32 h = 0; h < cn; ++h) + { + s32* ln = lane + h; + s32* dt = dst + h; + for (size_t k = x; k < colsn; k += cn) + { + dt[k] = ln[k-2*cn] + ln[k+2*cn] + 4*(ln[k-cn] + ln[k+cn]) + 6*ln[k]; + } + } + } +#else + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)borderValue; + (void)borderMargin; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/in_range.cpp b/3rdparty/carotene/src/in_range.cpp new file mode 100644 index 0000000000..b79a237e39 --- /dev/null +++ b/3rdparty/carotene/src/in_range.cpp @@ -0,0 +1,195 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +#include "vtransform.hpp" + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON + +namespace { + +inline void vnst(u8* dst, uint8x16_t v1, uint8x16_t v2) { vst1q_u8(dst, v1); vst1q_u8(dst+16, v2); } +inline void vnst(u8* dst, uint16x8_t v1, uint16x8_t v2) { vst1q_u8(dst, vcombine_u8(vmovn_u16(v1), vmovn_u16(v2))); } +inline void vnst(u8* dst, uint32x4_t v1, uint32x4_t v2) { vst1_u8(dst, vmovn_u16(vcombine_u16(vmovn_u32(v1), vmovn_u32(v2)))); } + +template struct vtail +{ + static inline void inRange(const T *, const T *, const T *, + u8 *, size_t &, size_t) + { + //do nothing since there couldn't be enough data + } +}; +template struct vtail +{ + static inline void inRange(const T * src, const T * rng1, const T * rng2, + u8 * dst, size_t &x, size_t width) + { + typedef typename internal::VecTraits::vec128 vec128; + typedef typename internal::VecTraits::unsign::vec128 uvec128; + //There no more than 15 elements in the tail, so we could handle 8 element vector only once + if( x + 8 < width) + { + vec128 vs = internal::vld1q( src + x); + vec128 vr1 = internal::vld1q(rng1 + x); + vec128 vr2 = internal::vld1q(rng2 + x); + uvec128 vd = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs)); + internal::vst1(dst + x, internal::vmovn(vd)); + x+=8; + } + } +}; +template struct vtail +{ + static inline void inRange(const T * src, const T * rng1, const T * rng2, + u8 * dst, size_t &x, size_t width) + { + typedef typename internal::VecTraits::vec128 vec128; + typedef typename internal::VecTraits::unsign::vec128 uvec128; + typedef typename internal::VecTraits::vec64 vec64; + typedef typename internal::VecTraits::unsign::vec64 uvec64; + //There no more than 31 elements in the tail, so we could handle once 16+8 or 16 or 8 elements + if( x + 16 < width) + { + vec128 vs = internal::vld1q( src + x); + vec128 vr1 = internal::vld1q(rng1 + x); + vec128 vr2 = internal::vld1q(rng2 + x); + uvec128 vd = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs)); + internal::vst1q(dst + x, vd); + x+=16; + } + if( x + 8 < width) + { + vec64 vs = internal::vld1( src + x); + vec64 vr1 = internal::vld1(rng1 + x); + vec64 vr2 = internal::vld1(rng2 + x); + uvec64 vd = internal::vand(internal::vcge(vs, vr1), internal::vcge(vr2, vs)); + internal::vst1(dst + x, vd); + x+=8; + } + } +}; + +template +inline void inRangeCheck(const Size2D &_size, + const T * srcBase, ptrdiff_t srcStride, + const T * rng1Base, ptrdiff_t rng1Stride, + const T * rng2Base, ptrdiff_t rng2Stride, + u8 * dstBase, ptrdiff_t dstStride) +{ + typedef typename internal::VecTraits::vec128 vec128; + typedef typename internal::VecTraits::unsign::vec128 uvec128; + + Size2D size(_size); + if (srcStride == dstStride && + srcStride == rng1Stride && + srcStride == rng2Stride && + srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + const size_t width = size.width & ~( 32/sizeof(T) - 1 ); + + for(size_t j = 0; j < size.height; ++j) + { + const T * src = internal::getRowPtr( srcBase, srcStride, j); + const T * rng1 = internal::getRowPtr(rng1Base, rng1Stride, j); + const T * rng2 = internal::getRowPtr(rng2Base, rng2Stride, j); + u8 * dst = internal::getRowPtr( dstBase, dstStride, j); + size_t i = 0; + for( ; i < width; i += 32/sizeof(T) ) + { + internal::prefetch(src + i); + internal::prefetch(rng1 + i); + internal::prefetch(rng2 + i); + + vec128 vs = internal::vld1q( src + i); + vec128 vr1 = internal::vld1q(rng1 + i); + vec128 vr2 = internal::vld1q(rng2 + i); + uvec128 vd1 = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs)); + vs = internal::vld1q( src + i + 16/sizeof(T)); + vr1 = internal::vld1q(rng1 + i + 16/sizeof(T)); + vr2 = internal::vld1q(rng2 + i + 16/sizeof(T)); + uvec128 vd2 = internal::vandq(internal::vcgeq(vs, vr1), internal::vcgeq(vr2, vs)); + vnst(dst + i, vd1, vd2); + } + vtail::inRange(src, rng1, rng2, dst, i, size.width); + for( ; i < size.width; i++ ) + dst[i] = (u8)(-(rng1[i] <= src[i] && src[i] <= rng2[i])); + } +} + +} + +#define INRANGEFUNC(T) \ +void inRange(const Size2D &_size, \ + const T * srcBase, ptrdiff_t srcStride, \ + const T * rng1Base, ptrdiff_t rng1Stride, \ + const T * rng2Base, ptrdiff_t rng2Stride, \ + u8 * dstBase, ptrdiff_t dstStride) \ +{ \ + internal::assertSupportedConfiguration(); \ + inRangeCheck(_size, srcBase, srcStride, \ + rng1Base, rng1Stride, rng2Base, rng2Stride, \ + dstBase, dstStride); \ +} +#else +#define INRANGEFUNC(T) \ +void inRange(const Size2D &, \ + const T *, ptrdiff_t, \ + const T *, ptrdiff_t, \ + const T *, ptrdiff_t, \ + u8 *, ptrdiff_t) \ +{ \ + internal::assertSupportedConfiguration(); \ +} +#endif + +INRANGEFUNC(u8) +INRANGEFUNC(s8) +INRANGEFUNC(u16) +INRANGEFUNC(s16) +INRANGEFUNC(s32) +INRANGEFUNC(f32) + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/integral.cpp b/3rdparty/carotene/src/integral.cpp new file mode 100644 index 0000000000..56c919500e --- /dev/null +++ b/3rdparty/carotene/src/integral.cpp @@ -0,0 +1,238 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2014, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +namespace CAROTENE_NS { + +void integral(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u32 * sumBase, ptrdiff_t sumStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + uint32x4_t v_zero = vmovq_n_u32(0u); + + // the first iteration + const u8 * src = internal::getRowPtr(srcBase, srcStride, 0); + u32 * sum = internal::getRowPtr(sumBase, sumStride, 0); + + uint32x4_t prev = v_zero; + size_t j = 0u; + + for ( ; j + 7 < size.width; j += 8) + { + internal::prefetch(sum + j); + internal::prefetch(src + j); + + uint8x8_t el8shr0 = vld1_u8(src + j); + uint8x8_t el8shr1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 8)); + uint8x8_t el8shr2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 16)); + uint8x8_t el8shr3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 24)); + + uint16x8_t el8shr12 = vaddl_u8(el8shr1, el8shr2); + uint16x8_t el8shr03 = vaddl_u8(el8shr0, el8shr3); + + uint16x8_t el8 = vaddq_u16(el8shr12, el8shr03); + uint16x4_t el4h = vadd_u16(vget_low_u16(el8), vget_high_u16(el8)); + + uint32x4_t vsuml = vaddw_u16(prev, vget_low_u16(el8)); + uint32x4_t vsumh = vaddw_u16(prev, el4h); + + vst1q_u32(sum + j, vsuml); + vst1q_u32(sum + j + 4, vsumh); + + prev = vaddw_u16(prev, vdup_lane_u16(el4h, 3)); + } + + for (u32 v = vgetq_lane_u32(prev, 3); j < size.width; ++j) + sum[j] = (v += src[j]); + + // the others + for (size_t i = 1; i < size.height ; ++i) + { + src = internal::getRowPtr(srcBase, srcStride, i); + u32 * prevSum = internal::getRowPtr(sumBase, sumStride, i - 1); + sum = internal::getRowPtr(sumBase, sumStride, i); + + prev = v_zero; + j = 0u; + + for ( ; j + 7 < size.width; j += 8) + { + internal::prefetch(sum + j); + internal::prefetch(src + j); + + uint32x4_t vsuml = vld1q_u32(prevSum + j); + uint32x4_t vsumh = vld1q_u32(prevSum + j + 4); + + uint8x8_t el8shr0 = vld1_u8(src + j); + uint8x8_t el8shr1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 8)); + uint8x8_t el8shr2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 16)); + uint8x8_t el8shr3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(el8shr0), 24)); + + vsuml = vaddq_u32(vsuml, prev); + vsumh = vaddq_u32(vsumh, prev); + + uint16x8_t el8shr12 = vaddl_u8(el8shr1, el8shr2); + uint16x8_t el8shr03 = vaddl_u8(el8shr0, el8shr3); + + uint16x8_t el8 = vaddq_u16(el8shr12, el8shr03); + uint16x4_t el4h = vadd_u16(vget_low_u16(el8), vget_high_u16(el8)); + + vsuml = vaddw_u16(vsuml, vget_low_u16(el8)); + vsumh = vaddw_u16(vsumh, el4h); + + vst1q_u32(sum + j, vsuml); + vst1q_u32(sum + j + 4, vsumh); + + prev = vaddw_u16(prev, vdup_lane_u16(el4h, 3)); + } + + for (u32 v = vgetq_lane_u32(prev, 3); j < size.width; ++j) + sum[j] = (v += src[j]) + prevSum[j]; + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)sumBase; + (void)sumStride; +#endif +} + +void sqrIntegral(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + f64 * sqsumBase, ptrdiff_t sqsumStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + uint16x8_t v_zero8 = vmovq_n_u16(0u); + + // the first iteration + const u8 * src = internal::getRowPtr(srcBase, srcStride, 0); + f64 * sqsum = internal::getRowPtr(sqsumBase, sqsumStride, 0); + + double prev = 0.; + size_t j = 0u; + + for ( ; j + 7 < size.width; j += 8) + { + internal::prefetch(sqsum + j); + internal::prefetch(src + j); + + uint8x8_t vsrc = vld1_u8(src + j); + + uint16x8_t el8shr0 = vmull_u8(vsrc, vsrc); + uint16x8_t el8shr1 = vextq_u16(v_zero8, el8shr0, 7); + + uint32x4_t el8shr01l = vaddl_u16(vget_low_u16(el8shr0), vget_low_u16(el8shr1)); + uint32x4_t el8shr01h = vaddl_u16(vget_high_u16(el8shr0), vget_high_u16(el8shr1)); + + uint32x4_t el4h = vaddq_u32(el8shr01l, el8shr01h); + + uint32x2_t el2l = vadd_u32(vget_low_u32(el8shr01l), vget_high_u32(el8shr01l)); + uint32x2_t el2hl = vadd_u32(vget_low_u32(el4h), vget_high_u32(el8shr01l)); + uint32x2_t el2hh = vadd_u32(vget_low_u32(el4h), vget_high_u32(el4h)); + + u32 buf[8]; + vst1_u32(buf, vget_low_u32(el8shr01l)); + vst1_u32(buf+2, el2l); + vst1_u32(buf+4, el2hl); + vst1_u32(buf+6, el2hh); + for(u32 k=0; k < 8; k++) + sqsum[j+k] = prev + buf[k]; + prev += buf[7]; + } + + for (; j < size.width; ++j) + sqsum[j] = (prev += src[j]*src[j]); + + // the others + for (size_t i = 1; i < size.height ; ++i) + { + src = internal::getRowPtr(srcBase, srcStride, i); + f64 * prevSqSum = internal::getRowPtr(sqsumBase, sqsumStride, i - 1); + sqsum = internal::getRowPtr(sqsumBase, sqsumStride, i); + + prev = 0.; + j = 0u; + + for ( ; j + 7 < size.width; j += 8) + { + internal::prefetch(sqsum + j); + internal::prefetch(src + j); + + uint8x8_t vsrc = vld1_u8(src + j); + + uint16x8_t el8shr0 = vmull_u8(vsrc, vsrc); + uint16x8_t el8shr1 = vextq_u16(v_zero8, el8shr0, 7); + + uint32x4_t el8shr01l = vaddl_u16(vget_low_u16(el8shr0), vget_low_u16(el8shr1)); + uint32x4_t el8shr01h = vaddl_u16(vget_high_u16(el8shr0), vget_high_u16(el8shr1)); + + uint32x4_t el4h = vaddq_u32(el8shr01l, el8shr01h); + + uint32x2_t el2l = vadd_u32(vget_low_u32(el8shr01l), vget_high_u32(el8shr01l)); + uint32x2_t el2hl = vadd_u32(vget_low_u32(el4h), vget_high_u32(el8shr01l)); + uint32x2_t el2hh = vadd_u32(vget_low_u32(el4h), vget_high_u32(el4h)); + + u32 buf[8]; + vst1_u32(buf, vget_low_u32(el8shr01l)); + vst1_u32(buf+2, el2l); + vst1_u32(buf+4, el2hl); + vst1_u32(buf+6, el2hh); + for(u32 k=0; k < 8; k++) + sqsum[j+k] = prev + prevSqSum[j+k] + buf[k]; + prev += buf[7]; + } + + for (; j < size.width; ++j) + sqsum[j] = (prev += src[j]*src[j]) + prevSqSum[j]; + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)sqsumBase; + (void)sqsumStride; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/intrinsics.hpp b/3rdparty/carotene/src/intrinsics.hpp new file mode 100644 index 0000000000..062a3f897b --- /dev/null +++ b/3rdparty/carotene/src/intrinsics.hpp @@ -0,0 +1,112 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#ifndef CAROTENE_INTRINSICS_HPP +#define CAROTENE_INTRINSICS_HPP + +#include + +#include + +namespace CAROTENE_NS { namespace internal { + +/////////////// Custom NEON intrinsics /////////////////// + +// calculate reciprocal value + +inline float32x4_t vrecpq_f32(float32x4_t val) +{ + float32x4_t reciprocal = vrecpeq_f32(val); + reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal); + return reciprocal; +} + +inline float32x2_t vrecp_f32(float32x2_t val) +{ + float32x2_t reciprocal = vrecpe_f32(val); + reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal); + reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal); + return reciprocal; +} + +// caclulate sqrt value + +inline float32x4_t vrsqrtq_f32(float32x4_t val) +{ + float32x4_t e = vrsqrteq_f32(val); + e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e); + e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e); + return e; +} + +inline float32x2_t vrsqrt_f32(float32x2_t val) +{ + float32x2_t e = vrsqrte_f32(val); + e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e); + e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e); + return e; +} + +inline float32x4_t vsqrtq_f32(float32x4_t val) +{ + return vrecpq_f32(vrsqrtq_f32(val)); +} + +inline float32x2_t vsqrt_f32(float32x2_t val) +{ + return vrecp_f32(vrsqrt_f32(val)); +} + +// table lookup with the table in a 128-bit register + +inline uint8x8_t vqtbl1_u8 (uint8x16_t a, uint8x8_t b) +{ +#ifdef __aarch64__ + // AArch64 supports this natively + return ::vqtbl1_u8(a, b); +#else + union { uint8x16_t v; uint8x8x2_t w; } u = { a }; + return vtbl2_u8(u.w, b); +#endif +} + +} } + +#endif diff --git a/3rdparty/carotene/src/laplacian.cpp b/3rdparty/carotene/src/laplacian.cpp new file mode 100644 index 0000000000..b9148de1b4 --- /dev/null +++ b/3rdparty/carotene/src/laplacian.cpp @@ -0,0 +1,713 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" +#include "saturate_cast.hpp" + +#include + +namespace CAROTENE_NS { + +bool isLaplacian3x3Supported(const Size2D &size, BORDER_MODE border) +{ + return isSupportedConfiguration() && size.width >= 8 && + (border == BORDER_MODE_CONSTANT || + border == BORDER_MODE_REPLICATE); +} + +void Laplacian3x3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue) +{ + internal::assertSupportedConfiguration(isLaplacian3x3Supported(size, border)); +#ifdef CAROTENE_NEON + const uint16x8_t v_border_x3 = vdupq_n_u16(borderValue * 3); + const uint16x8_t v_zero = vdupq_n_u16(0); + const uint8x8_t v_border = vdup_n_u8(borderValue); + + uint8x8_t vsub; + uint16x8_t tprev = v_zero, tcurr = v_zero, tnext = v_zero; + uint16x8_t t0 = v_zero, t1 = v_zero, t2 = v_zero; + + ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height; + + for (ptrdiff_t y = 0; y < height; ++y) + { + const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max(y - 1, 0)); + const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y); + const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1)); + u8 * drow = internal::getRowPtr(dstBase, dstStride, y); + + s16 prevx = 0, currx = 0, nextx = 0; + ptrdiff_t x = 0; + const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8); + + // perform vertical convolution + for ( ; x <= bwidth; x += 8) + { + internal::prefetch(srow0 + x); + internal::prefetch(srow1 + x); + internal::prefetch(srow2 + x); + + uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x); + uint8x8_t x1 = vld1_u8(srow1 + x); + uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x); + + // calculate values for plain CPU part below if needed + if (x + 8 >= bwidth) + { + ptrdiff_t x3 = x == width ? width - 1 : x; + ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max(x3 - 1, 0); + + if (border == BORDER_MODE_CONSTANT && x4 < 0) + prevx = borderValue; + else + prevx = (srow2 ? srow2[x4] : borderValue) + srow1[x4] + (srow0 ? srow0[x4] : borderValue); + + currx = (srow2 ? srow2[x3] : borderValue) + srow1[x3] + (srow0 ? srow0[x3] : borderValue); + } + + // make shift + if (x) + { + tprev = tcurr; + tcurr = tnext; + } + + // and calculate next value + tnext = vaddw_u8(vaddl_u8(x0, x1), x2); + + // make extrapolation for the first elements + if (!x) + { + // make border + if (border == BORDER_MODE_CONSTANT) + tcurr = v_border_x3; + else if (border == BORDER_MODE_REPLICATE) + tcurr = vdupq_n_u16(vgetq_lane_u16(tnext, 0)); + + vsub = x1; + + continue; + } + + // combine 3 "shifted" vectors + t0 = vextq_u16(tprev, tcurr, 7); + t1 = tcurr; + t2 = vextq_u16(tcurr, tnext, 1); + + // and add them + t0 = vqaddq_u16(t0, vqaddq_u16(t1, t2)); + + int16x8_t tt0 = vsubq_s16(vreinterpretq_s16_u16(t0), + vreinterpretq_s16_u16(vaddw_u8(vshll_n_u8(vsub, 3), vsub))); + uint8x8_t it0 = vqmovun_s16(tt0); + vst1_u8(drow + x - 8, it0); + + vsub = x1; + } + + x -= 8; + if (x == width) + --x; + + for ( ; x < width; ++x) + { + // make extrapolation for the last elements + if (x + 1 >= width) + { + if (border == BORDER_MODE_CONSTANT) + nextx = borderValue * 3; + else if (border == BORDER_MODE_REPLICATE) + nextx = srow2[x] + srow1[x] + srow0[x]; + } + else + { + nextx = (srow2 ? srow2[x + 1] : borderValue) + + srow1[x + 1] + + (srow0 ? srow0[x + 1] : borderValue); + } + + s32 val = (prevx + currx + nextx) - 9 * srow1[x]; + drow[x] = internal::saturate_cast((s32)val); + + // make shift + prevx = currx; + currx = nextx; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)border; + (void)borderValue; +#endif +} + +bool isLaplacianOpenCVSupported(const Size2D &size, BORDER_MODE border) +{ + return isSupportedConfiguration() && + size.width >= 8 && size.height >= 1 && + (border == BORDER_MODE_CONSTANT || + border == BORDER_MODE_REFLECT || + border == BORDER_MODE_REFLECT101 || + border == BORDER_MODE_REPLICATE); +} + +void Laplacian1OpenCV(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue) +{ + internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border)); +#ifdef CAROTENE_NEON + ptrdiff_t rows = size.height, cols = size.width; + + std::vector _tmp; + u8 *tmp = 0; + if (border == BORDER_MODE_CONSTANT) + { + _tmp.assign(cols + 4,borderValue); + tmp = &_tmp[2]; + } + + for( ptrdiff_t y = 0; y < rows; y++ ) + { + const u8* v0 = 0; + const u8* v1 = internal::getRowPtr(srcBase, srcStride, y); + const u8* v2 = 0; + // make border + if (border == BORDER_MODE_REFLECT101) { + v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : y+1); + v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0); + } else if (border == BORDER_MODE_CONSTANT) { + v0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp; + v2 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp; + } else { + v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0); + v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0); + } + s16* drow = internal::getRowPtr(dstBase, dstStride, y); + + int16x8_t tcurr = vmovq_n_s16(0x0); + int16x8_t tnext = vmovq_n_s16(0x0); + int16x8_t t0, t2; + uint8x8_t xx0 = vmov_n_u8(0x0); + uint8x8_t xx1 = vmov_n_u8(0x0); + uint8x8_t xx2 = vmov_n_u8(0x0); + ptrdiff_t x = 0; + const ptrdiff_t bcols = y + 2 < rows ? cols : (cols - 8); + for( ; x <= bcols; x += 8 ) + { + internal::prefetch(v0 + x); + internal::prefetch(v1 + x); + internal::prefetch(v2 + x); + + uint8x8_t x0 = vld1_u8(v0 + x); + uint8x8_t x1 = vld1_u8(v1 + x); + uint8x8_t x2 = vld1_u8(v2 + x); + + if(x) { + xx0 = xx1; + xx1 = xx2; + } else { + xx1 = x1; + // make border + if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT) + { + xx1 = vset_lane_u8(vget_lane_u8(x1, 0),x1, 7); + } + else if (border == BORDER_MODE_CONSTANT) + { + xx1 = vset_lane_u8(borderValue, x1, 7); + } + else if (border == BORDER_MODE_REFLECT101) + { + xx1 = vset_lane_u8(vget_lane_u8(x1, 1),x1, 7); + } + } + xx2 = x1; + + if(x) { + tcurr = tnext; + } + tnext = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x0, x2)), + vreinterpretq_s16_u16(vshll_n_u8(x1, 2))); + + if(!x) { + tcurr = tnext; + continue; + } + t0 = vreinterpretq_s16_u16(vmovl_u8(vext_u8(xx0, xx1, 7))); + t2 = vreinterpretq_s16_u16(vmovl_u8(vext_u8(xx1, xx2, 1))); + t0 = vaddq_s16(vqaddq_s16(t0, t2), tcurr); + + vst1q_s16(drow + x - 8, t0); + } + + x -= 8; + if(x == cols){ + x--; + } + + for( ; x < cols; x++ ) + { + s16 nextx; + s16 prevx; + // make border + if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT) + { + prevx = x == 0 ? v1[0] : v1[x-1]; + nextx = x == cols-1 ? v1[x] : v1[x+1]; + } + else if (border == BORDER_MODE_REFLECT101) + { + prevx = x == 0 ? v1[1] : v1[x-1]; + nextx = x == cols-1 ? v1[x-1] : v1[x+1]; + } + else //if (border == BORDER_MODE_CONSTANT) + { + prevx = x == 0 ? borderValue : v1[x-1]; + nextx = x == cols-1 ? borderValue : v1[x+1]; + } + *(drow+x) = prevx + nextx - 4*v1[x] + v0[x] + v2[x]; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)border; + (void)borderValue; +#endif +} + +void Laplacian3OpenCV(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue) +{ + internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border)); +#ifdef CAROTENE_NEON + ptrdiff_t rows = size.height, cols = size.width; + + std::vector _tmp; + u8 *tmp = 0; + if (border == BORDER_MODE_CONSTANT) + { + _tmp.assign(cols + 4,borderValue); + tmp = &_tmp[2]; + } + + for( ptrdiff_t y = 0; y < rows; y++ ) + { + const u8* v0 = 0; + const u8* v1 = internal::getRowPtr(srcBase, srcStride, y); + const u8* v2 = 0; + // make border + if (border == BORDER_MODE_REFLECT101) { + v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : y+1); + v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0); + } else if (border == BORDER_MODE_CONSTANT) { + v0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp; + v2 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp; + } else { + v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0); + v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0); + } + s16* drow = internal::getRowPtr(dstBase, dstStride, y); + + int16x8_t tprev = vmovq_n_s16(0x0); + int16x8_t tcurr = vmovq_n_s16(0x0); + int16x8_t tnext = vmovq_n_s16(0x0); + int16x8_t tc = vmovq_n_s16(0x0); + int16x8_t t0, t2, tcnext; + ptrdiff_t x = 0; + const ptrdiff_t bcols = y + 2 < rows ? cols : (cols - 8); + for( ; x <= bcols; x += 8 ) + { + internal::prefetch(v0 + x); + internal::prefetch(v1 + x); + internal::prefetch(v2 + x); + + uint8x8_t x0 = vld1_u8(v0 + x); + uint8x8_t x1 = vld1_u8(v1 + x); + uint8x8_t x2 = vld1_u8(v2 + x); + tcnext = vreinterpretq_s16_u16(vshll_n_u8(x1, 2)); + + if(x) { + tprev = tcurr; + tcurr = tnext; + } + tnext = vreinterpretq_s16_u16(vaddl_u8(x0, x2)); + + if(!x) { + tcurr = tnext; + tc = tcnext; + + // make border + if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT) + { + tcurr = vsetq_lane_s16(vgetq_lane_s16(tcurr, 0),tcurr, 7); + } + else if (border == BORDER_MODE_CONSTANT) + { + tcurr = vsetq_lane_s16(borderValue, tcurr, 7); + } + else if (border == BORDER_MODE_REFLECT101) + { + tcurr = vsetq_lane_s16(vgetq_lane_s16(tcurr, 1),tcurr, 7); + } + continue; + } + + t0 = vextq_s16(tprev, tcurr, 7); + t2 = vextq_s16(tcurr, tnext, 1); + + t0 = vsubq_s16(vqaddq_s16(t0, t2), tc); + tc = tcnext; + + t0 = vshlq_n_s16(t0, 1); + vst1q_s16(drow + x - 8, t0); + } + x -= 8; + if(x == cols){ + x--; + } + + for( ; x < cols; x++ ) + { + s16 nextx, nextx2; + s16 prevx, prevx2; + // make border + if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT) + { + prevx = x == 0 ? v0[0] : v0[x-1]; + prevx2 = x == 0 ? v2[0] : v2[x-1]; + nextx = x == cols-1 ? v0[x] : v0[x+1]; + nextx2 = x == cols-1 ? v2[x] : v2[x+1]; + } + else if (border == BORDER_MODE_REFLECT101) + { + prevx = x == 0 ? v0[1] : v0[x-1]; + prevx2 = x == 0 ? v2[1] : v2[x-1]; + nextx = x == cols-1 ? v0[x-1] : v0[x+1]; + nextx2 = x == cols-1 ? v2[x-1] : v2[x+1]; + } + else //if (border == BORDER_MODE_CONSTANT) + { + prevx = x == 0 ? borderValue : v0[x-1]; + prevx2 = x == 0 ? borderValue : v2[x-1]; + nextx = x == cols-1 ? borderValue : v0[x+1]; + nextx2 = x == cols-1 ? borderValue : v2[x+1]; + } + s16 res = prevx + nextx - 4*v1[x] + prevx2 + nextx2; + *(drow+x) = 2*res; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)border; + (void)borderValue; +#endif +} + +void Laplacian5OpenCV(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue) +{ + internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border)); +#ifdef CAROTENE_NEON + ptrdiff_t rows = size.height, cols = size.width; + + std::vector _tmp; + u8 *tmp = 0; + if (border == BORDER_MODE_CONSTANT) + { + _tmp.assign(cols + 4,borderValue); + tmp = &_tmp[2]; + } + + for( ptrdiff_t y = 0; y < rows; y++ ) + { + const u8* v0 = 0; + const u8* v1 = 0; + const u8* v2 = internal::getRowPtr(srcBase, srcStride, y); + const u8* v3 = 0; + const u8* v4 = 0; + // make border + if (border == BORDER_MODE_REPLICATE) { + v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : 0); + v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0); + v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0); + v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 0 ? rows-1 : 0); + } else if (border == BORDER_MODE_REFLECT) { + v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : rows > 1 ? 1-y : 0); + v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0); + v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0); + v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 1 ? 2*rows-(y+3) : 0); + } else if (border == BORDER_MODE_REFLECT101) { + v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : rows > 2-y ? 2-y : 0); ///check + v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : rows > 1 ? 1 : 0); + v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0); + v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 2 ? 2*rows-(y+4) : 0);///bad if rows=2 y=1 rows - 4 + (2,1) + } else if (border == BORDER_MODE_CONSTANT) { + v0 = y > 1 ? internal::getRowPtr(srcBase, srcStride, y-2) : tmp; + v1 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp; + v3 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp; + v4 = y < rows-2 ? internal::getRowPtr(srcBase, srcStride, y+2) : tmp; + } + s16* drow = internal::getRowPtr(dstBase, dstStride, y); + + int16x8_t tnext, tc, t0; + int16x8_t tnext2, tnext3; + int16x8_t tnext1Old, tnext2Old, tnext3Old; + int16x8_t tnext4OldOldOld, tnext5OldOldOld; + + int16x8_t tcurr1 = vmovq_n_s16(0x0); + int16x8_t tnext1 = vmovq_n_s16(0x0); + int16x8_t tprev1 = vmovq_n_s16(0x0); + int16x8_t tpprev1 = vmovq_n_s16(0x0); + int16x8_t tppprev1 = vmovq_n_s16(0x0); + + int16x8_t tnext4Old = vmovq_n_s16(0x0); + int16x8_t tnext5Old = vmovq_n_s16(0x0); + int16x8_t tnext1OldOld = vmovq_n_s16(0x0); + int16x8_t tnext2OldOld = vmovq_n_s16(0x0); + int16x8_t tnext3OldOld = vmovq_n_s16(0x0); + int16x8_t tnext4OldOld = vmovq_n_s16(0x0); + int16x8_t tnext5OldOld = vmovq_n_s16(0x0); + + // do vertical convolution + ptrdiff_t x = 0; + const ptrdiff_t bcols = y + 3 < rows ? cols : (cols - 8); + for( ; x <= bcols; x += 8 ) + { + internal::prefetch(v0 + x); + internal::prefetch(v1 + x); + internal::prefetch(v2 + x); + internal::prefetch(v3 + x); + internal::prefetch(v4 + x); + + uint8x8_t x0 = vld1_u8(v0 + x); + uint8x8_t x1 = vld1_u8(v1 + x); + uint8x8_t x2 = vld1_u8(v2 + x); + uint8x8_t x3 = vld1_u8(v3 + x); + uint8x8_t x4 = vld1_u8(v4 + x); + if(x) { + tcurr1 = tnext1; + } + + tnext4OldOldOld = tnext4Old; + tnext5OldOldOld = tnext5Old; + tnext1Old = tnext1OldOld; + tnext2Old = tnext2OldOld; + tnext3Old = tnext3OldOld; + tnext4Old = tnext4OldOld; + tnext5Old = tnext5OldOld; + + tnext3 = vreinterpretq_s16_u16(vaddq_u16(vaddl_u8(x3, x2),vaddl_u8(x2, x1))); + tnext3 = vshlq_n_s16(tnext3, 1); + + tc = vreinterpretq_s16_u16(vsubl_u8(x4, x2)); + tnext = vreinterpretq_s16_u16(vsubl_u8(x2, x0)); + tnext2 = vsubq_s16(tc, tnext); + + tnext1 = vaddq_s16(tnext3, tnext2); + // tnext1 = x0 + 2*x1 + 2*x2 + 2*x3 + x4 + + tnext2 = vshlq_n_s16(tnext2, 1); + // tnext2 = 2*x4 - 4*x2 + 2*x0 + + tnext3 = vsubq_s16(tnext2, vshlq_n_s16(tnext3, 1)); + // tnext3 = 2*x0 - 4*x1 - 12*x2 - 4*x3 + 2*x4 + + tnext1OldOld = tnext1; + tnext2OldOld = tnext2; + tnext3OldOld = tnext3; + tnext4OldOld = tnext2; + tnext5OldOld = tnext1; + + if(x) { + tnext1 = vextq_s16(tnext1Old, tnext1, 2); + tcurr1 = vextq_s16(tnext2Old, tnext2, 1); + tprev1 = tnext3Old; + + if(x!=8) { + tpprev1 = vextq_s16(tnext4OldOldOld, tnext4Old, 7); + tppprev1 = vextq_s16(tnext5OldOldOld, tnext5Old, 6); + } + } + + if(!x) { + // make border + if (border == BORDER_MODE_REPLICATE) { + tpprev1 = vextq_s16(tnext2, tnext2, 7); + tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 1),tpprev1, 0); + + tprev1 = vextq_s16(tnext1, tnext1, 6); + tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 0); + tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 1); + } else if (border == BORDER_MODE_REFLECT) { + tpprev1 = vextq_s16(tnext2, tnext2, 7); + tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 1),tpprev1, 0); + + tprev1 = vextq_s16(tnext1, tnext1, 6); + tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 3),tprev1, 0); + tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 1); + } else if (border == BORDER_MODE_REFLECT101) { + tpprev1 = vextq_s16(tnext2, tnext2, 7); + tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 2),tpprev1, 0); + + tprev1 = vextq_s16(tnext1, tnext1, 6); + tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 3),tprev1, 1); + tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 4),tprev1, 0); + } else if (border == BORDER_MODE_CONSTANT) { + tpprev1 = vextq_s16(tnext2, tnext2, 7); + tpprev1 = vsetq_lane_s16(borderValue, tpprev1, 0); + + tprev1 = vextq_s16(tnext1, tnext1, 6); + tprev1 = vsetq_lane_s16(borderValue, tprev1, 0); + tprev1 = vsetq_lane_s16(borderValue, tprev1, 1); + } + tppprev1 = tprev1; + continue; + } + + t0 = vaddq_s16(vaddq_s16(vqaddq_s16(tcurr1, tprev1), vqaddq_s16(tpprev1, tppprev1)), tnext1); + t0 = vaddq_s16(t0, t0); + vst1q_s16(drow + x - 8, t0); + } + x -= 8; + if(x >= cols - 1) + x = cols-2; + + s16 pprevx = 0; + s16 prevx = 0; + s16 nextx = 0; + s16 nnextx = 0; + + for( ; x < cols; x++ ) + { + if (x == 0) { + // make border + if (border == BORDER_MODE_REPLICATE) { + pprevx = v0[0] + 2*v1[0] + 2*v2[0] + 2*v3[0] + v4[0]; + prevx = 2*v0[0] - 4*v2[0] + 2*v4[0]; + } else if (border == BORDER_MODE_REFLECT) { + pprevx = v0[1] + 2*v1[1] + 2*v2[1] + 2*v3[1] + v4[1]; + prevx = 2*v0[0] - 4*v2[0] + 2*v4[0]; + } else if (border == BORDER_MODE_REFLECT101) { + pprevx = v0[2] + 2*v1[2] + 2*v2[2] + 2*v3[2] + v4[2]; + prevx = 2*v0[1] - 4*v2[1] + 2*v4[1]; + } else if (border == BORDER_MODE_CONSTANT) { + pprevx = 8 * borderValue; + prevx = 0; + } + } else if (x == 1) { + // make border + if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT) { + pprevx = v0[0] + 2*v1[0] + 2*v2[0] + 2*v3[0] + v4[0]; + } else if (border == BORDER_MODE_REFLECT101) { + pprevx = v0[1] + 2*v1[1] + 2*v2[1] + 2*v3[1] + v4[1]; + } else if (border == BORDER_MODE_CONSTANT) { + pprevx = 8 * borderValue; + } + prevx = 2*v0[0] - 4*v2[0] + 2*v4[0]; + } else { + pprevx = v0[x-2] + 2*v1[x-2] + 2*v2[x-2] + 2*v3[x-2] + v4[x-2]; + prevx = 2*v0[x-1] - 4*v2[x-1] + 2*v4[x-1]; + } + s16 currx = 2*v0[x] - 4*v1[x] - 12*v2[x] - 4*v3[x] + 2*v4[x]; + if (x == cols-1) { + // make border + if (border == BORDER_MODE_REPLICATE) { + nextx = 2*v0[x] - 4*v2[x] + 2*v4[x]; + nnextx = v0[x] + 2*v1[x] + 2*v2[x] + 2*v3[x] + v4[x]; + } else if (border == BORDER_MODE_REFLECT) { + nextx = 2*v0[x] - 4*v2[x] + 2*v4[x]; + nnextx = v0[x-1] + 2*v1[x-1] + 2*v2[x-1] + 2*v3[x-1] + v4[x-1]; + } else if (border == BORDER_MODE_REFLECT101) { + nextx = 2*v0[x-1] - 4*v2[x-1] + 2*v4[x-1]; + nnextx = v0[x-2] + 2*v1[x-2] + 2*v2[x-2] + 2*v3[x-2] + v4[x-2]; + } else if (border == BORDER_MODE_CONSTANT) { + nextx = 0; + nnextx = 8 * borderValue; + } + } else if (x == cols-2) { + // make border + if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT) { + nnextx = v0[x+1] + 2*v1[x+1] + 2*v2[x+1] + 2*v3[x+1] + v4[x+1]; + } else if (border == BORDER_MODE_REFLECT101) { + nnextx = v0[x] + 2*v1[x] + 2*v2[x] + 2*v3[x] + v4[x]; + } else if (border == BORDER_MODE_CONSTANT) { + nnextx = 8 * borderValue; + } + nextx = 2*v0[x+1] - 4*v2[x+1] + 2*v4[x+1]; + } else { + nextx = 2*v0[x+1] - 4*v2[x+1] + 2*v4[x+1]; + nnextx = v0[x+2] + 2*v1[x+2] + 2*v2[x+2] + 2*v3[x+2] + v4[x+2]; + } + s16 res = pprevx + prevx + currx + nextx + nnextx; + *(drow+x) = 2*res; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)border; + (void)borderValue; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/magnitude.cpp b/3rdparty/carotene/src/magnitude.cpp new file mode 100644 index 0000000000..cd9d82bf6c --- /dev/null +++ b/3rdparty/carotene/src/magnitude.cpp @@ -0,0 +1,160 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" +#include "vtransform.hpp" + +#include + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON + +namespace { + +struct Magnitude +{ + typedef s16 type; + + void operator() (const int16x8_t & v_src0, const int16x8_t & v_src1, + int16x8_t & v_dst) const + { + int16x4_t v_src0_p = vget_low_s16(v_src0), v_src1_p = vget_low_s16(v_src1); + float32x4_t v_sqr0 = vaddq_f32(vcvtq_f32_s32(vmull_s16(v_src0_p, v_src0_p)), + vcvtq_f32_s32(vmull_s16(v_src1_p, v_src1_p))); + v_src0_p = vget_high_s16(v_src0); + v_src1_p = vget_high_s16(v_src1); + float32x4_t v_sqr1 = vaddq_f32(vcvtq_f32_s32(vmull_s16(v_src0_p, v_src0_p)), + vcvtq_f32_s32(vmull_s16(v_src1_p, v_src1_p))); + + int32x4_t v_sqrt0 = vcvtq_s32_f32(internal::vsqrtq_f32(v_sqr0)); + int32x4_t v_sqrt1 = vcvtq_s32_f32(internal::vsqrtq_f32(v_sqr1)); + + v_dst = vcombine_s16(vqmovn_s32(v_sqrt0), vqmovn_s32(v_sqrt1)); + } + + void operator() (const int16x4_t & v_src0, const int16x4_t & v_src1, + int16x4_t & v_dst) const + { + float32x4_t v_tmp = vaddq_f32(vcvtq_f32_s32(vmull_s16(v_src0, v_src0)), + vcvtq_f32_s32(vmull_s16(v_src1, v_src1))); + int32x4_t v_sqrt = vcvtq_s32_f32(internal::vsqrtq_f32(v_tmp)); + v_dst = vqmovn_s32(v_sqrt); + } + + void operator() (const short * src0, const short * src1, short * dst) const + { + f32 src0val = (f32)src0[0], src1val = (f32)src1[0]; + dst[0] = internal::saturate_cast((s32)sqrtf(src0val * src0val + src1val * src1val)); + } +}; + +struct MagnitudeF32 +{ + typedef f32 type; + + void operator() (const float32x4_t & v_src0, const float32x4_t & v_src1, + float32x4_t & v_dst) const + { + v_dst = internal::vsqrtq_f32(vaddq_f32(vmulq_f32(v_src0, v_src0), vmulq_f32(v_src1, v_src1))); + } + + void operator() (const float32x2_t & v_src0, const float32x2_t & v_src1, + float32x2_t & v_dst) const + { + v_dst = internal::vsqrt_f32(vadd_f32(vmul_f32(v_src0, v_src0), vmul_f32(v_src1, v_src1))); + } + + void operator() (const f32 * src0, const f32 * src1, f32 * dst) const + { + dst[0] = sqrtf(src0[0] * src0[0] + src1[0] * src1[0]); + } +}; + +} // namespace + +#endif + +void magnitude(const Size2D &size, + const s16 * src0Base, ptrdiff_t src0Stride, + const s16 * src1Base, ptrdiff_t src1Stride, + s16 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + Magnitude()); +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; +#endif +} + +void magnitude(const Size2D &size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride, + f32 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + MagnitudeF32()); +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/meanstddev.cpp b/3rdparty/carotene/src/meanstddev.cpp new file mode 100644 index 0000000000..a847493429 --- /dev/null +++ b/3rdparty/carotene/src/meanstddev.cpp @@ -0,0 +1,163 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +#include + +namespace CAROTENE_NS { + +void meanStdDev(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + f32 * pMean, f32 * pStdDev) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + f64 fsum = 0.0f, fsqsum = 0.0f; + sqsum(size, srcBase, srcStride, &fsum, &fsqsum, 1); + + // calc mean and stddev + f64 itotal = 1.0 / size.total(); + f64 mean = fsum * itotal; + f64 stddev = sqrt(std::max(fsqsum * itotal - mean * mean, 0.0)); + + if (pMean) + *pMean = mean; + if (pStdDev) + *pStdDev = stddev; +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)pMean; + (void)pStdDev; +#endif +} + +void meanStdDev(const Size2D &size, + const u16 * srcBase, ptrdiff_t srcStride, + f32 * pMean, f32 * pStdDev) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t blockSize0 = 1 << 10, roiw4 = size.width & ~3; + f64 fsum = 0.0f, fsqsum = 0.0f; + + f32 arsum[8]; + uint32x4_t v_zero = vdupq_n_u32(0u), v_sum; + float32x4_t v_zero_f = vdupq_n_f32(0.0f), v_sqsum; + + for (size_t i = 0; i < size.height; ++i) + { + const u16 * src = internal::getRowPtr(srcBase, srcStride, i); + size_t j = 0u; + + while (j < roiw4) + { + size_t blockSize = std::min(roiw4 - j, blockSize0) + j; + v_sum = v_zero; + v_sqsum = v_zero_f; + + for ( ; j + 16 < blockSize ; j += 16) + { + internal::prefetch(src + j); + uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8); + + // 0 + uint32x4_t v_srclo = vmovl_u16(vget_low_u16(v_src0)); + uint32x4_t v_srchi = vmovl_u16(vget_high_u16(v_src0)); + v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi)); + float32x4_t v_srclo_f = vcvtq_f32_u32(v_srclo); + float32x4_t v_srchi_f = vcvtq_f32_u32(v_srchi); + v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f); + v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f); + + // 1 + v_srclo = vmovl_u16(vget_low_u16(v_src1)); + v_srchi = vmovl_u16(vget_high_u16(v_src1)); + v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi)); + v_srclo_f = vcvtq_f32_u32(v_srclo); + v_srchi_f = vcvtq_f32_u32(v_srchi); + v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f); + v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f); + } + + for ( ; j < blockSize; j += 4) + { + uint32x4_t v_src = vmovl_u16(vld1_u16(src + j)); + float32x4_t v_src_f = vcvtq_f32_u32(v_src); + v_sum = vaddq_u32(v_sum, v_src); + v_sqsum = vmlaq_f32(v_sqsum, v_src_f, v_src_f); + } + + vst1q_f32(arsum, vcvtq_f32_u32(v_sum)); + vst1q_f32(arsum + 4, v_sqsum); + + fsum += (f64)arsum[0] + arsum[1] + arsum[2] + arsum[3]; + fsqsum += (f64)arsum[4] + arsum[5] + arsum[6] + arsum[7]; + } + + // collect a few last elements in the current row + for ( ; j < size.width; ++j) + { + f32 srcval = src[j]; + fsum += srcval; + fsqsum += srcval * srcval; + } + } + + // calc mean and stddev + f64 itotal = 1.0 / size.total(); + f64 mean = fsum * itotal; + f64 stddev = sqrt(std::max(fsqsum * itotal - mean * mean, 0.0)); + + if (pMean) + *pMean = mean; + if (pStdDev) + *pStdDev = stddev; +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)pMean; + (void)pStdDev; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/median_filter.cpp b/3rdparty/carotene/src/median_filter.cpp new file mode 100644 index 0000000000..8c5d08b7ee --- /dev/null +++ b/3rdparty/carotene/src/median_filter.cpp @@ -0,0 +1,227 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2014, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +/* + * The code here is based on the code in + * , which is in public domain. + * See also . + */ + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON +namespace { + + uint8x16_t getLeftReplicate(uint8x16_t r, u32 cn) + { + u8 buf[16+8]; + vst1q_u8(buf+cn, r); + for (u32 i = 0; i < cn; ++i) buf[i] = buf[cn+i]; + return vld1q_u8(buf); + } + + uint8x8_t getRightReplicate(uint8x8_t r, u32 cn) + { + u8 buf[8+8]; + vst1_u8(buf, r); + for (u32 i = 0; i < cn; ++i) buf[8+i] = buf[8-cn+i]; + return vld1_u8(buf+cn); + } + +} // namespace + +//o------^-------^-----------------------------o 0 +// | | +//o--^---v---^---|-------^---------------------o 1 +// | | | | +//o--v-------v---|-------|-^-------^-------^---o 2 +// | | | | | +//o------^-------v-----^-|-|-------|-------|---o 3 +// | | | | | | +//o--^---v---^-----^---|-v-|---^---v---^---v---o 4 +// | | | | | | | +//o--v-------v---^-|---|---v---|-------|-------o 5 +// | | | | | +//o------^-------|-|---v-------|-------v-------o 6 +// | | | | +//o--^---v---^---|-v-----------v---------------o 7 +// | | | +//o--v-------v---v-----------------------------o 8 + +#define ELT(num, level) v ## num ## _lv ## level +#define PIX_SORT(a, alvl, b, blvl, newlvl) \ + PIX_MIN(a, alvl, b, blvl, newlvl); \ + PIX_MAX(a, alvl, b, blvl, newlvl); + +#define SORT9 \ + PIX_SORT(1, 00, 2, 00, 01); \ + PIX_SORT(4, 00, 5, 00, 02); \ + PIX_SORT(7, 00, 8, 00, 03); \ + PIX_SORT(0, 00, 1, 01, 04); \ + PIX_SORT(3, 00, 4, 02, 05); \ + PIX_SORT(6, 00, 7, 03, 06); \ + PIX_SORT(1, 04, 2, 01, 07); \ + PIX_SORT(4, 05, 5, 02, 08); \ + PIX_SORT(7, 06, 8, 03, 09); \ + PIX_MAX (0, 04, 3, 05, 10); \ + PIX_MIN (5, 08, 8, 09, 11); \ + PIX_SORT(4, 08, 7, 09, 12); \ + PIX_MAX (3, 10, 6, 06, 13); \ + PIX_MAX (1, 07, 4, 12, 14); \ + PIX_MIN (2, 07, 5, 11, 15); \ + PIX_MIN (4, 14, 7, 12, 16); \ + PIX_SORT(4, 16, 2, 15, 17); \ + PIX_MAX (6, 13, 4, 17, 18); \ + PIX_MIN (4, 18, 2, 17, 19); + +#endif + +bool isMedianFilter3x3Supported(const Size2D &size, u32 numChannels) +{ + return isSupportedConfiguration() && size.width >= 16 + numChannels && numChannels <= 8; +} + +void medianFilter3x3(const Size2D &size, u32 numChannels, + const u8 *srcBase, ptrdiff_t srcStride, + const Margin &srcMargin, + u8 *dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(isMedianFilter3x3Supported(size, numChannels)); +#ifdef CAROTENE_NEON + u32 cn = numChannels; + size_t colsn = size.width * cn; + + for (size_t i = 0; i < size.height; ++i) { + const u8* psrc1 = internal::getRowPtr(srcBase, srcStride, i); + const u8* psrc0 = i == 0 && srcMargin.top == 0 ? psrc1 : psrc1 - srcStride; + const u8* psrc2 = i + 1 == size.height && srcMargin.bottom == 0 ? psrc1 : psrc1 + srcStride; + u8* pdst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + { + uint8x16_t v3_lv00 = vld1q_u8(psrc0); + uint8x16_t v4_lv00 = vld1q_u8(psrc1); + uint8x16_t v5_lv00 = vld1q_u8(psrc2); + uint8x16_t v6_lv00 = vld1q_u8(psrc0 + cn); + uint8x16_t v7_lv00 = vld1q_u8(psrc1 + cn); + uint8x16_t v8_lv00 = vld1q_u8(psrc2 + cn); + uint8x16_t v0_lv00 = srcMargin.left > 0 ? vld1q_u8(psrc0 - cn) : getLeftReplicate(v3_lv00, cn); + uint8x16_t v1_lv00 = srcMargin.left > 0 ? vld1q_u8(psrc1 - cn) : getLeftReplicate(v4_lv00, cn); + uint8x16_t v2_lv00 = srcMargin.left > 0 ? vld1q_u8(psrc2 - cn) : getLeftReplicate(v5_lv00, cn); + + goto medianBlur3x3_mainBody; + + for (; j < colsn - 16; j += 16) { + internal::prefetch(psrc0 + j); + internal::prefetch(psrc1 + j); + internal::prefetch(psrc2 + j); + + v0_lv00 = vld1q_u8(psrc0 + j - cn); + v1_lv00 = vld1q_u8(psrc1 + j - cn); + v2_lv00 = vld1q_u8(psrc2 + j - cn); + v3_lv00 = vld1q_u8(psrc0 + j); + v4_lv00 = vld1q_u8(psrc1 + j); + v5_lv00 = vld1q_u8(psrc2 + j); + v6_lv00 = vld1q_u8(psrc0 + j + cn); + v7_lv00 = vld1q_u8(psrc1 + j + cn); + v8_lv00 = vld1q_u8(psrc2 + j + cn); + +medianBlur3x3_mainBody: + +#define PIX_MIN(a, alvl, b, blvl, newlvl) uint8x16_t ELT(a, newlvl) = vminq_u8(ELT(a, alvl), ELT(b, blvl)) +#define PIX_MAX(a, alvl, b, blvl, newlvl) uint8x16_t ELT(b, newlvl) = vmaxq_u8(ELT(a, alvl), ELT(b, blvl)) + SORT9; +#undef PIX_MAX +#undef PIX_MIN + + vst1q_u8(pdst + j, v4_lv19); + } + } + + { + size_t k = colsn - 8; + uint8x8_t v0_lv00 = vld1_u8(psrc0 + k - cn); + uint8x8_t v1_lv00 = vld1_u8(psrc1 + k - cn); + uint8x8_t v2_lv00 = vld1_u8(psrc2 + k - cn); + uint8x8_t v3_lv00 = vld1_u8(psrc0 + k); + uint8x8_t v4_lv00 = vld1_u8(psrc1 + k); + uint8x8_t v5_lv00 = vld1_u8(psrc2 + k); + uint8x8_t v6_lv00 = srcMargin.right > 0 ? vld1_u8(psrc0 + k + cn) : getRightReplicate(v3_lv00, cn); + uint8x8_t v7_lv00 = srcMargin.right > 0 ? vld1_u8(psrc1 + k + cn) : getRightReplicate(v4_lv00, cn); + uint8x8_t v8_lv00 = srcMargin.right > 0 ? vld1_u8(psrc2 + k + cn) : getRightReplicate(v5_lv00, cn); + + goto medianBlur3x3_tailBody; + + for (; k >= j - 8; k -= 8) { + v0_lv00 = vld1_u8(psrc0 + k - cn); + v1_lv00 = vld1_u8(psrc1 + k - cn); + v2_lv00 = vld1_u8(psrc2 + k - cn); + v3_lv00 = vld1_u8(psrc0 + k); + v4_lv00 = vld1_u8(psrc1 + k); + v5_lv00 = vld1_u8(psrc2 + k); + v6_lv00 = vld1_u8(psrc0 + k + cn); + v7_lv00 = vld1_u8(psrc1 + k + cn); + v8_lv00 = vld1_u8(psrc2 + k + cn); + +medianBlur3x3_tailBody: + +#define PIX_MIN(a, alvl, b, blvl, newlvl) uint8x8_t ELT(a, newlvl) = vmin_u8(ELT(a, alvl), ELT(b, blvl)) +#define PIX_MAX(a, alvl, b, blvl, newlvl) uint8x8_t ELT(b, newlvl) = vmax_u8(ELT(a, alvl), ELT(b, blvl)) + SORT9; +#undef PIX_MAX +#undef PIX_MIN + + vst1_u8(pdst + k, v4_lv19); + } + } + } +#else + (void)size; + (void)numChannels; + (void)srcBase; + (void)srcStride; + (void)srcMargin; + (void)dstBase; + (void)dstStride; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/min_max.cpp b/3rdparty/carotene/src/min_max.cpp new file mode 100644 index 0000000000..d6f4017841 --- /dev/null +++ b/3rdparty/carotene/src/min_max.cpp @@ -0,0 +1,139 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include + +#include "common.hpp" +#include "vtransform.hpp" + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON + +namespace { + +template +struct Min +{ + typedef T type; + + void operator() (const typename internal::VecTraits::vec128 & v_src0, + const typename internal::VecTraits::vec128 & v_src1, + typename internal::VecTraits::vec128 & v_dst) const + { + v_dst = internal::vminq(v_src0, v_src1); + } + + void operator() (const typename internal::VecTraits::vec64 & v_src0, + const typename internal::VecTraits::vec64 & v_src1, + typename internal::VecTraits::vec64 & v_dst) const + { + v_dst = internal::vmin(v_src0, v_src1); + } + + void operator() (const T * src0, const T * src1, T * dst) const + { + dst[0] = std::min(src0[0], src1[0]); + } +}; + +template +struct Max +{ + typedef T type; + + void operator() (const typename internal::VecTraits::vec128 & v_src0, + const typename internal::VecTraits::vec128 & v_src1, + typename internal::VecTraits::vec128 & v_dst) const + { + v_dst = internal::vmaxq(v_src0, v_src1); + } + + void operator() (const typename internal::VecTraits::vec64 & v_src0, + const typename internal::VecTraits::vec64 & v_src1, + typename internal::VecTraits::vec64 & v_dst) const + { + v_dst = internal::vmax(v_src0, v_src1); + } + + void operator() (const T * src0, const T * src1, T * dst) const + { + dst[0] = std::max(src0[0], src1[0]); + } +}; + +} // namespace + +#define IMPL_OP(fun, op, type) \ +void fun(const Size2D &size, \ + const type * src0Base, ptrdiff_t src0Stride, \ + const type * src1Base, ptrdiff_t src1Stride, \ + type * dstBase, ptrdiff_t dstStride) \ +{ \ + internal::assertSupportedConfiguration(); \ + internal::vtransform(size, \ + src0Base, src0Stride, \ + src1Base, src1Stride, \ + dstBase, dstStride, op()); \ +} + +#else + +#define IMPL_OP(fun, op, type) \ +void fun(const Size2D &, \ + const type *, ptrdiff_t, \ + const type *, ptrdiff_t, \ + type *, ptrdiff_t) \ +{ \ + internal::assertSupportedConfiguration(); \ +} + +#endif + +#define IMPL_MINMAX(type) IMPL_OP(min, Min, type) IMPL_OP(max, Max, type) + +IMPL_MINMAX(u8) +IMPL_MINMAX(s8) +IMPL_MINMAX(u16) +IMPL_MINMAX(s16) +IMPL_MINMAX(u32) +IMPL_MINMAX(s32) +IMPL_MINMAX(f32) + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/minmaxloc.cpp b/3rdparty/carotene/src/minmaxloc.cpp new file mode 100644 index 0000000000..a7f30bc4f8 --- /dev/null +++ b/3rdparty/carotene/src/minmaxloc.cpp @@ -0,0 +1,1340 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" +#include "vtransform.hpp" + +#include + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON + +namespace { + +template +void minMaxVals(const Size2D &size, + const T * srcBase, ptrdiff_t srcStride, + T * pMinVal, T * pMaxVal) +{ + using namespace internal; + + typedef typename VecTraits::vec128 vec128; + typedef typename VecTraits::vec64 vec64; + + u32 step_base = 32 / sizeof(T), step_tail = 8 / sizeof(T); + size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0; + size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0; + + T maxVal = std::numeric_limits::min(); + T minVal = std::numeric_limits::max(); + vec128 v_min_base = vdupq_n(minVal), v_max_base = vdupq_n(maxVal); + vec64 v_min_tail = vdup_n(minVal), v_max_tail = vdup_n(maxVal); + + for (size_t i = 0; i < size.height; ++i) + { + const T * src = getRowPtr(srcBase, srcStride, i); + size_t j = 0; + + for (; j < roiw_base; j += step_base) + { + prefetch(src + j); + vec128 v_src0 = vld1q(src + j), v_src1 = vld1q(src + j + 16 / sizeof(T)); + v_min_base = vminq(v_min_base, v_src0); + v_max_base = vmaxq(v_max_base, v_src0); + v_min_base = vminq(v_min_base, v_src1); + v_max_base = vmaxq(v_max_base, v_src1); + } + for (; j < roiw_tail; j += step_tail) + { + vec64 v_src0 = vld1(src + j); + v_min_tail = vmin(v_min_tail, v_src0); + v_max_tail = vmax(v_max_tail, v_src0); + } + + for (; j < size.width; j++) + { + T srcval = src[j]; + minVal = std::min(srcval, minVal); + maxVal = std::max(srcval, maxVal); + } + } + + // collect min & max values + T ar[16 / sizeof(T)]; + vst1q(ar, vcombine(vmin(v_min_tail, vmin(vget_low(v_min_base), vget_high(v_min_base))), + vmax(v_max_tail, vmax(vget_low(v_max_base), vget_high(v_max_base))))); + + for (size_t x = 0; x < 8u / sizeof(T); ++x) + { + minVal = std::min(minVal, ar[x]); + maxVal = std::max(maxVal, ar[x + 8 / sizeof(T)]); + } + + if (pMaxVal) + *pMaxVal = maxVal; + if (pMinVal) + *pMinVal = minVal; +} + +} // namespace + +#endif + +void minMaxVals(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * pMinVal, u8 * pMaxVal) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + minMaxVals(size, + srcBase, srcStride, + pMinVal, pMaxVal); +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)pMinVal; + (void)pMaxVal; +#endif +} + +void minMaxVals(const Size2D &size, + const s16 * srcBase, ptrdiff_t srcStride, + s16 * pMinVal, s16 * pMaxVal) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + minMaxVals(size, + srcBase, srcStride, + pMinVal, pMaxVal); +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)pMinVal; + (void)pMaxVal; +#endif +} + +void minMaxVals(const Size2D &size, + const u16 * srcBase, ptrdiff_t srcStride, + u16 * pMinVal, u16 * pMaxVal) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + minMaxVals(size, + srcBase, srcStride, + pMinVal, pMaxVal); +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)pMinVal; + (void)pMaxVal; +#endif +} + +void minMaxVals(const Size2D &size, + const s32 * srcBase, ptrdiff_t srcStride, + s32 * pMinVal, s32 * pMaxVal) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + minMaxVals(size, + srcBase, srcStride, + pMinVal, pMaxVal); +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)pMinVal; + (void)pMaxVal; +#endif +} + +void minMaxVals(const Size2D &size, + const u32 * srcBase, ptrdiff_t srcStride, + u32 * pMinVal, u32 * pMaxVal) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + minMaxVals(size, + srcBase, srcStride, + pMinVal, pMaxVal); +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)pMinVal; + (void)pMaxVal; +#endif +} + +void minMaxLoc(const Size2D &size, + const f32 * srcBase, ptrdiff_t srcStride, + f32 &minVal, size_t &minCol, size_t &minRow, + f32 &maxVal, size_t &maxCol, size_t &maxRow) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + minVal = srcBase[0]; + minCol = 0; + minRow = 0; + maxVal = srcBase[0]; + maxCol = 0; + maxRow = 0; + for(size_t l = 0, i = 0; l < size.height; ++l, i = 0) + { + const f32 * src = internal::getRowPtr( srcBase, srcStride, l); + if (size.width >= 16) + { + u32 tmp0123[4] = { 0, 1, 2, 3 }; + uint32x4_t c4 = vdupq_n_u32(4); + +#if SIZE_MAX > UINT32_MAX + size_t boundAll = size.width - (4 - 1); + for(size_t b = 0; i < boundAll; b = i) + { + size_t bound = std::min(boundAll, b + 0xffffFFFC); +#else + { + size_t bound = size.width - (4 - 1); +#endif + uint32x4_t lineIdxOffset = vld1q_u32(tmp0123); + float32x4_t n_min = vdupq_n_f32(minVal); + uint32x4_t n_minIdx = vdupq_n_u32(0xffffFFFC); + float32x4_t n_max = vdupq_n_f32(maxVal); + uint32x4_t n_maxIdx = vdupq_n_u32(0xffffFFFC); + + for(; i < bound; i+=4) + { + internal::prefetch(src + i); + float32x4_t line = vld1q_f32(src + i); + + uint32x4_t minmask = vcltq_f32(line, n_min); + uint32x4_t maxmask = vcgtq_f32(line, n_max); + + n_min = vbslq_f32(minmask, line, n_min); + n_minIdx = vbslq_u32(minmask, lineIdxOffset, n_minIdx); + n_max = vbslq_f32(maxmask, line, n_max); + n_maxIdx = vbslq_u32(maxmask, lineIdxOffset, n_maxIdx); + + // idx[] +=4 + lineIdxOffset = vaddq_u32(lineIdxOffset, c4); + } + + f32 fmin[4], fmax[4]; + u32 fminIdx[4], fmaxIdx[4]; + + vst1q_f32(fmin, n_min); + vst1q_f32(fmax, n_max); + + vst1q_u32(fminIdx, n_minIdx); + vst1q_u32(fmaxIdx, n_maxIdx); + + size_t minIdx = fminIdx[0]; + size_t maxIdx = fmaxIdx[0]; + minVal = fmin[0]; + maxVal = fmax[0]; + + for (s32 j = 1; j < 4; ++j) + { + f32 minval = fmin[j]; + f32 maxval = fmax[j]; + if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx)) + { + minIdx = fminIdx[j]; + minVal = minval; + } + if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx)) + { + maxIdx = fmaxIdx[j]; + maxVal = maxval; + } + } + if(minIdx < 0xffffFFFC) + { +#if SIZE_MAX > UINT32_MAX + minCol = b + minIdx; +#else + minCol = minIdx; +#endif + minRow = l; + } + if(maxIdx < 0xffffFFFC) + { +#if SIZE_MAX > UINT32_MAX + maxCol = b + maxIdx; +#else + maxCol = maxIdx; +#endif + maxRow = l; + } + } + } + for(; i < size.width; ++i ) + { + float val = src[i]; + if( val < minVal ) + { + minVal = val; + minCol = i; + minRow = l; + } + else if( val > maxVal ) + { + maxVal = val; + maxCol = i; + maxRow = l; + } + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)minVal; + (void)minCol; + (void)minRow; + (void)maxVal; + (void)maxCol; + (void)maxRow; +#endif +} + +void minMaxLoc(const Size2D &size, + const f32 * srcBase, ptrdiff_t srcStride, + const u8 * maskBase, ptrdiff_t maskStride, + f32 &minVal, size_t &minCol, size_t &minRow, + f32 &maxVal, size_t &maxCol, size_t &maxRow) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + minVal = std::numeric_limits::max(); + minCol = size.width; + minRow = size.height; + maxVal = -std::numeric_limits::max(); + maxCol = size.width; + maxRow = size.height; + for(size_t l = 0, i = 0; l < size.height; ++l, i = 0) + { + const f32 * src = internal::getRowPtr( srcBase, srcStride, l); + const u8 * mask = internal::getRowPtr( maskBase, maskStride, l); + if (size.width >= 16) + { + u32 tmp0123[4] = { 0, 1, 2, 3 }; + uint32x4_t uOne = vdupq_n_u32(1); + uint32x4_t c4 = vdupq_n_u32(4); + +#if SIZE_MAX > UINT32_MAX + size_t boundAll = size.width - (4 - 1); + for(size_t b = 0; i < boundAll; b = i) + { + size_t bound = std::min(boundAll, b + 0xffffFFFC); +#else + { + size_t bound = size.width - (4 - 1); +#endif + uint32x4_t lineIdxOffset = vld1q_u32(tmp0123); + float32x4_t n_min = vdupq_n_f32(minVal); + uint32x4_t n_minIdx = vdupq_n_u32(0xffffFFFC); + float32x4_t n_max = vdupq_n_f32(maxVal); + uint32x4_t n_maxIdx = vdupq_n_u32(0xffffFFFC); + + for(; i < bound; i+=4) + { + internal::prefetch(src + i); + internal::prefetch(mask + i); + float32x4_t line = vld1q_f32(src + i); + uint8x8_t maskLine = vld1_u8(mask + i); + + uint32x4_t maskLine4 = vmovl_u16(vget_low_u16(vmovl_u8(maskLine))); + maskLine4 = vcgeq_u32(maskLine4, uOne); + + uint32x4_t minmask = vcltq_f32(line, n_min); + uint32x4_t maxmask = vcgtq_f32(line, n_max); + + minmask = vandq_u32(minmask, maskLine4); + maxmask = vandq_u32(maxmask, maskLine4); + + n_min = vbslq_f32(minmask, line, n_min); + n_minIdx = vbslq_u32(minmask, lineIdxOffset, n_minIdx); + n_max = vbslq_f32(maxmask, line, n_max); + n_maxIdx = vbslq_u32(maxmask, lineIdxOffset, n_maxIdx); + + // idx[] +=4 + lineIdxOffset = vaddq_u32(lineIdxOffset, c4); + } + + f32 fmin[4], fmax[4]; + u32 fminIdx[4], fmaxIdx[4]; + + vst1q_f32(fmin, n_min); + vst1q_f32(fmax, n_max); + + vst1q_u32(fminIdx, n_minIdx); + vst1q_u32(fmaxIdx, n_maxIdx); + + size_t minIdx = fminIdx[0]; + size_t maxIdx = fmaxIdx[0]; + minVal = fmin[0]; + maxVal = fmax[0]; + + for (s32 j = 1; j < 4; ++j) + { + f32 minval = fmin[j]; + f32 maxval = fmax[j]; + if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx)) + { + minIdx = fminIdx[j]; + minVal = minval; + } + if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx)) + { + maxIdx = fmaxIdx[j]; + maxVal = maxval; + } + } + if(minIdx < 0xffffFFFC) + { +#if SIZE_MAX > UINT32_MAX + minCol = b + minIdx; +#else + minCol = minIdx; +#endif + minRow = l; + } + if(maxIdx < 0xffffFFFC) + { +#if SIZE_MAX > UINT32_MAX + maxCol = b + maxIdx; +#else + maxCol = maxIdx; +#endif + maxRow = l; + } + } + } + for(; i < size.width; i++ ) + { + if (!mask[i]) + continue; + f32 val = src[i]; + if( val < minVal ) + { + minVal = val; + minCol = i; + minRow = l; + } + if( val > maxVal ) + { + maxVal = val; + maxCol = i; + maxRow = l; + } + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)maskBase; + (void)maskStride; + (void)minVal; + (void)minCol; + (void)minRow; + (void)maxVal; + (void)maxCol; + (void)maxRow; +#endif +} + +void minMaxLoc(const Size2D &size, + const s32 * srcBase, ptrdiff_t srcStride, + s32 &minVal, size_t &minCol, size_t &minRow, + s32 &maxVal, size_t &maxCol, size_t &maxRow) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + minVal = srcBase[0]; + minCol = 0; + minRow = 0; + maxVal = srcBase[0]; + maxCol = 0; + maxRow = 0; + for(size_t l = 0, i = 0; l < size.height; ++l, i = 0) + { + const s32 * src = internal::getRowPtr( srcBase, srcStride, l); + if (size.width >= 16) + { + u32 tmp0123[4] = { 0, 1, 2, 3 }; + uint32x4_t c4 = vdupq_n_u32(4); + +#if SIZE_MAX > UINT32_MAX + size_t boundAll = size.width - (4 - 1); + for(size_t b = 0; i < boundAll; b = i) + { + size_t bound = std::min(boundAll, b + 0xffffFFFC); +#else + { + size_t bound = size.width - (4 - 1); +#endif + uint32x4_t lineIdxOffset = vld1q_u32(tmp0123); + int32x4_t n_min = vdupq_n_s32(minVal); + uint32x4_t n_minIdx = vdupq_n_u32(0xffffFFFC); + int32x4_t n_max = vdupq_n_s32(maxVal); + uint32x4_t n_maxIdx = vdupq_n_u32(0xffffFFFC); + + for(; i < bound; i+=4 ) + { + internal::prefetch(src + i); + int32x4_t line = vld1q_s32(src + i); + + uint32x4_t minmask = vcltq_s32(line, n_min); + uint32x4_t maxmask = vcgtq_s32(line, n_max); + + n_min = vbslq_s32(minmask, line, n_min); + n_minIdx = vbslq_u32(minmask, lineIdxOffset, n_minIdx); + n_max = vbslq_s32(maxmask, line, n_max); + n_maxIdx = vbslq_u32(maxmask, lineIdxOffset, n_maxIdx); + + // idx[] +=4 + lineIdxOffset = vaddq_u32(lineIdxOffset, c4); + } + + s32 fmin[4], fmax[4]; + u32 fminIdx[4], fmaxIdx[4]; + + vst1q_s32(fmin, n_min); + vst1q_s32(fmax, n_max); + + vst1q_u32(fminIdx, n_minIdx); + vst1q_u32(fmaxIdx, n_maxIdx); + + size_t minIdx = fminIdx[0]; + size_t maxIdx = fmaxIdx[0]; + minVal = fmin[0]; + maxVal = fmax[0]; + + for (s32 j = 1; j < 4; ++j) + { + s32 minval = fmin[j]; + s32 maxval = fmax[j]; + if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx)) + { + minIdx = fminIdx[j]; + minVal = minval; + } + if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx)) + { + maxIdx = fmaxIdx[j]; + maxVal = maxval; + } + } + if(minIdx < 0xffffFFFC) + { +#if SIZE_MAX > UINT32_MAX + minCol = b + minIdx; +#else + minCol = minIdx; +#endif + minRow = l; + } + if(maxIdx < 0xffffFFFC) + { +#if SIZE_MAX > UINT32_MAX + maxCol = b + maxIdx; +#else + maxCol = maxIdx; +#endif + maxRow = l; + } + } + } + for(; i < size.width; ++i ) + { + s32 val = src[i]; + if( val < minVal ) + { + minVal = val; + minCol = i; + minRow = l; + } + else if( val > maxVal ) + { + maxVal = val; + maxCol = i; + maxRow = l; + } + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)minVal; + (void)minCol; + (void)minRow; + (void)maxVal; + (void)maxCol; + (void)maxRow; +#endif +} + +void minMaxLoc(const Size2D &size, + const s16 * srcBase, ptrdiff_t srcStride, + s16 &minVal, size_t &minCol, size_t &minRow, + s16 &maxVal, size_t &maxCol, size_t &maxRow) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + minVal = srcBase[0]; + minCol = 0; + minRow = 0; + maxVal = srcBase[0]; + maxCol = 0; + maxRow = 0; + for(size_t l = 0, i = 0; l < size.height; ++l, i = 0) + { + const s16 * src = internal::getRowPtr( srcBase, srcStride, l); + if (size.width >= 32) + { + u32 tmp0123[4] = { 0, 1, 2, 3 }; + uint32x4_t c8 = vdupq_n_u32(8); + +#if SIZE_MAX > UINT32_MAX + size_t boundAll = size.width - (8 - 1); + for(size_t b = 0; i < boundAll; b = i) + { + size_t bound = std::min(boundAll, b + 0xffffFFF8); +#else + { + size_t bound = size.width - (8 - 1); +#endif + uint32x4_t lineIdxOffset = vld1q_u32(tmp0123); + int16x8_t n_min = vdupq_n_s16(minVal); + uint32x4_t n_minIdxl = vdupq_n_u32(0xffffFFF8); + uint32x4_t n_minIdxh = vdupq_n_u32(0xffffFFF8); + int16x8_t n_max = vdupq_n_s16(maxVal); + uint32x4_t n_maxIdxl = vdupq_n_u32(0xffffFFF8); + uint32x4_t n_maxIdxh = vdupq_n_u32(0xffffFFF8); + + for(; i < bound; i+=8 ) + { + internal::prefetch(src + i); + int16x8_t line = vld1q_s16(src + i); + + uint16x8_t minmask = vcltq_s16(line, n_min); + uint16x8_t maxmask = vcgtq_s16(line, n_max); + + n_min = vbslq_s16(minmask, line, n_min); + uint16x4_t minml = vget_low_u16(minmask); + uint16x4_t minmh = vget_high_u16(minmask); + uint32x4_t minml2 = vmovl_u16(minml); + uint32x4_t minmh2 = vmovl_u16(minmh); + minml2 = vqshlq_n_u32(minml2, 31); + minmh2 = vqshlq_n_u32(minmh2, 31); + n_minIdxl = vbslq_u32(minml2, lineIdxOffset, n_minIdxl); + n_minIdxh = vbslq_u32(minmh2, lineIdxOffset, n_minIdxh); + + n_max = vbslq_s16(maxmask, line, n_max); + uint16x4_t maxml = vget_low_u16(maxmask); + uint16x4_t maxmh = vget_high_u16(maxmask); + uint32x4_t maxml2 = vmovl_u16(maxml); + uint32x4_t maxmh2 = vmovl_u16(maxmh); + maxml2 = vqshlq_n_u32(maxml2, 31); + maxmh2 = vqshlq_n_u32(maxmh2, 31); + n_maxIdxl = vbslq_u32(maxml2, lineIdxOffset, n_maxIdxl); + n_maxIdxh = vbslq_u32(maxmh2, lineIdxOffset, n_maxIdxh); + + // idx[] +=8 + lineIdxOffset = vaddq_u32(lineIdxOffset, c8); + } + + // fix high part of indexes + uint32x4_t c4 = vdupq_n_u32((int32_t) 4); + n_minIdxh = vaddq_u32(n_minIdxh, c4); + n_maxIdxh = vaddq_u32(n_maxIdxh, c4); + + s16 fmin[8], fmax[8]; + u32 fminIdx[8], fmaxIdx[8]; + + vst1q_s16(fmin, n_min); + vst1q_s16(fmax, n_max); + vst1q_u32(fminIdx+0, n_minIdxl); + vst1q_u32(fmaxIdx+0, n_maxIdxl); + vst1q_u32(fminIdx+4, n_minIdxh); + vst1q_u32(fmaxIdx+4, n_maxIdxh); + + size_t minIdx = fminIdx[0]; + size_t maxIdx = fmaxIdx[0]; + minVal = fmin[0]; + maxVal = fmax[0]; + + for (s32 j = 1; j < 8; ++j) + { + s16 minval = fmin[j]; + s16 maxval = fmax[j]; + if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx)) + { + minIdx = fminIdx[j]; + minVal = minval; + } + if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx)) + { + maxIdx = fmaxIdx[j]; + maxVal = maxval; + } + } + if(minIdx < 0xffffFFF8) + { +#if SIZE_MAX > UINT32_MAX + minCol = b + minIdx; +#else + minCol = minIdx; +#endif + minRow = l; + } + if(maxIdx < 0xffffFFF8) + { +#if SIZE_MAX > UINT32_MAX + maxCol = b + maxIdx; +#else + maxCol = maxIdx; +#endif + maxRow = l; + } + } + } + for(; i < size.width; ++i ) + { + short val = src[i]; + if( val < minVal ) + { + minVal = val; + minCol = i; + minRow = l; + } + else if( val > maxVal ) + { + maxVal = val; + maxCol = i; + maxRow = l; + } + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)minVal; + (void)minCol; + (void)minRow; + (void)maxVal; + (void)maxCol; + (void)maxRow; +#endif +} + +void minMaxLoc(const Size2D &size, + const u16 * srcBase, ptrdiff_t srcStride, + u16 &minVal, size_t &minCol, size_t &minRow, + u16 &maxVal, size_t &maxCol, size_t &maxRow) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + minVal = srcBase[0]; + minCol = 0; + minRow = 0; + maxVal = srcBase[0]; + maxCol = 0; + maxRow = 0; + for(size_t l = 0, i = 0; l < size.height; ++l, i = 0) + { + const u16 * src = internal::getRowPtr( srcBase, srcStride, l); + if (size.width >= 32) + { + u32 tmp0123[4] = { 0, 1, 2, 3 }; + uint32x4_t c8 = vdupq_n_u32(8); + +#if SIZE_MAX > UINT32_MAX + size_t boundAll = size.width - (8 - 1); + for(size_t b = 0; i < boundAll; b = i) + { + size_t bound = std::min(boundAll, b + 0xffffFFF8); +#else + { + size_t bound = size.width - (8 - 1); +#endif + uint32x4_t lineIdxOffset = vld1q_u32(tmp0123); + uint16x8_t n_min = vdupq_n_u16(minVal); + uint32x4_t n_minIdxl = vdupq_n_u32(0xffffFFF8); + uint32x4_t n_minIdxh = vdupq_n_u32(0xffffFFF8); + uint16x8_t n_max = vdupq_n_u16(maxVal); + uint32x4_t n_maxIdxl = vdupq_n_u32(0xffffFFF8); + uint32x4_t n_maxIdxh = vdupq_n_u32(0xffffFFF8); + + for(; i < bound; i+=8 ) + { + internal::prefetch(src + i); + uint16x8_t line = vld1q_u16(src + i); + + uint16x8_t minmask = vcltq_u16(line, n_min); + uint16x8_t maxmask = vcgtq_u16(line, n_max); + + n_min = vbslq_u16(minmask, line, n_min); + uint16x4_t minml = vget_low_u16(minmask); + uint16x4_t minmh = vget_high_u16(minmask); + uint32x4_t minml2 = vmovl_u16(minml); + uint32x4_t minmh2 = vmovl_u16(minmh); + minml2 = vqshlq_n_u32(minml2, 31); + minmh2 = vqshlq_n_u32(minmh2, 31); + n_minIdxl = vbslq_u32(minml2, lineIdxOffset, n_minIdxl); + n_minIdxh = vbslq_u32(minmh2, lineIdxOffset, n_minIdxh); + + n_max = vbslq_u16(maxmask, line, n_max); + uint16x4_t maxml = vget_low_u16(maxmask); + uint16x4_t maxmh = vget_high_u16(maxmask); + uint32x4_t maxml2 = vmovl_u16(maxml); + uint32x4_t maxmh2 = vmovl_u16(maxmh); + maxml2 = vqshlq_n_u32(maxml2, 31); + maxmh2 = vqshlq_n_u32(maxmh2, 31); + n_maxIdxl = vbslq_u32(maxml2, lineIdxOffset, n_maxIdxl); + n_maxIdxh = vbslq_u32(maxmh2, lineIdxOffset, n_maxIdxh); + + // idx[] +=8 + lineIdxOffset = vaddq_u32(lineIdxOffset, c8); + } + + // fix high part of indexes + uint32x4_t c4 = vdupq_n_u32(4); + n_minIdxh = vaddq_u32(n_minIdxh, c4); + n_maxIdxh = vaddq_u32(n_maxIdxh, c4); + + u16 fmin[8], fmax[8]; + u32 fminIdx[8], fmaxIdx[8]; + + vst1q_u16(fmin, n_min); + vst1q_u16(fmax, n_max); + vst1q_u32(fminIdx+0, n_minIdxl); + vst1q_u32(fmaxIdx+0, n_maxIdxl); + vst1q_u32(fminIdx+4, n_minIdxh); + vst1q_u32(fmaxIdx+4, n_maxIdxh); + + size_t minIdx = fminIdx[0]; + size_t maxIdx = fmaxIdx[0]; + minVal = fmin[0]; + maxVal = fmax[0]; + + for (s32 j = 1; j < 8; ++j) + { + u16 minval = fmin[j]; + u16 maxval = fmax[j]; + if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx)) + { + minIdx = fminIdx[j]; + minVal = minval; + } + if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx)) + { + maxIdx = fmaxIdx[j]; + maxVal = maxval; + } + } + if(minIdx < 0xffffFFF8) + { +#if SIZE_MAX > UINT32_MAX + minCol = b + minIdx; +#else + minCol = minIdx; +#endif + minRow = l; + } + if(maxIdx < 0xffffFFF8) + { +#if SIZE_MAX > UINT32_MAX + maxCol = b + maxIdx; +#else + maxCol = maxIdx; +#endif + maxRow = l; + } + } + } + for(; i < size.width; ++i ) + { + u16 val = src[i]; + if( val < minVal ) + { + minVal = val; + minCol = i; + minRow = l; + } + else if( val > maxVal ) + { + maxVal = val; + maxCol = i; + maxRow = l; + } + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)minVal; + (void)minCol; + (void)minRow; + (void)maxVal; + (void)maxCol; + (void)maxRow; +#endif +} + +#ifdef CAROTENE_NEON +namespace { + +void minMaxLocBlock(const u8 * src, u32 len, + u8 &minVal, u16 &minIdx, + u8 &maxVal, u16 &maxIdx) +{ + u16 tmp0123[8] = { 0, 1, 2, 3, 4, 5, 6, 7 }; + + uint8x16_t n_min = vdupq_n_u8(src[0]); + uint16x8_t n_minIdxl = vdupq_n_u16(0); + uint16x8_t n_minIdxh = vdupq_n_u16(0); + uint8x16_t n_max = vdupq_n_u8(src[0]); + uint16x8_t n_maxIdxl = vdupq_n_u16(0); + uint16x8_t n_maxIdxh = vdupq_n_u16(0); + uint16x8_t c16 = vdupq_n_u16(16); + uint16x8_t lineIdxOffset = vld1q_u16(tmp0123); + + s32 i = 0; + s32 bound = len - (16 - 1); + for(; i < bound; i+=16 ) + { + internal::prefetch(src + i); + uint8x16_t line = vld1q_u8(src + i); + + uint8x16_t minmask = vcltq_u8(line, n_min); + uint8x16_t maxmask = vcgtq_u8(line, n_max); + + n_min = vbslq_u8(minmask, line, n_min); + uint8x8_t minml = vget_low_u8(minmask); + uint8x8_t minmh = vget_high_u8(minmask); + uint16x8_t minml2 = vmovl_u8(minml); + uint16x8_t minmh2 = vmovl_u8(minmh); + minml2 = vqshlq_n_u16(minml2, 15); + minmh2 = vqshlq_n_u16(minmh2, 15); + n_minIdxl = vbslq_u16(minml2, lineIdxOffset, n_minIdxl); + n_minIdxh = vbslq_u16(minmh2, lineIdxOffset, n_minIdxh); + + n_max = vbslq_u8(maxmask, line, n_max); + uint8x8_t maxml = vget_low_u8(maxmask); + uint8x8_t maxmh = vget_high_u8(maxmask); + uint16x8_t maxml2 = vmovl_u8(maxml); + uint16x8_t maxmh2 = vmovl_u8(maxmh); + maxml2 = vqshlq_n_u16(maxml2, 15); + maxmh2 = vqshlq_n_u16(maxmh2, 15); + n_maxIdxl = vbslq_u16(maxml2, lineIdxOffset, n_maxIdxl); + n_maxIdxh = vbslq_u16(maxmh2, lineIdxOffset, n_maxIdxh); + + // idx[] +=16 + lineIdxOffset = vaddq_u16(lineIdxOffset, c16); + } + + // fix high part of indexes + uint16x8_t c8 = vdupq_n_u16(8); + n_minIdxh = vaddq_u16(n_minIdxh, c8); + n_maxIdxh = vaddq_u16(n_maxIdxh, c8); + + u8 fmin[16], fmax[16]; + u16 fminIdx[16], fmaxIdx[16]; + /*{ + uint8x8_t min_low = vget_low_u8(n_min); + uint8x8_t min_high = vget_high_u8(n_min); + uint8x8_t max_low = vget_low_u8(n_max); + uint8x8_t max_high = vget_high_u8(n_max); + + uint8x8_t minmask = vclt_u8(min_low, min_high); + uint8x8_t maxmask = vcgt_u8(max_low, max_high); + + uint8x8_t min2 = vbsl_u8(minmask, min_low, min_high); + uint8x8_t max2 = vbsl_u8(maxmask, max_low, max_high); + + uint16x8_t minidxmask = vmovl_u8(minmask); + uint16x8_t maxidxmask = vmovl_u8(maxmask); + minidxmask = vqshlq_n_u16(minidxmask, 15); + maxidxmask = vqshlq_n_u16(maxidxmask, 15); + + uint16x8_t n_minIdx = vbslq_u16(minidxmask, n_minIdxl, n_minIdxh); + uint16x8_t n_maxIdx = vbslq_u16(maxidxmask, n_maxIdxl, n_maxIdxh); + + vst1_u8((uint8_t*)fmin, min2); + vst1_u8((uint8_t*)fmax, max2); + + vst1q_u16((uint16_t*)(fminIdx), n_minIdx); + vst1q_u16((uint16_t*)(fmaxIdx), n_maxIdx); + }*/ + + vst1q_u8(fmin, n_min); + vst1q_u8(fmax, n_max); + vst1q_u16(fminIdx+0, n_minIdxl); + vst1q_u16(fmaxIdx+0, n_maxIdxl); + vst1q_u16(fminIdx+8, n_minIdxh); + vst1q_u16(fmaxIdx+8, n_maxIdxh); + + minIdx = fminIdx[0]; + maxIdx = fmaxIdx[0]; + minVal = fmin[0]; + maxVal = fmax[0]; + + for (s32 j = 1; j < 16; ++j) + { + u8 minval = fmin[j]; + u8 maxval = fmax[j]; + if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx)) + { + minIdx = fminIdx[j]; + minVal = minval; + } + if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx)) + { + maxIdx = fmaxIdx[j]; + maxVal = maxval; + } + } + + for(; i < (s32)len; ++i ) + { + u8 val = src[i]; + if( val < minVal ) + { + minVal = val; + minIdx = (u16)i; + } + else if( val > maxVal ) + { + maxVal = val; + maxIdx = (u16)i; + } + } +} + +void minMaxLocBlock(const s8 * src, u32 len, + s8 &minVal, u16 &minIdx, + s8 &maxVal, u16 &maxIdx) +{ + u16 tmp0123[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; + + int8x16_t n_min = vdupq_n_s8(src[0]); + uint16x8_t n_minIdxl = vdupq_n_u16(0); + uint16x8_t n_minIdxh = vdupq_n_u16(0); + int8x16_t n_max = vdupq_n_s8(src[0]); + uint16x8_t n_maxIdxl = vdupq_n_u16(0); + uint16x8_t n_maxIdxh = vdupq_n_u16(0); + uint16x8_t c16 = vdupq_n_u16(16); + uint16x8_t lineIdxOffset = vld1q_u16(tmp0123); + + s32 i = 0; + s32 bound = len - (16 - 1); + for(; i < bound; i+=16 ) + { + internal::prefetch(src + i); + int8x16_t line = vld1q_s8(src + i); + + uint8x16_t minmask = vcltq_s8(line, n_min); + uint8x16_t maxmask = vcgtq_s8(line, n_max); + + n_min = vbslq_s8(minmask, line, n_min); + uint8x8_t minml = vget_low_u8(minmask); + uint8x8_t minmh = vget_high_u8(minmask); + uint16x8_t minml2 = vmovl_u8(minml); + uint16x8_t minmh2 = vmovl_u8(minmh); + minml2 = vqshlq_n_u16(minml2, 15); + minmh2 = vqshlq_n_u16(minmh2, 15); + n_minIdxl = vbslq_u16(minml2, lineIdxOffset, n_minIdxl); + n_minIdxh = vbslq_u16(minmh2, lineIdxOffset, n_minIdxh); + + n_max = vbslq_s8(maxmask, line, n_max); + uint8x8_t maxml = vget_low_u8(maxmask); + uint8x8_t maxmh = vget_high_u8(maxmask); + uint16x8_t maxml2 = vmovl_u8(maxml); + uint16x8_t maxmh2 = vmovl_u8(maxmh); + maxml2 = vqshlq_n_u16(maxml2, 15); + maxmh2 = vqshlq_n_u16(maxmh2, 15); + n_maxIdxl = vbslq_u16(maxml2, lineIdxOffset, n_maxIdxl); + n_maxIdxh = vbslq_u16(maxmh2, lineIdxOffset, n_maxIdxh); + + // idx[] +=16 + lineIdxOffset = vaddq_u16(lineIdxOffset, c16); + } + + // fix high part of indexes + uint16x8_t c8 = vdupq_n_u16(8); + n_minIdxh = vaddq_u16(n_minIdxh, c8); + n_maxIdxh = vaddq_u16(n_maxIdxh, c8); + + s8 fmin[16], fmax[16]; + u16 fminIdx[16], fmaxIdx[16]; + + vst1q_s8(fmin, n_min); + vst1q_s8(fmax, n_max); + vst1q_u16(fminIdx+0, n_minIdxl); + vst1q_u16(fmaxIdx+0, n_maxIdxl); + vst1q_u16(fminIdx+8, n_minIdxh); + vst1q_u16(fmaxIdx+8, n_maxIdxh); + + minIdx = fminIdx[0]; + maxIdx = fmaxIdx[0]; + minVal = fmin[0]; + maxVal = fmax[0]; + + for (s32 j = 1; j < 16; ++j) + { + s8 minval = fmin[j]; + s8 maxval = fmax[j]; + if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx)) + { + minIdx = fminIdx[j]; + minVal = minval; + } + if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx)) + { + maxIdx = fmaxIdx[j]; + maxVal = maxval; + } + } + + for(; i < (s32)len; ++i ) + { + s8 val = src[i]; + if( val < minVal ) + { + minVal = val; + minIdx = (u16)i; + } + else if( val > maxVal ) + { + maxVal = val; + maxIdx = (u16)i; + } + } +} + +} // namespace +#endif // CAROTENE_NEON + +#define USHORT_BLOCK_MAX_SIZE (1 << 16) + +void minMaxLoc(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 &minVal, size_t &minCol, size_t &minRow, + u8 &maxVal, size_t &maxCol, size_t &maxRow) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + minVal = srcBase[0]; + minCol = 0; + minRow = 0; + maxVal = srcBase[0]; + maxCol = 0; + maxRow = 0; + for(size_t l = 0; l < size.height; ++l) + { + const u8 * src = internal::getRowPtr( srcBase, srcStride, l); + if (size.width > 128) + { + for(size_t blockStart = 0; blockStart < size.width; blockStart += USHORT_BLOCK_MAX_SIZE) + { + u8 locMinVal, locMaxVal; + u16 locMinIdx, locMaxIdx; + size_t tail = size.width - blockStart; + minMaxLocBlock(src + blockStart, tail < USHORT_BLOCK_MAX_SIZE ? tail : USHORT_BLOCK_MAX_SIZE, + locMinVal, locMinIdx, locMaxVal, locMaxIdx); + + if (locMinVal == 0 && locMaxVal == 255) + { + minCol = blockStart + locMinIdx; + maxCol = blockStart + locMaxIdx; + minRow = l; + maxRow = l; + minVal = 0; + maxVal = 255; + return; + } + else + { + if (locMinVal < minVal) + { + minCol = blockStart + locMinIdx; + minRow = l; + minVal = locMinVal; + } + if (locMaxVal > maxVal) + { + maxCol = blockStart + locMaxIdx; + maxRow = l; + maxVal = locMaxVal; + } + } + } + } + else + { + for(size_t i = 0; i < size.width; ++i ) + { + u8 val = src[i]; + if( val < minVal ) + { + minVal = val; + minCol = i; + minRow = l; + } + else if( val > maxVal ) + { + maxVal = val; + maxCol = i; + maxRow = l; + } + } + } + + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)minVal; + (void)minCol; + (void)minRow; + (void)maxVal; + (void)maxCol; + (void)maxRow; +#endif +} + +void minMaxLoc(const Size2D &size, + const s8 * srcBase, ptrdiff_t srcStride, + s8 &minVal, size_t &minCol, size_t &minRow, + s8 &maxVal, size_t &maxCol, size_t &maxRow) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + minVal = srcBase[0]; + minCol = 0; + minRow = 0; + maxVal = srcBase[0]; + maxCol = 0; + maxRow = 0; + for(size_t l = 0; l < size.height; ++l) + { + const s8 * src = internal::getRowPtr( srcBase, srcStride, l); + if (size.width > 128) + { + for(size_t blockStart = 0; blockStart < size.width; blockStart += USHORT_BLOCK_MAX_SIZE) + { + s8 locMinVal, locMaxVal; + u16 locMinIdx, locMaxIdx; + size_t tail = size.width - blockStart; + minMaxLocBlock(src + blockStart, tail < USHORT_BLOCK_MAX_SIZE ? tail : USHORT_BLOCK_MAX_SIZE, + locMinVal, locMinIdx, locMaxVal, locMaxIdx); + + if (locMinVal == -128 && locMaxVal == 127) + { + minCol = blockStart + locMinIdx; + maxCol = blockStart + locMaxIdx; + minRow = l; + maxRow = l; + minVal = -128; + maxVal = 127; + return; + } + else + { + if (locMinVal < minVal) + { + minCol = blockStart + locMinIdx; + minRow = l; + minVal = locMinVal; + } + if (locMaxVal > maxVal) + { + maxCol = blockStart + locMaxIdx; + maxRow = l; + maxVal = locMaxVal; + } + } + } + } + else + { + for(size_t i = 0; i < size.width; ++i ) + { + s8 val = src[i]; + if( val < minVal ) + { + minVal = val; + minRow = l; + minCol = i; + } + else if( val > maxVal ) + { + maxVal = val; + maxRow = l; + maxCol = i; + } + } + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)minVal; + (void)minCol; + (void)minRow; + (void)maxVal; + (void)maxCol; + (void)maxRow; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/morph.cpp b/3rdparty/carotene/src/morph.cpp new file mode 100644 index 0000000000..bcc6aa7e06 --- /dev/null +++ b/3rdparty/carotene/src/morph.cpp @@ -0,0 +1,728 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +#include +#include +#include +#include + +namespace CAROTENE_NS { + +bool isMorph3x3Supported(const Size2D &size, BORDER_MODE border) +{ + return isSupportedConfiguration() && size.width >= 16 && + (border == BORDER_MODE_CONSTANT || + border == BORDER_MODE_REPLICATE); +} + +#ifdef CAROTENE_NEON + +namespace { + +struct ErodeVecOp +{ + ErodeVecOp():borderValue(0){} + + ErodeVecOp(BORDER_MODE border, u8 borderValue_) : + borderValue(borderValue_) + { + if (border == BORDER_MODE_REPLICATE) + borderValue = std::numeric_limits::max(); + } + + inline uint8x16_t operator()(uint8x16_t a, uint8x16_t b) const + { + return vminq_u8(a, b); + } + + inline uint8x8_t operator()(uint8x8_t a, uint8x8_t b) const + { + return vmin_u8(a, b); + } + + inline u8 operator()(u8 a, u8 b) const + { + return std::min(a, b); + } + + u8 borderValue; +}; + +struct DilateVecOp +{ + DilateVecOp():borderValue(0){} + + DilateVecOp(BORDER_MODE border, u8 borderValue_) : + borderValue(borderValue_) + { + if (border == BORDER_MODE_REPLICATE) + borderValue = std::numeric_limits::min(); + } + + inline uint8x16_t operator()(uint8x16_t a, uint8x16_t b) const + { + return vmaxq_u8(a, b); + } + + inline uint8x8_t operator()(uint8x8_t a, uint8x8_t b) const + { + return vmax_u8(a, b); + } + + inline u8 operator()(u8 a, u8 b) const + { + return std::max(a, b); + } + + u8 borderValue; +}; + +template +void morph3x3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, const VecOp & vop) +{ + u8 borderValue = vop.borderValue; + ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height; + + const uint8x16_t v_zero = vdupq_n_u8(0); + const uint8x16_t v_border = vdupq_n_u8(borderValue); + + uint8x16_t tprev = v_zero, tcurr = v_zero, tnext = v_zero; + uint8x16_t t0 = v_zero, t1 = v_zero, t2 = v_zero; + + for (ptrdiff_t y = 0; y < height; ++y) + { + const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max(y - 1, 0)); + const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y); + const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1)); + u8 * drow = internal::getRowPtr(dstBase, dstStride, y); + + u8 prevx = 0, currx = 0, nextx = 0; + ptrdiff_t x = 0; + const ptrdiff_t bwidth = y + 2 < height ? width : (width - 16); + + // perform vertical convolution + for ( ; x <= bwidth; x += 16) + { + internal::prefetch(srow0 + x); + internal::prefetch(srow1 + x); + internal::prefetch(srow2 + x); + + uint8x16_t x0 = !srow0 ? v_border : vld1q_u8(srow0 + x); + uint8x16_t x1 = vld1q_u8(srow1 + x); + uint8x16_t x2 = !srow2 ? v_border : vld1q_u8(srow2 + x); + + // calculate values for plain CPU part below if needed + if (x + 16 >= bwidth) + { + ptrdiff_t x3 = x == width ? width - 1 : x; + ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max(x3 - 1, 0); + + if (border == BORDER_MODE_CONSTANT && x4 < 0) + prevx = borderValue; + else + prevx = vop(srow1[x4], + vop(srow2 ? srow2[x4] : borderValue, + srow0 ? srow0[x4] : borderValue)); + + currx = vop(srow2 ? srow2[x3] : borderValue, vop(srow1[x3], srow0 ? srow0[x3] : borderValue)); + } + + // make shift + if (x) + { + tprev = tcurr; + tcurr = tnext; + } + + // and calculate next value + tnext = vop(vop(x0, x1), x2); + + // make extrapolation for the first elements + if (!x) + { + // make border + if (border == BORDER_MODE_CONSTANT) + tcurr = v_border; + else if (border == BORDER_MODE_REPLICATE) + tcurr = vdupq_n_u8(vgetq_lane_u8(tnext, 0)); + + continue; + } + + // combine 3 "shifted" vectors + t0 = vextq_u8(tprev, tcurr, 15); + t1 = tcurr; + t2 = vextq_u8(tcurr, tnext, 1); + + // and add them + t0 = vop(t0, vop(t1, t2)); + + vst1q_u8(drow + x - 16, t0); + } + + x -= 16; + if (x == width) + --x; + + for ( ; x < width; ++x) + { + // make extrapolation for the last elements + if (x + 1 >= width) + { + if (border == BORDER_MODE_CONSTANT) + nextx = borderValue; + else if (border == BORDER_MODE_REPLICATE) + nextx = vop(srow2[x], vop(srow1[x], srow0[x])); + } + else + nextx = vop(vop(srow2 ? srow2[x + 1] : borderValue, + srow0 ? srow0[x + 1] : borderValue), + srow1[x + 1]); + + drow[x] = vop(prevx, vop(currx, nextx)); + + // make shift + prevx = currx; + currx = nextx; + } + } +} + +} // namespace + +#endif + +void erode3x3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue) +{ + internal::assertSupportedConfiguration(isMorph3x3Supported(size, border)); +#ifdef CAROTENE_NEON + morph3x3(size, + srcBase, srcStride, + dstBase, dstStride, + border, ErodeVecOp(border, borderValue)); +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)border; + (void)borderValue; +#endif +} + +void dilate3x3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue) +{ + internal::assertSupportedConfiguration(isMorph3x3Supported(size, border)); +#ifdef CAROTENE_NEON + morph3x3(size, + srcBase, srcStride, + dstBase, dstStride, + border, DilateVecOp(border, borderValue)); +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)border; + (void)borderValue; +#endif +} + +#ifdef CAROTENE_NEON +namespace { + +template +void MorphRow(const u8* src, u8* dst, size_t width, s32 cn, size_t ksize) +{ + size_t i, j, k; + size_t width16 = (width & -16) * cn; + size_t width8 = (width & -8) * cn; + width *= cn; + + if (ksize == 1) + { + for (i = 0; i < width; i++) + dst[i] = src[i]; + return; + } + + ksize = ksize*cn; + VecUpdate updateOp; + switch(cn) + { + case 1: + for (i = 0; i < width16; i += 16) + { + const u8* sptr = src + i; + uint8x16_t s = vld1q_u8(sptr); + internal::prefetch(sptr); + + for( k = 1; k < ksize; ++k) + s = updateOp(s, vld1q_u8(sptr + k)); + + vst1q_u8(dst + i, s); + } + + for (; i < width8; i += 8) + { + const u8* sptr = src + i; + uint8x8_t s = vld1_u8(sptr); + internal::prefetch(sptr); + + for( k = 1; k < ksize; ++k) + s = updateOp(s, vld1_u8(sptr + k)); + + vst1_u8(dst + i, s); + } + break; + default: + for (i = 0; i < width16; i += 16) + { + uint8x16_t s = vld1q_u8(src + i); + internal::prefetch(src + i); + + for (k = cn; k < ksize; k += cn) + s = updateOp(s, vld1q_u8(src + i + k)); + + vst1q_u8(dst + i, s); + } + + for (; i < width8; i += 8) + { + uint8x8_t s = vld1_u8(src + i); + internal::prefetch(src + i); + + for (k = cn; k < ksize; k += cn) + s = updateOp(s, vld1_u8(src + i + k)); + + vst1_u8(dst + i, s); + } + break; + } + + ptrdiff_t i0 = i; + for( k = 0; k < (size_t)cn; k++, src++, dst++ ) + { + for( i = i0; i <= width - cn*2; i += cn*2 ) + { + const u8* s = src + i; + u8 m = s[cn]; + for( j = cn*2; j < ksize; j += cn ) + m = updateOp(m, s[j]); + dst[i] = updateOp(m, s[0]); + dst[i+cn] = updateOp(m, s[j]); + } + + for( ; i < width; i += cn ) + { + const u8* s = src + i; + u8 m = s[0]; + for( j = cn; j < ksize; j += cn ) + m = updateOp(m, s[j]); + dst[i] = m; + } + } +} + +template +void MorphColumn(const u8** src, u8* dst, ptrdiff_t dststep, size_t count, size_t width, size_t ksize) +{ + size_t i, k; + size_t width32 = width & -32; + VecUpdate updateOp; + + uint8x16_t x0,x1,s0,s1; + if (ksize == 3) + { + for (; count > 1; count -= 2, dst += dststep * 2, src += 2) + { + for (i = 0; i < width32; i += 32) + { + const u8* sptr = src[1] + i; + s0 = vld1q_u8(sptr); + s1 = vld1q_u8(sptr + 16); + internal::prefetch(sptr); + + sptr = src[2] + i; + x0 = vld1q_u8(sptr); + x1 = vld1q_u8(sptr + 16); + internal::prefetch(sptr); + + s0 = updateOp(s0, x0); + s1 = updateOp(s1, x1); + + sptr = src[0] + i; + x0 = vld1q_u8(sptr); + x1 = vld1q_u8(sptr + 16); + internal::prefetch(sptr); + + vst1q_u8(dst+i, updateOp(s0, x0)); + vst1q_u8(dst+i+16, updateOp(s1, x1)); + + sptr = src[3] + i; + x0 = vld1q_u8(sptr); + x1 = vld1q_u8(sptr + 16); + internal::prefetch(sptr); + vst1q_u8(dst + dststep + i, updateOp(s0, x0)); + vst1q_u8(dst + dststep + i + 16, updateOp(s1, x1)); + + } + for(; i < width; i++ ) + { + u8 s = src[1][i]; + + for( k = 2; k < ksize; k++ ) + s = updateOp(s, src[k][i]); + + dst[i] = updateOp(s, src[0][i]); + dst[i+dststep] = updateOp(s, src[k][i]); + } + } + } + else if (ksize > 1) + for (; count > 1; count -= 2, dst += dststep*2, src += 2) + { + for (i = 0; i < width32; i += 32) + { + const u8* sptr = src[1] + i; + s0 = vld1q_u8(sptr); + s1 = vld1q_u8(sptr + 16); + internal::prefetch(sptr); + for (k = 2; k < ksize; k++) + { + sptr = src[k] + i; + x0 = vld1q_u8(sptr); + x1 = vld1q_u8(sptr + 16); + internal::prefetch(sptr); + + s0 = updateOp(s0, x0); + s1 = updateOp(s1, x1); + } + + sptr = src[0] + i; + x0 = vld1q_u8(sptr); + x1 = vld1q_u8(sptr + 16); + internal::prefetch(sptr); + + vst1q_u8(dst+i, updateOp(s0, x0)); + vst1q_u8(dst+i+16, updateOp(s1, x1)); + + sptr = src[k] + i; + x0 = vld1q_u8(sptr); + x1 = vld1q_u8(sptr + 16); + internal::prefetch(sptr); + vst1q_u8(dst + dststep + i, updateOp(s0, x0)); + vst1q_u8(dst + dststep + i + 16, updateOp(s1, x1)); + } + for(; i < width; i++ ) + { + u8 s = src[1][i]; + + for( k = 2; k < ksize; k++ ) + s = updateOp(s, src[k][i]); + + dst[i] = updateOp(s, src[0][i]); + dst[i+dststep] = updateOp(s, src[k][i]); + } + } + + for (; count > 0; count--, dst += dststep, src++) + { + for (i = 0; i < width32; i += 32) + { + const u8* sptr = src[0] + i; + s0 = vld1q_u8(sptr); + s1 = vld1q_u8(sptr + 16); + internal::prefetch(sptr); + + for (k = 1; k < ksize; k++) + { + sptr = src[k] + i; + x0 = vld1q_u8(sptr); + x1 = vld1q_u8(sptr + 16); + internal::prefetch(sptr); + s0 = updateOp(s0, x0); + s1 = updateOp(s1, x1); + } + + vst1q_u8(dst + i, s0); + vst1q_u8(dst + i + 16, s1); + } + for(; i < width; i++ ) + { + u8 s = src[0][i]; + for( k = 1; k < ksize; k++ ) + s = updateOp(s, src[k][i]); + dst[i] = s; + } + } +} + +template +inline void morphology(const Size2D &ssize, u32 cn, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + const Size2D &ksize, + size_t anchorX, size_t anchorY, + BORDER_MODE rowBorderType, BORDER_MODE columnBorderType, + const u8 * borderValues, Margin borderMargin) +{ + //Temporary buffers common for all iterations + std::vector _srcRow(cn*(ssize.width + ksize.width - 1)); + u8* srcRow = &_srcRow[0]; + + size_t bufRows = std::max(ksize.height + 3, std::max(anchorY, ksize.height-anchorY-1)*2+1); + std::vector _rows(bufRows); + u8** rows = &_rows[0]; + + // adjust swidthcn so that the used part of buffers stays compact in memory + ptrdiff_t swidthcn = cn*((ssize.width + 15) & -16);// cn * (aligned ssize.width size) + std::vector _ringBuf(swidthcn*bufRows+16); + u8 * ringBuf = internal::alignPtr(&_ringBuf[0], 16); + + size_t borderLength = std::max(ksize.width - 1, 1) * cn; + std::vector _borderTab(borderLength); + ptrdiff_t * borderTab = &_borderTab[0]; + + std::vector _constBorderValue; + std::vector _constBorderRow; + u8 * constBorderValue = NULL; + u8 * constBorderRow = NULL; + if( rowBorderType == BORDER_MODE_CONSTANT || columnBorderType == BORDER_MODE_CONSTANT ) + { + _constBorderValue.resize(borderLength); + constBorderValue = &_constBorderValue[0]; + size_t i; + for(i = 0; i < cn; i++) + constBorderValue[i] = borderValues[i]; + for(; i < borderLength; i++) + constBorderValue[i] = constBorderValue[i-cn]; + + if( columnBorderType == BORDER_MODE_CONSTANT ) + { + _constBorderRow.resize(cn*(ssize.width + ksize.width - 1 + 16)); + constBorderRow = internal::alignPtr(&_constBorderRow[0], 16); + size_t N = (ssize.width + ksize.width - 1)*cn; + for( i = 0; i < N; i += borderLength ) + { + size_t n = std::min( borderLength, N - i ); + for(size_t j = 0; j < n; j++) + srcRow[i+j] = constBorderValue[j]; + } + MorphRow(srcRow, constBorderRow, ssize.width, cn, ksize.width); + } + } + + Size2D wholeSize(ssize.width + borderMargin.left + borderMargin.right, + ssize.height + borderMargin.top + borderMargin.bottom); + + ptrdiff_t dx1 = std::max(anchorX - (ptrdiff_t)borderMargin.left, 0); + ptrdiff_t dx2 = std::max((ptrdiff_t)ksize.width - anchorX - 1 - (ptrdiff_t)borderMargin.right, 0); + // recompute border tables + if( dx1 > 0 || dx2 > 0 ) + { + if( rowBorderType == BORDER_MODE_CONSTANT ) + { + memcpy( srcRow, &constBorderValue[0], dx1*cn ); + memcpy( srcRow + (ssize.width + ksize.width - 1 - dx2)*cn, &constBorderValue[0], dx2*cn ); + } + else + { + ptrdiff_t xofs1 = std::min(borderMargin.left, anchorX) - borderMargin.left; + + ptrdiff_t wholeWidth = wholeSize.width; + + ptrdiff_t i, j; + for( i = 0; i < dx1; i++ ) + { + ptrdiff_t p0 = (internal::borderInterpolate(i-dx1, wholeWidth, rowBorderType) + xofs1)*cn; + for( j = 0; j < (ptrdiff_t)cn; j++ ) + borderTab[i*cn + j] = p0 + j; + } + + for( i = 0; i < dx2; i++ ) + { + ptrdiff_t p0 = (internal::borderInterpolate(wholeWidth + i, wholeWidth, rowBorderType) + xofs1)*cn; + for( j = 0; j < (ptrdiff_t)cn; j++ ) + borderTab[(i + dx1)*cn + j] = p0 + j; + } + } + } + + ptrdiff_t startY, startY0, endY, rowCount; + startY = startY0 = std::max(borderMargin.top - anchorY, 0); + endY = std::min(borderMargin.top + ssize.height + ksize.height - anchorY - 1, wholeSize.height); + + const u8* src = srcBase + (startY - borderMargin.top)*srcStride; + u8* dst = dstBase; + + ptrdiff_t width = ssize.width, kwidth = ksize.width; + ptrdiff_t kheight = ksize.height, ay = anchorY; + ptrdiff_t width1 = ssize.width + kwidth - 1; + ptrdiff_t xofs1 = std::min(borderMargin.left, anchorX); + bool makeBorder = (dx1 > 0 || dx2 > 0) && rowBorderType != BORDER_MODE_CONSTANT; + ptrdiff_t dy = 0, i = 0; + + src -= xofs1*cn; + ptrdiff_t count = endY - startY; + + rowCount = 0; + for(;; dst += dstStride*i, dy += i) + { + ptrdiff_t dcount = bufRows - ay - startY - rowCount + borderMargin.top; + dcount = dcount > 0 ? dcount : bufRows - kheight + 1; + dcount = std::min(dcount, count); + count -= dcount; + for( ; dcount-- > 0; src += srcStride ) + { + ptrdiff_t bi = (startY - startY0 + rowCount) % bufRows; + u8* brow = ringBuf + bi*swidthcn; + + if( (size_t)(++rowCount) > bufRows ) + { + --rowCount; + ++startY; + } + + memcpy( srcRow + dx1*cn, src, (width1 - dx2 - dx1)*cn ); + + if( makeBorder ) + { + for( i = 0; i < (ptrdiff_t)(dx1*cn); i++ ) + srcRow[i] = src[borderTab[i]]; + for( i = 0; i < (ptrdiff_t)(dx2*cn); i++ ) + srcRow[i + (width1 - dx2)*cn] = src[borderTab[i+dx1*cn]]; + } + + MorphRow(srcRow, brow, width, cn, ksize.width); + } + + ptrdiff_t max_i = std::min(bufRows, ssize.height - dy + (kheight - 1)); + for( i = 0; i < max_i; i++ ) + { + ptrdiff_t srcY = internal::borderInterpolate(dy + i + borderMargin.top - ay, + wholeSize.height, columnBorderType); + if( srcY < 0 ) // can happen only with constant border type + rows[i] = constBorderRow; + else + { + if( srcY >= startY + rowCount ) + break; + ptrdiff_t bi = (srcY - startY0) % bufRows; + rows[i] = ringBuf + bi*swidthcn; + } + } + if( i < kheight ) + break; + i -= kheight - 1; + MorphColumn((const u8**)rows, dst, dstStride, i, ssize.width*cn, ksize.height); + } +} + +} // namespace +#endif // CAROTENE_NEON + +void erode(const Size2D &ssize, u32 cn, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + const Size2D &ksize, + size_t anchorX, size_t anchorY, + BORDER_MODE rowBorderType, BORDER_MODE columnBorderType, + const u8 * borderValues, Margin borderMargin) +{ + internal::assertSupportedConfiguration(ssize.width > 0 && ssize.height > 0 && + anchorX < ksize.width && anchorY < ksize.height); +#ifdef CAROTENE_NEON + morphology(ssize, cn, srcBase, srcStride, dstBase, dstStride, + ksize, anchorX, anchorY, rowBorderType, columnBorderType, + borderValues, borderMargin); +#else + (void)cn; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)rowBorderType; + (void)columnBorderType; + (void)borderValues; + (void)borderMargin; +#endif +} + +void dilate(const Size2D &ssize, u32 cn, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + const Size2D &ksize, + size_t anchorX, size_t anchorY, + BORDER_MODE rowBorderType, BORDER_MODE columnBorderType, + const u8 * borderValues, Margin borderMargin) +{ + internal::assertSupportedConfiguration(ssize.width > 0 && ssize.height > 0 && + anchorX < ksize.width && anchorY < ksize.height); +#ifdef CAROTENE_NEON + morphology(ssize, cn, srcBase, srcStride, dstBase, dstStride, + ksize, anchorX, anchorY, rowBorderType, columnBorderType, + borderValues, borderMargin); +#else + (void)cn; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)rowBorderType; + (void)columnBorderType; + (void)borderValues; + (void)borderMargin; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/mul.cpp b/3rdparty/carotene/src/mul.cpp new file mode 100644 index 0000000000..3bbbfc50aa --- /dev/null +++ b/3rdparty/carotene/src/mul.cpp @@ -0,0 +1,1572 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014-2016, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" +#include "vtransform.hpp" + +#include +#include +#include +#include + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON + +namespace { + +bool isIntegerScale(f32 scale) +{ + return std::fabs(scale - static_cast(scale)) < FLT_EPSILON; +} + +template +void mulu8(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + u8 * dstBase, ptrdiff_t dstStride, + CONVERT_POLICY cpolicy) +{ + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + if (cpolicy == CONVERT_POLICY_SATURATE) + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j), v_src1 = vld1q_u8(src1 + j); + + uint16x8_t v_dst0 = vmull_u8(vget_low_u8(v_src0), vget_low_u8(v_src1)); + uint16x8_t v_dst1 = vmull_u8(vget_high_u8(v_src0), vget_high_u8(v_src1)); + + v_dst0 = vshrq_n_u16(v_dst0, shift); + v_dst1 = vshrq_n_u16(v_dst1, shift); + + vst1q_u8(dst + j, vcombine_u8(vqmovn_u16(v_dst0), vqmovn_u16(v_dst1))); + } + for (; j < roiw8; j += 8) + { + uint16x8_t v_dst = vmull_u8(vld1_u8(src0 + j), vld1_u8(src1 + j)); + vst1_u8(dst + j, vqmovn_u16(vshrq_n_u16(v_dst, shift))); + } + + for (; j < size.width; j++) + { + u16 val = (u16)src0[j] * (u16)src1[j]; + dst[j] = internal::saturate_cast(val >> shift); + } + } + else // CONVERT_POLICY_WRAP + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j), v_src1 = vld1q_u8(src1 + j); + + uint16x8_t v_dst0 = vmull_u8(vget_low_u8(v_src0), vget_low_u8(v_src1)); + uint16x8_t v_dst1 = vmull_u8(vget_high_u8(v_src0), vget_high_u8(v_src1)); + + v_dst0 = vshrq_n_u16(v_dst0, shift); + v_dst1 = vshrq_n_u16(v_dst1, shift); + + vst1q_u8(dst + j, vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1))); + } + for (; j < roiw8; j += 8) + { + uint16x8_t v_dst = vmull_u8(vld1_u8(src0 + j), vld1_u8(src1 + j)); + vst1_u8(dst + j, vmovn_u16(vshrq_n_u16(v_dst, shift))); + } + + for (; j < size.width; j++) + { + u16 val = (u16)src0[j] * (u16)src1[j]; + dst[j] = (u8)(val >> shift); + } + } + } +} + +template +void muls16(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + s16 * dstBase, ptrdiff_t dstStride, + CONVERT_POLICY cpolicy) +{ + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + uint16x8_t v_32767 = vdupq_n_u16(0x7FFF); + + for (size_t i = 0; i < size.height; ++i) + { + const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i); + s16 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + if (cpolicy == CONVERT_POLICY_SATURATE) + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j), v_src1 = vld1q_u8(src1 + j); + + uint16x8_t v_dst0 = vmull_u8(vget_low_u8(v_src0), vget_low_u8(v_src1)); + uint16x8_t v_dst1 = vmull_u8(vget_high_u8(v_src0), vget_high_u8(v_src1)); + + v_dst0 = vshrq_n_u16(v_dst0, shift); + v_dst1 = vshrq_n_u16(v_dst1, shift); + + vst1q_s16(dst + j, vreinterpretq_s16_u16(vminq_u16(v_32767, v_dst0))); + vst1q_s16(dst + j + 8, vreinterpretq_s16_u16(vminq_u16(v_32767, v_dst1))); + } + for (; j < roiw8; j += 8) + { + uint16x8_t v_dst = vmull_u8(vld1_u8(src0 + j), vld1_u8(src1 + j)); + v_dst = vshrq_n_u16(v_dst, shift); + vst1q_s16(dst + j, vreinterpretq_s16_u16(vminq_u16(v_32767, v_dst))); + } + + for (; j < size.width; j++) + { + u16 val = (u16)src0[j] * (u16)src1[j]; + dst[j] = internal::saturate_cast(val >> shift); + } + } + else // CONVERT_POLICY_WRAP + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j), v_src1 = vld1q_u8(src1 + j); + + uint16x8_t v_dst0 = vmull_u8(vget_low_u8(v_src0), vget_low_u8(v_src1)); + uint16x8_t v_dst1 = vmull_u8(vget_high_u8(v_src0), vget_high_u8(v_src1)); + + v_dst0 = vshrq_n_u16(v_dst0, shift); + v_dst1 = vshrq_n_u16(v_dst1, shift); + + vst1q_s16(dst + j, vreinterpretq_s16_u16(v_dst0)); + vst1q_s16(dst + j + 8, vreinterpretq_s16_u16(v_dst1)); + } + for (; j < roiw8; j += 8) + { + uint16x8_t v_dst = vmull_u8(vld1_u8(src0 + j), vld1_u8(src1 + j)); + v_dst = vshrq_n_u16(v_dst, shift); + vst1q_s16(dst + j, vreinterpretq_s16_u16(v_dst)); + } + + for (; j < size.width; j++) + { + u16 val = (u16)src0[j] * (u16)src1[j]; + dst[j] = (s16)(val >> shift); + } + } + } +} + +typedef void (* mulFuncu8)(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + u8 * dstBase, ptrdiff_t dstStride, + CONVERT_POLICY cpolicy); + +typedef void (* mulFuncs16)(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + s16 * dstBase, ptrdiff_t dstStride, + CONVERT_POLICY cpolicy); + +} // namespace + +#endif + +void mul(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + u8 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + internal::assertSupportedConfiguration(); + +#ifdef CAROTENE_NEON + if ((scale * UCHAR_MAX * UCHAR_MAX) < 1.0f) + { + for (size_t y = 0; y < size.height; ++y) + { + u8 * dst = internal::getRowPtr(dstBase, dstStride, y); + std::memset(dst, 0, sizeof(u8) * size.width); + } + return; + } + + s32 iscale = static_cast(scale), exp = 0; + f32 significand = frexp(scale, &exp); + bool is_integer_scale = isIntegerScale(scale), + is_power_of_2 = (significand == 0.5f) && (exp <= 0); + exp = -exp + 1; + + if (is_power_of_2) + { + static const mulFuncu8 funcs[16] = + { + NULL, + mulu8<1>, + mulu8<2>, + mulu8<3>, + mulu8<4>, + mulu8<5>, + mulu8<6>, + mulu8<7>, + mulu8<8>, + mulu8<9>, + mulu8<10>, + mulu8<11>, + mulu8<12>, + mulu8<13>, + mulu8<14>, + mulu8<15> + }; + + mulFuncu8 func = funcs[exp]; + + func(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + cpolicy); + + return; + } + + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + if (cpolicy == CONVERT_POLICY_SATURATE) + { + if (is_integer_scale && iscale == 1) + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j), v_src1 = vld1q_u8(src1 + j); + + uint16x8_t v_dst0 = vmull_u8(vget_low_u8(v_src0), vget_low_u8(v_src1)); + uint16x8_t v_dst1 = vmull_u8(vget_high_u8(v_src0), vget_high_u8(v_src1)); + + vst1q_u8(dst + j, vcombine_u8(vqmovn_u16(v_dst0), vqmovn_u16(v_dst1))); + } + for (; j < roiw8; j += 8) + { + vst1_u8(dst + j, vqmovn_u16(vmull_u8(vld1_u8(src0 + j), vld1_u8(src1 + j)))); + } + + for (; j < size.width; j++) + { + u16 val = (u16)src0[j] * (u16)src1[j]; + dst[j] = internal::saturate_cast(val); + } + } + else // generic case using floats + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + + uint8x16_t v_src0 = vld1q_u8(src0 + j); + uint8x16_t v_src1 = vld1q_u8(src1 + j); + + uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0)); + uint16x8_t v_src1_p = vmovl_u8(vget_low_u8(v_src1)); + float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p)))), scale); + float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p)))), scale); + v_src0_p = vmovl_u8(vget_high_u8(v_src0)); + v_src1_p = vmovl_u8(vget_high_u8(v_src1)); + float32x4_t v_dst2f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p)))), scale); + float32x4_t v_dst3f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p)))), scale); + uint16x8_t v_dst0u = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(v_dst0f)), + vqmovn_u32(vcvtq_u32_f32(v_dst1f))); + uint16x8_t v_dst1u = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(v_dst2f)), + vqmovn_u32(vcvtq_u32_f32(v_dst3f))); + vst1q_u8(dst + j, vcombine_u8(vqmovn_u16(v_dst0u), vqmovn_u16(v_dst1u))); + } + for (; j < roiw8; j += 8) + { + uint16x8_t v_src0 = vmovl_u8(vld1_u8(src0 + j)); + uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + j)); + float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1)))), scale); + float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1)))), scale); + uint16x8_t v_dstu = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(v_dst0f)), + vqmovn_u32(vcvtq_u32_f32(v_dst1f))); + vst1_u8(dst + j, vqmovn_u16(v_dstu)); + } + + for (; j < size.width; j++) + { + f32 fval = (f32)src0[j] * (f32)src1[j] * scale; + dst[j] = internal::saturate_cast((s32)trunc(fval)); + } + } + } + else // CONVERT_POLICY_WRAP + { + if (is_integer_scale && iscale == 1) + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j), v_src1 = vld1q_u8(src1 + j); + + uint16x8_t v_dst0 = vmull_u8(vget_low_u8(v_src0), vget_low_u8(v_src1)); + uint16x8_t v_dst1 = vmull_u8(vget_high_u8(v_src0), vget_high_u8(v_src1)); + + vst1q_u8(dst + j, vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1))); + } + for (; j < roiw8; j += 8) + { + vst1_u8(dst + j, vmovn_u16(vmull_u8(vld1_u8(src0 + j), vld1_u8(src1 + j)))); + } + + for (; j < size.width; j++) + { + u16 val = (u16)src0[j] * (u16)src1[j]; + dst[j] = (u8)(val); + } + } + else // generic case using floats + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j); + uint8x16_t v_src1 = vld1q_u8(src1 + j); + + uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0)); + uint16x8_t v_src1_p = vmovl_u8(vget_low_u8(v_src1)); + float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p)))), scale); + float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p)))), scale); + v_src0_p = vmovl_u8(vget_high_u8(v_src0)); + v_src1_p = vmovl_u8(vget_high_u8(v_src1)); + float32x4_t v_dst2f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p)))), scale); + float32x4_t v_dst3f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p)))), scale); + uint16x8_t v_dst0u = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)), + vmovn_u32(vcvtq_u32_f32(v_dst1f))); + uint16x8_t v_dst1u = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst2f)), + vmovn_u32(vcvtq_u32_f32(v_dst3f))); + vst1q_u8(dst + j, vcombine_u8(vmovn_u16(v_dst0u), vmovn_u16(v_dst1u))); + } + for (; j < roiw8; j += 8) + { + uint16x8_t v_src0 = vmovl_u8(vld1_u8(src0 + j)); + uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + j)); + float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1)))), scale); + float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1)))), scale); + uint16x8_t v_dstu = vcombine_u16(vmovn_u32(vcvtq_u32_f32(v_dst0f)), + vmovn_u32(vcvtq_u32_f32(v_dst1f))); + vst1_u8(dst + j, vmovn_u16(v_dstu)); + } + + for (; j < size.width; j++) + { + f32 fval = (f32)src0[j] * (f32)src1[j] * scale; + dst[j] = (u8)(s32)trunc(fval); + } + } + } + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)cpolicy; + (void)scale; +#endif +} + +void mul(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + s16 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + if (((scale * UCHAR_MAX * UCHAR_MAX) < 1.0f) && (scale >= 0)) + { + for (size_t y = 0; y < size.height; ++y) + { + s16 * dst = internal::getRowPtr(dstBase, dstStride, y); + std::memset(dst, 0, sizeof(s16) * size.width); + } + return; + } + + s32 iscale = static_cast(scale), exp = 0; + f32 significand = frexp(scale, &exp); + bool is_integer_scale = isIntegerScale(scale), + is_power_of_2 = (significand == 0.5f) && (exp <= 0); + exp = -exp + 1; + + if (is_power_of_2) + { + static const mulFuncs16 funcs[16] = + { + NULL, + muls16<1>, + muls16<2>, + muls16<3>, + muls16<4>, + muls16<5>, + muls16<6>, + muls16<7>, + muls16<8>, + muls16<9>, + muls16<10>, + muls16<11>, + muls16<12>, + muls16<13>, + muls16<14>, + muls16<15> + }; + + mulFuncs16 func = funcs[exp]; + + func(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + cpolicy); + + return; + } + + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + uint16x8_t v_32767 = vdupq_n_u16(0x7FFF); + + for (size_t i = 0; i < size.height; ++i) + { + const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i); + s16 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + if (cpolicy == CONVERT_POLICY_SATURATE) + { + if (is_integer_scale && iscale == 1) + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j), v_src1 = vld1q_u8(src1 + j); + + uint16x8_t v_dst0 = vmull_u8(vget_low_u8(v_src0), vget_low_u8(v_src1)); + uint16x8_t v_dst1 = vmull_u8(vget_high_u8(v_src0), vget_high_u8(v_src1)); + + vst1q_s16(dst + j, vreinterpretq_s16_u16(vminq_u16(v_32767, v_dst0))); + vst1q_s16(dst + j +8, vreinterpretq_s16_u16(vminq_u16(v_32767, v_dst1))); + } + for (; j < roiw8; j += 8) + { + uint16x8_t v_dst = vmull_u8(vld1_u8(src0 + j), vld1_u8(src1 + j)); + vst1q_s16(dst + j, vreinterpretq_s16_u16(vminq_u16(v_32767, v_dst))); + } + + for (; j < size.width; j++) + { + u16 val = (u16)src0[j] * (u16)src1[j]; + dst[j] = internal::saturate_cast(val); + } + } + else // generic case using floats + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j); + uint8x16_t v_src1 = vld1q_u8(src1 + j); + + uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0)); + uint16x8_t v_src1_p = vmovl_u8(vget_low_u8(v_src1)); + float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p)))), scale); + float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p)))), scale); + vst1q_s16(dst + j, vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst0f)), + vqmovn_s32(vcvtq_s32_f32(v_dst1f)))); + + v_src0_p = vmovl_u8(vget_high_u8(v_src0)); + v_src1_p = vmovl_u8(vget_high_u8(v_src1)); + v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p)))), scale); + v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p)))), scale); + vst1q_s16(dst + j + 8, vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst0f)), + vqmovn_s32(vcvtq_s32_f32(v_dst1f)))); + } + for (; j < roiw8; j += 8) + { + uint16x8_t v_src0 = vmovl_u8(vld1_u8(src0 + j)); + uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + j)); + float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1)))), scale); + float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1)))), scale); + vst1q_s16(dst + j, vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst0f)), + vqmovn_s32(vcvtq_s32_f32(v_dst1f)))); + } + + for (; j < size.width; j++) + { + f32 fval = (f32)src0[j] * (f32)src1[j] * scale; + dst[j] = internal::saturate_cast((s32)trunc(fval)); + } + } + } + else // CONVERT_POLICY_WRAP + { + if (is_integer_scale && iscale == 1) + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j), v_src1 = vld1q_u8(src1 + j); + + uint16x8_t v_dst0 = vmull_u8(vget_low_u8(v_src0), vget_low_u8(v_src1)); + uint16x8_t v_dst1 = vmull_u8(vget_high_u8(v_src0), vget_high_u8(v_src1)); + + vst1q_s16(dst + j, vreinterpretq_s16_u16(v_dst0)); + vst1q_s16(dst + j + 8, vreinterpretq_s16_u16(v_dst1)); + } + for (; j < roiw8; j += 8) + { + uint16x8_t v_dst = vmull_u8(vld1_u8(src0 + j), vld1_u8(src1 + j)); + vst1q_s16(dst + j, vreinterpretq_s16_u16(v_dst)); + } + + for (; j < size.width; j++) + { + u16 val = (u16)src0[j] * (u16)src1[j]; + dst[j] = (s16)(val); + } + } + else // generic case using floats + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j); + uint8x16_t v_src1 = vld1q_u8(src1 + j); + + uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0)); + uint16x8_t v_src1_p = vmovl_u8(vget_low_u8(v_src1)); + float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p)))), scale); + float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p)))), scale); + vst1q_s16(dst + j, vcombine_s16(vmovn_s32(vcvtq_s32_f32(v_dst0f)), + vmovn_s32(vcvtq_s32_f32(v_dst1f)))); + + v_src0_p = vmovl_u8(vget_high_u8(v_src0)); + v_src1_p = vmovl_u8(vget_high_u8(v_src1)); + v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1_p)))), scale); + v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1_p)))), scale); + vst1q_s16(dst + j + 8, vcombine_s16(vmovn_s32(vcvtq_s32_f32(v_dst0f)), + vmovn_s32(vcvtq_s32_f32(v_dst1f)))); + } + for (; j < roiw8; j += 8) + { + uint16x8_t v_src0 = vmovl_u8(vld1_u8(src0 + j)); + uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + j)); + float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1)))), scale); + float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1)))), scale); + vst1q_s16(dst + j, vcombine_s16(vmovn_s32(vcvtq_s32_f32(v_dst0f)), + vmovn_s32(vcvtq_s32_f32(v_dst1f)))); + } + + for (; j < size.width; j++) + { + f32 fval = (f32)src0[j] * (f32)src1[j] * scale; + dst[j] = (s16)(s32)trunc(fval); + } + } + } + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)cpolicy; + (void)scale; +#endif +} + +void mul(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const s16 * src1Base, ptrdiff_t src1Stride, + s16 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + internal::assertSupportedConfiguration(); + +#ifdef CAROTENE_NEON + if (scale == 0.0f) + { + for (size_t y = 0; y < size.height; ++y) + { + s16 * dst = internal::getRowPtr(dstBase, dstStride, y); + std::memset(dst, 0, sizeof(s16) * size.width); + } + return; + } + + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + bool is_integer_scale = isIntegerScale(scale); + s32 iscale = static_cast(scale); + + for (size_t i = 0; i < size.height; ++i) + { + const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i); + s16 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + if (cpolicy == CONVERT_POLICY_SATURATE) + { + if (is_integer_scale && iscale == 1) + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j); + + int16x8_t v_src0_p = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0))); + int16x8_t v_src1_p = vld1q_s16(src1 + j); + int16x8_t v_dst = vcombine_s16(vqmovn_s32(vmull_s16(vget_low_s16(v_src0_p), vget_low_s16(v_src1_p))), + vqmovn_s32(vmull_s16(vget_high_s16(v_src0_p), vget_high_s16(v_src1_p)))); + vst1q_s16(dst + j, v_dst); + + v_src0_p = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0))); + v_src1_p = vld1q_s16(src1 + j + 8); + v_dst = vcombine_s16(vqmovn_s32(vmull_s16(vget_low_s16(v_src0_p), vget_low_s16(v_src1_p))), + vqmovn_s32(vmull_s16(vget_high_s16(v_src0_p), vget_high_s16(v_src1_p)))); + vst1q_s16(dst + j + 8, v_dst); + } + for (; j < roiw8; j += 8) + { + int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vld1q_u8(src0 + j)))); + int16x8_t v_src1 = vld1q_s16(src1 + j); + int16x8_t v_dst = vcombine_s16(vqmovn_s32(vmull_s16(vget_low_s16(v_src0), vget_low_s16(v_src1))), + vqmovn_s32(vmull_s16(vget_high_s16(v_src0), vget_high_s16(v_src1)))); + vst1q_s16(dst + j, v_dst); + } + + for (; j < size.width; j++) + { + s32 val = (s32)src0[j] * (s32)src1[j]; + dst[j] = internal::saturate_cast(val); + } + } + else // generic case using floats + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j); + + uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0)); + int16x8_t v_src1_p = vld1q_s16(src1 + j); + float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))), + vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1_p)))), scale); + float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1_p)))), scale); + vst1q_s16(dst + j, vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst0f)), + vqmovn_s32(vcvtq_s32_f32(v_dst1f)))); + + v_src0_p = vmovl_u8(vget_high_u8(v_src0)); + v_src1_p = vld1q_s16(src1 + j + 8); + v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))), + vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1_p)))), scale); + v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1_p)))), scale); + vst1q_s16(dst + j + 8, vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst0f)), + vqmovn_s32(vcvtq_s32_f32(v_dst1f)))); + } + for (; j < roiw8; j += 8) + { + uint16x8_t v_src0 = vmovl_u8(vld1_u8(src0 + j)); + int16x8_t v_src1 = vld1q_s16(src1 + j); + float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))), + vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1)))), scale); + float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1)))), scale); + vst1q_s16(dst + j, vcombine_s16(vqmovn_s32(vcvtq_s32_f32(v_dst0f)), + vqmovn_s32(vcvtq_s32_f32(v_dst1f)))); + } + + for (; j < size.width; j++) + { + f32 fval = (f32)src0[j] * (f32)src1[j] * scale; + dst[j] = internal::saturate_cast((s32)trunc(fval)); + } + } + } + else // CONVERT_POLICY_WRAP + { + if (is_integer_scale && iscale == 1) + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j); + + int16x8_t v_src0_p = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0))); + int16x8_t v_src1_p = vld1q_s16(src1 + j); + int16x8_t v_dst = vcombine_s16(vmovn_s32(vmull_s16(vget_low_s16(v_src0_p), vget_low_s16(v_src1_p))), + vmovn_s32(vmull_s16(vget_high_s16(v_src0_p), vget_high_s16(v_src1_p)))); + vst1q_s16(dst + j, v_dst); + + v_src0_p = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0))); + v_src1_p = vld1q_s16(src1 + j + 8); + v_dst = vcombine_s16(vmovn_s32(vmull_s16(vget_low_s16(v_src0_p), vget_low_s16(v_src1_p))), + vmovn_s32(vmull_s16(vget_high_s16(v_src0_p), vget_high_s16(v_src1_p)))); + vst1q_s16(dst + j + 8, v_dst); + } + for (; j < roiw8; j += 8) + { + int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vld1q_u8(src0 + j)))); + int16x8_t v_src1 = vld1q_s16(src1 + j); + int16x8_t v_dst = vcombine_s16(vmovn_s32(vmull_s16(vget_low_s16(v_src0), vget_low_s16(v_src1))), + vmovn_s32(vmull_s16(vget_high_s16(v_src0), vget_high_s16(v_src1)))); + vst1q_s16(dst + j, v_dst); + } + + for (; j < size.width; j++) + { + s32 val = (s32)src0[j] * (s32)src1[j]; + dst[j] = (s16)(val); + } + } + else // generic case using floats + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j); + + uint16x8_t v_src0_p = vmovl_u8(vget_low_u8(v_src0)); + int16x8_t v_src1_p = vld1q_s16(src1 + j); + float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))), + vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1_p)))), scale); + float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1_p)))), scale); + vst1q_s16(dst + j, vcombine_s16(vmovn_s32(vcvtq_s32_f32(v_dst0f)), + vmovn_s32(vcvtq_s32_f32(v_dst1f)))); + + v_src0_p = vmovl_u8(vget_high_u8(v_src0)); + v_src1_p = vld1q_s16(src1 + j + 8); + v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0_p))), + vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1_p)))), scale); + v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0_p))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1_p)))), scale); + vst1q_s16(dst + j + 8, vcombine_s16(vmovn_s32(vcvtq_s32_f32(v_dst0f)), + vmovn_s32(vcvtq_s32_f32(v_dst1f)))); + } + for (; j < roiw8; j += 8) + { + uint16x8_t v_src0 = vmovl_u8(vld1_u8(src0 + j)); + int16x8_t v_src1 = vld1q_s16(src1 + j); + float32x4_t v_dst0f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))), + vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1)))), scale); + float32x4_t v_dst1f = vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1)))), scale); + vst1q_s16(dst + j, vcombine_s16(vmovn_s32(vcvtq_s32_f32(v_dst0f)), + vmovn_s32(vcvtq_s32_f32(v_dst1f)))); + } + + for (; j < size.width; j++) + { + f32 fval = (f32)src0[j] * (f32)src1[j] * scale; + dst[j] = (s16)(s32)trunc(fval); + } + } + } + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)cpolicy; + (void)scale; +#endif +} + +namespace { + +#ifdef CAROTENE_NEON + +template +inline T mulSaturateQ(const T &v1, const T &v2, const float scale) +{ + return internal::vcombine(internal::vqmovn(mulSaturateQ(internal::vmovl(internal::vget_low(v1)), + internal::vmovl(internal::vget_low(v2)), scale)), + internal::vqmovn(mulSaturateQ(internal::vmovl(internal::vget_high(v1)), + internal::vmovl(internal::vget_high(v2)), scale)) + ); +} +template <> +inline int32x4_t mulSaturateQ(const int32x4_t &v1, const int32x4_t &v2, const float scale) +{ return vcvtq_s32_f32(vmulq_n_f32(vmulq_f32(vcvtq_f32_s32(v1), vcvtq_f32_s32(v2)), scale)); } +template <> +inline uint32x4_t mulSaturateQ(const uint32x4_t &v1, const uint32x4_t &v2, const float scale) +{ return vcvtq_u32_f32(vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(v1), vcvtq_f32_u32(v2)), scale)); } + +template +inline T mulSaturate(const T &v1, const T &v2, const float scale) +{ + return internal::vqmovn(mulSaturateQ(internal::vmovl(v1), internal::vmovl(v2), scale)); +} +template <> +inline int32x2_t mulSaturate(const int32x2_t &v1, const int32x2_t &v2, const float scale) +{ return vcvt_s32_f32(vmul_n_f32(vmul_f32(vcvt_f32_s32(v1), vcvt_f32_s32(v2)), scale)); } +template <> +inline uint32x2_t mulSaturate(const uint32x2_t &v1, const uint32x2_t &v2, const float scale) +{ return vcvt_u32_f32(vmul_n_f32(vmul_f32(vcvt_f32_u32(v1), vcvt_f32_u32(v2)), scale)); } + + +template +inline T mulWrapQ(const T &v1, const T &v2, const float scale) +{ + return internal::vcombine(internal::vmovn(mulWrapQ(internal::vmovl(internal::vget_low(v1)), + internal::vmovl(internal::vget_low(v2)), scale)), + internal::vmovn(mulWrapQ(internal::vmovl(internal::vget_high(v1)), + internal::vmovl(internal::vget_high(v2)), scale)) + ); +} +template <> +inline int32x4_t mulWrapQ(const int32x4_t &v1, const int32x4_t &v2, const float scale) +{ return vcvtq_s32_f32(vmulq_n_f32(vmulq_f32(vcvtq_f32_s32(v1), vcvtq_f32_s32(v2)), scale)); } +template <> +inline uint32x4_t mulWrapQ(const uint32x4_t &v1, const uint32x4_t &v2, const float scale) +{ return vcvtq_u32_f32(vmulq_n_f32(vmulq_f32(vcvtq_f32_u32(v1), vcvtq_f32_u32(v2)), scale)); } + +template +inline T mulWrap(const T &v1, const T &v2, const float scale) +{ + return internal::vmovn(mulWrapQ(internal::vmovl(v1), internal::vmovl(v2), scale)); +} +template <> +inline int32x2_t mulWrap(const int32x2_t &v1, const int32x2_t &v2, const float scale) +{ return vcvt_s32_f32(vmul_n_f32(vmul_f32(vcvt_f32_s32(v1), vcvt_f32_s32(v2)), scale)); } +template <> +inline uint32x2_t mulWrap(const uint32x2_t &v1, const uint32x2_t &v2, const float scale) +{ return vcvt_u32_f32(vmul_n_f32(vmul_f32(vcvt_f32_u32(v1), vcvt_f32_u32(v2)), scale)); } + + +template inline uint8x16_t vshrq_n(const uint8x16_t & v0) { return vshrq_n_u8 (v0, n); } +template inline int8x16_t vshrq_n(const int8x16_t & v0) { return vshrq_n_s8 (v0, n); } +template inline uint16x8_t vshrq_n(const uint16x8_t & v0) { return vshrq_n_u16(v0, n); } +template inline int16x8_t vshrq_n(const int16x8_t & v0) { return vshrq_n_s16(v0, n); } +template inline uint32x4_t vshrq_n(const uint32x4_t & v0) { return vshrq_n_u32(v0, n); } +template inline int32x4_t vshrq_n(const int32x4_t & v0) { return vshrq_n_s32(v0, n); } +template inline uint64x2_t vshrq_n(const uint64x2_t & v0) { return vshrq_n_u64(v0, n); } +template inline int64x2_t vshrq_n(const int64x2_t & v0) { return vshrq_n_s64(v0, n); } + +template inline uint8x8_t vshr_n(const uint8x8_t & v0) { return vshr_n_u8 (v0, n); } +template inline int8x8_t vshr_n(const int8x8_t & v0) { return vshr_n_s8 (v0, n); } +template inline uint16x4_t vshr_n(const uint16x4_t & v0) { return vshr_n_u16(v0, n); } +template inline int16x4_t vshr_n(const int16x4_t & v0) { return vshr_n_s16(v0, n); } +template inline uint32x2_t vshr_n(const uint32x2_t & v0) { return vshr_n_u32(v0, n); } +template inline int32x2_t vshr_n(const int32x2_t & v0) { return vshr_n_s32(v0, n); } +template inline uint64x1_t vshr_n(const uint64x1_t & v0) { return vshr_n_u64(v0, n); } +template inline int64x1_t vshr_n(const int64x1_t & v0) { return vshr_n_s64(v0, n); } + +template inline uint8x16_t vrshrq_n(const uint8x16_t & v0) { return vrshrq_n_u8 (v0, n); } +template inline int8x16_t vrshrq_n(const int8x16_t & v0) { return vrshrq_n_s8 (v0, n); } +template inline uint16x8_t vrshrq_n(const uint16x8_t & v0) { return vrshrq_n_u16(v0, n); } +template inline int16x8_t vrshrq_n(const int16x8_t & v0) { return vrshrq_n_s16(v0, n); } +template inline uint32x4_t vrshrq_n(const uint32x4_t & v0) { return vrshrq_n_u32(v0, n); } +template inline int32x4_t vrshrq_n(const int32x4_t & v0) { return vrshrq_n_s32(v0, n); } +template inline uint64x2_t vrshrq_n(const uint64x2_t & v0) { return vrshrq_n_u64(v0, n); } +template inline int64x2_t vrshrq_n(const int64x2_t & v0) { return vrshrq_n_s64(v0, n); } + +template inline uint8x8_t vrshr_n(const uint8x8_t & v0) { return vrshr_n_u8 (v0, n); } +template inline int8x8_t vrshr_n(const int8x8_t & v0) { return vrshr_n_s8 (v0, n); } +template inline uint16x4_t vrshr_n(const uint16x4_t & v0) { return vrshr_n_u16(v0, n); } +template inline int16x4_t vrshr_n(const int16x4_t & v0) { return vrshr_n_s16(v0, n); } +template inline uint32x2_t vrshr_n(const uint32x2_t & v0) { return vrshr_n_u32(v0, n); } +template inline int32x2_t vrshr_n(const int32x2_t & v0) { return vrshr_n_s32(v0, n); } +template inline uint64x1_t vrshr_n(const uint64x1_t & v0) { return vrshr_n_u64(v0, n); } +template inline int64x1_t vrshr_n(const int64x1_t & v0) { return vrshr_n_s64(v0, n); } + +template +void mulShift(const Size2D &size, + const T * src0Base, ptrdiff_t src0Stride, + const T * src1Base, ptrdiff_t src1Stride, + T * dstBase, ptrdiff_t dstStride, + CONVERT_POLICY cpolicy) +{ + typedef typename internal::VecTraits::vec128 vec128; + typedef typename internal::VecTraits::vec128 wvec128; + typedef typename internal::VecTraits::vec64 vec64; + const size_t step128 = 16 / sizeof(T); + size_t roiw128 = size.width >= (step128 - 1) ? size.width - step128 + 1 : 0; + const size_t step64 = 8 / sizeof(T); + size_t roiw64 = size.width >= (step64 - 1) ? size.width - step64 + 1 : 0; + + wvec128 v_mask = internal::vdupq_n((WT)(1<(internal::vqsubq(v_mul0, vshrq_n(internal::vbicq(v_mask, v_mul0)) ))); + vec64 v_res1 = internal::vqmovn(vrshrq_n(internal::vqsubq(v_mul1, vshrq_n(internal::vbicq(v_mask, v_mul1)) ))); + + internal::vst1q(dst + j, internal::vcombine(v_res0, v_res1)); + } + for (; j < roiw64; j += step64) + { + wvec128 v_mul = internal::vmull(internal::vld1(src0 + j), internal::vld1(src1 + j)); + vec64 v_res = internal::vqmovn(vrshrq_n(internal::vqsubq(v_mul, vshrq_n(internal::vbicq(v_mask, v_mul)) ))); + internal::vst1(dst + j, v_res); + } + + for (; j < size.width; j++) + { + WT val = (WT)src0[j] * (WT)src1[j]; + dst[j] = internal::saturate_cast((val - (((1<> shift) + (1<<(shift-1))) >> shift); + } + } + else // CONVERT_POLICY_WRAP + { + for (; j < roiw128; j += step128) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + vec128 v_src0 = internal::vld1q(src0 + j), v_src1 = internal::vld1q(src1 + j); + wvec128 v_mul0 = internal::vmull( internal::vget_low(v_src0), internal::vget_low(v_src1)); + wvec128 v_mul1 = internal::vmull(internal::vget_high(v_src0), internal::vget_high(v_src1)); + + vec64 v_res0 = internal::vmovn(vrshrq_n(internal::vqsubq(v_mul0, vshrq_n(internal::vbicq(v_mask, v_mul0)) ))); + vec64 v_res1 = internal::vmovn(vrshrq_n(internal::vqsubq(v_mul1, vshrq_n(internal::vbicq(v_mask, v_mul1)) ))); + + internal::vst1q(dst + j, internal::vcombine(v_res0, v_res1)); + } + for (; j < roiw64; j += step64) + { + wvec128 v_mul = internal::vmull(internal::vld1(src0 + j), internal::vld1(src1 + j)); + vec64 v_res = internal::vmovn(vrshrq_n(internal::vqsubq(v_mul, vshrq_n(internal::vbicq(v_mask, v_mul)) ))); + internal::vst1(dst + j, v_res); + } + + for (; j < size.width; j++) + { + WT val = (WT)src0[j] * (WT)src1[j]; + dst[j] = (T)((val - (((1<> shift) + (1<<(shift-1))) >> shift); + } + } + } +} +#endif + +template +void mul(const Size2D &size, + const T * src0Base, ptrdiff_t src0Stride, + const T * src1Base, ptrdiff_t src1Stride, + T * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + internal::assertSupportedConfiguration(); + +#ifdef CAROTENE_NEON + typedef typename internal::VecTraits::vec128 vec128; + + typedef void (* mulFunc)(const Size2D &size, + const T * src0Base, ptrdiff_t src0Stride, + const T * src1Base, ptrdiff_t src1Stride, + T * dstBase, ptrdiff_t dstStride, + CONVERT_POLICY cpolicy); + + if (scale == 0.0f || + (std::numeric_limits::is_integer && + (scale * std::numeric_limits::max() * std::numeric_limits::max()) < 1.0f && + (scale * std::numeric_limits::max() * std::numeric_limits::max()) > -1.0f)) + { + for (size_t y = 0; y < size.height; ++y) + { + T * dst = internal::getRowPtr(dstBase, dstStride, y); + std::memset(dst, 0, sizeof(T) * size.width); + } + return; + } + + s32 iscale = static_cast(scale), exp = 0; + f32 significand = frexp(scale, &exp); + bool is_integer_scale = isIntegerScale(scale), + is_power_of_2 = (significand == 0.5f) && (exp <= 0); + exp = -exp + 1; + + if (is_power_of_2) + { + static const mulFunc funcs[16] = + { + NULL, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift + }; + + mulFunc func = funcs[exp]; + + func(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + cpolicy); + + return; + } + + const size_t step128 = 16 / sizeof(T); + size_t roiw128 = size.width >= (step128 - 1) ? size.width - step128 + 1 : 0; + const size_t step64 = 8 / sizeof(T); + size_t roiw64 = size.width >= (step64 - 1) ? size.width - step64 + 1 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const T * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const T * src1 = internal::getRowPtr(src1Base, src1Stride, i); + T * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + if (cpolicy == CONVERT_POLICY_SATURATE) + { + if (is_integer_scale && iscale == 1) + { + for (; j < roiw128; j += step128) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + vec128 v_src0 = internal::vld1q(src0 + j), v_src1 = internal::vld1q(src1 + j); + internal::vst1q(dst + j, internal::vcombine( + internal::vqmovn(internal::vmull(internal::vget_low(v_src0), + internal::vget_low(v_src1))), + internal::vqmovn(internal::vmull(internal::vget_high(v_src0), + internal::vget_high(v_src1))) + ) + ); + } + for (; j < roiw64; j += step64) + { + internal::vst1(dst + j, internal::vqmovn(internal::vmull(internal::vld1(src0 + j), + internal::vld1(src1 + j)))); + } + + for (; j < size.width; j++) + { + WT val = (WT)src0[j] * (WT)src1[j]; + dst[j] = internal::saturate_cast(val); + } + } + else // generic case using floats + { + for (; j < roiw128; j += step128) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + internal::vst1q(dst + j, mulSaturateQ(internal::vld1q(src0 + j), + internal::vld1q(src1 + j), scale)); + } + for (; j < roiw64; j += step64) + { + internal::vst1(dst + j, mulSaturate(internal::vld1(src0 + j), + internal::vld1(src1 + j), scale)); + } + + for (; j < size.width; j++) + { + f32 fval = (f32)src0[j] * (f32)src1[j] * scale; + dst[j] = internal::saturate_cast(fval); + } + } + } + else // CONVERT_POLICY_WRAP + { + if (is_integer_scale && iscale == 1) + { + for (; j < roiw128; j += step128) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + vec128 v_src0 = internal::vld1q(src0 + j), v_src1 = internal::vld1q(src1 + j); + internal::vst1q(dst + j, internal::vcombine( + internal::vmovn(internal::vmull(internal::vget_low(v_src0), + internal::vget_low(v_src1))), + internal::vmovn(internal::vmull(internal::vget_high(v_src0), + internal::vget_high(v_src1))) + ) + ); + } + for (; j < roiw64; j += step64) + { + internal::vst1(dst + j, internal::vmovn(internal::vmull(internal::vld1(src0 + j), + internal::vld1(src1 + j)))); + } + + for (; j < size.width; j++) + { + WT val = (WT)src0[j] * (WT)src1[j]; + dst[j] = (T)(val); + } + } + else // generic case using floats + { + for (; j < roiw128; j += step128) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + internal::vst1q(dst + j, mulWrapQ(internal::vld1q(src0 + j), + internal::vld1q(src1 + j), scale)); + } + for (; j < roiw64; j += step64) + { + internal::vst1(dst + j, mulWrap(internal::vld1(src0 + j), + internal::vld1(src1 + j), scale)); + } + + for (; j < size.width; j++) + { + f32 fval = (f32)src0[j] * (f32)src1[j] * scale; + dst[j] = (T)((s32)trunc(fval)); + } + } + } + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)cpolicy; + (void)scale; +#endif +} + +} + +void mul(const Size2D &size, + const s8 * src0Base, ptrdiff_t src0Stride, + const s8 * src1Base, ptrdiff_t src1Stride, + s8 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + mul(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy); +} + +void mul(const Size2D &size, + const u16 * src0Base, ptrdiff_t src0Stride, + const u16 * src1Base, ptrdiff_t src1Stride, + u16 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + mul(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy); +} + +void mul(const Size2D &size, + const s16 * src0Base, ptrdiff_t src0Stride, + const s16 * src1Base, ptrdiff_t src1Stride, + s16 * dstBase, ptrdiff_t dstStride, + f32 scale, + CONVERT_POLICY cpolicy) +{ + mul(size, src0Base, src0Stride, src1Base, src1Stride, dstBase, dstStride, scale, cpolicy); +} + +void mul(const Size2D &size, + const s32 * src0Base, ptrdiff_t src0Stride, + const s32 * src1Base, ptrdiff_t src1Stride, + s32 * dstBase, ptrdiff_t dstStride, + f64 scale, + CONVERT_POLICY cpolicy) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + typedef void (* mulFunc)(const Size2D &size, + const s32 * src0Base, ptrdiff_t src0Stride, + const s32 * src1Base, ptrdiff_t src1Stride, + s32 * dstBase, ptrdiff_t dstStride, + CONVERT_POLICY cpolicy); + + if (!std::isnormal(scale) || + ((scale * std::numeric_limits::max() * std::numeric_limits::max()) < 1.0f && + (scale * std::numeric_limits::max() * std::numeric_limits::max()) > -1.0f)) + { + for (size_t y = 0; y < size.height; ++y) + { + s32 * dst = internal::getRowPtr(dstBase, dstStride, y); + std::memset(dst, 0, sizeof(s32) * size.width); + } + return; + } + + s32 iscale = static_cast(scale), exp = 0; + f64 significand = frexp(scale, &exp); + bool is_integer_scale = isIntegerScale(scale), + is_power_of_2 = (significand == 0.5) && (exp <= 0); + exp = -exp + 1; + + if (is_power_of_2) + { + static const mulFunc funcs[16] = + { + NULL, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift, + mulShift + }; + + mulFunc func = funcs[exp]; + + func(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + cpolicy); + + return; + } + + size_t roiw128 = size.width >= 3 ? size.width - 3 : 0; + size_t roiw64 = size.width >= 1 ? size.width - 1 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s32 * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const s32 * src1 = internal::getRowPtr(src1Base, src1Stride, i); + s32 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + if (cpolicy == CONVERT_POLICY_SATURATE) + { + if (is_integer_scale && iscale == 1) + { + for (; j < roiw128; j += 4) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + int32x4_t v_src0 = internal::vld1q(src0 + j), v_src1 = internal::vld1q(src1 + j); + internal::vst1q(dst + j, internal::vcombine( + internal::vqmovn(internal::vmull(internal::vget_low(v_src0), + internal::vget_low(v_src1))), + internal::vqmovn(internal::vmull(internal::vget_high(v_src0), + internal::vget_high(v_src1))) + ) + ); + } + for (; j < roiw64; j += 2) + { + internal::vst1(dst + j, internal::vqmovn(internal::vmull(internal::vld1(src0 + j), + internal::vld1(src1 + j)))); + } + + for (; j < size.width; j++) + { + s64 val = (s64)src0[j] * (s64)src1[j]; + dst[j] = internal::saturate_cast(val); + } + } + else // generic case using floats + { + for (; j < size.width; j++) + { + f64 fval = src0[j] * src1[j] * scale; + dst[j] = internal::saturate_cast(fval); + } + } + } + else // CONVERT_POLICY_WRAP + { + if (is_integer_scale && iscale == 1) + { + for (; j < roiw128; j += 4) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + int32x4_t v_src0 = internal::vld1q(src0 + j), v_src1 = internal::vld1q(src1 + j); + internal::vst1q(dst + j, internal::vcombine( + internal::vmovn(internal::vmull(internal::vget_low(v_src0), + internal::vget_low(v_src1))), + internal::vmovn(internal::vmull(internal::vget_high(v_src0), + internal::vget_high(v_src1))) + ) + ); + } + for (; j < roiw64; j += 2) + { + internal::vst1(dst + j, internal::vmovn(internal::vmull(internal::vld1(src0 + j), + internal::vld1(src1 + j)))); + } + + for (; j < size.width; j++) + { + s64 val = (s64)src0[j] * (s64)src1[j]; + dst[j] = (s32)(val); + } + } + else // generic case using floats + { + for (; j < size.width; j++) + { + f64 fval = src0[j] * src1[j] * scale; + dst[j] = (s32)trunc(fval); + } + } + } + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)cpolicy; + (void)scale; +#endif +} + +void mul(const Size2D &size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride, + f32 * dstBase, ptrdiff_t dstStride, + f32 scale) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + if (scale == 0.0f) + { + for (size_t y = 0; y < size.height; ++y) + { + f32 * dst = internal::getRowPtr(dstBase, dstStride, y); + std::memset(dst, 0, sizeof(f32) * size.width); + } + return; + } + + size_t roiw128 = size.width >= 3 ? size.width - 3 : 0; + size_t roiw64 = size.width >= 1 ? size.width - 1 : 0; + + if (std::fabs(scale - 1.0f) < FLT_EPSILON) + { + for (size_t i = 0; i < size.height; ++i) + { + const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i); + f32 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw128; j += 4) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + vst1q_f32(dst + j, vmulq_f32(vld1q_f32(src0 + j), vld1q_f32(src1 + j))); + } + + for (; j < roiw64; j += 2) + { + vst1_f32(dst + j, vmul_f32(vld1_f32(src0 + j), vld1_f32(src1 + j))); + } + + for (; j < size.width; j++) + { + dst[j] = src0[j] * src1[j]; + } + } + } + else + { + for (size_t i = 0; i < size.height; ++i) + { + const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i); + f32 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw128; j += 4) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + vst1q_f32(dst + j, vmulq_n_f32(vmulq_f32(vld1q_f32(src0 + j), vld1q_f32(src1 + j)), scale)); + } + + for (; j < roiw64; j += 2) + { + vst1_f32(dst + j, vmul_n_f32(vmul_f32(vld1_f32(src0 + j), vld1_f32(src1 + j)), scale)); + } + + for (; j < size.width; j++) + { + dst[j] = src0[j] * src1[j] * scale; + } + } + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)scale; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/norm.cpp b/3rdparty/carotene/src/norm.cpp new file mode 100644 index 0000000000..6ff2456597 --- /dev/null +++ b/3rdparty/carotene/src/norm.cpp @@ -0,0 +1,1310 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +namespace CAROTENE_NS { + +//magic number; must be multiple of 4 +#define NORM32F_BLOCK_SIZE 2048 + +s32 normInf(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const u8* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + if (size.width >= 16) + { + uint8x16_t s = vld1q_u8(src); + for (i = 16; i <= size.width - 16; i += 16) + { + internal::prefetch(src + i); + uint8x16_t s1 = vld1q_u8(src + i); + s = vmaxq_u8(s1, s); + } + u8 s2[8]; + uint8x8_t s3 = vmax_u8(vget_low_u8(s), vget_high_u8(s)); + vst1_u8(s2, s3); + for (u32 j = 0; j < 8; j++) + result = std::max((s32)(s2[j]), result); + } + for ( ; i < size.width; i++) + result = std::max((s32)(src[i]), result); + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0; +#endif +} + +s32 normInf(const Size2D &_size, + const s8 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const s8* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + if (size.width >= 16) + { + uint8x16_t s = vreinterpretq_u8_s8(vabsq_s8(vld1q_s8(src))); + for (i = 16; i <= size.width - 16; i += 16) + { + internal::prefetch(src + i); + uint8x16_t s1 = vreinterpretq_u8_s8(vabsq_s8(vld1q_s8(src + i))); + s = vmaxq_u8(s1, s); + } + u8 s2[8]; + uint8x8_t s3 = vmax_u8(vget_low_u8(s), vget_high_u8(s)); + vst1_u8(s2, s3); + for (u32 j = 0; j < 8; j++) + result = std::max((s32)(s2[j]), result); + } + for ( ; i < size.width; i++) + result = std::max((s32)(std::abs(src[i])), result); + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0; +#endif +} + +s32 normInf(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const u16* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + if (size.width >= 8) + { + uint16x8_t s = vld1q_u16(src); + for (i = 8; i <= size.width - 8; i += 8) + { + internal::prefetch(src + i); + uint16x8_t s1 = vld1q_u16(src + i); + s = vmaxq_u16(s1, s); + } + u16 s2[4]; + uint16x4_t s3 = vmax_u16(vget_low_u16(s), vget_high_u16(s)); + vst1_u16(s2, s3); + for (u32 j = 0; j < 4; j++) + result = std::max((s32)(s2[j]), result); + } + for ( ; i < size.width; i++) + result = std::max((s32)(src[i]), result); + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0; +#endif +} + +s32 normInf(const Size2D &_size, + const s16 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const s16* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + if (size.width >= 8) + { + uint16x8_t s = vreinterpretq_u16_s16(vabsq_s16(vld1q_s16(src))); + for (i = 8; i <= size.width - 8; i += 8) + { + internal::prefetch(src + i); + uint16x8_t s1 = vreinterpretq_u16_s16(vabsq_s16(vld1q_s16(src + i))); + s = vmaxq_u16(s1, s); + } + u16 s2[4]; + uint16x4_t s3 = vmax_u16(vget_low_u16(s), vget_high_u16(s)); + vst1_u16(s2, s3); + for (u32 j = 0; j < 4; j++) + result = std::max((s32)(s2[j]), result); + } + for ( ; i < size.width; i++) + result = std::max(std::abs((s32)(src[i])), result); + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0; +#endif +} + +s32 normInf(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const s32* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + if (size.width >= 4) + { + uint32x4_t s = vreinterpretq_u32_s32(vabsq_s32(vld1q_s32(src))); + for (i = 4; i <= size.width - 4; i += 4) + { + internal::prefetch(src + i); + uint32x4_t s1 = vreinterpretq_u32_s32(vabsq_s32(vld1q_s32(src + i))); + s = vmaxq_u32(s1, s); + } + u32 s2[2]; + uint32x2_t s3 = vmax_u32(vget_low_u32(s), vget_high_u32(s)); + vst1_u32(s2, s3); + for (u32 j = 0; j < 2; j++) + result = std::max((s32)(s2[j]), result); + } + for ( ; i < size.width; i++) + result = std::max((s32)(std::abs(src[i])), result); + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0; +#endif +} + +f32 normInf(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + f32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const f32* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + if (size.width >= 4) + { + float32x4_t s = vabsq_f32(vld1q_f32(src)); + for (i = 4; i <= size.width - 4; i += 4 ) + { + internal::prefetch(src + i); + float32x4_t s1 = vld1q_f32(src + i); + float32x4_t sa = vabsq_f32(s1); + s = vmaxq_f32(sa, s); + } + f32 s2[2]; + float32x2_t s3 = vmax_f32(vget_low_f32(s), vget_high_f32(s)); + vst1_f32(s2, s3); + for (u32 j = 0; j < 2; j++) + result = std::max(s2[j], result); + } + for (; i < size.width; i++) + result = std::max(std::abs(src[i]), result); + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0.; +#endif +} + +s32 normL1(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const u8* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + uint32x4_t vs = vmovq_n_u32(0); + for (; i < roiw8;) + { + size_t limit = std::min(size.width, i + 256) - 8; + uint8x8_t s0 = vld1_u8(src + i); + uint16x8_t s = vmovl_u8(s0); + + for (i += 8; i <= limit; i += 8) + { + internal::prefetch(src + i); + uint8x8_t s1 = vld1_u8(src + i); + s = vaddw_u8(s, s1); + } + + uint16x4_t s4 = vadd_u16(vget_low_u16(s), vget_high_u16(s)); + vs = vaddw_u16(vs, s4); + } + + u32 s2[2]; + uint32x2_t vs2 = vadd_u32(vget_low_u32(vs), vget_high_u32(vs)); + vst1_u32(s2, vs2); + + result += (s32)(s2[0] + s2[1]); + + for ( ; i < size.width; i++) + result += (s32)(src[i]); + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0; +#endif +} + +s32 normL1(const Size2D &_size, + const s8 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const s8* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + uint32x4_t vs = vmovq_n_u32(0); + + for (; i < roiw8;) + { + size_t limit = std::min(size.width, i + 256) - 8; + uint8x8_t s0 = vreinterpret_u8_s8(vabs_s8(vld1_s8(src + i))); + uint16x8_t s = vmovl_u8(s0); + + for (i += 8; i <= limit; i += 8) + { + internal::prefetch(src + i); + uint8x8_t s1 = vreinterpret_u8_s8(vabs_s8(vld1_s8(src + i))); + s = vaddw_u8(s, s1); + } + + uint16x4_t s4 = vadd_u16(vget_low_u16(s), vget_high_u16(s)); + vs = vaddw_u16(vs, s4); + } + + u32 s2[2]; + uint32x2_t vs2 = vadd_u32(vget_low_u32(vs), vget_high_u32(vs)); + vst1_u32(s2, vs2); + + result += (s32)(s2[0] + s2[1]); + + for ( ; i < size.width; i++) + result += (s32)(std::abs(src[i])); + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0; +#endif +} + +s32 normL1(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + size_t roiw4 = size.width >= 3 ? size.width - 3 : 0; + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const u16* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + uint32x4_t vs = vmovq_n_u32(0); + for (; i < roiw4; i += 4) + { + internal::prefetch(src + i); + uint16x4_t s = vld1_u16(src + i); + vs = vaddw_u16(vs, s); + } + u32 s2[4]; + vst1q_u32(s2, vs); + for (u32 j = 0; j < 4; j++) + result += s2[j]; + for ( ; i < size.width; i++) + result += (s32)(src[i]); + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0; +#endif +} + +s32 normL1(const Size2D &_size, + const s16 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + size_t roiw4 = size.width >= 3 ? size.width - 3 : 0; + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const s16* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + uint32x4_t vs = vmovq_n_u32(0); + for (; i < roiw4; i += 4) + { + internal::prefetch(src + i); + uint16x4_t s = vreinterpret_u16_s16(vabs_s16(vld1_s16(src + i))); + vs = vaddw_u16(vs, s); + } + u32 s2[4]; + vst1q_u32(s2, vs); + for (u32 j = 0; j < 4; j++) + result += s2[j]; + for ( ; i < size.width; i++) + result += (s32)(std::abs(src[i])); + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0; +#endif +} + +f64 normL1(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + size_t roiw4 = size.width >= 3 ? size.width - 3 : 0; + f64 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const s32* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + for (; i < roiw4;) + { + size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4; + float32x4_t s = vcvtq_f32_s32(vabsq_s32(vld1q_s32(src + i))); + for (i += 4; i <= limit; i += 4 ) + { + internal::prefetch(src + i); + float32x4_t s1 = vcvtq_f32_s32(vabsq_s32(vld1q_s32(src + i))); + s = vaddq_f32(s, s1); + } + + f32 s2[4]; + vst1q_f32(s2, s); + + for (u32 j = 0; j < 4; j++) + result += (f64)(s2[j]); + } + for ( ; i < size.width; i++) + result += (f64)(std::abs(src[i])); + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0.; +#endif +} + +f64 normL1(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + size_t roiw4 = size.width >= 3 ? size.width - 3 : 0; + f64 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const f32* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + + for (; i < roiw4;) + { + size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4; + float32x4_t s = vabsq_f32(vld1q_f32(src + i)); + for (i += 4; i <= limit; i += 4) + { + internal::prefetch(src + i); + float32x4_t s1 = vld1q_f32(src + i); + float32x4_t sa = vabsq_f32(s1); + s = vaddq_f32(sa, s); + } + + f32 s2[4]; + vst1q_f32(s2, s); + + for (u32 j = 0; j < 4; j++) + result += (f64)(s2[j]); + } + for (; i < size.width; i++) + result += std::abs((f64)(src[i])); + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0.; +#endif +} + +s32 normL2(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const u8* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + + uint32x4_t sl = vmovq_n_u32(0); + uint32x4_t sh = vmovq_n_u32(0); + + for (; i < roiw8; i += 8) + { + internal::prefetch(src + i); + uint8x8_t s1 = vld1_u8(src + i); + uint16x8_t sq = vmull_u8(s1, s1); + + sl = vaddw_u16(sl, vget_low_u16(sq)); + sh = vaddw_u16(sh, vget_high_u16(sq)); + } + + uint32x4_t s = vaddq_u32(sl, sh); + uint32x2_t ss = vadd_u32(vget_low_u32(s), vget_high_u32(s)); + + u32 s2[2]; + vst1_u32(s2, ss); + + result += (s32)(s2[0] + s2[1]); + + for (; i < size.width; i++) + result += (s32)(src[i]) * (s32)(src[i]); + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0; +#endif +} + +s32 normL2(const Size2D &_size, + const s8 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const s8* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + + int32x4_t sl = vmovq_n_s32(0); + int32x4_t sh = vmovq_n_s32(0); + + for (; i < roiw8; i += 8) + { + internal::prefetch(src + i); + int8x8_t s1 = vld1_s8(src + i); + int16x8_t sq = vmull_s8(s1, s1); + + sl = vaddw_s16(sl, vget_low_s16(sq)); + sh = vaddw_s16(sh, vget_high_s16(sq)); + } + + int32x4_t s = vaddq_s32(sl, sh); + int32x2_t ss = vadd_s32(vget_low_s32(s), vget_high_s32(s)); + + s32 s2[2]; + vst1_s32(s2, ss); + + result += s2[0] + s2[1]; + + for (; i < size.width; i++) + result += (s32)(src[i]) * (s32)(src[i]); + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0; +#endif +} + +f64 normL2(const Size2D &_size, + const u16 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + size_t roiw4 = size.width >= 3 ? size.width - 3 : 0; + f64 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const u16* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + for (; i < roiw4;) + { + size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4; + uint16x4_t s0 = vld1_u16(src+i); + float32x4_t s = vcvtq_f32_u32(vmull_u16(s0,s0)); + for (i += 4; i <= limit; i += 4 ) + { + internal::prefetch(src + i); + uint16x4_t s1 = vld1_u16(src+i); + float32x4_t sq = vcvtq_f32_u32(vmull_u16(s1, s1)); + s = vaddq_f32(s, sq); + } + f32 s2[4]; + vst1q_f32(s2, s); + for (u32 j = 0; j < 4; j++) + result += (f64)(s2[j]); + } + + for ( ; i < size.width; i++) + result += (f64)(src[i]) * (f64)(src[i]); + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0.; +#endif +} + +f64 normL2(const Size2D &_size, + const s16 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + size_t roiw4 = size.width >= 3 ? size.width - 3 : 0; + f64 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const s16* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + for (; i < roiw4;) + { + size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4; + int16x4_t s0 = vld1_s16(src+i); + float32x4_t s = vcvtq_f32_s32(vmull_s16(s0,s0)); + for (i += 4; i <= limit; i += 4 ) + { + internal::prefetch(src + i); + int16x4_t s1 = vld1_s16(src+i); + float32x4_t sq = vcvtq_f32_s32(vmull_s16(s1, s1)); + s = vaddq_f32(s, sq); + } + f32 s2[4]; + vst1q_f32(s2, s); + for (u32 j = 0; j < 4; j++) + result += (f64)(s2[j]); + } + + for ( ; i < size.width; i++) + result += (f64)(src[i]) * (f64)(src[i]); + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0.; +#endif +} + +f64 normL2(const Size2D &_size, + const s32 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + size_t roiw4 = size.width >= 3 ? size.width - 3 : 0; + f64 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const s32* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + for (; i < roiw4;) + { + size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4; + float32x4_t s = vcvtq_f32_s32(vld1q_s32(src + i)); + s = vmulq_f32(s, s); + for (i += 4; i <= limit; i += 4 ) + { + internal::prefetch(src + i); + float32x4_t s1 = vcvtq_f32_s32(vld1q_s32(src + i)); + s = vmlaq_f32(s, s1, s1); + } + + f32 s2[4]; + vst1q_f32(s2, s); + + for (u32 j = 0; j < 4; j++) + result += (f64)(s2[j]); + } + for ( ; i < size.width; i++) + result += (f64)(src[i]) * (f64)(src[i]); + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0.; +#endif +} + +f64 normL2(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + size_t roiw4 = size.width >= 3 ? size.width - 3 : 0; + f64 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const f32* src = internal::getRowPtr( srcBase, srcStride, k); + size_t i = 0; + for (; i < roiw4;) + { + size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4; + float32x4_t s = vld1q_f32(src + i); + s = vmulq_f32(s, s); + for (i += 4; i <= limit; i += 4 ) + { + internal::prefetch(src + i); + float32x4_t s1 = vld1q_f32(src + i); + s = vmlaq_f32(s, s1, s1); + } + + f32 s2[4]; + vst1q_f32(s2, s); + + for (u32 j = 0; j < 4; j++) + result += (f64)(s2[j]); + } + for ( ; i < size.width; i++) + result += (f64)(src[i]) * (f64)(src[i]); + } + return result; +#else + (void)_size; + (void)srcBase; + (void)srcStride; + + return 0.; +#endif +} + +s32 diffNormInf(const Size2D &_size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (src0Stride == src1Stride && + src0Stride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const u8* src1 = internal::getRowPtr( src0Base, src0Stride, k); + const u8* src2 = internal::getRowPtr( src1Base, src1Stride, k); + size_t i = 0; + + if (size.width >= 16) + { + uint8x16_t vs3 = vdupq_n_u8(0); + for (; i < size.width - 16; i += 16) + { + internal::prefetch(src1 + i); + internal::prefetch(src2 + i); + + uint8x16_t vs1 = vld1q_u8(src1 + i); + uint8x16_t vs2 = vld1q_u8(src2 + i); + + vs3 = vmaxq_u8(vs3, vabdq_u8(vs1, vs2)); + } + + u8 s2[8]; + vst1_u8(s2, vpmax_u8(vget_low_u8(vs3), vget_high_u8(vs3))); + + for (u32 j = 0; j < 8; j++) + result = std::max((s32)(s2[j]), result); + } + + for (; i < size.width; i++) + { + result = std::max(std::abs((s32)(src1[i]) - (s32)(src2[i])), result); + } + } + return result; +#else + (void)_size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + + return 0; +#endif +} + +f32 diffNormInf(const Size2D &_size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (src0Stride == src1Stride && + src0Stride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + f32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const f32* src1 = internal::getRowPtr( src0Base, src0Stride, k); + const f32* src2 = internal::getRowPtr( src1Base, src1Stride, k); + size_t i = 0; + + if (size.width >= 4) + { + float32x4_t s = vabdq_f32(vld1q_f32(src1), vld1q_f32(src2)); + + for (i += 4; i <= size.width - 4; i += 4 ) + { + internal::prefetch(src1 + i); + internal::prefetch(src2 + i); + + float32x4_t vs1 = vld1q_f32(src1 + i); + float32x4_t vs2 = vld1q_f32(src2 + i); + + float32x4_t vd = vabdq_f32(vs2, vs1); + s = vmaxq_f32(s, vd); + } + + f32 s2[4]; + vst1q_f32(s2, s); + + for (u32 j = 0; j < 4; j++) + if (s2[j] > result) + result = s2[j]; + } + + for (; i < size.width; i++) + { + f32 v = std::abs(src1[i] - src2[i]); + if (v > result) + result = v; + } + } + return result; +#else + (void)_size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + + return 0.; +#endif +} + +s32 diffNormL1(const Size2D &_size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (src0Stride == src1Stride && + src0Stride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const u8* src1 = internal::getRowPtr( src0Base, src0Stride, k); + const u8* src2 = internal::getRowPtr( src1Base, src1Stride, k); + size_t i = 0; + + if (size.width >= 16) + { + for(; i <= size.width - 16;) + { + size_t limit = std::min(size.width, i + 2*256) - 16; + uint16x8_t si1 = vmovq_n_u16(0); + uint16x8_t si2 = vmovq_n_u16(0); + + for (; i <= limit; i += 16) + { + internal::prefetch(src1 + i); + internal::prefetch(src2 + i); + + uint8x16_t vs1 = vld1q_u8(src1 + i); + uint8x16_t vs2 = vld1q_u8(src2 + i); + + si1 = vabal_u8(si1, vget_low_u8(vs1), vget_low_u8(vs2)); + si2 = vabal_u8(si2, vget_high_u8(vs1), vget_high_u8(vs2)); + } + + u32 s2[4]; + vst1q_u32(s2, vaddq_u32(vpaddlq_u16(si1), vpaddlq_u16(si2))); + + for (u32 j = 0; j < 4; j++) + { + if ((s32)(0x7fFFffFFu - s2[j]) <= result) + { + return 0x7fFFffFF; //result already saturated + } + result = (s32)((u32)(result) + s2[j]); + } + } + + } + + for (; i < size.width; i++) + { + u32 v = std::abs((s32)(src1[i]) - (s32)(src2[i])); + + if ((s32)(0x7fFFffFFu - v) <= result) + { + return 0x7fFFffFF; //result already saturated + } + result = (s32)((u32)(result) + v); + } + } + return result; +#else + (void)_size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + + return 0; +#endif +} + +f64 diffNormL1(const Size2D &_size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (src0Stride == src1Stride && + src0Stride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + f64 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const f32* src1 = internal::getRowPtr( src0Base, src0Stride, k); + const f32* src2 = internal::getRowPtr( src1Base, src1Stride, k); + size_t i = 0; + + if (size.width >= 4) + { + for(; i <= size.width - 4;) + { + size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4; + float32x4_t s = vmovq_n_f32(0.0f); + + for (; i <= limit; i += 4 ) + { + internal::prefetch(src1 + i); + internal::prefetch(src2 + i); + + float32x4_t vs1 = vld1q_f32(src1 + i); + float32x4_t vs2 = vld1q_f32(src2 + i); + + float32x4_t vd = vabdq_f32(vs2, vs1); + s = vaddq_f32(s, vd); + } + + f32 s2[4]; + vst1q_f32(s2, s); + + for (u32 j = 0; j < 4; j++) + result += (f64)(s2[j]); + } + } + + for (; i < size.width; i++) + { + f32 v = std::abs(src1[i] - src2[i]); + result += (f64)(v); + } + } + return result; +#else + (void)_size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + + return 0.; +#endif +} + +s32 diffNormL2(const Size2D &_size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (src0Stride == src1Stride && + src0Stride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + s32 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const u8* src1 = internal::getRowPtr( src0Base, src0Stride, k); + const u8* src2 = internal::getRowPtr( src1Base, src1Stride, k); + size_t i = 0; + +#define NORML28U_BLOCK_SIZE (33024*2) //bigger block size can result in integer overflow + if (size.width >= 16) + { + for(; i <= size.width - 16;) + { + size_t limit = std::min(size.width, i + NORML28U_BLOCK_SIZE) - 16; + uint32x4_t si1 = vmovq_n_u32(0); + uint32x4_t si2 = vmovq_n_u32(0); + + for (; i <= limit; i += 16) + { + internal::prefetch(src1 + i); + internal::prefetch(src2 + i); + + uint8x16_t vs1 = vld1q_u8(src1 + i); + uint8x16_t vs2 = vld1q_u8(src2 + i); + + uint16x8_t vdlo = vabdl_u8(vget_low_u8(vs1), vget_low_u8(vs2)); + uint16x8_t vdhi = vabdl_u8(vget_high_u8(vs1), vget_high_u8(vs2)); + + si1 = vmlal_u16(si1, vget_low_u16(vdlo), vget_low_u16(vdlo)); + si2 = vmlal_u16(si2, vget_high_u16(vdlo), vget_high_u16(vdlo)); + + si1 = vmlal_u16(si1, vget_low_u16(vdhi), vget_low_u16(vdhi)); + si2 = vmlal_u16(si2, vget_high_u16(vdhi), vget_high_u16(vdhi)); + } + + u32 s2[4]; + vst1q_u32(s2, vqaddq_u32(si1, si2)); + + for (u32 j = 0; j < 4; j++) + { + if ((s32)(0x7fFFffFFu - s2[j]) <= result) + { + return 0x7fFFffFF; //result already saturated + } + result += (s32)s2[j]; + } + } + + } + + for (; i < size.width; i++) + { + s32 v = (s32)(src1[i]) - (s32)(src2[i]); + v *= v; + + if ((s32)(0x7fFFffFFu - (u32)(v)) <= result) + { + return 0x7fFFffFF; //result already saturated + } + result += v; + } + } + return result; +#else + (void)_size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + + return 0; +#endif +} + +f64 diffNormL2(const Size2D &_size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (src0Stride == src1Stride && + src0Stride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + f64 result = 0; + for(size_t k = 0; k < size.height; ++k) + { + const f32* src1 = internal::getRowPtr( src0Base, src0Stride, k); + const f32* src2 = internal::getRowPtr( src1Base, src1Stride, k); + size_t i = 0; + + if (size.width >= 4) + { + for(; i <= size.width - 4;) + { + size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4; + float32x4_t s = vmovq_n_f32(0.0f); + + for (; i <= limit; i += 4 ) + { + internal::prefetch(src1 + i); + internal::prefetch(src2 + i); + + float32x4_t vs1 = vld1q_f32(src1 + i); + float32x4_t vs2 = vld1q_f32(src2 + i); + + float32x4_t vd = vsubq_f32(vs2,vs1); + s = vmlaq_f32(s, vd, vd); + } + + f32 s2[4]; + vst1q_f32(s2, s); + + for (u32 j = 0; j < 4; j++) + result += (f64)(s2[j]); + } + } + + for (; i < size.width; i++) + { + f32 v = src1[i] - src2[i]; + result += v * v; + } + } + return result; +#else + (void)_size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + + return 0.; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/opticalflow.cpp b/3rdparty/carotene/src/opticalflow.cpp new file mode 100644 index 0000000000..fa9402a05c --- /dev/null +++ b/3rdparty/carotene/src/opticalflow.cpp @@ -0,0 +1,539 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" +#include "saturate_cast.hpp" +#include +#include // For FLT_EPSILON + +namespace CAROTENE_NS { + +#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n)) + +/* + * Pyramidal Lucas-Kanade Optical Flow level processing + */ +void pyrLKOptFlowLevel(const Size2D &size, s32 cn, + const u8 *prevData, ptrdiff_t prevStride, + const s16 *prevDerivData, ptrdiff_t prevDerivStride, + const u8 *nextData, ptrdiff_t nextStride, + u32 ptCount, + const f32 *prevPts, f32 *nextPts, + u8 *status, f32 *err, + const Size2D &winSize, + u32 terminationCount, f64 terminationEpsilon, + u32 level, u32 maxLevel, bool useInitialFlow, bool getMinEigenVals, + f32 minEigThreshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + f32 halfWinX = (winSize.width-1)*0.5f, halfWinY = (winSize.height-1)*0.5f; + s32 cn2 = cn*2; + + std::vector _buf(winSize.total()*(cn + cn2)); + s16* IWinBuf = &_buf[0]; + s32 IWinBufStride = winSize.width*cn; + s16* derivIWinBuf = &_buf[winSize.total()*cn]; + s32 derivIWinBufStride = winSize.width*cn2; + + for( u32 ptidx = 0; ptidx < ptCount; ptidx++ ) + { + f32 levscale = (1./(1 << level)); + u32 ptref = ptidx << 1; + f32 prevPtX = prevPts[ptref+0]*levscale; + f32 prevPtY = prevPts[ptref+1]*levscale; + f32 nextPtX; + f32 nextPtY; + if( level == maxLevel ) + { + if( useInitialFlow ) + { + nextPtX = nextPts[ptref+0]*levscale; + nextPtY = nextPts[ptref+1]*levscale; + } + else + { + nextPtX = prevPtX; + nextPtY = prevPtY; + } + } + else + { + nextPtX = nextPts[ptref+0]*2.f; + nextPtY = nextPts[ptref+1]*2.f; + } + nextPts[ptref+0] = nextPtX; + nextPts[ptref+1] = nextPtY; + + s32 iprevPtX, iprevPtY; + s32 inextPtX, inextPtY; + prevPtX -= halfWinX; + prevPtY -= halfWinY; + iprevPtX = floor(prevPtX); + iprevPtY = floor(prevPtY); + + if( iprevPtX < -(s32)winSize.width || iprevPtX >= (s32)size.width || + iprevPtY < -(s32)winSize.height || iprevPtY >= (s32)size.height ) + { + if( level == 0 ) + { + if( status ) + status[ptidx] = false; + if( err ) + err[ptidx] = 0; + } + continue; + } + + f32 a = prevPtX - iprevPtX; + f32 b = prevPtY - iprevPtY; + const s32 W_BITS = 14, W_BITS1 = 14; + const f32 FLT_SCALE = 1.f/(1 << 20); + s32 iw00 = round((1.f - a)*(1.f - b)*(1 << W_BITS)); + s32 iw01 = round(a*(1.f - b)*(1 << W_BITS)); + s32 iw10 = round((1.f - a)*b*(1 << W_BITS)); + s32 iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + + s32 dstep = prevDerivStride/sizeof(s16); + f32 A11 = 0, A12 = 0, A22 = 0; + + int16x4_t viw00 = vmov_n_s16((s16)iw00); + int16x4_t viw01 = vmov_n_s16((s16)iw01); + int16x4_t viw10 = vmov_n_s16((s16)iw10); + int16x4_t viw11 = vmov_n_s16((s16)iw11); + + float32x4_t vA11 = vmovq_n_f32(0); + float32x4_t vA12 = vmovq_n_f32(0); + float32x4_t vA22 = vmovq_n_f32(0); + + s32 wwcn = winSize.width*cn; + + // extract the patch from the first image, compute covariation matrix of derivatives + s32 x = 0; + for(s32 y = 0; y < (s32)winSize.height; y++ ) + { + const u8* src = prevData + prevStride*(y + iprevPtY) + iprevPtX*cn; + const s16* dsrc = prevDerivData + dstep*(y + iprevPtY) + iprevPtX*cn2; + + s16* Iptr = IWinBuf + y*IWinBufStride; + s16* dIptr = derivIWinBuf + y*derivIWinBufStride; + + internal::prefetch(src + x + prevStride * 2, 0); + for(x = 0; x <= wwcn - 8; x += 8) + { + uint8x8_t vsrc00 = vld1_u8(src + x); + uint8x8_t vsrc10 = vld1_u8(src + x + prevStride); + uint8x8_t vsrc01 = vld1_u8(src + x + cn); + uint8x8_t vsrc11 = vld1_u8(src + x + prevStride + cn); + + int16x8_t vs00 = vreinterpretq_s16_u16(vmovl_u8(vsrc00)); + int16x8_t vs10 = vreinterpretq_s16_u16(vmovl_u8(vsrc10)); + int16x8_t vs01 = vreinterpretq_s16_u16(vmovl_u8(vsrc01)); + int16x8_t vs11 = vreinterpretq_s16_u16(vmovl_u8(vsrc11)); + + int32x4_t vsuml = vmull_s16(vget_low_s16(vs00), viw00); + int32x4_t vsumh = vmull_s16(vget_high_s16(vs10), viw10); + + vsuml = vmlal_s16(vsuml, vget_low_s16(vs01), viw01); + vsumh = vmlal_s16(vsumh, vget_high_s16(vs11), viw11); + + vsuml = vmlal_s16(vsuml, vget_low_s16(vs10), viw10); + vsumh = vmlal_s16(vsumh, vget_high_s16(vs00), viw00); + + vsuml = vmlal_s16(vsuml, vget_low_s16(vs11), viw11); + vsumh = vmlal_s16(vsumh, vget_high_s16(vs01), viw01); + + int16x4_t vsumnl = vrshrn_n_s32(vsuml, W_BITS1-5); + int16x4_t vsumnh = vrshrn_n_s32(vsumh, W_BITS1-5); + + vst1q_s16(Iptr + x, vcombine_s16(vsumnl, vsumnh)); + } + for(; x <= wwcn - 4; x += 4) + { + uint8x8_t vsrc00 = vld1_u8(src + x); + uint8x8_t vsrc10 = vld1_u8(src + x + prevStride); + uint8x8_t vsrc01 = vld1_u8(src + x + cn); + uint8x8_t vsrc11 = vld1_u8(src + x + prevStride + cn); + + int16x4_t vs00 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc00))); + int16x4_t vs10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc10))); + int16x4_t vs01 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc01))); + int16x4_t vs11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc11))); + + int32x4_t vsuml1 = vmull_s16(vs00, viw00); + int32x4_t vsuml2 = vmull_s16(vs01, viw01); + vsuml1 = vmlal_s16(vsuml1, vs10, viw10); + vsuml2 = vmlal_s16(vsuml2, vs11, viw11); + int32x4_t vsuml = vaddq_s32(vsuml1, vsuml2); + + int16x4_t vsumnl = vrshrn_n_s32(vsuml, W_BITS1-5); + + vst1_s16(Iptr + x, vsumnl); + } + + internal::prefetch(dsrc + dstep * 2, 0); + for(x = 0; x <= wwcn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 ) + { +#if __GNUC_MINOR__ < 0 + __asm__ ( + "vld2.16 {d0-d1}, [%[dsrc00]] \n\t" + "vld2.16 {d2-d3}, [%[dsrc10]] \n\t" + "vld2.16 {d4-d5}, [%[dsrc01]] \n\t" + "vld2.16 {d6-d7}, [%[dsrc11]] \n\t" + "vmull.s16 q4, d3, %P[viw10] \n\t" + "vmull.s16 q5, d0, %P[viw00] \n\t" + "vmlal.s16 q4, d7, %P[viw11] \n\t" + "vmlal.s16 q5, d4, %P[viw01] \n\t" + "vmlal.s16 q4, d1, %P[viw00] \n\t" + "vmlal.s16 q5, d2, %P[viw10] \n\t" + "vmlal.s16 q4, d5, %P[viw01] \n\t" + "vmlal.s16 q5, d6, %P[viw11] \n\t" + "vrshrn.s32 d13, q4, %[W_BITS1] \n\t" + "vrshrn.s32 d12, q5, %[W_BITS1] \n\t" + "vmull.s16 q3, d13, d13 \n\t" + "vmull.s16 q4, d12, d12 \n\t" + "vmull.s16 q5, d13, d12 \n\t" + "vcvt.f32.s32 q3, q3 \n\t" + "vcvt.f32.s32 q4, q4 \n\t" + "vcvt.f32.s32 q5, q5 \n\t" + "vadd.f32 %q[vA22], q3 \n\t" + "vadd.f32 %q[vA11], q4 \n\t" + "vadd.f32 %q[vA12], q5 \n\t" + "vst2.16 {d12-d13}, [%[out]] \n\t" + : [vA22] "=w" (vA22), + [vA11] "=w" (vA11), + [vA12] "=w" (vA12) + : "0" (vA22), + "1" (vA11), + "2" (vA12), + [out] "r" (dIptr), + [dsrc00] "r" (dsrc), + [dsrc10] "r" (dsrc + dstep), + [dsrc01] "r" (dsrc + cn2), + [dsrc11] "r" (dsrc + dstep + cn2), + [viw00] "w" (viw00), + [viw10] "w" (viw10), + [viw01] "w" (viw01), + [viw11] "w" (viw11), + [W_BITS1] "I" (W_BITS1) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13" + ); +#else + int16x4x2_t vdsrc00 = vld2_s16(dsrc); + int16x4x2_t vdsrc10 = vld2_s16(dsrc + dstep); + int16x4x2_t vdsrc01 = vld2_s16(dsrc + cn2); + int16x4x2_t vdsrc11 = vld2_s16(dsrc + dstep + cn2); + + int32x4_t vsumy = vmull_s16(vdsrc10.val[1], viw10); + int32x4_t vsumx = vmull_s16(vdsrc00.val[0], viw00); + + vsumy = vmlal_s16(vsumy, vdsrc11.val[1], viw11); + vsumx = vmlal_s16(vsumx, vdsrc01.val[0], viw01); + + vsumy = vmlal_s16(vsumy, vdsrc00.val[1], viw00); + vsumx = vmlal_s16(vsumx, vdsrc10.val[0], viw10); + + vsumy = vmlal_s16(vsumy, vdsrc01.val[1], viw01); + vsumx = vmlal_s16(vsumx, vdsrc11.val[0], viw11); + + int16x4_t vsumny = vrshrn_n_s32(vsumy, W_BITS1); + int16x4_t vsumnx = vrshrn_n_s32(vsumx, W_BITS1); + + int32x4_t va22i = vmull_s16(vsumny, vsumny); + int32x4_t va11i = vmull_s16(vsumnx, vsumnx); + int32x4_t va12i = vmull_s16(vsumnx, vsumny); + + float32x4_t va22f = vcvtq_f32_s32(va22i); + float32x4_t va11f = vcvtq_f32_s32(va11i); + float32x4_t va12f = vcvtq_f32_s32(va12i); + + vA22 = vaddq_f32(vA22, va22f); + vA11 = vaddq_f32(vA11, va11f); + vA12 = vaddq_f32(vA12, va12f); + + int16x4x2_t vsum; + vsum.val[0] = vsumnx; + vsum.val[1] = vsumny; + vst2_s16(dIptr, vsum); +#endif + } + + for( ; x < wwcn; x++, dsrc += 2, dIptr += 2 ) + { + s32 ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + + src[x+prevStride]*iw10 + src[x+prevStride+cn]*iw11, W_BITS1-5); + s32 ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 + + dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1); + s32 iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 + + dsrc[dstep+cn2+1]*iw11, W_BITS1); + Iptr[x] = (s16)ival; + dIptr[0] = (s16)ixval; + dIptr[1] = (s16)iyval; + + A11 += (f32)(ixval*ixval); + A12 += (f32)(ixval*iyval); + A22 += (f32)(iyval*iyval); + } + } + + f32 A11buf[2], A12buf[2], A22buf[2]; + vst1_f32(A11buf, vadd_f32(vget_low_f32(vA11), vget_high_f32(vA11))); + vst1_f32(A12buf, vadd_f32(vget_low_f32(vA12), vget_high_f32(vA12))); + vst1_f32(A22buf, vadd_f32(vget_low_f32(vA22), vget_high_f32(vA22))); + A11 += A11buf[0] + A11buf[1]; + A12 += A12buf[0] + A12buf[1]; + A22 += A22buf[0] + A22buf[1]; + + A11 *= FLT_SCALE; + A12 *= FLT_SCALE; + A22 *= FLT_SCALE; + + f32 D = A11*A22 - A12*A12; + f32 minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) + + 4.f*A12*A12))/(2*winSize.width*winSize.height); + + if( err && getMinEigenVals ) + err[ptidx] = (f32)minEig; + + if( minEig < minEigThreshold || D < FLT_EPSILON ) + { + if( level == 0 && status ) + status[ptidx] = false; + continue; + } + + D = 1.f/D; + + nextPtX -= halfWinX; + nextPtY -= halfWinY; + f32 prevDeltaX = 0; + f32 prevDeltaY = 0; + + for(u32 j = 0; j < terminationCount; j++ ) + { + inextPtX = floor(nextPtX); + inextPtY = floor(nextPtY); + + if( inextPtX < -(s32)winSize.width || inextPtX >= (s32)size.width || + inextPtY < -(s32)winSize.height || inextPtY >= (s32)size.height ) + { + if( level == 0 && status ) + status[ptidx] = false; + break; + } + + a = nextPtX - inextPtX; + b = nextPtY - inextPtY; + iw00 = round((1.f - a)*(1.f - b)*(1 << W_BITS)); + iw01 = round(a*(1.f - b)*(1 << W_BITS)); + iw10 = round((1.f - a)*b*(1 << W_BITS)); + iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + f32 b1 = 0, b2 = 0; + + viw00 = vmov_n_s16((s16)iw00); + viw01 = vmov_n_s16((s16)iw01); + viw10 = vmov_n_s16((s16)iw10); + viw11 = vmov_n_s16((s16)iw11); + + float32x4_t vb1 = vmovq_n_f32(0); + float32x4_t vb2 = vmovq_n_f32(0); + + for(s32 y = 0; y < (s32)winSize.height; y++ ) + { + const u8* Jptr = nextData + nextStride*(y + inextPtY) + inextPtX*cn; + const s16* Iptr = IWinBuf + y*IWinBufStride; + const s16* dIptr = derivIWinBuf + y*derivIWinBufStride; + + x = 0; + + internal::prefetch(Jptr, nextStride * 2); + internal::prefetch(Iptr, IWinBufStride/2); + internal::prefetch(dIptr, derivIWinBufStride/2); + + for( ; x <= wwcn - 8; x += 8, dIptr += 8*2 ) + { + uint8x8_t vj00 = vld1_u8(Jptr + x); + uint8x8_t vj10 = vld1_u8(Jptr + x + nextStride); + uint8x8_t vj01 = vld1_u8(Jptr + x + cn); + uint8x8_t vj11 = vld1_u8(Jptr + x + nextStride + cn); + int16x8_t vI = vld1q_s16(Iptr + x); + int16x8x2_t vDerivI = vld2q_s16(dIptr); + + int16x8_t vs00 = vreinterpretq_s16_u16(vmovl_u8(vj00)); + int16x8_t vs10 = vreinterpretq_s16_u16(vmovl_u8(vj10)); + int16x8_t vs01 = vreinterpretq_s16_u16(vmovl_u8(vj01)); + int16x8_t vs11 = vreinterpretq_s16_u16(vmovl_u8(vj11)); + + int32x4_t vsuml = vmull_s16(vget_low_s16(vs00), viw00); + int32x4_t vsumh = vmull_s16(vget_high_s16(vs10), viw10); + + vsuml = vmlal_s16(vsuml, vget_low_s16(vs01), viw01); + vsumh = vmlal_s16(vsumh, vget_high_s16(vs11), viw11); + + vsuml = vmlal_s16(vsuml, vget_low_s16(vs10), viw10); + vsumh = vmlal_s16(vsumh, vget_high_s16(vs00), viw00); + + vsuml = vmlal_s16(vsuml, vget_low_s16(vs11), viw11); + vsumh = vmlal_s16(vsumh, vget_high_s16(vs01), viw01); + + int16x4_t vsumnl = vrshrn_n_s32(vsuml, W_BITS1-5); + int16x4_t vsumnh = vrshrn_n_s32(vsumh, W_BITS1-5); + + int16x8_t diff = vqsubq_s16(vcombine_s16(vsumnl, vsumnh), vI); + + int32x4_t vb1l = vmull_s16(vget_low_s16(diff), vget_low_s16(vDerivI.val[0])); + int32x4_t vb2h = vmull_s16(vget_high_s16(diff), vget_high_s16(vDerivI.val[1])); + int32x4_t vb1i = vmlal_s16(vb1l, vget_high_s16(diff), vget_high_s16(vDerivI.val[0])); + int32x4_t vb2i = vmlal_s16(vb2h, vget_low_s16(diff), vget_low_s16(vDerivI.val[1])); + + float32x4_t vb1f = vcvtq_f32_s32(vb1i); + float32x4_t vb2f = vcvtq_f32_s32(vb2i); + + vb1 = vaddq_f32(vb1, vb1f); + vb2 = vaddq_f32(vb2, vb2f); + } + + for( ; x < wwcn; x++, dIptr += 2 ) + { + s32 diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + + Jptr[x+nextStride]*iw10 + Jptr[x+nextStride+cn]*iw11, + W_BITS1-5) - Iptr[x]; + b1 += (f32)(diff*dIptr[0]); + b2 += (f32)(diff*dIptr[1]); + } + } + + f32 bbuf[2]; + float32x2_t vb = vpadd_f32(vadd_f32(vget_low_f32(vb1), vget_high_f32(vb1)), vadd_f32(vget_low_f32(vb2), vget_high_f32(vb2))); + vst1_f32(bbuf, vb); + b1 += bbuf[0]; + b2 += bbuf[1]; + + b1 *= FLT_SCALE; + b2 *= FLT_SCALE; + + f32 deltaX = (f32)((A12*b2 - A22*b1) * D); + f32 deltaY = (f32)((A12*b1 - A11*b2) * D); + + nextPtX += deltaX; + nextPtY += deltaY; + nextPts[ptref+0] = nextPtX + halfWinX; + nextPts[ptref+1] = nextPtY + halfWinY; + + if( ((double)deltaX*deltaX + (double)deltaY*deltaY) <= terminationEpsilon ) + break; + + if( j > 0 && std::abs(deltaX + prevDeltaX) < 0.01 && + std::abs(deltaY + prevDeltaY) < 0.01 ) + { + nextPts[ptref+0] -= deltaX*0.5f; + nextPts[ptref+1] -= deltaY*0.5f; + break; + } + prevDeltaX = deltaX; + prevDeltaY = deltaY; + } + + if( status && status[ptidx] && err && level == 0 && !getMinEigenVals ) + { + f32 nextPointX = nextPts[ptref+0] - halfWinX; + f32 nextPointY = nextPts[ptref+1] - halfWinY; + + s32 inextPointX = floor(nextPointX); + s32 inextPointY = floor(nextPointY); + + if( inextPointX < -(s32)winSize.width || inextPointX >= (s32)size.width || + inextPointY < -(s32)winSize.height || inextPointY >= (s32)size.height ) + { + if( status ) + status[ptidx] = false; + continue; + } + + f32 aa = nextPointX - inextPointX; + f32 bb = nextPointY - inextPointY; + iw00 = round((1.f - aa)*(1.f - bb)*(1 << W_BITS)); + iw01 = round(aa*(1.f - bb)*(1 << W_BITS)); + iw10 = round((1.f - aa)*bb*(1 << W_BITS)); + iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + f32 errval = 0.f; + + for(s32 y = 0; y < (s32)winSize.height; y++ ) + { + const u8* Jptr = nextData + nextStride*(y + inextPointY) + inextPointX*cn; + const s16* Iptr = IWinBuf + y*IWinBufStride; + + for( x = 0; x < wwcn; x++ ) + { + s32 diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + + Jptr[x+nextStride]*iw10 + Jptr[x+nextStride+cn]*iw11, + W_BITS1-5) - Iptr[x]; + errval += std::abs((f32)diff); + } + } + err[ptidx] = errval / (32*wwcn*winSize.height); + } + } +#else + (void)size; + (void)cn; + (void)prevData; + (void)prevStride; + (void)prevDerivData; + (void)prevDerivStride; + (void)nextData; + (void)nextStride; + (void)prevPts; + (void)nextPts; + (void)status; + (void)err; + (void)winSize; + (void)terminationCount; + (void)terminationEpsilon; + (void)level; + (void)maxLevel; + (void)useInitialFlow; + (void)getMinEigenVals; + (void)minEigThreshold; + (void)ptCount; +#endif +} + +}//CAROTENE_NS + diff --git a/3rdparty/carotene/src/phase.cpp b/3rdparty/carotene/src/phase.cpp new file mode 100644 index 0000000000..141b1e864a --- /dev/null +++ b/3rdparty/carotene/src/phase.cpp @@ -0,0 +1,274 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include +#include + +#include "common.hpp" + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON + +namespace { + +#define FASTATAN2CONST(scale) \ + f32 P1((f32)( 0.9997878412794807 * (180.0 / M_PI) * scale)), \ + P3((f32)(-0.3258083974640975 * (180.0 / M_PI) * scale)), \ + P5((f32)( 0.1555786518463281 * (180.0 / M_PI) * scale)), \ + P7((f32)(-0.04432655554792128 * (180.0 / M_PI) * scale)), \ + A_90((f32)(90.f * scale)), \ + A_180((f32)(180.f * scale)), \ + A_360((f32)(360.f * scale)); \ + float32x4_t eps(vdupq_n_f32((float)DBL_EPSILON)), \ + _90(vdupq_n_f32(A_90)), \ + _180(vdupq_n_f32(A_180)), \ + _360(vdupq_n_f32(A_360)), \ + z(vdupq_n_f32(0.0f)), \ + p1(vdupq_n_f32(P1)), \ + p3(vdupq_n_f32(P3)), \ + p5(vdupq_n_f32(P5)), \ + p7(vdupq_n_f32(P7)); + +#define FASTATAN2SCALAR(y, x, a) \ + { \ + f32 ax = std::abs(x), ay = std::abs(y); \ + f32 c, c2; \ + if (ax >= ay) \ + { \ + c = ay / (ax + (float)DBL_EPSILON); \ + c2 = c * c; \ + a = (((P7 * c2 + P5) * c2 + P3) * c2 + P1) * c; \ + } \ + else \ + { \ + c = ax / (ay + (float)DBL_EPSILON); \ + c2 = c * c; \ + a = A_90 - (((P7 * c2 + P5) * c2 + P3) * c2 + P1) * c; \ + } \ + if (x < 0) \ + a = A_180 - a; \ + if (y < 0) \ + a = A_360 - a; \ + } + +#define FASTATAN2VECTOR(v_y, v_x, a) \ + { \ + float32x4_t ax = vabsq_f32(v_x), ay = vabsq_f32(v_y); \ + float32x4_t tmin = vminq_f32(ax, ay), tmax = vmaxq_f32(ax, ay); \ + float32x4_t c = vmulq_f32(tmin, internal::vrecpq_f32(vaddq_f32(tmax, eps))); \ + float32x4_t c2 = vmulq_f32(c, c); \ + a = vmulq_f32(c2, p7); \ + \ + a = vmulq_f32(vaddq_f32(a, p5), c2); \ + a = vmulq_f32(vaddq_f32(a, p3), c2); \ + a = vmulq_f32(vaddq_f32(a, p1), c); \ + \ + a = vbslq_f32(vcgeq_f32(ax, ay), a, vsubq_f32(_90, a)); \ + a = vbslq_f32(vcltq_f32(v_x, z), vsubq_f32(_180, a), a); \ + a = vbslq_f32(vcltq_f32(v_y, z), vsubq_f32(_360, a), a); \ + \ + } + +} // namespace + +#endif + +void phase(const Size2D &size, + const s16 * src0Base, ptrdiff_t src0Stride, + const s16 * src1Base, ptrdiff_t src1Stride, + u8 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + FASTATAN2CONST(256.0f / 360.0f) + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + float32x4_t v_05 = vdupq_n_f32(0.5f); + + for (size_t i = 0; i < size.height; ++i) + { + const s16 * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + + int16x8_t v_src00 = vld1q_s16(src0 + j), v_src01 = vld1q_s16(src0 + j + 8); + int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8); + + // 0 + float32x4_t v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00))); + float32x4_t v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10))); + float32x4_t v_dst32f0; + FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0) + + v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00))); + v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10))); + float32x4_t v_dst32f1; + FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1) + + uint16x8_t v_dst16s0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))), + vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05)))); + + // 1 + v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src01))); + v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src11))); + FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0) + + v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src01))); + v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src11))); + FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1) + + uint16x8_t v_dst16s1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))), + vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05)))); + + vst1q_u8(dst + j, vcombine_u8(vmovn_u16(v_dst16s0), + vmovn_u16(v_dst16s1))); + } + for (; j < roiw8; j += 8) + { + int16x8_t v_src0 = vld1q_s16(src0 + j); + int16x8_t v_src1 = vld1q_s16(src1 + j); + + float32x4_t v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src0))); + float32x4_t v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))); + float32x4_t v_dst32f0; + FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0) + + v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src0))); + v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))); + float32x4_t v_dst32f1; + FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1) + + uint16x8_t v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))), + vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05)))); + + vst1_u8(dst + j, vmovn_u16(v_dst)); + } + + for (; j < size.width; j++) + { + f32 x = src0[j], y = src1[j]; + f32 a; + FASTATAN2SCALAR(y, x, a) + dst[j] = (u8)(s32)floor(a + 0.5f); + } + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; +#endif +} + +void phase(const Size2D &size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride, + f32 * dstBase, ptrdiff_t dstStride, + f32 scale) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + FASTATAN2CONST(scale) + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i); + f32 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw8; j += 8) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + + float32x4_t v_src00 = vld1q_f32(src0 + j), v_src01 = vld1q_f32(src0 + j + 4); + float32x4_t v_src10 = vld1q_f32(src1 + j), v_src11 = vld1q_f32(src1 + j + 4); + + float32x4_t v_dst32f; + // 0 + FASTATAN2VECTOR(v_src10, v_src00, v_dst32f) + vst1q_f32(dst + j, v_dst32f); + // 1 + FASTATAN2VECTOR(v_src11, v_src01, v_dst32f) + vst1q_f32(dst + j + 4, v_dst32f); + } + if(j + 4 <= size.width) + { + float32x4_t v_src0 = vld1q_f32(src0 + j); + float32x4_t v_src1 = vld1q_f32(src1 + j); + + float32x4_t v_dst32f; + FASTATAN2VECTOR(v_src1, v_src0, v_dst32f) + vst1q_f32(dst + j, v_dst32f); + j += 4; + } + + for (; j < size.width; j++) + { + f32 a; + FASTATAN2SCALAR(src1[j], src0[j], a) + dst[j] = a; + } + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)scale; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/pyramid.cpp b/3rdparty/carotene/src/pyramid.cpp new file mode 100644 index 0000000000..546ccecd97 --- /dev/null +++ b/3rdparty/carotene/src/pyramid.cpp @@ -0,0 +1,1414 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +#include + +namespace CAROTENE_NS { + +bool isGaussianPyramidDownRTZSupported(const Size2D &srcSize, const Size2D &dstSize, BORDER_MODE border_mode) +{ + if (!isSupportedConfiguration()) + return false; + // Need at least 8 pixels for vectorization. + // Need to make sure dst width is half the src width. + // Don't care about dst height. + if ( dstSize.width < 8 || std::abs((ptrdiff_t)dstSize.width*2 - (ptrdiff_t)srcSize.width) > 2 ) + return false; + + // Current implementation only supports Reflect101 (ie: UNDEFINED mode) + if (border_mode != BORDER_MODE_UNDEFINED) + return false; + + return true; +} + +bool isGaussianPyramidDownU8Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn) +{ + if (!isSupportedConfiguration()) + return false; + if ( (dstSize.width * cn) < 8 || + (cn != 1 && cn !=3 && cn!=4) || + std::abs((ptrdiff_t)dstSize.width*2 - (ptrdiff_t)srcSize.width) > 2 || + std::abs((ptrdiff_t)dstSize.height*2 - (ptrdiff_t)srcSize.height) > 2 ) + return false; + + return true; +} + +bool isGaussianPyramidDownS16Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn) +{ + if (!isSupportedConfiguration()) + return false; + if ( (dstSize.width * cn) < 4 || + (cn != 1 && cn !=3 && cn!=4) || + std::abs((ptrdiff_t)dstSize.width*2 - (ptrdiff_t)srcSize.width) > 2 || + std::abs((ptrdiff_t)dstSize.height*2 - (ptrdiff_t)srcSize.height) > 2 ) + return false; + + return true; +} + +bool isGaussianPyramidDownF32Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn) +{ + if (!isSupportedConfiguration()) + return false; + if ( (dstSize.width * cn) < 4 || + (cn != 1 && cn !=3 && cn!=4) || + std::abs((ptrdiff_t)dstSize.width*2 - (ptrdiff_t)srcSize.width) > 2 || + std::abs((ptrdiff_t)dstSize.height*2 - (ptrdiff_t)srcSize.height) > 2 ) + return false; + + return true; +} + +bool isGaussianPyramidUpU8Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn) +{ + if (!isSupportedConfiguration()) + return false; + if ( (srcSize.width * cn) < 8 || + (cn != 1 && cn !=3 && cn!=4) || + std::abs((ptrdiff_t)dstSize.width - (ptrdiff_t)srcSize.width*2) != (ptrdiff_t)dstSize.width % 2 || + std::abs((ptrdiff_t)dstSize.height - (ptrdiff_t)srcSize.height*2) != (ptrdiff_t)dstSize.height % 2 ) + return false; + + return true; +} + +bool isGaussianPyramidUpS16Supported(const Size2D &srcSize, const Size2D &dstSize, u8 cn) +{ + if (!isSupportedConfiguration()) + return false; + if ( (srcSize.width * cn) < 12 || + (cn != 1 && cn !=3 && cn!=4) || + std::abs((ptrdiff_t)dstSize.width - (ptrdiff_t)srcSize.width*2) != (ptrdiff_t)dstSize.width % 2 || + std::abs((ptrdiff_t)dstSize.height - (ptrdiff_t)srcSize.height*2) != (ptrdiff_t)dstSize.height % 2 ) + return false; + + return true; +} + +#ifdef CAROTENE_NEON + +namespace { + +ptrdiff_t borderInterpolate101(ptrdiff_t p, ptrdiff_t len) +{ + if (len == 1) + return 0; + else + { + while ((unsigned)p >= (unsigned)len) + { + if (p < 0) + p = -p; + else + p = (len - 1)*2 - p; + } + } + return p; +} + +} // namespace + +#endif + +void gaussianPyramidDownRTZ(const Size2D &srcSize, + const u8 *srcBase, ptrdiff_t srcStride, + const Size2D &dstSize, + u8 *dstBase, ptrdiff_t dstStride, + BORDER_MODE border, u8 borderValue) +{ + internal::assertSupportedConfiguration(isGaussianPyramidDownRTZSupported(srcSize, dstSize, border)); +#ifdef CAROTENE_NEON + // Single-core NEON code + const size_t dwidth = dstSize.width; + const size_t dheight = dstSize.height; + const size_t swidth = srcSize.width; + const size_t sheight = srcSize.height; + + ptrdiff_t idx_l1 = borderInterpolate101(-1, swidth); + ptrdiff_t idx_l2 = borderInterpolate101(-2, swidth); + ptrdiff_t idx_r1 = borderInterpolate101(swidth + 0, swidth); + ptrdiff_t idx_r2 = borderInterpolate101(swidth + 1, swidth); + + //1-line buffer + std::vector _buf((swidth + 4) + 32/sizeof(u16)); + u16* lane = internal::alignPtr(&_buf[2], 32); + + uint8x8_t vc6u8 = vmov_n_u8(6); + uint16x8_t vc6u16 = vmovq_n_u16(6); + uint16x8_t vc4u16 = vmovq_n_u16(4); + + u8* dst = dstBase; + + for (size_t i = 0; i < dheight; ++i, dst += dstStride) + { + //vertical convolution + const u8* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-2, sheight)); + const u8* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-1, sheight)); + const u8* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+0, sheight)); + const u8* ln3 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+1, sheight)); + const u8* ln4 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+2, sheight)); + + size_t x = 0; + for (; x <= swidth - 8; x += 8) + { + internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, x % 5 - 2)); + uint8x8_t v0 = vld1_u8(ln0+x); + uint8x8_t v1 = vld1_u8(ln1+x); + uint8x8_t v2 = vld1_u8(ln2+x); + uint8x8_t v3 = vld1_u8(ln3+x); + uint8x8_t v4 = vld1_u8(ln4+x); + + uint16x8_t v = vaddl_u8(v0, v4); + uint16x8_t v13 = vaddl_u8(v1, v3); + + v = vmlal_u8(v, v2, vc6u8); + v = vmlaq_u16(v, v13, vc4u16); + + vst1q_u16(lane + x, v); + } + for (; x < swidth; ++x) + { + lane[x] = ln0[x] + ln4[x] + 4u * (ln1[x] + ln3[x]) + 6u * ln2[x]; + } + + //left&right borders + lane[-1] = lane[idx_l1]; + lane[-2] = lane[idx_l2]; + + lane[swidth] = lane[idx_r1]; + lane[swidth+1] = lane[idx_r2]; + + //horizontal convolution + x = 0; + size_t vw = (swidth/2) - 7; // Using 7 instead of 8 allows swidth of 14 or 15. + for (; x < vw; x += 8) + { + internal::prefetch(lane + 2 * x); + uint16x8x2_t vLane0 = vld2q_u16(lane + 2*x-2); // L0[0] = x0 x2 x4 x6 x8 x10 x12 x14 L0[1] = x1 x3 x5 x7 x9 x11 x13 x15 + uint16x8x2_t vLane1 = vld2q_u16(lane + 2*x-1); // L1[0] = x1 x3 x5 x7 x9 x11 x13 x15 L1[1] = x2 x4 x6 x8 x10 x12 x14 x16 + uint16x8x2_t vLane2 = vld2q_u16(lane + 2*x+0); // L2[0] = x2 x4 x6 x8 x10 x12 x14 x16 L2[1] = x3 x5 x7 x9 x11 x13 x15 x17 + uint16x8x2_t vLane3 = vld2q_u16(lane + 2*x+1); // L3[0] = x3 x5 x7 x9 x11 x13 x15 x17 L3[1] = x4 x6 x8 x10 x12 x14 x16 x18 + uint16x8x2_t vLane4 = vld2q_u16(lane + 2*x+2); // L4[0] = x4 x6 x8 x10 x12 x14 x16 x18 L4[1] = x5 x7 x9 x11 x13 x15 x17 x19 + uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane4.val[0]); + uint16x8_t vSum_1_3 = vaddq_u16(vLane1.val[0], vLane3.val[0]); + vSum_0_4 = vmlaq_u16(vSum_0_4, vLane2.val[0], vc6u16); + vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_1_3, vc4u16); + uint8x8_t vRes = vshrn_n_u16(vSum_0_4, 8); + + vst1_u8(dst + x, vRes); + } + + for (; x < dwidth; x++) + { + dst[x] = u8((lane[2*x-2] + lane[2*x+2] + 4u * (lane[2*x-1] + lane[2*x+1]) + 6u * lane[2*x]) >> 8); + } + } +#else + // Remove 'unused parameter' warnings. + (void)srcSize; + (void)srcBase; + (void)srcStride; + (void)dstSize; + (void)dstBase; + (void)dstStride; + (void)border; +#endif + (void)borderValue; +} + +void gaussianPyramidDown(const Size2D &srcSize, + const u8 *srcBase, ptrdiff_t srcStride, + const Size2D &dstSize, + u8 *dstBase, ptrdiff_t dstStride, u8 cn) +{ + internal::assertSupportedConfiguration(isGaussianPyramidDownU8Supported(srcSize, dstSize, cn)); +#ifdef CAROTENE_NEON + size_t dcolcn = dstSize.width*cn; + size_t scolcn = srcSize.width*cn; + size_t roiw8 = dcolcn - 7; + + size_t idx_l1 = borderInterpolate101(-1, srcSize.width) * cn; + size_t idx_l2 = borderInterpolate101(-2, srcSize.width) * cn; + size_t idx_r1 = borderInterpolate101(srcSize.width + 0, srcSize.width) * cn; + size_t idx_r2 = borderInterpolate101(srcSize.width + 1, srcSize.width) * cn; + + //1-line buffer + std::vector _buf(cn*(srcSize.width + 4) + 32/sizeof(u16)); + u16* lane = internal::alignPtr(&_buf[2*cn], 32); + + uint8x8_t vc6u8 = vmov_n_u8(6); + uint16x8_t vc6u16 = vmovq_n_u16(6); + uint16x8_t vc4u16 = vmovq_n_u16(4); + + for (size_t i = 0; i < dstSize.height; ++i) + { + u8* dst = internal::getRowPtr(dstBase, dstStride, i); + //vertical convolution + const u8* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-2, srcSize.height)); + const u8* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-1, srcSize.height)); + const u8* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+0, srcSize.height)); + const u8* ln3 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+1, srcSize.height)); + const u8* ln4 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+2, srcSize.height)); + + size_t x = 0; + for (; x <= scolcn - 8; x += 8) + { + internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, (ptrdiff_t)x % 5 - 2)); + uint8x8_t v0 = vld1_u8(ln0+x); + uint8x8_t v1 = vld1_u8(ln1+x); + uint8x8_t v2 = vld1_u8(ln2+x); + uint8x8_t v3 = vld1_u8(ln3+x); + uint8x8_t v4 = vld1_u8(ln4+x); + + uint16x8_t v = vaddl_u8(v0, v4); + uint16x8_t v13 = vaddl_u8(v1, v3); + + v = vmlal_u8(v, v2, vc6u8); + v = vmlaq_u16(v, v13, vc4u16); + + vst1q_u16(lane + x, v); + } + for (; x < scolcn; ++x) + { + lane[x] = ln0[x] + ln4[x] + 4u * (ln1[x] + ln3[x]) + 6u * ln2[x]; + } + + //left&right borders + for (u32 k = 0; k < cn; ++k) + { + lane[(s32)(-cn+k)] = lane[idx_l1 + k]; + lane[(s32)(-cn-cn+k)] = lane[idx_l2 + k]; + + lane[scolcn+k] = lane[idx_r1 + k]; + lane[scolcn+cn+k] = lane[idx_r2 + k]; + } + + //horizontal convolution + x = 0; + switch(cn) + { + case 1: + for (; x < roiw8; x += 8) + { + internal::prefetch(lane + 2 * x); +#if __GNUC_MINOR__ < 7 + __asm__ ( + "vld2.16 {d0-d3}, [%[in0]] \n\t" + "vld2.16 {d4-d7}, [%[in4]] \n\t" + "vld2.16 {d12-d15}, [%[in1]] \n\t" + "vld2.16 {d16-d19}, [%[in3]] \n\t" + "vld2.16 {d8-d11}, [%[in2],:256] \n\t" + "vadd.i16 q0, q2 /*q0 = v0 + v4*/ \n\t" + "vadd.i16 q6, q8 /*q6 = v1 + v3*/ \n\t" + "vmla.i16 q0, q4, %q[c6] /*q0 += v2 * 6*/ \n\t" + "vmla.i16 q0, q6, %q[c4] /*q1 += (v1+v3) * 4*/ \n\t" + "vrshrn.u16 d8, q0, #8 \n\t" + "vst1.8 {d8}, [%[out]] \n\t" + : /*no output*/ + : [out] "r" (dst + x), + [in0] "r" (lane + 2*x-2), + [in1] "r" (lane + 2*x-1), + [in2] "r" (lane + 2*x+0), + [in3] "r" (lane + 2*x+1), + [in4] "r" (lane + 2*x+2), + [c4] "w" (vc4u16), [c6] "w" (vc6u16) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19" + ); +#else + uint16x8x2_t vLane0 = vld2q_u16(lane + 2*x-2); + uint16x8x2_t vLane1 = vld2q_u16(lane + 2*x-1); + uint16x8x2_t vLane2 = vld2q_u16(lane + 2*x+0); + uint16x8x2_t vLane3 = vld2q_u16(lane + 2*x+1); + uint16x8x2_t vLane4 = vld2q_u16(lane + 2*x+2); + + uint16x8_t vSum_0_4 = vaddq_u16(vLane0.val[0], vLane4.val[0]); + uint16x8_t vSum_1_3 = vaddq_u16(vLane1.val[0], vLane3.val[0]); + vSum_0_4 = vmlaq_u16(vSum_0_4, vLane2.val[0], vc6u16); + vSum_0_4 = vmlaq_u16(vSum_0_4, vSum_1_3, vc4u16); + uint8x8_t vRes = vrshrn_n_u16(vSum_0_4, 8); + + vst1_u8(dst + x, vRes); +#endif + } + break; + case 3: + { + uint16x4_t vx1 = vld1_u16(lane - 2*3); + uint16x4_t vx2 = vld1_u16(lane - 1*3); + uint16x4_t vx3 = vld1_u16(lane + 0*3); + uint16x8_t v0 = vcombine_u16(vx1, vx3); + + uint8x8_t map = vreinterpret_u8_u64(vmov_n_u64(0xFFFF060504020100ULL)); + for (; x < roiw8; x += 6) + { + internal::prefetch(lane + 2 * x + 12); + + uint16x4_t vx_ = vld1_u16(lane + 2*x-1*3 + 6); + uint16x4_t vx4 = vld1_u16(lane + 2*x+0*3 + 6); + uint16x4_t vx5 = vld1_u16(lane + 2*x+1*3 + 6); + uint16x4_t vx6 = vld1_u16(lane + 2*x+2*3 + 6); + + uint16x8_t v1 = vcombine_u16(vx2, vx_); + uint16x8_t v2 = vcombine_u16(vget_high_u16(v0), vx4); + uint16x8_t v3 = vcombine_u16(vx_, vx5); + uint16x8_t v4 = vcombine_u16(vx4, vx6); + vx2 = vx5; + + uint16x8_t v = vaddq_u16(v0, v4); + uint16x8_t v13 = vaddq_u16(v1, v3); + + v = vmlaq_u16(v, v2, vc6u16); + v = vmlaq_u16(v, v13, vc4u16); + + uint8x8_t v8 = vrshrn_n_u16(v, 8); + + v0 = v4; + + vst1_u8(dst + x, vtbl1_u8(v8, map)); + } + } + break; + case 4: + { + uint16x4_t vx1 = vld1_u16(lane - 2*4); + uint16x4_t vx2 = vld1_u16(lane - 1*4); + uint16x4_t vx3 = vld1_u16(lane + 0*4); + uint16x8_t v0 = vcombine_u16(vx1, vx3); + + for (; x < roiw8; x += 8) + { + internal::prefetch(lane + 2 * x + 16); + + uint16x4_t vx_ = vld1_u16(lane + 2 * x - 1*4 + 8); + uint16x4_t vx4 = vld1_u16(lane + 2 * x + 0*4 + 8); + uint16x4_t vx5 = vld1_u16(lane + 2 * x + 1*4 + 8); + uint16x4_t vx6 = vld1_u16(lane + 2 * x + 2*4 + 8); + + uint16x8_t v1 = vcombine_u16(vx2, vx_); + uint16x8_t v2 = vcombine_u16(vget_high_u16(v0), vx4); + uint16x8_t v3 = vcombine_u16(vx_, vx5); + uint16x8_t v4 = vcombine_u16(vx4, vx6); + vx2 = vx5; + + uint16x8_t v = vaddq_u16(v0, v4); + uint16x8_t v13 = vaddq_u16(v1, v3); + + v = vmlaq_u16(v, v2, vc6u16); + v = vmlaq_u16(v, v13, vc4u16); + + uint8x8_t v8 = vrshrn_n_u16(v, 8); + + v0 = v4; + + vst1_u8(dst + x, v8); + } + } + break; + } + + for (u32 h = 0; h < cn; ++h) + { + u16* ln = lane + h; + u8* dt = dst + h; + for (size_t k = x; k < dcolcn; k += cn) + dt[k] = u8((ln[2*k-2*cn] + ln[2*k+2*cn] + 4u * (ln[2*k-cn] + ln[2*k+cn]) + 6u * ln[2*k] + (1 << 7)) >> 8); + } + } +#else + // Remove 'unused parameter' warnings. + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void gaussianPyramidDown(const Size2D &srcSize, + const s16 *srcBase, ptrdiff_t srcStride, + const Size2D &dstSize, + s16 *dstBase, ptrdiff_t dstStride, u8 cn) +{ + internal::assertSupportedConfiguration(isGaussianPyramidDownS16Supported(srcSize, dstSize, cn)); +#ifdef CAROTENE_NEON + size_t dcolcn = dstSize.width*cn; + size_t scolcn = srcSize.width*cn; + size_t roiw4 = dcolcn - 3; + + size_t idx_l1 = borderInterpolate101(-1, srcSize.width) * cn; + size_t idx_l2 = borderInterpolate101(-2, srcSize.width) * cn; + size_t idx_r1 = borderInterpolate101(srcSize.width + 0, srcSize.width) * cn; + size_t idx_r2 = borderInterpolate101(srcSize.width + 1, srcSize.width) * cn; + + //1-line buffer + std::vector _buf(cn*(srcSize.width + 4) + 32/sizeof(s32)); + s32* lane = internal::alignPtr(&_buf[2*cn], 32); + + int16x4_t vc6s16 = vmov_n_s16(6); + int32x4_t vc6s32 = vmovq_n_s32(6); + int32x4_t vc4s32 = vmovq_n_s32(4); + + for (size_t i = 0; i < dstSize.height; ++i) + { + s16* dst = internal::getRowPtr(dstBase, dstStride, i); + //vertical convolution + const s16* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-2, srcSize.height)); + const s16* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-1, srcSize.height)); + const s16* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+0, srcSize.height)); + const s16* ln3 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+1, srcSize.height)); + const s16* ln4 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+2, srcSize.height)); + + size_t x = 0; + for (; x <= scolcn - 4; x += 4) + { + internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, (ptrdiff_t)x % 5 - 2)); + int16x4_t v0 = vld1_s16(ln0 + x); + int16x4_t v1 = vld1_s16(ln1 + x); + int16x4_t v2 = vld1_s16(ln2 + x); + int16x4_t v3 = vld1_s16(ln3 + x); + int16x4_t v4 = vld1_s16(ln4 + x); + + int32x4_t v = vaddl_s16(v0, v4); + int32x4_t v13 = vaddl_s16(v1, v3); + + v = vmlal_s16(v, v2, vc6s16); + v = vmlaq_s32(v, v13, vc4s32); + + vst1q_s32(lane + x, v); + } + for (; x < scolcn; ++x) + { + lane[x] = ln0[x] + ln4[x] + 4 * (ln1[x] + ln3[x]) + 6 * ln2[x]; + } + + //left&right borders + for (u32 k = 0; k < cn; ++k) + { + lane[(s32)(-cn+k)] = lane[idx_l1 + k]; + lane[(s32)(-cn-cn+k)] = lane[idx_l2 + k]; + + lane[scolcn+k] = lane[idx_r1 + k]; + lane[scolcn+cn+k] = lane[idx_r2 + k]; + } + + //horizontal convolution + x = 0; + switch(cn) + { + case 1: + for (; x < roiw4; x += 4) + { + internal::prefetch(lane + 2 * x); +#if __GNUC_MINOR__ < 7 + __asm__ ( + "vld2.32 {d0-d3}, [%[in0]] \n\t" + "vld2.32 {d4-d7}, [%[in4]] \n\t" + "vld2.32 {d12-d15}, [%[in1]] \n\t" + "vld2.32 {d16-d19}, [%[in3]] \n\t" + "vld2.32 {d8-d11}, [%[in2],:256] \n\t" + "vadd.i32 q0, q2 \n\t" + "vadd.i32 q6, q8 \n\t" + "vmla.i32 q0, q4, %q[c6] \n\t" + "vmla.i32 q0, q6, %q[c4] \n\t" + "vrshrn.s32 d8, q0, #8 \n\t" + "vst1.16 {d8}, [%[out]] \n\t" + : /*no output*/ + : [out] "r" (dst + x), + [in0] "r" (lane + 2*x-2), + [in1] "r" (lane + 2*x-1), + [in2] "r" (lane + 2*x+0), + [in3] "r" (lane + 2*x+1), + [in4] "r" (lane + 2*x+2), + [c4] "w" (vc4s32), [c6] "w" (vc6s32) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19" + ); +#else + int32x4x2_t vLane0 = vld2q_s32(lane + 2*x-2); + int32x4x2_t vLane1 = vld2q_s32(lane + 2*x-1); + int32x4x2_t vLane2 = vld2q_s32(lane + 2*x+0); + int32x4x2_t vLane3 = vld2q_s32(lane + 2*x+1); + int32x4x2_t vLane4 = vld2q_s32(lane + 2*x+2); + + int32x4_t vSum_0_4 = vaddq_s32(vLane0.val[0], vLane4.val[0]); + int32x4_t vSum_1_3 = vaddq_s32(vLane1.val[0], vLane3.val[0]); + vSum_0_4 = vmlaq_s32(vSum_0_4, vLane2.val[0], vc6s32); + vSum_0_4 = vmlaq_s32(vSum_0_4, vSum_1_3, vc4s32); + int16x4_t vRes = vrshrn_n_s32(vSum_0_4, 8); + + vst1_s16(dst + x, vRes); +#endif + } + break; + case 3: + { + int32x4_t v0 = vld1q_s32(lane - 2*3); + int32x4_t v1 = vld1q_s32(lane - 1*3); + int32x4_t v2 = vld1q_s32(lane + 0*3); + for (; x < roiw4; x += 3) + { + internal::prefetch(lane + 2 * x); + + int32x4_t v3 = vld1q_s32(lane + 2 * x + 1*3); + int32x4_t v4 = vld1q_s32(lane + 2 * x + 2*3); + + int32x4_t v = vaddq_s32(v0, v4); + int32x4_t v13 = vaddq_s32(v1, v3); + + v = vmlaq_s32(v, v2, vc6s32); + v = vmlaq_s32(v, v13, vc4s32); + + int16x4_t vv = vrshrn_n_s32(v, 8); + + v0 = v2; + v1 = v3; + v2 = v4; + + vst1_s16(dst + x, vv); + } + } + break; + case 4: + { + int32x4_t v0 = vld1q_s32(lane - 2*4); + int32x4_t v1 = vld1q_s32(lane - 1*4); + int32x4_t v2 = vld1q_s32(lane + 0*4); + for (; x < roiw4; x += 4) + { + internal::prefetch(lane + 2 * x + 8); + int32x4_t v3 = vld1q_s32(lane + 2 * x + 1*4); + int32x4_t v4 = vld1q_s32(lane + 2 * x + 2*4); + + int32x4_t v = vaddq_s32(v0, v4); + int32x4_t v13 = vaddq_s32(v1, v3); + + v = vmlaq_s32(v, v2, vc6s32); + v = vmlaq_s32(v, v13, vc4s32); + + int16x4_t vv = vrshrn_n_s32(v, 8); + + v0 = v2; + v1 = v3; + v2 = v4; + + vst1_s16(dst + x, vv); + } + } + break; + } + + for (u32 h = 0; h < cn; ++h) + { + s32* ln = lane + h; + s16* dt = dst + h; + for (size_t k = x; k < dcolcn; k += cn) + dt[k] = s16((ln[2*k-2*cn] + ln[2*k+2*cn] + 4 * (ln[2*k-cn] + ln[2*k+cn]) + 6 * ln[2*k] + (1 << 7)) >> 8); + } + } +#else + // Remove 'unused parameter' warnings. + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void gaussianPyramidDown(const Size2D &srcSize, + const f32 *srcBase, ptrdiff_t srcStride, + const Size2D &dstSize, + f32 *dstBase, ptrdiff_t dstStride, u8 cn) +{ + internal::assertSupportedConfiguration(isGaussianPyramidDownF32Supported(srcSize, dstSize, cn)); +#ifdef CAROTENE_NEON + size_t dcolcn = dstSize.width*cn; + size_t scolcn = srcSize.width*cn; + size_t roiw4 = dcolcn - 3; + + size_t idx_l1 = borderInterpolate101(-1, srcSize.width) * cn; + size_t idx_l2 = borderInterpolate101(-2, srcSize.width) * cn; + size_t idx_r1 = borderInterpolate101(srcSize.width + 0, srcSize.width) * cn; + size_t idx_r2 = borderInterpolate101(srcSize.width + 1, srcSize.width) * cn; + + //1-line buffer + std::vector _buf(cn*(srcSize.width + 4) + 32/sizeof(f32)); + f32* lane = internal::alignPtr(&_buf[2*cn], 32); + +#if __GNUC_MINOR__ < 7 + register float32x4_t vc6d4f32 asm ("q11") = vmovq_n_f32(1.5f); // 6/4 + register float32x4_t vc1d4f32 asm ("q12") = vmovq_n_f32(0.25f); // 1/4 + + register float32x4_t vc1d64f32 asm ("q13") = vmovq_n_f32(0.015625f); //1/4/16 + register float32x4_t vc4d64f32 asm ("q14") = vmovq_n_f32(0.0625f); //4/4/16 + register float32x4_t vc6d64f32 asm ("q15") = vmovq_n_f32(0.09375f); //6/4/16 +#else + register float32x4_t vc6d4f32 = vmovq_n_f32(1.5f); // 6/4 + register float32x4_t vc1d4f32 = vmovq_n_f32(0.25f); // 1/4 + + register float32x4_t vc1d64f32 = vmovq_n_f32(0.015625f); //1/4/16 + register float32x4_t vc4d64f32 = vmovq_n_f32(0.0625f); //4/4/16 + register float32x4_t vc6d64f32 = vmovq_n_f32(0.09375f); //6/4/16 +#endif + + for (size_t i = 0; i < dstSize.height; ++i) + { + f32* dst = internal::getRowPtr(dstBase, dstStride, i); + //vertical convolution + const f32* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-2, srcSize.height)); + const f32* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2-1, srcSize.height)); + const f32* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+0, srcSize.height)); + const f32* ln3 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+1, srcSize.height)); + const f32* ln4 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i*2+2, srcSize.height)); + + size_t x = 0; + for (; x <= scolcn - 4; x += 4) + { + internal::prefetch(internal::getRowPtr(ln2 + x, srcStride, (ptrdiff_t)x % 5 - 2)); + float32x4_t v0 = vld1q_f32((const float32_t*)ln0 + x); + float32x4_t v1 = vld1q_f32((const float32_t*)ln1 + x); + float32x4_t v2 = vld1q_f32((const float32_t*)ln2 + x); + float32x4_t v3 = vld1q_f32((const float32_t*)ln3 + x); + float32x4_t v4 = vld1q_f32((const float32_t*)ln4 + x); + + float32x4_t v = vaddq_f32(v1, v3); + float32x4_t v04 = vaddq_f32(v0, v4); + + v = vmlaq_f32(v, v2, vc6d4f32); + v = vmlaq_f32(v, v04, vc1d4f32); + + vst1q_f32(lane + x, v); + } + for (; x < scolcn; ++x) + { + lane[x] = 0.25f*(ln0[x] + ln4[x]) + (ln1[x] + ln3[x]) + 1.5f * ln2[x]; + } + + //left&right borders + for (u32 k = 0; k < cn; ++k) + { + lane[(s32)(-cn+k)] = lane[idx_l1 + k]; + lane[(s32)(-cn-cn+k)] = lane[idx_l2 + k]; + + lane[scolcn+k] = lane[idx_r1 + k]; + lane[scolcn+cn+k] = lane[idx_r2 + k]; + } + + //horizontal convolution + x = 0; + switch(cn) + { + case 1: + for (; x < roiw4; x += 4) + { + internal::prefetch(lane + 2 * x); +#if __GNUC_MINOR__ < 7 + __asm__ __volatile__ ( + "vld2.32 {d0-d3}, [%[in0]] \n\t" + "vld2.32 {d8-d11}, [%[in4]] \n\t" + "vld2.32 {d14-d17}, [%[in2],:256] \n\t" + "vld2.32 {d10-d13}, [%[in1]] \n\t" + "vld2.32 {d16-d19}, [%[in3]] \n\t" + "vmul.f32 q7, %q[c6d64] \n\t" + "vadd.f32 q0, q4 @v04 \n\t" + "vadd.f32 q5, q8 @v13 \n\t" + "vmla.f32 q7, q0, %q[c1d64] \n\t" + "vmla.f32 q7, q5, %q[c4d64] \n\t" + "vst1.32 {d14-d15}, [%[out]] \n\t" + : + : [out] "r" (dst + x), + [in0] "r" (lane + 2*x-2), + [in1] "r" (lane + 2*x-1), + [in2] "r" (lane + 2*x+0), + [in3] "r" (lane + 2*x+1), + [in4] "r" (lane + 2*x+2), + [c4d64] "w" (vc4d64f32), [c6d64] "w" (vc6d64f32), [c1d64] "w" (vc1d64f32) + : "d0","d1","d2","d3","d4",/*"d5","d6","d7",*/"d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19" //ugly compiler "bug" - can't touch d5-d7 + ); +#else + float32x4x2_t vLane0 = vld2q_f32(lane + 2*x-2); + float32x4x2_t vLane1 = vld2q_f32(lane + 2*x-1); + float32x4x2_t vLane2 = vld2q_f32(lane + 2*x+0); + float32x4x2_t vLane3 = vld2q_f32(lane + 2*x+1); + float32x4x2_t vLane4 = vld2q_f32(lane + 2*x+2); + + float32x4_t vSum_0_4 = vaddq_f32(vLane0.val[0], vLane4.val[0]); + float32x4_t vSum_1_3 = vaddq_f32(vLane1.val[0], vLane3.val[0]); + float32x4_t vRes = vmulq_f32(vLane2.val[0], vc6d64f32); + vRes = vmlaq_f32(vRes, vSum_0_4, vc1d64f32); + vRes = vmlaq_f32(vRes, vSum_1_3, vc4d64f32); + + vst1q_f32(dst + x, vRes); +#endif + } + break; + case 3: + { + float32x4_t v0 = vld1q_f32((const float32_t*)lane - 2*3); + float32x4_t v1 = vld1q_f32((const float32_t*)lane - 1*3); + float32x4_t v2 = vld1q_f32((const float32_t*)lane + 0*3); + + for (; x < roiw4; x += 3) + { + internal::prefetch(lane + 2 * x); + + float32x4_t v3 = vld1q_f32((const float32_t*)lane + 2 * x + 1*3); + float32x4_t v4 = vld1q_f32((const float32_t*)lane + 2 * x + 2*3); + + float32x4_t v04 = vaddq_f32(v0, v4); + float32x4_t v13 = vaddq_f32(v1, v3); + + float32x4_t v = vmulq_f32(v2, vc6d64f32); + v = vmlaq_f32(v, v04, vc1d64f32); + v = vmlaq_f32(v, v13, vc4d64f32); + + v0 = v2; + v1 = v3; + v2 = v4; + + vst1q_f32(dst + x, v); + } + } + break; + case 4: + { + float32x4_t v0 = vld1q_f32((const float32_t*)lane - 2*4); + float32x4_t v1 = vld1q_f32((const float32_t*)lane - 1*4); + float32x4_t v2 = vld1q_f32((const float32_t*)lane + 0*4); + + for (; x < roiw4; x += 4) + { + internal::prefetch(lane + 2 * x + 8); + + float32x4_t v3 = vld1q_f32((const float32_t*)lane + 2 * x + 1*4); + float32x4_t v4 = vld1q_f32((const float32_t*)lane + 2 * x + 2*4); + + float32x4_t v04 = vaddq_f32(v0, v4); + float32x4_t v13 = vaddq_f32(v1, v3); + + float32x4_t v = vmulq_f32(v2, vc6d64f32); + v = vmlaq_f32(v, v04, vc1d64f32); + v = vmlaq_f32(v, v13, vc4d64f32); + + v0 = v2; + v1 = v3; + v2 = v4; + + vst1q_f32(dst + x, v); + } + } + break; + } + + for (u32 h = 0; h < cn; ++h) + { + f32* ln = lane + h; + f32* dt = dst + h; + for (size_t k = x; k < dcolcn; k += cn) + dt[k] = 0.015625f * (ln[2*k-2*cn] + ln[2*k+2*cn]) + 0.0625f * (ln[2*k-cn] + ln[2*k+cn]) + 0.09375f * ln[2*k]; + } + } +#else + // Remove 'unused parameter' warnings. + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void gaussianPyramidUp(const Size2D &srcSize, + const u8 *srcBase, ptrdiff_t srcStride, + const Size2D &dstSize, + u8 *dstBase, ptrdiff_t dstStride, u8 cn) +{ + internal::assertSupportedConfiguration(isGaussianPyramidUpU8Supported(srcSize, dstSize, cn)); +#ifdef CAROTENE_NEON + size_t dcolshn = (dstSize.width/2) * cn; + size_t dcolshw = ((dstSize.width+1)/2) * cn; + size_t scolsn = srcSize.width*cn; + + size_t idx_l = (borderInterpolate101(-2, 2 * srcSize.width)/2) * cn; + size_t idx_r1 = (borderInterpolate101(2 * srcSize.width + 0, 2 * srcSize.width)/2) * cn; + size_t idx_r2 = (borderInterpolate101(2 * srcSize.width + 2, 2 * srcSize.width + 2)/2) * cn; + + //2-lines buffer + std::vector _buf(2*(cn*(srcSize.width + 3) + 32/sizeof(u16))); + u16* lane0 = internal::alignPtr(&_buf[cn], 32); + u16* lane1 = internal::alignPtr(lane0 + (3 + srcSize.width)*cn, 32); + + uint8x8_t vc6u8 = vmov_n_u8(6); + uint16x8_t vc6u16 = vmovq_n_u16(6); + + for (size_t i = 0; i < (dstSize.height + 1)/2; ++i) + { + u8* dst = internal::getRowPtr(dstBase, dstStride, 2*i); + //vertical convolution + const u8* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 - 2, srcSize.height * 2)/2); + const u8* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 + 0, srcSize.height * 2)/2); + const u8* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 + 2, srcSize.height * 2)/2); + + size_t x = 0; + for (; x <= scolsn - 8; x += 8) + { + internal::prefetch(internal::getRowPtr(ln1 + x, srcStride, (ptrdiff_t)x % 3 - 1)); + uint8x8_t v0 = vld1_u8(ln0+x); + uint8x8_t v2 = vld1_u8(ln2+x); + uint8x8_t v1 = vld1_u8(ln1+x); + + uint16x8_t vl0 = vaddl_u8(v0, v2); + uint16x8_t vl1 = vaddl_u8(v1, v2); + + vl0 = vmlal_u8(vl0, v1, vc6u8); + vl1 = vshlq_n_u16(vl1, 2); + + vst1q_u16(lane0 + x, vl0); + vst1q_u16(lane1 + x, vl1); + } + for (; x < scolsn; ++x) + { + lane0[x] = ln0[x] + ln2[x] + 6u * ln1[x]; + lane1[x] = 4u * (ln1[x] + ln2[x]); + } + + //left&right borders + for (u32 k = 0; k < cn; ++k) + { + lane0[(s32)(-cn+k)] = lane0[idx_l + k]; + lane1[(s32)(-cn+k)] = lane1[idx_l + k]; + + lane0[scolsn+k] = lane0[idx_r1 + k]; + lane0[scolsn+cn+k] = lane0[idx_r2 + k]; + lane1[scolsn+k] = lane1[idx_r1 + k]; + lane1[scolsn+cn+k] = lane1[idx_r2 + k]; + } + + //horizontal convolution + const u16* lane = lane0; +pyrUp8uHorizontalConvolution: + x = 0; + size_t lim; + switch(cn) + { + case 1: + lim = dcolshn > 7 ? dcolshn - 7 : 0; + for (; x < lim; x += 8) + { + internal::prefetch(lane + x); +#if defined(__GNUC__) && defined(__arm__) + __asm__ ( + "vld1.16 {d0-d1}, [%[in0]] /*q0 = v0*/ \n\t" + "vld1.16 {d2-d3}, [%[in2]] /*q1 = v2*/ \n\t" + "vld1.16 {d4-d5}, [%[in1],:128] /*q2 = v1*/ \n\t" + "vadd.i16 q0, q1 /*q0 = v0 + v2*/ \n\t" + "vadd.i16 q3, q1, q2 /*q3 = v1 + v2*/ \n\t" + "vmla.i16 q0, q2, %q[c6] /*q0 += v1*6*/ \n\t" + "vrshrn.u16 d9, q3, #4 \n\t" + "vrshrn.u16 d8, q0, #6 \n\t" + "vst2.8 {d8-d9}, [%[out]] \n\t" + : /*no output*/ + : [out] "r" (dst + x*2), + [in0] "r" (lane + x - 1), + [in1] "r" (lane + x + 0), + [in2] "r" (lane + x + 1), + [c6] "w" (vc6u16) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9" + ); +#else + uint16x8_t vLane0 = vld1q_u16(lane + x - 1); + uint16x8_t vLane1 = vld1q_u16(lane + x + 0); + uint16x8_t vLane2 = vld1q_u16(lane + x + 1); + + vLane0 = vaddq_u16(vLane0, vLane2); + vLane2 = vaddq_u16(vLane2, vLane1); + vLane0 = vmlaq_u16(vLane0, vLane1, vc6u16); + uint8x8x2_t vRes; + vRes.val[0] = vrshrn_n_u16(vLane0, 6); + vRes.val[1] = vrshrn_n_u16(vLane2, 4); + + vst2_u8(dst + x*2, vRes); +#endif + } + break; + case 3: + { + lim = dcolshn > 23 ? dcolshn - 23 : 0; + for (; x < lim; x += 24) + { + internal::prefetch(lane + x); +#if defined(__GNUC__) && defined(__arm__) + __asm__ ( + "vmov.u16 q9, #6 \n\t" + "vld3.16 {d0, d2, d4}, [%[in0]] /*v0*/ \n\t" + "vld3.16 {d1, d3, d5}, [%[in02]] \n\t" + "vld3.16 {d6, d8, d10}, [%[in2]] /*v2*/ \n\t" + "vld3.16 {d7, d9, d11}, [%[in22]] \n\t" + "vld3.16 {d12, d14, d16}, [%[in1]] /*v1*/ \n\t" + "vld3.16 {d13, d15, d17}, [%[in12]] \n\t" + "vadd.i16 q0, q3 /*v0 + v2*/ \n\t" + "vadd.i16 q1, q4 /*v0 + v2*/ \n\t" + "vadd.i16 q2, q5 /*v0 + v2*/ \n\t" + "vadd.i16 q3, q6 /*v1 + v2*/ \n\t" + "vadd.i16 q4, q7 /*v1 + v2*/ \n\t" + "vadd.i16 q5, q8 /*v1 + v2*/ \n\t" + "vmla.i16 q0, q6, q9 /*v0 + v2 + v1*6 */ \n\t" + "vmla.i16 q1, q7, q9 /*v0 + v2 + v1*6 */ \n\t" + "vmla.i16 q2, q8, q9 /*v0 + v2 + v1*6 */ \n\t" + "vrshrn.u16 d19, q3, #4 \n\t" + "vrshrn.u16 d21, q4, #4 \n\t" + "vrshrn.u16 d23, q5, #4 \n\t" + "vrshrn.u16 d18, q0, #6 \n\t" + "vrshrn.u16 d20, q1, #6 \n\t" + "vrshrn.u16 d22, q2, #6 \n\t" + "vzip.8 d18, d19 \n\t" + "vzip.8 d20, d21 \n\t" + "vzip.8 d22, d23 \n\t" + "vst3.8 {d18, d20, d22}, [%[out1]] \n\t" + "vst3.8 {d19, d21, d23}, [%[out2]] \n\t" + : /*no output*/ + : [out1] "r" (dst + 2 * x), + [out2] "r" (dst + 2 * x + 24), + [in0] "r" (lane + x - 3), + [in02] "r" (lane + x + 9), + [in1] "r" (lane + x), + [in12] "r" (lane + x + 12), + [in2] "r" (lane + x + 3), + [in22] "r" (lane + x + 15) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23" + ); +#else + uint16x8_t vc6 = vmovq_n_u16(6); + uint16x8x3_t vLane0 = vld3q_u16(lane + x - 3); + uint16x8x3_t vLane1 = vld3q_u16(lane + x + 0); + uint16x8x3_t vLane2 = vld3q_u16(lane + x + 3); + + uint16x8_t vSum_0_3 = vaddq_u16(vLane0.val[0], vLane2.val[0]); + uint16x8_t vSum_1_4 = vaddq_u16(vLane0.val[1], vLane2.val[1]); + uint16x8_t vSum_2_5 = vaddq_u16(vLane0.val[2], vLane2.val[2]); + uint16x8_t vSum_3_6 = vaddq_u16(vLane2.val[0], vLane1.val[0]); + uint16x8_t vSum_4_7 = vaddq_u16(vLane2.val[1], vLane1.val[1]); + uint16x8_t vSum_5_8 = vaddq_u16(vLane2.val[2], vLane1.val[2]); + + vSum_0_3 = vmlaq_u16(vSum_0_3, vLane1.val[0], vc6); + vSum_1_4 = vmlaq_u16(vSum_1_4, vLane1.val[1], vc6); + vSum_2_5 = vmlaq_u16(vSum_2_5, vLane1.val[2], vc6); + + uint8x8x2_t vSumShr3; + vSumShr3.val[0] = vrshrn_n_u16(vSum_3_6, 4); + vSumShr3.val[1] = vrshrn_n_u16(vSum_0_3, 6);; + uint8x8x2_t vSumShr4; + vSumShr4.val[0] = vrshrn_n_u16(vSum_4_7, 4); + vSumShr4.val[1] = vrshrn_n_u16(vSum_1_4, 6); + uint8x8x2_t vSumShr5; + vSumShr5.val[0] = vrshrn_n_u16(vSum_5_8, 4); + vSumShr5.val[1] = vrshrn_n_u16(vSum_2_5, 6); + + vSumShr3 = vzip_u8(vSumShr3.val[1], vSumShr3.val[0]); + vSumShr4 = vzip_u8(vSumShr4.val[1], vSumShr4.val[0]); + vSumShr5 = vzip_u8(vSumShr5.val[1], vSumShr5.val[0]); + + uint8x8x3_t vRes1; + vRes1.val[0] = vSumShr3.val[0]; + vRes1.val[1] = vSumShr4.val[0]; + vRes1.val[2] = vSumShr5.val[0]; + vst3_u8(dst + 2 * x, vRes1); + + uint8x8x3_t vRes2; + vRes2.val[0] = vSumShr3.val[1]; + vRes2.val[1] = vSumShr4.val[1]; + vRes2.val[2] = vSumShr5.val[1]; + vst3_u8(dst + 2 * x + 24, vRes2); +#endif + } + } + break; + case 4: + lim = dcolshn > 7 ? dcolshn - 7 : 0; + for (; x < lim; x += 8) + { + internal::prefetch(lane + x); +#if defined(__GNUC__) && defined(__arm__) + __asm__ ( + "vld1.16 {d0-d1}, [%[in0]] /*q0 = v0*/ \n\t" + "vld1.16 {d2-d3}, [%[in2]] /*q1 = v2*/ \n\t" + "vld1.16 {d4-d5}, [%[in1],:128] /*q2 = v1*/ \n\t" + "vadd.i16 q0, q1 /*q0 = v0 + v2*/ \n\t" + "vadd.i16 q3, q1, q2 /*q3 = v1 + v2*/ \n\t" + "vmla.i16 q0, q2, %q[c6] /*q0 += v1*6*/ \n\t" + "vrshrn.u16 d9, q3, #4 \n\t" + "vrshrn.u16 d8, q0, #6 \n\t" + "vst2.32 {d8-d9}, [%[out]] \n\t" + : /*no output*/ + : [out] "r" (dst + x*2), + [in0] "r" (lane + x-4), + [in1] "r" (lane + x), + [in2] "r" (lane + x+4), + [c6] "w" (vc6u16) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9" + ); +#else + uint16x8_t vLane0 = vld1q_u16(lane + x-4); + uint16x8_t vLane1 = vld1q_u16(lane + x+0); + uint16x8_t vLane2 = vld1q_u16(lane + x+4); + + vLane0 = vaddq_u16(vLane0, vLane2); + vLane2 = vaddq_u16(vLane2, vLane1); + vLane0 = vmlaq_u16(vLane0, vLane1, vc6u16); + uint32x2x2_t vRes; + vRes.val[1] = vreinterpret_u32_u8(vrshrn_n_u16(vLane2, 4)); + vRes.val[0] = vreinterpret_u32_u8(vrshrn_n_u16(vLane0, 6)); + + vst2_u32((uint32_t*)(dst + x*2), vRes); +#endif + } + break; + }; + + for (u32 h = 0; h < cn; ++h) + { + const u16* ln = lane + h; + u8* dt = dst + h; + size_t k = x; + for (; k < dcolshn; k += cn) + { + dt[2*k+0] = u8((ln[(ptrdiff_t)(k-cn)] + ln[k+cn] + 6u * ln[k] + (1 << 5)) >> 6); + dt[2*k+cn] = u8(((ln[k] + ln[k+cn]) * 4u + (1 << 5)) >> 6); + } + for (; k < dcolshw; k += cn) + dt[2*k] = u8((ln[(ptrdiff_t)(k-cn)] + ln[k+cn] + 6u * ln[k] + (1 << 5)) >> 6); + } + dst = internal::getRowPtr(dstBase, dstStride, 2*i+1); + + //second row + if (lane == lane0 && 2*i+1 < dstSize.height) + { + lane = lane1; + goto pyrUp8uHorizontalConvolution; + } + } +#else + // Remove 'unused parameter' warnings. + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +void gaussianPyramidUp(const Size2D &srcSize, + const s16 *srcBase, ptrdiff_t srcStride, + const Size2D &dstSize, + s16 *dstBase, ptrdiff_t dstStride, u8 cn) +{ + internal::assertSupportedConfiguration(isGaussianPyramidUpS16Supported(srcSize, dstSize, cn)); +#ifdef CAROTENE_NEON + size_t dcolshn = (dstSize.width/2) * cn; + size_t dcolshw = ((dstSize.width+1)/2) * cn; + size_t scolsn = srcSize.width*cn; + + size_t idx_l = (borderInterpolate101(-2, 2 * srcSize.width)/2) * cn; + size_t idx_r1 = (borderInterpolate101(2 * srcSize.width + 0, 2 * srcSize.width)/2) * cn; + size_t idx_r2 = (borderInterpolate101(2 * srcSize.width + 2, 2 * srcSize.width + 2)/2) * cn; + + //2-lines buffer + std::vector _buf(2*(cn*(srcSize.width + 3) + 32/sizeof(s32))); + s32* lane0 = internal::alignPtr(&_buf[cn], 32); + s32* lane1 = internal::alignPtr(lane0 + (3 + srcSize.width)*cn, 32); + + int16x4_t vc6s16 = vmov_n_s16(6); + int32x4_t vc6s32 = vmovq_n_s32(6); + + for (size_t i = 0; i < (dstSize.height + 1)/2; ++i) + { + s16* dst = internal::getRowPtr(dstBase, dstStride, 2*i); + //vertical convolution + const s16* ln0 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 - 2, srcSize.height * 2)/2); + const s16* ln1 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 + 0, srcSize.height * 2)/2); + const s16* ln2 = internal::getRowPtr(srcBase, srcStride, borderInterpolate101(i * 2 + 2, srcSize.height * 2)/2); + + size_t x = 0; + for (; x <= scolsn - 4; x += 4) + { + internal::prefetch(internal::getRowPtr(ln1 + x, srcStride, (ptrdiff_t)x % 3 - 1)); + int16x4_t v0 = vld1_s16(ln0 + x); + int16x4_t v2 = vld1_s16(ln2 + x); + int16x4_t v1 = vld1_s16(ln1 + x); + + int32x4_t vl0 = vaddl_s16(v0, v2); + int32x4_t vl1 = vaddl_s16(v1, v2); + + vl0 = vmlal_s16(vl0, v1, vc6s16); + vl1 = vshlq_n_s32(vl1, 2); + + vst1q_s32(lane0 + x, vl0); + vst1q_s32(lane1 + x, vl1); + } + for (; x < scolsn; ++x) + { + lane0[x] = ln0[x] + ln2[x] + 6 * ln1[x]; + lane1[x] = 4 * (ln1[x] + ln2[x]); + } + + //left&right borders + for (u32 k = 0; k < cn; ++k) + { + lane0[(s32)(-cn+k)] = lane0[idx_l + k]; + lane1[(s32)(-cn+k)] = lane1[idx_l + k]; + + lane0[scolsn+k] = lane0[idx_r1 + k]; + lane0[scolsn+cn+k] = lane0[idx_r2 + k]; + lane1[scolsn+k] = lane1[idx_r1 + k]; + lane1[scolsn+cn+k] = lane1[idx_r2 + k]; + } + + //horizontal convolution + const s32* lane = lane0; +pyrUp16sHorizontalConvolution: + x = 0; + size_t lim; + switch(cn) + { + case 1: + lim = dcolshn > 3 ? dcolshn - 3 : 0; + for (; x < lim; x += 4) + { + internal::prefetch(lane + x); +#if defined(__GNUC__) && defined(__arm__) + __asm__ ( + "vld1.32 {d0-d1}, [%[in0]] /*q0 = v0*/ \n\t" + "vld1.32 {d2-d3}, [%[in2]] /*q1 = v2*/ \n\t" + "vld1.32 {d4-d5}, [%[in1],:128] /*q2 = v1*/ \n\t" + "vadd.i32 q0, q0, q1 /*q0 = v0 + v2*/ \n\t" + "vadd.i32 q3, q1, q2 /*q3 = v1 + v2*/ \n\t" + "vmla.i32 q0, q2, %q[c6] /*q0 += v1*6*/ \n\t" + "vrshrn.s32 d9, q3, #4 \n\t" + "vrshrn.s32 d8, q0, #6 \n\t" + "vst2.16 {d8-d9}, [%[out]] \n\t" + : /*no output*/ + : [out] "r" (dst + x * 2), + [in0] "r" (lane + x - 1), + [in1] "r" (lane + x), + [in2] "r" (lane + x + 1), + [c6] "w" (vc6s32) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9" + ); +#else + int32x4_t vLane0 = vld1q_s32(lane + x - 1); + int32x4_t vLane1 = vld1q_s32(lane + x); + int32x4_t vLane2 = vld1q_s32(lane + x + 1); + + vLane0 = vaddq_s32(vLane0, vLane2); + vLane2 = vaddq_s32(vLane2, vLane1); + vLane0 = vmlaq_s32(vLane0, vLane1, vc6s32); + int16x4x2_t vRes; + vRes.val[0] = vrshrn_n_s32(vLane0, 6); + vRes.val[1] = vrshrn_n_s32(vLane2, 4); + + vst2_s16(dst + x * 2, vRes); +#endif + } + break; + case 3: + { + lim = dcolshn > 11 ? dcolshn - 11 : 0; + for (; x < lim; x += 12) + { + internal::prefetch(lane + x + 3); +#if defined(__GNUC__) && defined(__arm__) + __asm__ ( + "vmov.s32 q9, #6 \n\t" + "vld3.32 {d0, d2, d4}, [%[in0]] /*v0*/ \n\t" + "vld3.32 {d1, d3, d5}, [%[in2]] \n\t" + "vld3.32 {d6, d8, d10}, [%[in2]] /*v2*/ \n\t" + "vld3.32 {d7, d9, d11}, [%[in22]] \n\t" + "vld3.32 {d12, d14, d16}, [%[in1]] /*v1*/ \n\t" + "vld3.32 {d13, d15, d17}, [%[in12]] \n\t" + "vadd.i32 q0, q3 /*v0 + v2*/ \n\t" + "vadd.i32 q1, q4 /*v0 + v2*/ \n\t" + "vadd.i32 q2, q5 /*v0 + v2*/ \n\t" + "vadd.i32 q3, q6 /*v1 + v2*/ \n\t" + "vadd.i32 q4, q7 /*v1 + v2*/ \n\t" + "vadd.i32 q5, q8 /*v1 + v2*/ \n\t" + "vmla.i32 q0, q6, q9 /*v0 + v2 + v1*6 */ \n\t" + "vmla.i32 q1, q7, q9 /*v0 + v2 + v1*6 */ \n\t" + "vmla.i32 q2, q8, q9 /*v0 + v2 + v1*6 */ \n\t" + "vrshrn.s32 d19, q3, #4 \n\t" + "vrshrn.s32 d21, q4, #4 \n\t" + "vrshrn.s32 d23, q5, #4 \n\t" + "vrshrn.s32 d18, q0, #6 \n\t" + "vrshrn.s32 d20, q1, #6 \n\t" + "vrshrn.s32 d22, q2, #6 \n\t" + "vzip.16 d18, d19 \n\t" + "vzip.16 d20, d21 \n\t" + "vzip.16 d22, d23 \n\t" + "vst3.16 {d18, d20, d22}, [%[out1]] \n\t" + "vst3.16 {d19, d21, d23}, [%[out2]] \n\t" + : /*no output*/ + : [out1] "r" (dst + 2*x), + [out2] "r" (dst + 2*x + 12), + [in0] "r" (lane + x - 3), + [in1] "r" (lane + x), + [in12] "r" (lane + x + 6), + [in2] "r" (lane + x + 3), + [in22] "r" (lane + x + 9) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15","d16","d17","d18","d19","d20","d21","d22","d23" + ); +#else + int32x4_t vc6 = vmovq_n_s32(6); + int32x4x3_t vLane0 = vld3q_s32(lane + x - 3); + int32x4x3_t vLane1 = vld3q_s32(lane + x); + int32x4x3_t vLane2 = vld3q_s32(lane + x + 3); + + int32x4_t vSum_0_3 = vaddq_s32(vLane0.val[0], vLane2.val[0]); + int32x4_t vSum_1_4 = vaddq_s32(vLane0.val[1], vLane2.val[1]); + int32x4_t vSum_2_5 = vaddq_s32(vLane0.val[2], vLane2.val[2]); + int32x4_t vSum_3_6 = vaddq_s32(vLane2.val[0], vLane1.val[0]); + int32x4_t vSum_4_7 = vaddq_s32(vLane2.val[1], vLane1.val[1]); + int32x4_t vSum_5_8 = vaddq_s32(vLane2.val[2], vLane1.val[2]); + + vSum_0_3 = vmlaq_s32(vSum_0_3, vLane1.val[0], vc6); + vSum_1_4 = vmlaq_s32(vSum_1_4, vLane1.val[1], vc6); + vSum_2_5 = vmlaq_s32(vSum_2_5, vLane1.val[2], vc6); + + int16x4x2_t vSumShr1; + vSumShr1.val[1] = vrshrn_n_s32(vSum_3_6, 4); + vSumShr1.val[0] = vrshrn_n_s32(vSum_0_3, 6); + + int16x4x2_t vSumShr2; + vSumShr2.val[1] = vrshrn_n_s32(vSum_4_7, 4); + vSumShr2.val[0] = vrshrn_n_s32(vSum_1_4, 6); + + int16x4x2_t vSumShr3; + vSumShr3.val[1] = vrshrn_n_s32(vSum_5_8, 4); + vSumShr3.val[0] = vrshrn_n_s32(vSum_2_5, 6); + + vSumShr1 = vzip_s16(vSumShr1.val[0], vSumShr1.val[1]); + vSumShr2 = vzip_s16(vSumShr2.val[0], vSumShr2.val[1]); + vSumShr3 = vzip_s16(vSumShr3.val[0], vSumShr3.val[1]); + + int16x4x3_t vRes1; + vRes1.val[0] = vSumShr1.val[0]; + vRes1.val[1] = vSumShr2.val[0]; + vRes1.val[2] = vSumShr3.val[0]; + vst3_s16((int16_t*)(dst + 2 * x), vRes1); + + int16x4x3_t vRes2; + vRes2.val[0] = vSumShr1.val[1]; + vRes2.val[1] = vSumShr2.val[1]; + vRes2.val[2] = vSumShr3.val[1]; + vst3_s16(dst + 2 * x + 12, vRes2); +#endif + } + } + break; + case 4: + lim = dcolshn > 3 ? dcolshn - 3 : 0; + for (; x < lim; x += 4) + { + internal::prefetch(lane + x); +#if defined(__GNUC__) && defined(__arm__) + __asm__ ( + "vld1.32 {d0-d1}, [%[in0]] /*q0 = v0*/ \n\t" + "vld1.32 {d2-d3}, [%[in2]] /*q1 = v2*/ \n\t" + "vld1.32 {d4-d5}, [%[in1],:128] /*q2 = v1*/ \n\t" + "vadd.i32 q0, q1 /*q0 = v0 + v2*/ \n\t" + "vadd.i32 q3, q1, q2 /*q3 = v1 + v2*/ \n\t" + "vmla.i32 q0, q2, %q[c6] /*q0 += v1*6*/ \n\t" + "vrshrn.s32 d9, q3, #4 \n\t" + "vrshrn.s32 d8, q0, #6 \n\t" + "vst1.16 {d8-d9}, [%[out]] \n\t" + : /*no output*/ + : [out] "r" (dst + x * 2), + [in0] "r" (lane + x - 4), + [in1] "r" (lane + x), + [in2] "r" (lane + x + 4), + [c6] "w" (vc6s32) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9" + ); +#else + int32x4_t vLane0 = vld1q_s32(lane + x - 4); + int32x4_t vLane1 = vld1q_s32(lane + x); + int32x4_t vLane2 = vld1q_s32(lane + x + 4); + + vLane0 = vaddq_s32(vLane0, vLane2); + vLane2 = vaddq_s32(vLane2, vLane1); + vLane0 = vmlaq_s32(vLane0, vLane1, vc6s32); + int16x4x2_t vRes; + vRes.val[0] = vrshrn_n_s32(vLane0, 6); + vRes.val[1] = vrshrn_n_s32(vLane2, 4); + + vst1q_s16(dst + x * 2, vcombine_s16(vRes.val[0], vRes.val[1])); +#endif + } + break; + }; + + for (u32 h = 0; h < cn; ++h) + { + const s32* ln = lane + h; + s16* dt = dst + h; + size_t k = x; + for (; k < dcolshn; k += cn) + { + dt[2*k+0] = s16((ln[(ptrdiff_t)(k-cn)] + ln[k+cn] + 6 * ln[k] + (1 << 5)) >> 6); + dt[2*k+cn] = s16(((ln[k] + ln[k+cn]) * 4 + (1 << 5)) >> 6); + } + for (; k < dcolshw; k += cn) + dt[2*k] = s16((ln[(ptrdiff_t)(k-cn)] + ln[k+cn] + 6 * ln[k] + (1 << 5)) >> 6); + } + dst = internal::getRowPtr(dstBase, dstStride, 2*i+1); + + //second row + if (lane == lane0 && 2*i+1 < dstSize.height) + { + lane = lane1; + goto pyrUp16sHorizontalConvolution; + } + } +#else + // Remove 'unused parameter' warnings. + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/reduce.cpp b/3rdparty/carotene/src/reduce.cpp new file mode 100644 index 0000000000..8c11c39e80 --- /dev/null +++ b/3rdparty/carotene/src/reduce.cpp @@ -0,0 +1,460 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +#include + +namespace CAROTENE_NS { + +void reduceColSum(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s32 * dstBase) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + memset(dstBase, 0, size.width*sizeof(s32)); + size_t i = 0; + for (; i + 16 <= size.width; i += 16) + { + const u8* src_address = srcBase + i; + + int32x4_t sll = vmovq_n_s32(0); + int32x4_t slh = vmovq_n_s32(0); + int32x4_t shl = vmovq_n_s32(0); + int32x4_t shh = vmovq_n_s32(0); + + for (size_t h = 0; h < size.height; h += 256) + { + size_t lim = std::min(h + 256, size.height); + + uint16x8_t sl = vmovq_n_u16(0); + uint16x8_t sh = vmovq_n_u16(0); + + for (size_t k = h; k < lim; ++k, src_address += srcStride) + { + internal::prefetch(src_address + srcStride, 0); + + uint8x16_t v = vld1q_u8(src_address); + + sl = vaddw_u8(sl, vget_low_u8(v)); + sh = vaddw_u8(sh, vget_high_u8(v)); + } + + int32x4_t vsll = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sl))); + int32x4_t vslh = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sl))); + int32x4_t vshl = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sh))); + int32x4_t vshh = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sh))); + + sll = vqaddq_s32(sll, vsll); + slh = vqaddq_s32(slh, vslh); + shl = vqaddq_s32(shl, vshl); + shh = vqaddq_s32(shh, vshh); + } + + vst1q_s32(dstBase + i + 0, sll); + vst1q_s32(dstBase + i + 4, slh); + vst1q_s32(dstBase + i + 8, shl); + vst1q_s32(dstBase + i + 12, shh); + } + + for(size_t h = 0; h < size.height; ++h) + { + for(size_t j = i ; j < size.width; j++ ) + { + if (((u32)(dstBase[j] += srcBase[j + srcStride * h])) > 0x7fFFffFFu) + dstBase[j] = 0x7fFFffFF; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; +#endif +} + +void reduceColMax(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + memcpy(dstBase, srcBase, size.width); + size_t i = 0; + for (; i + 16*4 <= size.width; i += 16*4) + { + const u8* src_address = srcBase + i; + + uint8x16_t s1 = vld1q_u8(src_address + 0); + uint8x16_t s2 = vld1q_u8(src_address + 16); + uint8x16_t s3 = vld1q_u8(src_address + 32); + uint8x16_t s4 = vld1q_u8(src_address + 48); + + src_address += srcStride; + + for(size_t h = 1; h < size.height; ++h, src_address += srcStride) + { + internal::prefetch(src_address + srcStride, 0); + internal::prefetch(src_address + srcStride, 32); + + uint8x16_t v1 = vld1q_u8(src_address + 0); + uint8x16_t v2 = vld1q_u8(src_address + 16); + uint8x16_t v3 = vld1q_u8(src_address + 32); + uint8x16_t v4 = vld1q_u8(src_address + 48); + + s1 = vmaxq_u8(s1, v1); + s2 = vmaxq_u8(s2, v2); + s3 = vmaxq_u8(s3, v3); + s4 = vmaxq_u8(s4, v4); + } + + vst1q_u8(dstBase + i + 0, s1); + vst1q_u8(dstBase + i + 16, s2); + vst1q_u8(dstBase + i + 32, s3); + vst1q_u8(dstBase + i + 48, s4); + } + + for (; i + 16 <= size.width; i += 16) + { + const u8* src_address = srcBase + i; + uint8x16_t s1 = vld1q_u8(src_address); + src_address += srcStride; + for(size_t h = 1; h < size.height; ++h, src_address += srcStride) + { + internal::prefetch(src_address + srcStride, 0); + + uint8x16_t v1 = vld1q_u8(src_address); + s1 = vmaxq_u8(s1, v1); + } + vst1q_u8(dstBase + i, s1); + } + + if (i < size.width) + for(size_t h = 1; h < size.height; ++h) + for(size_t j = i ; j < size.width; j++ ) + dstBase[j] = std::max(dstBase[j], srcBase[j + srcStride * h]); +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; +#endif +} + +void reduceColMin(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + memcpy(dstBase, srcBase, size.width); + size_t i = 0; + for (; i + 16*4 <= size.width; i += 16*4) + { + const u8* src_address = srcBase + i; + + uint8x16_t s1 = vld1q_u8(src_address + 0); + uint8x16_t s2 = vld1q_u8(src_address + 16); + uint8x16_t s3 = vld1q_u8(src_address + 32); + uint8x16_t s4 = vld1q_u8(src_address + 48); + + src_address += srcStride; + + for(size_t h = 1; h < size.height; ++h, src_address += srcStride) + { + internal::prefetch(src_address + srcStride, 0); + internal::prefetch(src_address + srcStride, 32); + + uint8x16_t v1 = vld1q_u8(src_address + 0); + uint8x16_t v2 = vld1q_u8(src_address + 16); + uint8x16_t v3 = vld1q_u8(src_address + 32); + uint8x16_t v4 = vld1q_u8(src_address + 48); + + s1 = vminq_u8(s1, v1); + s2 = vminq_u8(s2, v2); + s3 = vminq_u8(s3, v3); + s4 = vminq_u8(s4, v4); + } + + vst1q_u8(dstBase + i + 0, s1); + vst1q_u8(dstBase + i + 16, s2); + vst1q_u8(dstBase + i + 32, s3); + vst1q_u8(dstBase + i + 48, s4); + } + + for (; i + 16 <= size.width; i += 16) + { + const u8* src_address = srcBase + i; + uint8x16_t s1 = vld1q_u8(src_address); + src_address += srcStride; + for(size_t h = 1; h < size.height; ++h, src_address += srcStride) + { + internal::prefetch(src_address + srcStride, 0); + + uint8x16_t v1 = vld1q_u8(src_address); + s1 = vminq_u8(s1, v1); + } + vst1q_u8(dstBase + i, s1); + } + + if (i < size.width) + for(size_t h = 1; h < size.height; ++h) + for(size_t j = i ; j < size.width; j++ ) + dstBase[j] = std::min(dstBase[j], srcBase[j + srcStride * h]); +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; +#endif +} + +void reduceColSum(const Size2D &size, + const f32 * srcBase, ptrdiff_t srcStride, + f32 * dstBase) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + memcpy(dstBase, srcBase, size.width*sizeof(f32)); + size_t srcstep = srcStride/sizeof(f32); + size_t i = 0; + for (; i + 16 <= size.width; i += 16) + { + const f32* src_address = srcBase + i; + + float32x4_t s1 = vld1q_f32(src_address + 0); + float32x4_t s2 = vld1q_f32(src_address + 4); + float32x4_t s3 = vld1q_f32(src_address + 8); + float32x4_t s4 = vld1q_f32(src_address + 12); + + src_address += srcstep; + + for(size_t h = 1; h < size.height; ++h, src_address += srcstep) + { + internal::prefetch(src_address + srcstep, 0); + internal::prefetch(src_address + srcstep, 32); + + float32x4_t v1 = vld1q_f32(src_address + 0); + float32x4_t v2 = vld1q_f32(src_address + 4); + float32x4_t v3 = vld1q_f32(src_address + 8); + float32x4_t v4 = vld1q_f32(src_address + 12); + + s1 = vaddq_f32(s1, v1); + s2 = vaddq_f32(s2, v2); + s3 = vaddq_f32(s3, v3); + s4 = vaddq_f32(s4, v4); + } + + vst1q_f32(dstBase + i + 0, s1); + vst1q_f32(dstBase + i + 4, s2); + vst1q_f32(dstBase + i + 8, s3); + vst1q_f32(dstBase + i + 12, s4); + } + + for (; i + 4 <= size.width; i += 4) + { + const f32* src_address = srcBase + i; + float32x4_t s1 = vld1q_f32(src_address); + src_address += srcstep; + for(size_t h = 1; h < size.height; ++h, src_address += srcstep) + { + internal::prefetch(src_address + srcstep, 0); + + float32x4_t v1 = vld1q_f32(src_address); + s1 = vaddq_f32(s1, v1); + } + vst1q_f32(dstBase + i, s1); + } + + if (i < size.width) + for(size_t h = 1; h < size.height; ++h) + { + for(size_t j = i ; j < size.width; j++ ) + { + dstBase[j] += srcBase[j + srcstep * h]; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; +#endif +} + +void reduceColMax(const Size2D &size, + const f32 * srcBase, ptrdiff_t srcStride, + f32 * dstBase) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + memcpy(dstBase, srcBase, size.width*sizeof(f32)); + size_t srcstep = srcStride/sizeof(f32); + size_t i = 0; + for (; i + 16 <= size.width; i += 16) + { + const f32* src_address = srcBase + i; + + float32x4_t s1 = vld1q_f32(src_address + 0); + float32x4_t s2 = vld1q_f32(src_address + 4); + float32x4_t s3 = vld1q_f32(src_address + 8); + float32x4_t s4 = vld1q_f32(src_address + 12); + + src_address += srcstep; + + for(size_t h = 1; h < size.height; ++h, src_address += srcstep) + { + internal::prefetch(src_address + srcstep, 0); + internal::prefetch(src_address + srcstep, 32); + + float32x4_t v1 = vld1q_f32(src_address + 0); + float32x4_t v2 = vld1q_f32(src_address + 4); + float32x4_t v3 = vld1q_f32(src_address + 8); + float32x4_t v4 = vld1q_f32(src_address + 12); + + s1 = vmaxq_f32(s1, v1); + s2 = vmaxq_f32(s2, v2); + s3 = vmaxq_f32(s3, v3); + s4 = vmaxq_f32(s4, v4); + } + + vst1q_f32(dstBase + i + 0, s1); + vst1q_f32(dstBase + i + 4, s2); + vst1q_f32(dstBase + i + 8, s3); + vst1q_f32(dstBase + i + 12, s4); + } + + for (; i + 4 <= size.width; i += 4) + { + const f32* src_address = srcBase + i; + float32x4_t s1 = vld1q_f32(src_address); + src_address += srcstep; + for(size_t h = 1; h < size.height; ++h, src_address += srcstep) + { + internal::prefetch(src_address + srcstep, 0); + + float32x4_t v1 = vld1q_f32(src_address); + s1 = vmaxq_f32(s1, v1); + } + vst1q_f32(dstBase + i, s1); + } + + if (i < size.width) + for(size_t h = 1; h < size.height; ++h) + for(size_t j = i ; j < size.width; j++ ) + dstBase[j] = std::max(dstBase[j], srcBase[j + srcstep * h]); +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; +#endif +} + +void reduceColMin(const Size2D &size, + const f32 * srcBase, ptrdiff_t srcStride, + f32 * dstBase) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + memcpy(dstBase, srcBase, size.width*sizeof(f32)); + size_t srcstep = srcStride/sizeof(f32); + size_t i = 0; + for (; i + 16 <= size.width; i += 16) + { + const f32* src_address = srcBase + i; + + float32x4_t s1 = vld1q_f32(src_address + 0); + float32x4_t s2 = vld1q_f32(src_address + 4); + float32x4_t s3 = vld1q_f32(src_address + 8); + float32x4_t s4 = vld1q_f32(src_address + 12); + + src_address += srcstep; + + for(size_t h = 1; h < size.height; ++h, src_address += srcstep) + { + internal::prefetch(src_address + srcstep, 0); + internal::prefetch(src_address + srcstep, 32); + + float32x4_t v1 = vld1q_f32(src_address + 0); + float32x4_t v2 = vld1q_f32(src_address + 4); + float32x4_t v3 = vld1q_f32(src_address + 8); + float32x4_t v4 = vld1q_f32(src_address + 12); + + s1 = vminq_f32(s1, v1); + s2 = vminq_f32(s2, v2); + s3 = vminq_f32(s3, v3); + s4 = vminq_f32(s4, v4); + } + + vst1q_f32(dstBase + i + 0, s1); + vst1q_f32(dstBase + i + 4, s2); + vst1q_f32(dstBase + i + 8, s3); + vst1q_f32(dstBase + i + 12, s4); + } + + for (; i + 4 <= size.width; i += 4) + { + const f32* src_address = srcBase + i; + float32x4_t s1 = vld1q_f32(src_address); + src_address += srcstep; + for(size_t h = 1; h < size.height; ++h, src_address += srcstep) + { + internal::prefetch(src_address + srcstep, 0); + + float32x4_t v1 = vld1q_f32(src_address); + s1 = vminq_f32(s1, v1); + } + vst1q_f32(dstBase + i, s1); + } + + if (i < size.width) + for(size_t h = 1; h < size.height; ++h) + for(size_t j = i ; j < size.width; j++ ) + dstBase[j] = std::min(dstBase[j], srcBase[j + srcstep * h]); +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/remap.cpp b/3rdparty/carotene/src/remap.cpp new file mode 100644 index 0000000000..a4b99c3db0 --- /dev/null +++ b/3rdparty/carotene/src/remap.cpp @@ -0,0 +1,694 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "remap.hpp" + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON + +namespace internal { + +void remapNearestNeighborReplicate(const Size2D size, + const u8 * srcBase, + const s32 * map, + u8 * dstBase, ptrdiff_t dstStride) +{ + for (size_t y = 0; y < size.height; ++y) + { + const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32), y); + u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y); + + for (size_t x = 0; x < size.width; ++x) + { + dst_row[x] = srcBase[map_row[x]]; + } + } +} + +void remapNearestNeighborConst(const Size2D size, + const u8 * srcBase, + const s32 * map, + u8 * dstBase, ptrdiff_t dstStride, + u8 borderValue) +{ + for (size_t y = 0; y < size.height; ++y) + { + const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32), y); + u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y); + + for (size_t x = 0; x < size.width; ++x) + { + s32 src_idx = map_row[x]; + dst_row[x] = src_idx >= 0 ? srcBase[map_row[x]] : borderValue; + } + } +} + +void remapLinearReplicate(const Size2D size, + const u8 * srcBase, + const s32 * map, + const f32 * coeffs, + u8 * dstBase, ptrdiff_t dstStride) +{ + int16x8_t v_zero16 = vdupq_n_s16(0); + + for (size_t y = 0; y < size.height; ++y) + { + const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32) * 4, y); + const f32 * coeff_row = internal::getRowPtr(coeffs, size.width * sizeof(f32) * 2, y); + + u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y); + + size_t x = 0; + for ( ; x + 8 < size.width; x += 8) + { + int16x8_t v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2)]], v_zero16, 0); + v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 4]], v_src00, 1); + v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 8]], v_src00, 2); + v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 12]], v_src00, 3); + v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 16]], v_src00, 4); + v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 20]], v_src00, 5); + v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 24]], v_src00, 6); + v_src00 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 28]], v_src00, 7); + + int16x8_t v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 1]], v_zero16, 0); + v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 5]], v_src01, 1); + v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 9]], v_src01, 2); + v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 13]], v_src01, 3); + v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 17]], v_src01, 4); + v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 21]], v_src01, 5); + v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 25]], v_src01, 6); + v_src01 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 29]], v_src01, 7); + + int16x8_t v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 2]], v_zero16, 0); + v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 6]], v_src10, 1); + v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 10]], v_src10, 2); + v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 14]], v_src10, 3); + v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 18]], v_src10, 4); + v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 22]], v_src10, 5); + v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 26]], v_src10, 6); + v_src10 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 30]], v_src10, 7); + + int16x8_t v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 3]], v_zero16, 0); + v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 7]], v_src11, 1); + v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 11]], v_src11, 2); + v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 15]], v_src11, 3); + v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 19]], v_src11, 4); + v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 23]], v_src11, 5); + v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 27]], v_src11, 6); + v_src11 = vsetq_lane_s16(srcBase[map_row[(x << 2) + 31]], v_src11, 7); + + // first part + float32x4_t v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00))); + float32x4_t v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10))); + + float32x4x2_t v_coeff = vld2q_f32(coeff_row + (x << 1)); + float32x4_t v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src01), + vget_low_s16(v_src00))), v_coeff.val[0]); + float32x4_t v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src11), + vget_low_s16(v_src10))), v_coeff.val[0]); + + float32x4_t v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]); + uint16x4_t v_dst0 = vmovn_u32(vcvtq_u32_f32(v_dst)); + + // second part + v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00))); + v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10))); + + v_coeff = vld2q_f32(coeff_row + (x << 1) + 8); + v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src01), + vget_high_s16(v_src00))), v_coeff.val[0]); + v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src11), + vget_high_s16(v_src10))), v_coeff.val[0]); + + v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]); + uint16x4_t v_dst1 = vmovn_u32(vcvtq_u32_f32(v_dst)); + + // store + vst1_u8(dst_row + x, vmovn_u16(vcombine_u16(v_dst0, v_dst1))); + } + + for ( ; x < size.width; ++x) + { + s32 src00_index = map_row[(x << 2)]; + s32 src10_index = map_row[(x << 2) + 2]; + f32 dst_val_0 = (srcBase[map_row[(x << 2) + 1]] - srcBase[src00_index]) * coeff_row[x << 1] + + srcBase[src00_index]; + f32 dst_val_1 = (srcBase[map_row[(x << 2) + 3]] - srcBase[src10_index]) * coeff_row[x << 1] + + srcBase[src10_index]; + dst_row[x] = floorf((dst_val_1 - dst_val_0) * coeff_row[(x << 1) + 1] + dst_val_0); + } + } +} + +void remapLinearConst(const Size2D size, + const u8 * srcBase, + const s32 * map, + const f32 * coeffs, + u8 * dstBase, ptrdiff_t dstStride, + u8 borderValue) +{ + int16x8_t v_zero16 = vdupq_n_s16(0); + + for (size_t y = 0; y < size.height; ++y) + { + const s32 * map_row = internal::getRowPtr(map, size.width * sizeof(s32) * 4, y); + const f32 * coeff_row = internal::getRowPtr(coeffs, size.width * sizeof(f32) * 2, y); + + u8 * dst_row = internal::getRowPtr(dstBase, dstStride, y); + + size_t x = 0; + for ( ; x + 8 < size.width; x += 8) + { + int16x8_t v_src00 = vsetq_lane_s16(map_row[(x << 2)] >= 0 ? srcBase[map_row[(x << 2)]] : borderValue, v_zero16, 0); + v_src00 = vsetq_lane_s16(map_row[(x << 2) + 4] >= 0 ? srcBase[map_row[(x << 2) + 4]] : borderValue, v_src00, 1); + v_src00 = vsetq_lane_s16(map_row[(x << 2) + 8] >= 0 ? srcBase[map_row[(x << 2) + 8]] : borderValue, v_src00, 2); + v_src00 = vsetq_lane_s16(map_row[(x << 2) + 12] >= 0 ? srcBase[map_row[(x << 2) + 12]] : borderValue, v_src00, 3); + v_src00 = vsetq_lane_s16(map_row[(x << 2) + 16] >= 0 ? srcBase[map_row[(x << 2) + 16]] : borderValue, v_src00, 4); + v_src00 = vsetq_lane_s16(map_row[(x << 2) + 20] >= 0 ? srcBase[map_row[(x << 2) + 20]] : borderValue, v_src00, 5); + v_src00 = vsetq_lane_s16(map_row[(x << 2) + 24] >= 0 ? srcBase[map_row[(x << 2) + 24]] : borderValue, v_src00, 6); + v_src00 = vsetq_lane_s16(map_row[(x << 2) + 28] >= 0 ? srcBase[map_row[(x << 2) + 28]] : borderValue, v_src00, 7); + + int16x8_t v_src01 = vsetq_lane_s16(map_row[(x << 2) + 1] >= 0 ? srcBase[map_row[(x << 2) + 1]] : borderValue, v_zero16, 0); + v_src01 = vsetq_lane_s16(map_row[(x << 2) + 5] >= 0 ? srcBase[map_row[(x << 2) + 5]] : borderValue, v_src01, 1); + v_src01 = vsetq_lane_s16(map_row[(x << 2) + 9] >= 0 ? srcBase[map_row[(x << 2) + 9]] : borderValue, v_src01, 2); + v_src01 = vsetq_lane_s16(map_row[(x << 2) + 13] >= 0 ? srcBase[map_row[(x << 2) + 13]] : borderValue, v_src01, 3); + v_src01 = vsetq_lane_s16(map_row[(x << 2) + 17] >= 0 ? srcBase[map_row[(x << 2) + 17]] : borderValue, v_src01, 4); + v_src01 = vsetq_lane_s16(map_row[(x << 2) + 21] >= 0 ? srcBase[map_row[(x << 2) + 21]] : borderValue, v_src01, 5); + v_src01 = vsetq_lane_s16(map_row[(x << 2) + 25] >= 0 ? srcBase[map_row[(x << 2) + 25]] : borderValue, v_src01, 6); + v_src01 = vsetq_lane_s16(map_row[(x << 2) + 29] >= 0 ? srcBase[map_row[(x << 2) + 29]] : borderValue, v_src01, 7); + + int16x8_t v_src10 = vsetq_lane_s16(map_row[(x << 2) + 2] >= 0 ? srcBase[map_row[(x << 2) + 2]] : borderValue, v_zero16, 0); + v_src10 = vsetq_lane_s16(map_row[(x << 2) + 6] >= 0 ? srcBase[map_row[(x << 2) + 6]] : borderValue, v_src10, 1); + v_src10 = vsetq_lane_s16(map_row[(x << 2) + 10] >= 0 ? srcBase[map_row[(x << 2) + 10]] : borderValue, v_src10, 2); + v_src10 = vsetq_lane_s16(map_row[(x << 2) + 14] >= 0 ? srcBase[map_row[(x << 2) + 14]] : borderValue, v_src10, 3); + v_src10 = vsetq_lane_s16(map_row[(x << 2) + 18] >= 0 ? srcBase[map_row[(x << 2) + 18]] : borderValue, v_src10, 4); + v_src10 = vsetq_lane_s16(map_row[(x << 2) + 22] >= 0 ? srcBase[map_row[(x << 2) + 22]] : borderValue, v_src10, 5); + v_src10 = vsetq_lane_s16(map_row[(x << 2) + 26] >= 0 ? srcBase[map_row[(x << 2) + 26]] : borderValue, v_src10, 6); + v_src10 = vsetq_lane_s16(map_row[(x << 2) + 30] >= 0 ? srcBase[map_row[(x << 2) + 30]] : borderValue, v_src10, 7); + + int16x8_t v_src11 = vsetq_lane_s16(map_row[(x << 2) + 3] >= 0 ? srcBase[map_row[(x << 2) + 3]] : borderValue, v_zero16, 0); + v_src11 = vsetq_lane_s16(map_row[(x << 2) + 7] >= 0 ? srcBase[map_row[(x << 2) + 7]] : borderValue, v_src11, 1); + v_src11 = vsetq_lane_s16(map_row[(x << 2) + 11] >= 0 ? srcBase[map_row[(x << 2) + 11]] : borderValue, v_src11, 2); + v_src11 = vsetq_lane_s16(map_row[(x << 2) + 15] >= 0 ? srcBase[map_row[(x << 2) + 15]] : borderValue, v_src11, 3); + v_src11 = vsetq_lane_s16(map_row[(x << 2) + 19] >= 0 ? srcBase[map_row[(x << 2) + 19]] : borderValue, v_src11, 4); + v_src11 = vsetq_lane_s16(map_row[(x << 2) + 23] >= 0 ? srcBase[map_row[(x << 2) + 23]] : borderValue, v_src11, 5); + v_src11 = vsetq_lane_s16(map_row[(x << 2) + 27] >= 0 ? srcBase[map_row[(x << 2) + 27]] : borderValue, v_src11, 6); + v_src11 = vsetq_lane_s16(map_row[(x << 2) + 31] >= 0 ? srcBase[map_row[(x << 2) + 31]] : borderValue, v_src11, 7); + + // first part + float32x4_t v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00))); + float32x4_t v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10))); + + float32x4x2_t v_coeff = vld2q_f32(coeff_row + (x << 1)); + float32x4_t v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src01), + vget_low_s16(v_src00))), v_coeff.val[0]); + float32x4_t v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src11), + vget_low_s16(v_src10))), v_coeff.val[0]); + + float32x4_t v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]); + uint16x4_t v_dst0 = vmovn_u32(vcvtq_u32_f32(v_dst)); + + // second part + v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00))); + v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10))); + + v_coeff = vld2q_f32(coeff_row + (x << 1) + 8); + v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src01), + vget_high_s16(v_src00))), v_coeff.val[0]); + v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src11), + vget_high_s16(v_src10))), v_coeff.val[0]); + + v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]); + uint16x4_t v_dst1 = vmovn_u32(vcvtq_u32_f32(v_dst)); + + // store + vst1_u8(dst_row + x, vmovn_u16(vcombine_u16(v_dst0, v_dst1))); + } + + for ( ; x < size.width; ++x) + { + s16 src00 = map_row[(x << 2) + 0] >= 0 ? srcBase[map_row[(x << 2) + 0]] : borderValue; + s16 src01 = map_row[(x << 2) + 1] >= 0 ? srcBase[map_row[(x << 2) + 1]] : borderValue; + s16 src10 = map_row[(x << 2) + 2] >= 0 ? srcBase[map_row[(x << 2) + 2]] : borderValue; + s16 src11 = map_row[(x << 2) + 3] >= 0 ? srcBase[map_row[(x << 2) + 3]] : borderValue; + + f32 dst_val_0 = (src01 - src00) * coeff_row[(x << 1)] + src00; + f32 dst_val_1 = (src11 - src10) * coeff_row[(x << 1)] + src10; + dst_row[x] = floorf((dst_val_1 - dst_val_0) * coeff_row[(x << 1) + 1] + dst_val_0); + } + } +} + +} // namespace internal + +#endif // CAROTENE_NEON + +bool isRemapNearestNeighborSupported(const Size2D &ssize) +{ +#if SIZE_MAX > UINT32_MAX + return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation + // is performed with u32 + isSupportedConfiguration(); +#else + (void)ssize; + return isSupportedConfiguration(); +#endif +} + +bool isRemapLinearSupported(const Size2D &ssize) +{ +#if SIZE_MAX > UINT32_MAX + return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation + // is performed with u32 + isSupportedConfiguration(); +#else + (void)ssize; + return isSupportedConfiguration(); +#endif +} + +void remapNearestNeighbor(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + const f32 * tableBase, ptrdiff_t tableStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderMode, u8 borderValue) +{ + internal::assertSupportedConfiguration(isRemapNearestNeighborSupported(ssize)); +#ifdef CAROTENE_NEON + using namespace internal; + + s32 _map[BLOCK_SIZE * BLOCK_SIZE + 16]; + s32 * map = alignPtr(_map, 16); + + int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1); + int32x2_t v_width2 = vdup_n_s32(ssize.width - 1), v_height2 = vdup_n_s32(ssize.height - 1); + int32x4_t v_step4 = vdupq_n_s32(srcStride); + int32x2_t v_step2 = vdup_n_s32(srcStride); + + if (borderMode == BORDER_MODE_REPLICATE) + { + int32x4_t v_zero4 = vdupq_n_s32(0); + int32x2_t v_zero2 = vdup_n_s32(0); + + for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE) + { + size_t blockHeight = std::min(BLOCK_SIZE, dsize.height - i); + for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE) + { + size_t blockWidth = std::min(BLOCK_SIZE, dsize.width - j); + + // compute table + for (size_t y = 0; y < blockHeight; ++y) + { + const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1); + s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y); + + size_t x = 0; + for ( ; x + 8 <= blockWidth; x += 8) + { + float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1)), + v_table1 = vld2q_f32(table_row + (x << 1) + 8); + + int32x4_t v_dst_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_table0.val[0]))); + int32x4_t v_dst_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_table0.val[1]))); + int32x4_t v_dst_index = vmlaq_s32(v_dst_x, v_dst_y, v_step4); + vst1q_s32(map_row + x, v_dst_index); + + v_dst_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_table1.val[0]))); + v_dst_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_table1.val[1]))); + v_dst_index = vmlaq_s32(v_dst_x, v_dst_y, v_step4); + vst1q_s32(map_row + x + 4, v_dst_index); + } + + for ( ; x + 4 <= blockWidth; x += 4) + { + float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1)); + + int32x4_t v_dst_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_table0.val[0]))); + int32x4_t v_dst_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_table0.val[1]))); + int32x4_t v_dst_index = vmlaq_s32(v_dst_x, v_dst_y, v_step4); + vst1q_s32(map_row + x, v_dst_index); + } + + for ( ; x + 2 <= blockWidth; x += 2) + { + float32x2x2_t v_table0 = vld2_f32(table_row + (x << 1)); + + int32x2_t v_dst_x = vmax_s32(v_zero2, vmin_s32(v_width2, vcvt_s32_f32(v_table0.val[0]))); + int32x2_t v_dst_y = vmax_s32(v_zero2, vmin_s32(v_height2, vcvt_s32_f32(v_table0.val[1]))); + int32x2_t v_dst_index = vmla_s32(v_dst_x, v_dst_y, v_step2); + vst1_s32(map_row + x, v_dst_index); + } + + for ( ; x < blockWidth; ++x) + { + s32 src_x = std::max(0, std::min(ssize.width - 1, (s32)floorf(table_row[(x << 1) + 0]))); + s32 src_y = std::max(0, std::min(ssize.height - 1, (s32)floorf(table_row[(x << 1) + 1]))); + map_row[x] = src_y * srcStride + src_x; + } + } + + // make remap + remapNearestNeighborReplicate(Size2D(blockWidth, blockHeight), srcBase, &map[0], + getRowPtr(dstBase, dstStride, i) + j, dstStride); + } + } + } + else if (borderMode == BORDER_MODE_CONSTANT) + { + int32x4_t v_m1_4 = vdupq_n_s32(-1); + int32x2_t v_m1_2 = vdup_n_s32(-1); + float32x4_t v_zero4 = vdupq_n_f32(0.0f); + float32x2_t v_zero2 = vdup_n_f32(0.0f); + + for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE) + { + size_t blockHeight = std::min(BLOCK_SIZE, dsize.height - i); + for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE) + { + size_t blockWidth = std::min(BLOCK_SIZE, dsize.width - j); + + // compute table + for (size_t y = 0; y < blockHeight; ++y) + { + const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1); + s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y); + + size_t x = 0; + for ( ; x + 8 <= blockWidth; x += 8) + { + float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1)), + v_table1 = vld2q_f32(table_row + (x << 1) + 8); + + int32x4_t v_dst_x = vcvtq_s32_f32(v_table0.val[0]); + int32x4_t v_dst_y = vcvtq_s32_f32(v_table0.val[1]); + uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table0.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)), + vandq_u32(vcgeq_f32(v_table0.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4))); + int32x4_t v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4); + vst1q_s32(map_row + x, v_dst_index); + + v_dst_x = vcvtq_s32_f32(v_table1.val[0]); + v_dst_y = vcvtq_s32_f32(v_table1.val[1]); + v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table1.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)), + vandq_u32(vcgeq_f32(v_table1.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4))); + v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4); + vst1q_s32(map_row + x + 4, v_dst_index); + } + + for ( ; x + 4 <= blockWidth; x += 4) + { + float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1)); + + int32x4_t v_dst_x = vcvtq_s32_f32(v_table0.val[0]); + int32x4_t v_dst_y = vcvtq_s32_f32(v_table0.val[1]); + uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table0.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)), + vandq_u32(vcgeq_f32(v_table0.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4))); + int32x4_t v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4); + vst1q_s32(map_row + x, v_dst_index); + } + + for ( ; x + 2 <= blockWidth; x += 2) + { + float32x2x2_t v_table0 = vld2_f32(table_row + (x << 1)); + + int32x2_t v_dst_x = vcvt_s32_f32(v_table0.val[0]); + int32x2_t v_dst_y = vcvt_s32_f32(v_table0.val[1]); + uint32x2_t v_mask = vand_u32(vand_u32(vcge_f32(v_table0.val[0], v_zero2), vcle_s32(v_dst_x, v_width2)), + vand_u32(vcge_f32(v_table0.val[1], v_zero2), vcle_s32(v_dst_y, v_height2))); + int32x2_t v_dst_index = vbsl_s32(v_mask, vmla_s32(v_dst_x, v_dst_y, v_step2), v_m1_2); + vst1_s32(map_row + x, v_dst_index); + } + + for ( ; x < blockWidth; ++x) + { + s32 src_x = (s32)floorf(table_row[(x << 1) + 0]); + s32 src_y = (s32)floorf(table_row[(x << 1) + 1]); + map_row[x] = (src_x >= 0) && (src_x < (s32)ssize.width) && + (src_y >= 0) && (src_y < (s32)ssize.height) ? src_y * srcStride + src_x : -1; + } + } + + // make remap + remapNearestNeighborConst(Size2D(blockWidth, blockHeight), srcBase, &map[0], + getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue); + } + } + } + +#else + (void)ssize; + (void)dsize; + (void)srcBase; + (void)srcStride; + (void)tableBase; + (void)tableStride; + (void)dstBase; + (void)dstStride; + (void)borderMode; + (void)borderValue; +#endif +} + +void remapLinear(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + const f32 * tableBase, ptrdiff_t tableStride, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderMode, u8 borderValue) +{ + internal::assertSupportedConfiguration(isRemapLinearSupported(ssize)); +#ifdef CAROTENE_NEON + using namespace internal; + + s32 _map[((BLOCK_SIZE * BLOCK_SIZE) << 2) + 16]; + f32 _coeffs[((BLOCK_SIZE * BLOCK_SIZE) << 1) + 16]; + + s32 * map = alignPtr(_map, 16); + f32 * coeffs = alignPtr(_coeffs, 16); + + int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1); + int32x4_t v_step4 = vdupq_n_s32(srcStride), v_1 = vdupq_n_s32(1); + float32x4_t v_zero4f = vdupq_n_f32(0.0f), v_one4f = vdupq_n_f32(1.0f); + + if (borderMode == BORDER_MODE_REPLICATE) + { + int32x4_t v_zero4 = vdupq_n_s32(0); + + for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE) + { + size_t blockHeight = std::min(BLOCK_SIZE, dsize.height - i); + for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE) + { + size_t blockWidth = std::min(BLOCK_SIZE, dsize.width - j); + + // compute table + for (size_t y = 0; y < blockHeight; ++y) + { + const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1); + + s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y); + f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y); + + size_t x = 0; + for ( ; x + 4 <= blockWidth; x += 4) + { + float32x4x2_t v_table = vld2q_f32(table_row + (x << 1)); + + int32x4_t v_src_x = vcvtq_s32_f32(v_table.val[0]); + int32x4_t v_src_y = vcvtq_s32_f32(v_table.val[1]); + + float32x4x2_t v_coeff; + v_coeff.val[0] = vsubq_f32(v_table.val[0], vcvtq_f32_s32(v_src_x)); + v_coeff.val[1] = vsubq_f32(v_table.val[1], vcvtq_f32_s32(v_src_y)); + uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f); + uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f); + v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]); + v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]); + v_src_x = vbslq_s32(v_maskx, vsubq_s32(v_src_x, v_1), v_src_x); + v_src_y = vbslq_s32(v_masky, vsubq_s32(v_src_y, v_1), v_src_y); + + int32x4_t v_dst0_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, v_src_x)); + int32x4_t v_dst0_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, v_src_y)); + int32x4_t v_dst1_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vaddq_s32(v_1, v_src_x))); + int32x4_t v_dst1_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vaddq_s32(v_1, v_src_y))); + + int32x4x4_t v_dst_index; + v_dst_index.val[0] = vmlaq_s32(v_dst0_x, v_dst0_y, v_step4); + v_dst_index.val[1] = vmlaq_s32(v_dst1_x, v_dst0_y, v_step4); + v_dst_index.val[2] = vmlaq_s32(v_dst0_x, v_dst1_y, v_step4); + v_dst_index.val[3] = vmlaq_s32(v_dst1_x, v_dst1_y, v_step4); + + vst2q_f32(coeff_row + (x << 1), v_coeff); + vst4q_s32(map_row + (x << 2), v_dst_index); + } + + for ( ; x < blockWidth; ++x) + { + f32 src_x_f = table_row[(x << 1) + 0]; + f32 src_y_f = table_row[(x << 1) + 1]; + + s32 src0_x = (s32)floorf(src_x_f); + s32 src0_y = (s32)floorf(src_y_f); + + coeff_row[x << 1] = src_x_f - src0_x; + coeff_row[(x << 1) + 1] = src_y_f - src0_y; + + s32 src1_y = std::max(0, std::min(ssize.height - 1, src0_y + 1)); + src0_y = std::max(0, std::min(ssize.height - 1, src0_y)); + s32 src1_x = std::max(0, std::min(ssize.width - 1, src0_x + 1)); + src0_x = std::max(0, std::min(ssize.width - 1, src0_x)); + + map_row[(x << 2) + 0] = src0_y * srcStride + src0_x; + map_row[(x << 2) + 1] = src0_y * srcStride + src1_x; + map_row[(x << 2) + 2] = src1_y * srcStride + src0_x; + map_row[(x << 2) + 3] = src1_y * srcStride + src1_x; + } + } + + remapLinearReplicate(Size2D(blockWidth, blockHeight), + srcBase, &map[0], &coeffs[0], + getRowPtr(dstBase, dstStride, i) + j, dstStride); + } + } + } + else if (borderMode == BORDER_MODE_CONSTANT) + { + float32x4_t v_zero4 = vdupq_n_f32(0.0f); + int32x4_t v_m1_4 = vdupq_n_s32(-1); + + for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE) + { + size_t blockHeight = std::min(BLOCK_SIZE, dsize.height - i); + for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE) + { + size_t blockWidth = std::min(BLOCK_SIZE, dsize.width - j); + + // compute table + for (size_t y = 0; y < blockHeight; ++y) + { + const f32 * table_row = getRowPtr(tableBase, tableStride, i + y) + (j << 1); + + s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y); + f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y); + + size_t x = 0; + for ( ; x + 4 <= blockWidth; x += 4) + { + float32x4x2_t v_table = vld2q_f32(table_row + (x << 1)); + + int32x4_t v_src_x0 = vcvtq_s32_f32(v_table.val[0]); + int32x4_t v_src_y0 = vcvtq_s32_f32(v_table.val[1]); + + float32x4x2_t v_coeff; + v_coeff.val[0] = vsubq_f32(v_table.val[0], vcvtq_f32_s32(v_src_x0)); + v_coeff.val[1] = vsubq_f32(v_table.val[1], vcvtq_f32_s32(v_src_y0)); + uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f); + uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f); + v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]); + v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]); + v_src_x0 = vbslq_s32(v_maskx, vsubq_s32(v_src_x0, v_1), v_src_x0); + v_src_y0 = vbslq_s32(v_masky, vsubq_s32(v_src_y0, v_1), v_src_y0); + + int32x4_t v_src_x1 = vaddq_s32(v_src_x0, v_1); + int32x4_t v_src_y1 = vaddq_s32(v_src_y0, v_1); + + int32x4x4_t v_dst_index; + v_dst_index.val[0] = vmlaq_s32(v_src_x0, v_src_y0, v_step4); + v_dst_index.val[1] = vmlaq_s32(v_src_x1, v_src_y0, v_step4); + v_dst_index.val[2] = vmlaq_s32(v_src_x0, v_src_y1, v_step4); + v_dst_index.val[3] = vmlaq_s32(v_src_x1, v_src_y1, v_step4); + + uint32x4_t v_mask_x0 = vandq_u32(vcgeq_f32(v_table.val[0], v_zero4), vcleq_s32(v_src_x0, v_width4)); + uint32x4_t v_mask_x1 = vandq_u32(vcgeq_f32(vaddq_f32(v_table.val[0], v_one4f), v_zero4), vcleq_s32(v_src_x1, v_width4)); + uint32x4_t v_mask_y0 = vandq_u32(vcgeq_f32(v_table.val[1], v_zero4), vcleq_s32(v_src_y0, v_height4)); + uint32x4_t v_mask_y1 = vandq_u32(vcgeq_f32(vaddq_f32(v_table.val[1], v_one4f), v_zero4), vcleq_s32(v_src_y1, v_height4)); + + v_dst_index.val[0] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y0), v_dst_index.val[0], v_m1_4); + v_dst_index.val[1] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y0), v_dst_index.val[1], v_m1_4); + v_dst_index.val[2] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y1), v_dst_index.val[2], v_m1_4); + v_dst_index.val[3] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y1), v_dst_index.val[3], v_m1_4); + + vst2q_f32(coeff_row + (x << 1), v_coeff); + vst4q_s32(map_row + (x << 2), v_dst_index); + } + + for ( ; x < blockWidth; ++x) + { + f32 src_x_f = table_row[(x << 1) + 0]; + f32 src_y_f = table_row[(x << 1) + 1]; + + s32 src0_x = (s32)floorf(src_x_f), src1_x = src0_x + 1; + s32 src0_y = (s32)floorf(src_y_f), src1_y = src0_y + 1; + + coeff_row[(x << 1)] = src_x_f - src0_x; + coeff_row[(x << 1) + 1] = src_y_f - src0_y; + + map_row[(x << 2) + 0] = (src0_x >= 0) && (src0_x < (s32)ssize.width) && + (src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src0_x : -1; + map_row[(x << 2) + 1] = (src1_x >= 0) && (src1_x < (s32)ssize.width) && + (src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src1_x : -1; + map_row[(x << 2) + 2] = (src0_x >= 0) && (src0_x < (s32)ssize.width) && + (src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src0_x : -1; + map_row[(x << 2) + 3] = (src1_x >= 0) && (src1_x < (s32)ssize.width) && + (src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src1_x : -1; + } + } + + remapLinearConst(Size2D(blockWidth, blockHeight), + srcBase, &map[0], &coeffs[0], + getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue); + } + } + } +#else + (void)ssize; + (void)dsize; + (void)srcBase; + (void)srcStride; + (void)tableBase; + (void)tableStride; + (void)dstBase; + (void)dstStride; + (void)borderMode; + (void)borderValue; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/remap.hpp b/3rdparty/carotene/src/remap.hpp new file mode 100644 index 0000000000..0f9765965f --- /dev/null +++ b/3rdparty/carotene/src/remap.hpp @@ -0,0 +1,85 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#ifndef CAROTENE_SRC_REMAP_HPP +#define CAROTENE_SRC_REMAP_HPP + +#include "common.hpp" + +#include + +#ifdef CAROTENE_NEON + +namespace CAROTENE_NS { namespace internal { + +enum +{ + BLOCK_SIZE = 32 +}; + + +void remapNearestNeighborReplicate(const Size2D size, + const u8 * srcBase, + const s32 * map, + u8 * dstBase, ptrdiff_t dstStride); + +void remapNearestNeighborConst(const Size2D size, + const u8 * srcBase, + const s32 * map, + u8 * dstBase, ptrdiff_t dstStride, + u8 borderValue); + +void remapLinearReplicate(const Size2D size, + const u8 * srcBase, + const s32 * map, + const f32 * coeffs, + u8 * dstBase, ptrdiff_t dstStride); + +void remapLinearConst(const Size2D size, + const u8 * srcBase, + const s32 * map, + const f32 * coeffs, + u8 * dstBase, ptrdiff_t dstStride, + u8 borderValue); + +} } + +#endif // CAROTENE_NEON + +#endif // CAROTENE_SRC_REMAP_HPP diff --git a/3rdparty/carotene/src/resize.cpp b/3rdparty/carotene/src/resize.cpp new file mode 100644 index 0000000000..3a80d472df --- /dev/null +++ b/3rdparty/carotene/src/resize.cpp @@ -0,0 +1,2191 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" +#include "vtransform.hpp" + +#include +#include +#include + +namespace CAROTENE_NS { + +bool isResizeNearestNeighborSupported(const Size2D &ssize, u32 elemSize) +{ +#if SIZE_MAX <= UINT32_MAX + (void)ssize; +#endif + bool supportedElemSize = (elemSize == 1) || (elemSize == 3) || (elemSize == 4); + return isSupportedConfiguration() +#if SIZE_MAX > UINT32_MAX + && !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF)// Restrict image size since internally used resizeGeneric performs + // index evaluation with u32 +#endif + && supportedElemSize; +} + +bool isResizeAreaSupported(f32 wr, f32 hr, u32 channels) +{ + bool supportedRatio = false; + + if (channels == 1) + supportedRatio = (hr == wr) && ((wr == 2.0f) || (wr == 4.0f) || (wr == 0.5)); + else if (channels == 3) + supportedRatio = (hr == wr) && ((wr == 2.0f) || (wr == 4.0f) || (wr == 0.5f)); + else if (channels == 4) + supportedRatio = (hr == wr) && ((wr == 2.0f) || (wr == 4.0f) || (wr == 0.5f)); + + return isSupportedConfiguration() && supportedRatio; +} + +bool isResizeLinearSupported(const Size2D &ssize, const Size2D &dsize, + f32 wr, f32 hr, u32 channels) +{ + if ((wr <= 2.0f) && (hr <= 2.0f)) + { + bool channelsSupport = (channels == 1) || (channels == 3) || (channels == 4); + return (ssize.width >= 16) && (dsize.height >= 8) && + (dsize.width >= 8) && channelsSupport; + } + + return false; +} + +bool isResizeLinearOpenCVSupported(const Size2D &ssize, const Size2D &dsize, u32 channels) +{ + switch(channels) + { + case 1: + if (ssize.width >= 8 +#if SIZE_MAX > UINT32_MAX + && !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF)// Restrict image size since internal index evaluation + // is performed with u32 +#endif + && dsize.width >= 8 && dsize.height >= 8) + return isSupportedConfiguration(); + return false; + case 4: + if (ssize.width >= 2 +#if SIZE_MAX > UINT32_MAX + && !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF)// Restrict image size since internal index evaluation + // is performed with u32 +#endif + && dsize.width >= 2 && dsize.height >= 8) + return isSupportedConfiguration(); + default: + return false; + }; +} + +#ifdef CAROTENE_NEON + +namespace { + +u32 * calcLUT(size_t size, f32 ratio, + std::vector & _ofs) +{ + _ofs.resize(size); + u32 * ofs = &_ofs[0]; + + size_t roiw8 = size >= 7 ? size - 7 : 0; + size_t roiw4 = size >= 3 ? size - 3 : 0; + size_t x = 0; + + f32 indeces[4] = { 0, 1, 2, 3 }; + float32x4_t v_index = vld1q_f32(indeces), v_inc = vdupq_n_f32(4); + float32x4_t v_05 = vdupq_n_f32(0.5f), v_ratio = vdupq_n_f32(ratio); + + for ( ; x < roiw8; x += 8) + { + float32x4_t v_dstf = vmulq_f32(vaddq_f32(v_index, v_05), v_ratio); + vst1q_u32(ofs + x, vcvtq_u32_f32(v_dstf)); + v_index = vaddq_f32(v_index, v_inc); + + v_dstf = vmulq_f32(vaddq_f32(v_index, v_05), v_ratio); + vst1q_u32(ofs + x + 4, vcvtq_u32_f32(v_dstf)); + v_index = vaddq_f32(v_index, v_inc); + } + + for ( ; x < roiw4; x += 4) + { + float32x4_t v_dstf = vmulq_f32(vaddq_f32(v_index, v_05), v_ratio); + vst1q_u32(ofs + x, vcvtq_u32_f32(v_dstf)); + v_index = vaddq_f32(v_index, v_inc); + } + + for ( ; x < size; ++x) + { + ofs[x] = static_cast(floorf((x + 0.5f) * ratio)); + } + + return ofs; +} + +template +void resizeGeneric(const Size2D &dsize, + const void * srcBase, ptrdiff_t srcStride, + void * dstBase, ptrdiff_t dstStride, + f32 wr, f32 hr) +{ + std::vector _x_ofs; + u32 * x_ofs = calcLUT(dsize.width, wr, _x_ofs);//32bit LUT is used so we could get issues on src image dimensions greater than (2^32-1) + + for (size_t dst_y = 0; dst_y < dsize.height; ++dst_y) + { + size_t src_y = static_cast(floorf((dst_y + 0.5f) * hr)); + const T * src = internal::getRowPtr(static_cast(srcBase), srcStride, src_y); + T * dst = internal::getRowPtr(static_cast(dstBase), dstStride, dst_y); + + for (size_t dst_x = 0; dst_x < dsize.width; ++dst_x) + { + internal::prefetch(src + dst_x); + dst[dst_x] = src[x_ofs[dst_x]]; + } + } +} + +typedef struct _24bit_ +{ + u8 a[3]; +} _24bit; + +} // namespace + + +#endif + +void resizeNearestNeighbor(const Size2D &ssize, const Size2D &dsize, + const void * srcBase, ptrdiff_t srcStride, + void * dstBase, ptrdiff_t dstStride, + f32 wr, f32 hr, u32 elemSize) +{ + internal::assertSupportedConfiguration(wr > 0 && hr > 0 && + (dsize.width - 0.5) * wr < ssize.width && + (dsize.height - 0.5) * hr < ssize.height && // Ensure we have enough source data + (dsize.width + 0.5) * wr >= ssize.width && + (dsize.height + 0.5) * hr >= ssize.height && // Ensure source isn't too big + isResizeNearestNeighborSupported(ssize, elemSize)); +#ifdef CAROTENE_NEON + + if (elemSize == 1) + { + resizeGeneric(dsize, + srcBase, srcStride, + dstBase, dstStride, + wr, hr); + } + else if (elemSize == 3) + { + resizeGeneric<_24bit>(dsize, + srcBase, srcStride, + dstBase, dstStride, + wr, hr); + } + else if (elemSize == 4) + { + resizeGeneric(dsize, + srcBase, srcStride, + dstBase, dstStride, + wr, hr); + } + +#else + (void)dsize; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)wr; + (void)hr; +#endif +} + +#ifdef CAROTENE_NEON +template +inline uint8x8_t areaDownsamplingDivision(uint16x8_t data) +{ + return vshrn_n_u16(data, shiftsize); +} +template <> +inline uint8x8_t areaDownsamplingDivision(uint16x8_t data) +{ + // rounding + return vrshrn_n_u16(data,2); +} +template <> +inline uint8x8_t areaDownsamplingDivision(uint16x8_t data) +{ + // bankers rounding + return vrshrn_n_u16(vqsubq_u16(data, vshrq_n_u16(vbicq_u16(vdupq_n_u16(1<<4), data), 4)),4); +} + +template +inline u8 areaDownsamplingDivision(u16 data) +{ + return data >> shiftsize; +} +template <> +inline u8 areaDownsamplingDivision(u16 data) +{ + // rounding + return (data + 2) >> 2; +} +template <> +inline u8 areaDownsamplingDivision(u16 data) +{ + // bankers rounding + return (data - (((1<<4) & ~data) >> 4) + 8) >> 4; +} +#endif + +template +inline void resizeAreaRounding(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f32 wr, f32 hr, u32 channels) +{ + internal::assertSupportedConfiguration(isResizeAreaSupported(wr, hr, channels) && + std::abs(dsize.width * wr - ssize.width) < 0.1 && + std::abs(dsize.height * hr - ssize.height) < 0.1); +#ifdef CAROTENE_NEON + if (channels == 1) + { + if ((wr == 2.0f) && (hr == 2.0f)) + { + size_t roiw8 = dsize.width >= 7 ? dsize.width - 7 : 0; + + for (size_t i = 0; i < dsize.height; ++i) + { + const u8 * src0_row = internal::getRowPtr(srcBase, srcStride, i << 1); + const u8 * src1_row = internal::getRowPtr(srcBase, srcStride, (i << 1) + 1); + u8 * dst_row = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0, dj = 0; + + for ( ; dj < roiw8; dj += 8, sj += 16) + { + internal::prefetch(src0_row + sj); + internal::prefetch(src1_row + sj); + + uint16x8_t vSum1 = vpaddlq_u8(vld1q_u8(src0_row + sj)); + uint16x8_t vSum2 = vpaddlq_u8(vld1q_u8(src1_row + sj)); + uint8x8_t vRes1 = areaDownsamplingDivision(vaddq_u16(vSum1, vSum2)); + + vst1_u8(dst_row + dj, vRes1); + } + + for ( ; dj < dsize.width; ++dj, sj += 2) + { + dst_row[dj] = areaDownsamplingDivision( + (u16)src0_row[sj] + src0_row[sj + 1] + + src1_row[sj] + src1_row[sj + 1]); + } + } + } + else if ((wr == 0.5f) && (hr == 0.5f)) + { + size_t roiw32 = dsize.width >= 31 ? dsize.width - 31 : 0; + size_t roiw16 = dsize.width >= 15 ? dsize.width - 15 : 0; + + for (size_t i = 0; i < dsize.height; i += 2) + { + const u8 * src_row = internal::getRowPtr(srcBase, srcStride, i >> 1); + u8 * dst0_row = internal::getRowPtr(dstBase, dstStride, i); + u8 * dst1_row = internal::getRowPtr(dstBase, dstStride, std::min(i + 1, dsize.height - 1)); + size_t sj = 0, dj = 0; + + for ( ; dj < roiw32; dj += 32, sj += 16) + { + internal::prefetch(src_row + sj); + + uint8x16x2_t v_dst; + v_dst.val[0] = v_dst.val[1] = vld1q_u8(src_row + sj); + + vst2q_u8(dst0_row + dj, v_dst); + vst2q_u8(dst1_row + dj, v_dst); + } + + for ( ; dj < roiw16; dj += 16, sj += 8) + { + uint8x8x2_t v_dst; + v_dst.val[0] = v_dst.val[1] = vld1_u8(src_row + sj); + + vst2_u8(dst0_row + dj, v_dst); + vst2_u8(dst1_row + dj, v_dst); + } + + for ( ; dj < dsize.width; dj += 2, ++sj) + { + u8 src_val = src_row[sj]; + dst0_row[dj] = dst0_row[dj + 1] = src_val; + dst1_row[dj] = dst1_row[dj + 1] = src_val; + } + } + } + else //if ((wr == 4.0f) && (hr == 4.0f)) //the only scale that lasts after isSupported check + { +#ifndef ANDROID + size_t roiw16 = dsize.width >= 15 ? dsize.width - 15 : 0; +#endif + size_t roiw8 = dsize.width >= 7 ? dsize.width - 7 : 0; + + for (size_t i = 0; i < dsize.height; ++i) + { + const u8 * src0_row = internal::getRowPtr(srcBase, srcStride, i << 2); + const u8 * src1_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 1); + const u8 * src2_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 2); + const u8 * src3_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 3); + u8 * dst_row = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0, dj = 0; + +#ifndef ANDROID + for ( ; dj < roiw16; dj += 16, sj += 64) + { + internal::prefetch(src0_row + sj); + internal::prefetch(src1_row + sj); + internal::prefetch(src2_row + sj); + internal::prefetch(src3_row + sj); + + uint8x16x4_t vLane1 = vld4q_u8(src0_row + sj); + uint8x16x4_t vLane2 = vld4q_u8(src1_row + sj); + uint8x16x4_t vLane3 = vld4q_u8(src2_row + sj); + uint8x16x4_t vLane4 = vld4q_u8(src3_row + sj); + + uint16x8_t vSum_0 = vaddl_u8(vget_low_u8(vLane1.val[0]), vget_low_u8(vLane1.val[1])); + vSum_0 = vaddq_u16(vSum_0, vaddl_u8(vget_low_u8(vLane1.val[2]), vget_low_u8(vLane1.val[3]))); + vSum_0 = vaddq_u16(vSum_0, vaddl_u8(vget_low_u8(vLane2.val[0]), vget_low_u8(vLane2.val[1]))); + vSum_0 = vaddq_u16(vSum_0, vaddl_u8(vget_low_u8(vLane2.val[2]), vget_low_u8(vLane2.val[3]))); + vSum_0 = vaddq_u16(vSum_0, vaddl_u8(vget_low_u8(vLane3.val[0]), vget_low_u8(vLane3.val[1]))); + vSum_0 = vaddq_u16(vSum_0, vaddl_u8(vget_low_u8(vLane3.val[2]), vget_low_u8(vLane3.val[3]))); + vSum_0 = vaddq_u16(vSum_0, vaddl_u8(vget_low_u8(vLane4.val[0]), vget_low_u8(vLane4.val[1]))); + vSum_0 = vaddq_u16(vSum_0, vaddl_u8(vget_low_u8(vLane4.val[2]), vget_low_u8(vLane4.val[3]))); + + uint16x8_t vSum_1 = vaddl_u8(vget_high_u8(vLane1.val[0]), vget_high_u8(vLane1.val[1])); + vSum_1 = vaddq_u16(vSum_1, vaddl_u8(vget_high_u8(vLane1.val[2]), vget_high_u8(vLane1.val[3]))); + vSum_1 = vaddq_u16(vSum_1, vaddl_u8(vget_high_u8(vLane2.val[0]), vget_high_u8(vLane2.val[1]))); + vSum_1 = vaddq_u16(vSum_1, vaddl_u8(vget_high_u8(vLane2.val[2]), vget_high_u8(vLane2.val[3]))); + vSum_1 = vaddq_u16(vSum_1, vaddl_u8(vget_high_u8(vLane3.val[0]), vget_high_u8(vLane3.val[1]))); + vSum_1 = vaddq_u16(vSum_1, vaddl_u8(vget_high_u8(vLane3.val[2]), vget_high_u8(vLane3.val[3]))); + vSum_1 = vaddq_u16(vSum_1, vaddl_u8(vget_high_u8(vLane4.val[0]), vget_high_u8(vLane4.val[1]))); + vSum_1 = vaddq_u16(vSum_1, vaddl_u8(vget_high_u8(vLane4.val[2]), vget_high_u8(vLane4.val[3]))); + + uint8x8_t vRes_0 = areaDownsamplingDivision(vSum_0); + uint8x8_t vRes_1 = areaDownsamplingDivision(vSum_1); + + vst1q_u8(dst_row + dj, vcombine_u8(vRes_0, vRes_1)); + } +#endif + + for ( ; dj < roiw8; dj += 8, sj += 32) + { + internal::prefetch(src0_row + sj); + internal::prefetch(src1_row + sj); + internal::prefetch(src2_row + sj); + internal::prefetch(src3_row + sj); + + uint8x8x4_t vLane1 = vld4_u8(src0_row + sj); + uint8x8x4_t vLane2 = vld4_u8(src1_row + sj); + uint8x8x4_t vLane3 = vld4_u8(src2_row + sj); + uint8x8x4_t vLane4 = vld4_u8(src3_row + sj); + + uint16x8_t vSum = vaddl_u8(vLane1.val[0], vLane1.val[1]); + vSum = vaddq_u16(vSum, vaddl_u8(vLane1.val[2], vLane1.val[3])); + vSum = vaddq_u16(vSum, vaddl_u8(vLane2.val[0], vLane2.val[1])); + vSum = vaddq_u16(vSum, vaddl_u8(vLane2.val[2], vLane2.val[3])); + vSum = vaddq_u16(vSum, vaddl_u8(vLane3.val[0], vLane3.val[1])); + vSum = vaddq_u16(vSum, vaddl_u8(vLane3.val[2], vLane3.val[3])); + vSum = vaddq_u16(vSum, vaddl_u8(vLane4.val[0], vLane4.val[1])); + vSum = vaddq_u16(vSum, vaddl_u8(vLane4.val[2], vLane4.val[3])); + + vst1_u8(dst_row + dj, areaDownsamplingDivision(vSum)); + } + + for ( ; dj < dsize.width; ++dj, sj += 4) + { + dst_row[dj] = areaDownsamplingDivision( + (u16)src0_row[sj] + src0_row[sj + 1] + src0_row[sj + 2] + src0_row[sj + 3] + + src1_row[sj] + src1_row[sj + 1] + src1_row[sj + 2] + src1_row[sj + 3] + + src2_row[sj] + src2_row[sj + 1] + src2_row[sj + 2] + src2_row[sj + 3] + + src3_row[sj] + src3_row[sj + 1] + src3_row[sj + 2] + src3_row[sj + 3]); + } + } + } + } + else if (channels == 4) + { + if ((wr == 2.0f) && (hr == 2.0f)) + { +#ifndef ANDROID + size_t roiw4 = dsize.width >= 3 ? (dsize.width - 3) << 2 : 0; +#endif + size_t roiw2 = dsize.width >= 1 ? (dsize.width - 1) << 2 : 0; + + for (size_t i = 0; i < dsize.height; ++i) + { + const u8 * src0_row = internal::getRowPtr(srcBase, srcStride, i << 1); + const u8 * src1_row = internal::getRowPtr(srcBase, srcStride, (i << 1) + 1); + u8 * dst_row = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0, dj = 0; + +#ifndef ANDROID + for ( ; dj < roiw4; dj += 16, sj += 32) + { + internal::prefetch(src0_row + sj); + internal::prefetch(src1_row + sj); + + uint8x8_t vRes_0, vRes_1; + + { + uint8x16_t vLane1 = vld1q_u8(src0_row + sj); + uint8x16_t vLane2 = vld1q_u8(src1_row + sj); + + uint16x8_t vLane_l = vaddl_u8(vget_low_u8(vLane1), vget_low_u8(vLane2)); + uint16x8_t vLane_h = vaddl_u8(vget_high_u8(vLane1), vget_high_u8(vLane2)); + + uint16x4_t vSum_l = vadd_u16(vget_low_u16(vLane_l), vget_high_u16(vLane_l)); + uint16x4_t vSum_h = vadd_u16(vget_low_u16(vLane_h), vget_high_u16(vLane_h)); + + vRes_0 = areaDownsamplingDivision(vcombine_u16(vSum_l, vSum_h)); + } + + { + uint8x16_t vLane1 = vld1q_u8(src0_row + sj + 16); + uint8x16_t vLane2 = vld1q_u8(src1_row + sj + 16); + + uint16x8_t vLane_l = vaddl_u8(vget_low_u8(vLane1), vget_low_u8(vLane2)); + uint16x8_t vLane_h = vaddl_u8(vget_high_u8(vLane1), vget_high_u8(vLane2)); + + uint16x4_t vSum_l = vadd_u16(vget_low_u16(vLane_l), vget_high_u16(vLane_l)); + uint16x4_t vSum_h = vadd_u16(vget_low_u16(vLane_h), vget_high_u16(vLane_h)); + + vRes_1 = areaDownsamplingDivision(vcombine_u16(vSum_l, vSum_h)); + } + + vst1q_u8(dst_row + dj, vcombine_u8(vRes_0, vRes_1)); + } +#endif + + for ( ; dj < roiw2; dj += 8, sj += 16) + { + internal::prefetch(src0_row + sj); + internal::prefetch(src1_row + sj); + + uint8x16_t vLane1 = vld1q_u8(src0_row + sj); + uint8x16_t vLane2 = vld1q_u8(src1_row + sj); + + uint16x8_t vLane_l = vaddl_u8(vget_low_u8(vLane1), vget_low_u8(vLane2)); + uint16x8_t vLane_h = vaddl_u8(vget_high_u8(vLane1), vget_high_u8(vLane2)); + + uint16x4_t vSum_l = vadd_u16(vget_low_u16(vLane_l), vget_high_u16(vLane_l)); + uint16x4_t vSum_h = vadd_u16(vget_low_u16(vLane_h), vget_high_u16(vLane_h)); + + uint8x8_t vRes = areaDownsamplingDivision(vcombine_u16(vSum_l, vSum_h)); + vst1_u8(dst_row + dj, vRes); + } + + for (size_t dwidth = dsize.width << 2; dj < dwidth; dj += 4, sj += 8) + { + dst_row[dj ] = areaDownsamplingDivision( + (u16)src0_row[sj ] + src0_row[sj + 4] + + src1_row[sj ] + src1_row[sj + 4]); + dst_row[dj + 1] = areaDownsamplingDivision( + (u16)src0_row[sj + 1] + src0_row[sj + 5] + + src1_row[sj + 1] + src1_row[sj + 5]); + dst_row[dj + 2] = areaDownsamplingDivision( + (u16)src0_row[sj + 2] + src0_row[sj + 6] + + src1_row[sj + 2] + src1_row[sj + 6]); + dst_row[dj + 3] = areaDownsamplingDivision( + (u16)src0_row[sj + 3] + src0_row[sj + 7] + + src1_row[sj + 3] + src1_row[sj + 7]); + } + } + } + else if ((wr == 0.5f) && (hr == 0.5f)) + { +#ifndef ANDROID + size_t roiw32 = dsize.width >= 31 ? (dsize.width - 31) << 2 : 0; +#endif + size_t roiw16 = dsize.width >= 15 ? (dsize.width - 15) << 2 : 0; + + for (size_t i = 0; i < dsize.height; i += 2) + { + const u8 * src_row = internal::getRowPtr(srcBase, srcStride, i >> 1); + u8 * dst0_row = internal::getRowPtr(dstBase, dstStride, i); + u8 * dst1_row = internal::getRowPtr(dstBase, dstStride, std::min(i + 1, dsize.height - 1)); + size_t sj = 0, dj = 0; + +#ifndef ANDROID + for ( ; dj < roiw32; dj += 128, sj += 64) + { + internal::prefetch(src_row + sj); + + uint8x16x4_t v_src = vld4q_u8(src_row + sj); + uint8x16x2_t v_c0 = vzipq_u8(v_src.val[0], v_src.val[0]); + uint8x16x2_t v_c1 = vzipq_u8(v_src.val[1], v_src.val[1]); + uint8x16x2_t v_c2 = vzipq_u8(v_src.val[2], v_src.val[2]); + uint8x16x2_t v_c3 = vzipq_u8(v_src.val[3], v_src.val[3]); + + uint8x16x4_t v_dst; + v_dst.val[0] = v_c0.val[0]; + v_dst.val[1] = v_c1.val[0]; + v_dst.val[2] = v_c2.val[0]; + v_dst.val[3] = v_c3.val[0]; + vst4q_u8(dst0_row + dj, v_dst); + vst4q_u8(dst1_row + dj, v_dst); + + v_dst.val[0] = v_c0.val[1]; + v_dst.val[1] = v_c1.val[1]; + v_dst.val[2] = v_c2.val[1]; + v_dst.val[3] = v_c3.val[1]; + vst4q_u8(dst0_row + dj + 64, v_dst); + vst4q_u8(dst1_row + dj + 64, v_dst); + } +#endif + + for ( ; dj < roiw16; dj += 64, sj += 32) + { + internal::prefetch(src_row + sj); + + uint8x8x4_t v_src = vld4_u8(src_row + sj); + uint8x8x2_t v_c0 = vzip_u8(v_src.val[0], v_src.val[0]); + uint8x8x2_t v_c1 = vzip_u8(v_src.val[1], v_src.val[1]); + uint8x8x2_t v_c2 = vzip_u8(v_src.val[2], v_src.val[2]); + uint8x8x2_t v_c3 = vzip_u8(v_src.val[3], v_src.val[3]); + + uint8x16x4_t v_dst; + v_dst.val[0] = vcombine_u8(v_c0.val[0], v_c0.val[1]); + v_dst.val[1] = vcombine_u8(v_c1.val[0], v_c1.val[1]); + v_dst.val[2] = vcombine_u8(v_c2.val[0], v_c2.val[1]); + v_dst.val[3] = vcombine_u8(v_c3.val[0], v_c3.val[1]); + vst4q_u8(dst0_row + dj, v_dst); + vst4q_u8(dst1_row + dj, v_dst); + } + + for (size_t dwidth = dsize.width << 2; dj < dwidth; dj += 8, sj += 4) + { + u8 src_val = src_row[sj]; + dst0_row[dj] = dst0_row[dj + 4] = src_val; + dst1_row[dj] = dst1_row[dj + 4] = src_val; + + src_val = src_row[sj + 1]; + dst0_row[dj + 1] = dst0_row[dj + 5] = src_val; + dst1_row[dj + 1] = dst1_row[dj + 5] = src_val; + + src_val = src_row[sj + 2]; + dst0_row[dj + 2] = dst0_row[dj + 6] = src_val; + dst1_row[dj + 2] = dst1_row[dj + 6] = src_val; + + src_val = src_row[sj + 3]; + dst0_row[dj + 3] = dst0_row[dj + 7] = src_val; + dst1_row[dj + 3] = dst1_row[dj + 7] = src_val; + } + } + } + else //if ((hr == 4.0f) && (wr == 4.0f)) //the only scale that lasts after isSupported check + { + size_t roiw4 = dsize.width >= 3 ? (dsize.width - 3) << 2 : 0; + size_t roiw2 = dsize.width >= 1 ? (dsize.width - 1) << 2 : 0; + + for (size_t i = 0; i < dsize.height; ++i) + { + const u8 * src0_row = internal::getRowPtr(srcBase, srcStride, i << 2); + const u8 * src1_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 1); + const u8 * src2_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 2); + const u8 * src3_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 3); + u8 * dst_row = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0, dj = 0; + + for ( ; dj < roiw4; dj += 16, sj += 64) + { + internal::prefetch(src0_row + sj); + internal::prefetch(src1_row + sj); + internal::prefetch(src2_row + sj); + internal::prefetch(src3_row + sj); + + uint8x16_t vLane10 = vld1q_u8(src0_row + sj), vLane11 = vld1q_u8(src0_row + sj + 16); + uint8x16_t vLane20 = vld1q_u8(src1_row + sj), vLane21 = vld1q_u8(src1_row + sj + 16); + uint8x16_t vLane30 = vld1q_u8(src2_row + sj), vLane31 = vld1q_u8(src2_row + sj + 16); + uint8x16_t vLane40 = vld1q_u8(src3_row + sj), vLane41 = vld1q_u8(src3_row + sj + 16); + + uint16x8_t v_part_0, v_part_1; + { + uint16x8_t v_sum0 = vaddl_u8(vget_low_u8(vLane10), vget_high_u8(vLane10)); + v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane20), vget_high_u8(vLane20))); + v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane30), vget_high_u8(vLane30))); + v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane40), vget_high_u8(vLane40))); + + uint16x8_t v_sum1 = vaddl_u8(vget_low_u8(vLane11), vget_high_u8(vLane11)); + v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane21), vget_high_u8(vLane21))); + v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane31), vget_high_u8(vLane31))); + v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane41), vget_high_u8(vLane41))); + + v_part_0 = vcombine_u16(vadd_u16(vget_low_u16(v_sum0), vget_high_u16(v_sum0)), + vadd_u16(vget_low_u16(v_sum1), vget_high_u16(v_sum1))); + } + + vLane10 = vld1q_u8(src0_row + sj + 32); + vLane11 = vld1q_u8(src0_row + sj + 48); + vLane20 = vld1q_u8(src1_row + sj + 32); + vLane21 = vld1q_u8(src1_row + sj + 48); + vLane30 = vld1q_u8(src2_row + sj + 32); + vLane31 = vld1q_u8(src2_row + sj + 48); + vLane40 = vld1q_u8(src3_row + sj + 32); + vLane41 = vld1q_u8(src3_row + sj + 48); + + { + uint16x8_t v_sum0 = vaddl_u8(vget_low_u8(vLane10), vget_high_u8(vLane10)); + v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane20), vget_high_u8(vLane20))); + v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane30), vget_high_u8(vLane30))); + v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane40), vget_high_u8(vLane40))); + + uint16x8_t v_sum1 = vaddl_u8(vget_low_u8(vLane11), vget_high_u8(vLane11)); + v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane21), vget_high_u8(vLane21))); + v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane31), vget_high_u8(vLane31))); + v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane41), vget_high_u8(vLane41))); + + v_part_1 = vcombine_u16(vadd_u16(vget_low_u16(v_sum0), vget_high_u16(v_sum0)), + vadd_u16(vget_low_u16(v_sum1), vget_high_u16(v_sum1))); + } + + vst1q_u8(dst_row + dj, vcombine_u8(areaDownsamplingDivision(v_part_0), + areaDownsamplingDivision(v_part_1))); + } + + for ( ; dj < roiw2; dj += 8, sj += 32) + { + uint8x16_t vLane10 = vld1q_u8(src0_row + sj), vLane11 = vld1q_u8(src0_row + sj + 16); + uint8x16_t vLane20 = vld1q_u8(src1_row + sj), vLane21 = vld1q_u8(src1_row + sj + 16); + uint8x16_t vLane30 = vld1q_u8(src2_row + sj), vLane31 = vld1q_u8(src2_row + sj + 16); + uint8x16_t vLane40 = vld1q_u8(src3_row + sj), vLane41 = vld1q_u8(src3_row + sj + 16); + + uint16x8_t v_sum0 = vaddl_u8(vget_low_u8(vLane10), vget_high_u8(vLane10)); + v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane20), vget_high_u8(vLane20))); + v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane30), vget_high_u8(vLane30))); + v_sum0 = vaddq_u16(v_sum0, vaddl_u8(vget_low_u8(vLane40), vget_high_u8(vLane40))); + + uint16x8_t v_sum1 = vaddl_u8(vget_low_u8(vLane11), vget_high_u8(vLane11)); + v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane21), vget_high_u8(vLane21))); + v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane31), vget_high_u8(vLane31))); + v_sum1 = vaddq_u16(v_sum1, vaddl_u8(vget_low_u8(vLane41), vget_high_u8(vLane41))); + + uint16x8_t v_sum = vcombine_u16(vadd_u16(vget_low_u16(v_sum0), vget_high_u16(v_sum0)), + vadd_u16(vget_low_u16(v_sum1), vget_high_u16(v_sum1))); + + vst1_u8(dst_row + dj, areaDownsamplingDivision(v_sum)); + } + + for (size_t dwidth = dsize.width << 2; dj < dwidth; dj += 4, sj += 16) + { + dst_row[dj ] = areaDownsamplingDivision( + (u16)src0_row[sj ] + src0_row[sj + 4] + + src0_row[sj + 8] + src0_row[sj + 12] + + src1_row[sj ] + src1_row[sj + 4] + + src1_row[sj + 8] + src1_row[sj + 12] + + src2_row[sj ] + src2_row[sj + 4] + + src2_row[sj + 8] + src2_row[sj + 12] + + src3_row[sj ] + src3_row[sj + 4] + + src3_row[sj + 8] + src3_row[sj + 12]); + + dst_row[dj + 1] = areaDownsamplingDivision( + (u16)src0_row[sj + 1] + src0_row[sj + 5] + + src0_row[sj + 9] + src0_row[sj + 13] + + src1_row[sj + 1] + src1_row[sj + 5] + + src1_row[sj + 9] + src1_row[sj + 13] + + src2_row[sj + 1] + src2_row[sj + 5] + + src2_row[sj + 9] + src2_row[sj + 13] + + src3_row[sj + 1] + src3_row[sj + 5] + + src3_row[sj + 9] + src3_row[sj + 13]); + + dst_row[dj + 2] = areaDownsamplingDivision( + (u16)src0_row[sj + 2] + src0_row[sj + 6] + + src0_row[sj + 10] + src0_row[sj + 14] + + src1_row[sj + 2] + src1_row[sj + 6] + + src1_row[sj + 10] + src1_row[sj + 14] + + src2_row[sj + 2] + src2_row[sj + 6] + + src2_row[sj + 10] + src2_row[sj + 14] + + src3_row[sj + 2] + src3_row[sj + 6] + + src3_row[sj + 10] + src3_row[sj + 14]); + + dst_row[dj + 3] = areaDownsamplingDivision( + (u16)src0_row[sj + 3] + src0_row[sj + 7] + + src0_row[sj + 11] + src0_row[sj + 15] + + src1_row[sj + 3] + src1_row[sj + 7] + + src1_row[sj + 11] + src1_row[sj + 15] + + src2_row[sj + 3] + src2_row[sj + 7] + + src2_row[sj + 11] + src2_row[sj + 15] + + src3_row[sj + 3] + src3_row[sj + 7] + + src3_row[sj + 11] + src3_row[sj + 15]); + } + } + } + } + else if (channels == 3) + { + if ((wr == 2.0f) && (wr == 2.0f)) + { +#ifndef ANDROID + size_t roiw16 = dsize.width >= 15 ? (dsize.width - 15) * 3 : 0; +#endif + size_t roiw8 = dsize.width >= 7 ? (dsize.width - 7) * 3 : 0; + + for (size_t i = 0; i < dsize.height; ++i) + { + const u8 * src0_row = internal::getRowPtr(srcBase, srcStride, i << 1); + const u8 * src1_row = internal::getRowPtr(srcBase, srcStride, (i << 1) + 1); + u8 * dst_row = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0, dj = 0; + +#ifndef ANDROID + for ( ; dj < roiw16; dj += 48, sj += 96) + { + internal::prefetch(src0_row + sj); + internal::prefetch(src1_row + sj); + + uint8x16x3_t vLane1 = vld3q_u8(src0_row + sj); + uint8x16x3_t vLane2 = vld3q_u8(src1_row + sj); + + uint8x8x3_t v_dst0, v_dst1; + { + uint16x8_t v_el0 = vpaddlq_u8(vLane1.val[0]); + uint16x8_t v_el1 = vpaddlq_u8(vLane1.val[1]); + uint16x8_t v_el2 = vpaddlq_u8(vLane1.val[2]); + v_el0 = vpadalq_u8(v_el0, vLane2.val[0]); + v_el1 = vpadalq_u8(v_el1, vLane2.val[1]); + v_el2 = vpadalq_u8(v_el2, vLane2.val[2]); + + v_dst0.val[0] = areaDownsamplingDivision(v_el0); + v_dst0.val[1] = areaDownsamplingDivision(v_el1); + v_dst0.val[2] = areaDownsamplingDivision(v_el2); + } + + vLane1 = vld3q_u8(src0_row + sj + 48); + vLane2 = vld3q_u8(src1_row + sj + 48); + { + uint16x8_t v_el0 = vpaddlq_u8(vLane1.val[0]); + uint16x8_t v_el1 = vpaddlq_u8(vLane1.val[1]); + uint16x8_t v_el2 = vpaddlq_u8(vLane1.val[2]); + v_el0 = vpadalq_u8(v_el0, vLane2.val[0]); + v_el1 = vpadalq_u8(v_el1, vLane2.val[1]); + v_el2 = vpadalq_u8(v_el2, vLane2.val[2]); + + v_dst1.val[0] = areaDownsamplingDivision(v_el0); + v_dst1.val[1] = areaDownsamplingDivision(v_el1); + v_dst1.val[2] = areaDownsamplingDivision(v_el2); + } + + uint8x16x3_t v_dst; + v_dst.val[0] = vcombine_u8(v_dst0.val[0], v_dst1.val[0]); + v_dst.val[1] = vcombine_u8(v_dst0.val[1], v_dst1.val[1]); + v_dst.val[2] = vcombine_u8(v_dst0.val[2], v_dst1.val[2]); + + vst3q_u8(dst_row + dj, v_dst); + } +#endif + + for ( ; dj < roiw8; dj += 24, sj += 48) + { + internal::prefetch(src0_row + sj); + internal::prefetch(src1_row + sj); + + uint8x16x3_t vLane1 = vld3q_u8(src0_row + sj); + uint8x16x3_t vLane2 = vld3q_u8(src1_row + sj); + + uint16x8_t v_el0 = vpaddlq_u8(vLane1.val[0]); + uint16x8_t v_el1 = vpaddlq_u8(vLane1.val[1]); + uint16x8_t v_el2 = vpaddlq_u8(vLane1.val[2]); + v_el0 = vpadalq_u8(v_el0, vLane2.val[0]); + v_el1 = vpadalq_u8(v_el1, vLane2.val[1]); + v_el2 = vpadalq_u8(v_el2, vLane2.val[2]); + + uint8x8x3_t v_dst; + v_dst.val[0] = areaDownsamplingDivision(v_el0); + v_dst.val[1] = areaDownsamplingDivision(v_el1); + v_dst.val[2] = areaDownsamplingDivision(v_el2); + + vst3_u8(dst_row + dj, v_dst); + } + + for (size_t dwidth = dsize.width * 3; dj < dwidth; dj += 3, sj += 6) + { + dst_row[dj ] = areaDownsamplingDivision( + (u16)src0_row[sj ] + src0_row[sj + 3] + + src1_row[sj ] + src1_row[sj + 3]); + dst_row[dj + 1] = areaDownsamplingDivision( + (u16)src0_row[sj + 1] + src0_row[sj + 4] + + src1_row[sj + 1] + src1_row[sj + 4]); + dst_row[dj + 2] = areaDownsamplingDivision( + (u16)src0_row[sj + 2] + src0_row[sj + 5] + + src1_row[sj + 2] + src1_row[sj + 5]); + } + } + } + else if ((wr == 0.5f) && (hr == 0.5f)) + { +#ifndef ANDROID + size_t roiw32 = dsize.width >= 31 ? (dsize.width - 31) * 3 : 0; +#endif + size_t roiw16 = dsize.width >= 15 ? (dsize.width - 15) * 3 : 0; + + for (size_t i = 0; i < dsize.height; i += 2) + { + const u8 * src_row = internal::getRowPtr(srcBase, srcStride, i >> 1); + u8 * dst0_row = internal::getRowPtr(dstBase, dstStride, i); + u8 * dst1_row = internal::getRowPtr(dstBase, dstStride, std::min(i + 1, dsize.height - 1)); + size_t sj = 0, dj = 0; + +#ifndef ANDROID + for ( ; dj < roiw32; dj += 96, sj += 48) + { + internal::prefetch(src_row + sj); + + uint8x16x3_t v_src = vld3q_u8(src_row + sj); + uint8x16x2_t v_c0 = vzipq_u8(v_src.val[0], v_src.val[0]); + uint8x16x2_t v_c1 = vzipq_u8(v_src.val[1], v_src.val[1]); + uint8x16x2_t v_c2 = vzipq_u8(v_src.val[2], v_src.val[2]); + + uint8x16x3_t v_dst; + v_dst.val[0] = v_c0.val[0]; + v_dst.val[1] = v_c1.val[0]; + v_dst.val[2] = v_c2.val[0]; + vst3q_u8(dst0_row + dj, v_dst); + vst3q_u8(dst1_row + dj, v_dst); + + v_dst.val[0] = v_c0.val[1]; + v_dst.val[1] = v_c1.val[1]; + v_dst.val[2] = v_c2.val[1]; + vst3q_u8(dst0_row + dj + 48, v_dst); + vst3q_u8(dst1_row + dj + 48, v_dst); + } +#endif + + for ( ; dj < roiw16; dj += 48, sj += 24) + { + internal::prefetch(src_row + sj); + + uint8x8x3_t v_src = vld3_u8(src_row + sj); + uint8x8x2_t v_c0 = vzip_u8(v_src.val[0], v_src.val[0]); + uint8x8x2_t v_c1 = vzip_u8(v_src.val[1], v_src.val[1]); + uint8x8x2_t v_c2 = vzip_u8(v_src.val[2], v_src.val[2]); + + uint8x16x3_t v_dst; + v_dst.val[0] = vcombine_u8(v_c0.val[0], v_c0.val[1]); + v_dst.val[1] = vcombine_u8(v_c1.val[0], v_c1.val[1]); + v_dst.val[2] = vcombine_u8(v_c2.val[0], v_c2.val[1]); + vst3q_u8(dst0_row + dj, v_dst); + vst3q_u8(dst1_row + dj, v_dst); + } + + for (size_t dwidth = dsize.width * 3; dj < dwidth; dj += 6, sj += 3) + { + u8 src_val = src_row[sj]; + dst0_row[dj] = dst0_row[dj + 3] = src_val; + dst1_row[dj] = dst1_row[dj + 3] = src_val; + + src_val = src_row[sj + 1]; + dst0_row[dj + 1] = dst0_row[dj + 4] = src_val; + dst1_row[dj + 1] = dst1_row[dj + 4] = src_val; + + src_val = src_row[sj + 2]; + dst0_row[dj + 2] = dst0_row[dj + 5] = src_val; + dst1_row[dj + 2] = dst1_row[dj + 5] = src_val; + } + } + } + else //if ((hr == 4.0f) && (wr == 4.0f)) //the only scale that lasts after isSupported check + { +#ifndef ANDROID + size_t roiw8 = dsize.width >= 7 ? (dsize.width - 7) * 3 : 0; +#endif + + for (size_t i = 0; i < dsize.height; ++i) + { + const u8 * src0_row = internal::getRowPtr(srcBase, srcStride, i << 2); + const u8 * src1_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 1); + const u8 * src2_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 2); + const u8 * src3_row = internal::getRowPtr(srcBase, srcStride, (i << 2) + 3); + u8 * dst_row = internal::getRowPtr(dstBase, dstStride, i); + size_t sj = 0, dj = 0; + +#ifndef ANDROID + for ( ; dj < roiw8; dj += 24, sj += 96) + { + internal::prefetch(src0_row + sj); + internal::prefetch(src1_row + sj); + internal::prefetch(src2_row + sj); + internal::prefetch(src3_row + sj); + + uint8x16x3_t vLane10 = vld3q_u8(src0_row + sj), vLane11 = vld3q_u8(src0_row + sj + 48); + uint8x16x3_t vLane20 = vld3q_u8(src1_row + sj), vLane21 = vld3q_u8(src1_row + sj + 48); + uint8x16x3_t vLane30 = vld3q_u8(src2_row + sj), vLane31 = vld3q_u8(src2_row + sj + 48); + uint8x16x3_t vLane40 = vld3q_u8(src3_row + sj), vLane41 = vld3q_u8(src3_row + sj + 48); + + uint8x8x3_t v_dst; + + // channel 0 + { + uint16x8_t v_lane0 = vpaddlq_u8(vLane10.val[0]); + uint16x8_t v_lane1 = vpaddlq_u8(vLane20.val[0]); + uint16x8_t v_lane2 = vpaddlq_u8(vLane30.val[0]); + uint16x8_t v_lane3 = vpaddlq_u8(vLane40.val[0]); + v_lane0 = vaddq_u16(v_lane0, v_lane1); + v_lane0 = vaddq_u16(v_lane0, v_lane2); + v_lane0 = vaddq_u16(v_lane0, v_lane3); + + uint16x8_t v_lane0_ = vpaddlq_u8(vLane11.val[0]); + uint16x8_t v_lane1_ = vpaddlq_u8(vLane21.val[0]); + uint16x8_t v_lane2_ = vpaddlq_u8(vLane31.val[0]); + uint16x8_t v_lane3_ = vpaddlq_u8(vLane41.val[0]); + v_lane0_ = vaddq_u16(v_lane0_, v_lane1_); + v_lane0_ = vaddq_u16(v_lane0_, v_lane2_); + v_lane0_ = vaddq_u16(v_lane0_, v_lane3_); + + v_dst.val[0] = areaDownsamplingDivision( + vcombine_u16(vmovn_u32(vpaddlq_u16(v_lane0)), + vmovn_u32(vpaddlq_u16(v_lane0_)))); + } + + // channel 1 + { + uint16x8_t v_lane0 = vpaddlq_u8(vLane10.val[1]); + uint16x8_t v_lane1 = vpaddlq_u8(vLane20.val[1]); + uint16x8_t v_lane2 = vpaddlq_u8(vLane30.val[1]); + uint16x8_t v_lane3 = vpaddlq_u8(vLane40.val[1]); + v_lane0 = vaddq_u16(v_lane0, v_lane1); + v_lane0 = vaddq_u16(v_lane0, v_lane2); + v_lane0 = vaddq_u16(v_lane0, v_lane3); + + uint16x8_t v_lane0_ = vpaddlq_u8(vLane11.val[1]); + uint16x8_t v_lane1_ = vpaddlq_u8(vLane21.val[1]); + uint16x8_t v_lane2_ = vpaddlq_u8(vLane31.val[1]); + uint16x8_t v_lane3_ = vpaddlq_u8(vLane41.val[1]); + v_lane0_ = vaddq_u16(v_lane0_, v_lane1_); + v_lane0_ = vaddq_u16(v_lane0_, v_lane2_); + v_lane0_ = vaddq_u16(v_lane0_, v_lane3_); + + v_dst.val[1] = areaDownsamplingDivision( + vcombine_u16(vmovn_u32(vpaddlq_u16(v_lane0)), + vmovn_u32(vpaddlq_u16(v_lane0_)))); + } + + // channel 2 + { + uint16x8_t v_lane0 = vpaddlq_u8(vLane10.val[2]); + uint16x8_t v_lane1 = vpaddlq_u8(vLane20.val[2]); + uint16x8_t v_lane2 = vpaddlq_u8(vLane30.val[2]); + uint16x8_t v_lane3 = vpaddlq_u8(vLane40.val[2]); + v_lane0 = vaddq_u16(v_lane0, v_lane1); + v_lane0 = vaddq_u16(v_lane0, v_lane2); + v_lane0 = vaddq_u16(v_lane0, v_lane3); + + uint16x8_t v_lane0_ = vpaddlq_u8(vLane11.val[2]); + uint16x8_t v_lane1_ = vpaddlq_u8(vLane21.val[2]); + uint16x8_t v_lane2_ = vpaddlq_u8(vLane31.val[2]); + uint16x8_t v_lane3_ = vpaddlq_u8(vLane41.val[2]); + v_lane0_ = vaddq_u16(v_lane0_, v_lane1_); + v_lane0_ = vaddq_u16(v_lane0_, v_lane2_); + v_lane0_ = vaddq_u16(v_lane0_, v_lane3_); + + v_dst.val[2] = areaDownsamplingDivision( + vcombine_u16(vmovn_u32(vpaddlq_u16(v_lane0)), + vmovn_u32(vpaddlq_u16(v_lane0_)))); + } + + vst3_u8(dst_row + dj, v_dst); + } +#endif + + for (size_t dwidth = dsize.width * 3; dj < dwidth; dj += 3, sj += 12) + { + dst_row[dj ] = areaDownsamplingDivision( + (u16)src0_row[sj ] + src0_row[sj + 3] + + src0_row[sj + 6] + src0_row[sj + 9] + + src1_row[sj ] + src1_row[sj + 3] + + src1_row[sj + 6] + src1_row[sj + 9] + + src2_row[sj ] + src2_row[sj + 3] + + src2_row[sj + 6] + src2_row[sj + 9] + + src3_row[sj ] + src3_row[sj + 3] + + src3_row[sj + 6] + src3_row[sj + 9]); + + dst_row[dj + 1] = areaDownsamplingDivision( + (u16)src0_row[sj + 1] + src0_row[sj + 4] + + src0_row[sj + 7] + src0_row[sj + 10] + + src1_row[sj + 1] + src1_row[sj + 4] + + src1_row[sj + 7] + src1_row[sj + 10] + + src2_row[sj + 1] + src2_row[sj + 4] + + src2_row[sj + 7] + src2_row[sj + 10] + + src3_row[sj + 1] + src3_row[sj + 4] + + src3_row[sj + 7] + src3_row[sj + 10]); + + dst_row[dj + 2] = areaDownsamplingDivision( + (u16)src0_row[sj + 2] + src0_row[sj + 5] + + src0_row[sj + 8] + src0_row[sj + 11] + + src1_row[sj + 2] + src1_row[sj + 5] + + src1_row[sj + 8] + src1_row[sj + 11] + + src2_row[sj + 2] + src2_row[sj + 5] + + src2_row[sj + 8] + src2_row[sj + 11] + + src3_row[sj + 2] + src3_row[sj + 5] + + src3_row[sj + 8] + src3_row[sj + 11]); + } + } + } + } +#else + (void)dsize; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)wr; + (void)hr; +#endif + (void)ssize; +} + +void resizeAreaOpenCV(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f32 wr, f32 hr, u32 channels) +{ + resizeAreaRounding(ssize, dsize, srcBase, srcStride, dstBase, dstStride, wr, hr, channels); +} + +void resizeArea(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f32 wr, f32 hr, u32 channels) +{ + resizeAreaRounding(ssize, dsize, srcBase, srcStride, dstBase, dstStride, wr, hr, channels); +} + +#ifdef CAROTENE_NEON + +namespace { + +uint8x8_t resizeLinearStep(uint8x16_t vr1, uint8x16_t vr2, + uint8x8_t vlutl, uint8x8_t vluth, + float32x4_t vrw, float32x4_t vcw0, float32x4_t vcw1) +{ + uint8x8_t vr1l = internal::vqtbl1_u8(vr1, vlutl); + uint8x8_t vr1h = internal::vqtbl1_u8(vr1, vluth); + uint8x8_t vr2l = internal::vqtbl1_u8(vr2, vlutl); + uint8x8_t vr2h = internal::vqtbl1_u8(vr2, vluth); + + uint16x8_t v1hw = vmovl_u8(vr1h); + uint16x8_t v2hw = vmovl_u8(vr2h); + + int16x8_t v1df = vreinterpretq_s16_u16(vsubl_u8(vr1l, vr1h)); + int16x8_t v2df = vreinterpretq_s16_u16(vsubl_u8(vr2l, vr2h)); + + float32x4_t v1L = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1hw))); + float32x4_t v1H = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1hw))); + float32x4_t v2L = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v2hw))); + float32x4_t v2H = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v2hw))); + + v1L = vmlaq_f32(v1L, vcvtq_f32_s32(vmovl_s16(vget_low_s16(v1df))), vcw0); + v1H = vmlaq_f32(v1H, vcvtq_f32_s32(vmovl_s16(vget_high_s16(v1df))), vcw1); + v2L = vmlaq_f32(v2L, vcvtq_f32_s32(vmovl_s16(vget_low_s16(v2df))), vcw0); + v2H = vmlaq_f32(v2H, vcvtq_f32_s32(vmovl_s16(vget_high_s16(v2df))), vcw1); + + float32x4_t vdiffL = vsubq_f32(v1L, v2L); + float32x4_t vdiffH = vsubq_f32(v1H, v2H); + + float32x4_t vL = vmlaq_f32(v2L, vdiffL, vrw); + float32x4_t vH = vmlaq_f32(v2H, vdiffH, vrw); + uint16x4_t vL_ = vmovn_u32(vcvtq_u32_f32(vL)); + uint16x4_t vH_ = vmovn_u32(vcvtq_u32_f32(vH)); + return vmovn_u16(vcombine_u16(vL_, vH_)); +} + +} // namespace + +namespace { + +void resize_bilinear_rows(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f32 hr, const u8** gcols, u8* gcweight, u8* buf) +{ + f32 scale_y_offset = 0.5f * hr - 0.5f; + + size_t dst_h8 = dsize.height & ~7; + size_t dst_w8 = dsize.width & ~7; + size_t src_w8 = ssize.width & ~7; + + size_t r = 0; + for (; r < dst_h8; r += 8) + { +resize8u_xystretch: + const u8* rows[16]; + u8 rweight[8]; + + for (u32 i = 0; i < 8; ++i) + { + f32 w = (i + r) * hr + scale_y_offset; + ptrdiff_t src_row = floorf(w); + ptrdiff_t src_row2 = src_row + 1; + + rweight[i] = (u8)((src_row2-w) * 128); + + if (src_row < 0) + src_row = 0; + if (src_row2 >= (ptrdiff_t)ssize.height) + src_row2 = ssize.height-1; + + rows[2 * i] = srcBase + src_row * srcStride; + rows[2 * i + 1] = srcBase + src_row2 * srcStride; + } + + uint8x8_t vr0w = vdup_n_u8(rweight[0]); + uint8x8_t vr1w = vdup_n_u8(rweight[1]); + uint8x8_t vr2w = vdup_n_u8(rweight[2]); + uint8x8_t vr3w = vdup_n_u8(rweight[3]); + uint8x8_t vr4w = vdup_n_u8(rweight[4]); + uint8x8_t vr5w = vdup_n_u8(rweight[5]); + uint8x8_t vr6w = vdup_n_u8(rweight[6]); + uint8x8_t vr7w = vdup_n_u8(rweight[7]); + + uint8x8_t vr0w2 = vdup_n_u8(128 - rweight[0]); + uint8x8_t vr1w2 = vdup_n_u8(128 - rweight[1]); + uint8x8_t vr2w2 = vdup_n_u8(128 - rweight[2]); + uint8x8_t vr3w2 = vdup_n_u8(128 - rweight[3]); + uint8x8_t vr4w2 = vdup_n_u8(128 - rweight[4]); + uint8x8_t vr5w2 = vdup_n_u8(128 - rweight[5]); + uint8x8_t vr6w2 = vdup_n_u8(128 - rweight[6]); + uint8x8_t vr7w2 = vdup_n_u8(128 - rweight[7]); + + size_t col = 0; + for(; col < src_w8; col += 8) + { + internal::prefetch(rows[3] + col); + internal::prefetch(rows[7] + col); + internal::prefetch(rows[11] + col); + internal::prefetch(rows[15] + col); +resize8u_ystretch: + uint8x8_t vsrc0l1 = vld1_u8(rows[0] + col); + uint8x8_t vsrc0l2 = vld1_u8(rows[1] + col); + uint8x8_t vsrc1l1 = vld1_u8(rows[2] + col); + uint8x8_t vsrc1l2 = vld1_u8(rows[3] + col); + + // (l1 * w + l2 * (128 - w) + 64) / 128 + uint16x8_t vdst0l = vmull_u8(vsrc0l1, vr0w); + uint16x8_t vdst1l = vmull_u8(vsrc1l1, vr1w); + + uint8x8_t vsrc2l1 = vld1_u8(rows[4] + col); + uint8x8_t vsrc2l2 = vld1_u8(rows[5] + col); + uint8x8_t vsrc3l1 = vld1_u8(rows[6] + col); + uint8x8_t vsrc3l2 = vld1_u8(rows[7] + col); + + vdst0l = vmlal_u8(vdst0l, vsrc0l2, vr0w2); + vdst1l = vmlal_u8(vdst1l, vsrc1l2, vr1w2); + uint16x8_t vdst2l = vmull_u8(vsrc2l1, vr2w); + uint16x8_t vdst3l = vmull_u8(vsrc3l1, vr3w); + + uint8x8_t vsrc4l1 = vld1_u8(rows[8] + col); + uint8x8_t vsrc4l2 = vld1_u8(rows[9] + col); + uint8x8_t vsrc5l1 = vld1_u8(rows[10] + col); + uint8x8_t vsrc5l2 = vld1_u8(rows[11] + col); + + vdst2l = vmlal_u8(vdst2l, vsrc2l2, vr2w2); + vdst3l = vmlal_u8(vdst3l, vsrc3l2, vr3w2); + uint16x8_t vdst4l = vmull_u8(vsrc4l1, vr4w); + uint16x8_t vdst5l = vmull_u8(vsrc5l1, vr5w); + + uint8x8_t vsrc6l1 = vld1_u8(rows[12] + col); + uint8x8_t vsrc6l2 = vld1_u8(rows[13] + col); + uint8x8_t vsrc7l1 = vld1_u8(rows[14] + col); + uint8x8_t vsrc7l2 = vld1_u8(rows[15] + col); + + uint8x8_t vdst0 = vrshrn_n_u16(vdst0l, 7); + uint8x8_t vdst1 = vrshrn_n_u16(vdst1l, 7); + vdst4l = vmlal_u8(vdst4l, vsrc4l2, vr4w2); + vdst5l = vmlal_u8(vdst5l, vsrc5l2, vr5w2); + uint16x8_t vdst6l = vmull_u8(vsrc6l1, vr6w); + uint16x8_t vdst7l = vmull_u8(vsrc7l1, vr7w); + + uint8x8_t vdst2 = vrshrn_n_u16(vdst2l, 7); + uint8x8_t vdst3 = vrshrn_n_u16(vdst3l, 7); + vdst6l = vmlal_u8(vdst6l, vsrc6l2, vr6w2); + vdst7l = vmlal_u8(vdst7l, vsrc7l2, vr7w2); + + uint8x8_t vdst4 = vrshrn_n_u16(vdst4l, 7); + uint8x8_t vdst5 = vrshrn_n_u16(vdst5l, 7); + uint8x8_t vdst6 = vrshrn_n_u16(vdst6l, 7); + uint8x8_t vdst7 = vrshrn_n_u16(vdst7l, 7); + + // == 8x8 matrix transpose == + + //00 01 02 03 04 05 06 07 d0 + //10 11 12 13 14 15 16 17 d1 + //20 21 22 23 24 25 26 27 d2 + //30 31 32 33 34 35 36 37 d3 + //40 41 42 43 44 45 46 47 d4 + //50 51 52 53 54 55 56 57 d5 + //60 61 62 63 64 65 66 67 d6 + //70 71 72 73 74 75 76 77 d7 + + uint8x8x2_t vdst10t = vtrn_u8(vdst0, vdst1); + uint8x8x2_t vdst32t = vtrn_u8(vdst2, vdst3); + uint8x8x2_t vdst54t = vtrn_u8(vdst4, vdst5); + uint8x8x2_t vdst76t = vtrn_u8(vdst6, vdst7); + + uint8x16_t vd1d0 = vcombine_u8(vdst10t.val[0], vdst10t.val[1]); + uint8x16_t vd3d2 = vcombine_u8(vdst32t.val[0], vdst32t.val[1]); + uint8x16_t vd5d4 = vcombine_u8(vdst54t.val[0], vdst54t.val[1]); + uint8x16_t vd7d6 = vcombine_u8(vdst76t.val[0], vdst76t.val[1]); + + //00 10 02 12 04 14 06 16 d0 + //01 11 03 13 05 15 07 17 d1 + //20 30 22 32 24 34 26 36 d2 + //21 31 23 33 25 35 27 37 d3 + //40 50 42 52 44 54 46 56 d4 + //41 51 43 53 45 55 47 57 d5 + //60 70 62 72 64 74 66 76 d6 + //61 71 63 73 65 75 67 77 d7 + + uint16x8x2_t vq1q0t = vtrnq_u16((uint16x8_t)vd1d0, (uint16x8_t)vd3d2); + uint16x8x2_t vq3q2t = vtrnq_u16((uint16x8_t)vd5d4, (uint16x8_t)vd7d6); + + //00 10 20 30 04 14 24 34 d0 + //01 11 21 31 05 15 25 35 d1 + //02 12 22 32 06 16 26 36 d2 + //03 13 23 33 07 17 27 37 d3 + //40 50 60 70 44 54 64 74 d4 + //41 51 61 71 45 55 65 75 d5 + //42 52 62 72 46 56 66 76 d6 + //43 53 63 73 47 57 67 77 d7 + + uint32x4x2_t vq2q0t = vtrnq_u32((uint32x4_t)vq1q0t.val[0], (uint32x4_t)vq3q2t.val[0]); + uint32x4x2_t vq3q1t = vtrnq_u32((uint32x4_t)vq1q0t.val[1], (uint32x4_t)vq3q2t.val[1]); + + //00 10 20 30 40 50 60 70 d0 + //01 11 21 31 41 51 61 71 d1 + //02 12 22 32 42 52 62 72 d2 + //03 13 23 33 43 53 63 73 d3 + //04 14 24 34 44 54 64 74 d4 + //05 15 25 35 45 55 65 75 d5 + //06 16 26 36 46 56 66 76 d6 + //07 17 27 37 47 57 67 77 d7 + + vst1q_u8(buf + col * 8 + 0, (uint8x16_t)vq2q0t.val[0]); + vst1q_u8(buf + col * 8 + 16, (uint8x16_t)vq3q1t.val[0]); + vst1q_u8(buf + col * 8 + 32, (uint8x16_t)vq2q0t.val[1]); + vst1q_u8(buf + col * 8 + 48, (uint8x16_t)vq3q1t.val[1]); + } + + if (col < ssize.width) + { + col = ssize.width - 8; + goto resize8u_ystretch; + } + + u8* dst_data = dstBase + r * dstStride; + const u8** cols = gcols; + u8* cweight = gcweight; + + size_t dcol = 0; + for (; dcol < dst_w8; dcol += 8, cols += 16, cweight += 8) + { + internal::prefetch(cols[0], 64*4); +resize8u_xstretch: + uint8x8_t vc0w = vdup_n_u8(cweight[0]); + uint8x8_t vc1w = vdup_n_u8(cweight[1]); + uint8x8_t vc2w = vdup_n_u8(cweight[2]); + uint8x8_t vc3w = vdup_n_u8(cweight[3]); + uint8x8_t vc4w = vdup_n_u8(cweight[4]); + uint8x8_t vc5w = vdup_n_u8(cweight[5]); + uint8x8_t vc6w = vdup_n_u8(cweight[6]); + uint8x8_t vc7w = vdup_n_u8(cweight[7]); + + uint8x8_t vc0w2 = vdup_n_u8(128 - cweight[0]); + uint8x8_t vc1w2 = vdup_n_u8(128 - cweight[1]); + uint8x8_t vc2w2 = vdup_n_u8(128 - cweight[2]); + uint8x8_t vc3w2 = vdup_n_u8(128 - cweight[3]); + uint8x8_t vc4w2 = vdup_n_u8(128 - cweight[4]); + uint8x8_t vc5w2 = vdup_n_u8(128 - cweight[5]); + uint8x8_t vc6w2 = vdup_n_u8(128 - cweight[6]); + uint8x8_t vc7w2 = vdup_n_u8(128 - cweight[7]); + + uint8x8_t vsrc0l1 = vld1_u8(cols[0]); + uint8x8_t vsrc0l2 = vld1_u8(cols[1]); + uint8x8_t vsrc1l1 = vld1_u8(cols[2]); + uint8x8_t vsrc1l2 = vld1_u8(cols[3]); + uint8x8_t vsrc2l1 = vld1_u8(cols[4]); + uint8x8_t vsrc2l2 = vld1_u8(cols[5]); + uint8x8_t vsrc3l1 = vld1_u8(cols[6]); + uint8x8_t vsrc3l2 = vld1_u8(cols[7]); + uint8x8_t vsrc4l1 = vld1_u8(cols[8]); + uint8x8_t vsrc4l2 = vld1_u8(cols[9]); + uint8x8_t vsrc5l1 = vld1_u8(cols[10]); + uint8x8_t vsrc5l2 = vld1_u8(cols[11]); + uint8x8_t vsrc6l1 = vld1_u8(cols[12]); + uint8x8_t vsrc6l2 = vld1_u8(cols[13]); + uint8x8_t vsrc7l1 = vld1_u8(cols[14]); + uint8x8_t vsrc7l2 = vld1_u8(cols[15]); + + // (l1 * w + l2 * (128 - w) + 64) / 128 + uint16x8_t vdst0l = vmull_u8(vsrc0l1, vc0w); + uint16x8_t vdst1l = vmull_u8(vsrc1l1, vc1w); + uint16x8_t vdst2l = vmull_u8(vsrc2l1, vc2w); + uint16x8_t vdst3l = vmull_u8(vsrc3l1, vc3w); + uint16x8_t vdst4l = vmull_u8(vsrc4l1, vc4w); + uint16x8_t vdst5l = vmull_u8(vsrc5l1, vc5w); + uint16x8_t vdst6l = vmull_u8(vsrc6l1, vc6w); + uint16x8_t vdst7l = vmull_u8(vsrc7l1, vc7w); + + vdst0l = vmlal_u8(vdst0l, vsrc0l2, vc0w2); + vdst1l = vmlal_u8(vdst1l, vsrc1l2, vc1w2); + vdst2l = vmlal_u8(vdst2l, vsrc2l2, vc2w2); + vdst3l = vmlal_u8(vdst3l, vsrc3l2, vc3w2); + vdst4l = vmlal_u8(vdst4l, vsrc4l2, vc4w2); + vdst5l = vmlal_u8(vdst5l, vsrc5l2, vc5w2); + vdst6l = vmlal_u8(vdst6l, vsrc6l2, vc6w2); + vdst7l = vmlal_u8(vdst7l, vsrc7l2, vc7w2); + + uint8x8_t vdst0 = vrshrn_n_u16(vdst0l, 7); + uint8x8_t vdst1 = vrshrn_n_u16(vdst1l, 7); + uint8x8_t vdst2 = vrshrn_n_u16(vdst2l, 7); + uint8x8_t vdst3 = vrshrn_n_u16(vdst3l, 7); + uint8x8_t vdst4 = vrshrn_n_u16(vdst4l, 7); + uint8x8_t vdst5 = vrshrn_n_u16(vdst5l, 7); + uint8x8_t vdst6 = vrshrn_n_u16(vdst6l, 7); + uint8x8_t vdst7 = vrshrn_n_u16(vdst7l, 7); + + // == 8x8 matrix transpose == + uint8x8x2_t vdst10t = vtrn_u8(vdst0, vdst1); + uint8x8x2_t vdst32t = vtrn_u8(vdst2, vdst3); + uint8x8x2_t vdst54t = vtrn_u8(vdst4, vdst5); + uint8x8x2_t vdst76t = vtrn_u8(vdst6, vdst7); + uint8x16_t vd1d0 = vcombine_u8(vdst10t.val[0], vdst10t.val[1]); + uint8x16_t vd3d2 = vcombine_u8(vdst32t.val[0], vdst32t.val[1]); + uint8x16_t vd5d4 = vcombine_u8(vdst54t.val[0], vdst54t.val[1]); + uint8x16_t vd7d6 = vcombine_u8(vdst76t.val[0], vdst76t.val[1]); + uint16x8x2_t vq1q0t = vtrnq_u16((uint16x8_t)vd1d0, (uint16x8_t)vd3d2); + uint16x8x2_t vq3q2t = vtrnq_u16((uint16x8_t)vd5d4, (uint16x8_t)vd7d6); + uint32x4x2_t vq2q0t = vtrnq_u32((uint32x4_t)vq1q0t.val[0], (uint32x4_t)vq3q2t.val[0]); + uint32x4x2_t vq3q1t = vtrnq_u32((uint32x4_t)vq1q0t.val[1], (uint32x4_t)vq3q2t.val[1]); + + //save results + vst1_u8(dst_data + 0 * dstStride + dcol, (uint8x8_t)vget_low_u32(vq2q0t.val[0])); + vst1_u8(dst_data + 1 * dstStride + dcol, (uint8x8_t)vget_high_u32(vq2q0t.val[0])); + vst1_u8(dst_data + 2 * dstStride + dcol, (uint8x8_t)vget_low_u32(vq3q1t.val[0])); + vst1_u8(dst_data + 3 * dstStride + dcol, (uint8x8_t)vget_high_u32(vq3q1t.val[0])); + vst1_u8(dst_data + 4 * dstStride + dcol, (uint8x8_t)vget_low_u32(vq2q0t.val[1])); + vst1_u8(dst_data + 5 * dstStride + dcol, (uint8x8_t)vget_high_u32(vq2q0t.val[1])); + vst1_u8(dst_data + 6 * dstStride + dcol, (uint8x8_t)vget_low_u32(vq3q1t.val[1])); + vst1_u8(dst_data + 7 * dstStride + dcol, (uint8x8_t)vget_high_u32(vq3q1t.val[1])); + } + + if (dcol < dsize.width) + { + dcol = dsize.width - 8; + cols = gcols + dcol * 2; + cweight = gcweight + dcol; + goto resize8u_xstretch; + } + } + + if (r < dsize.height) + { + r = dsize.height - 8; + goto resize8u_xystretch; + } +} + +template struct resizeLinearInternals; +template <> struct resizeLinearInternals<1> +{ + int32x4_t vc_upd; + int32x4_t vc0; + int32x4_t vcmax; + + inline resizeLinearInternals(int32x4_t & vi, u32 srccols) + { + vc_upd = vdupq_n_s32(4); + vc0 = vdupq_n_s32(0); + vcmax = vdupq_n_s32(srccols-1); + + s32 tmp0123[] = {0, 1, 2, 3 }; + vi = vld1q_s32(tmp0123); + } + inline void updateIndexes(int32x4_t & vi, int32x4_t & vsrch, int32x4_t & vsrcl) + { + vsrch = vminq_s32(vsrch, vcmax); + vsrcl = vmaxq_s32(vsrcl, vc0); + vsrcl = vminq_s32(vsrcl, vcmax);//for safe tail + vsrch = vshlq_n_s32(vsrch, 3); + vsrcl = vshlq_n_s32(vsrcl, 3); + vi = vaddq_s32(vi, vc_upd); + } +}; +template <> struct resizeLinearInternals<4> +{ + int32x4_t vc_upd; + int32x4_t vc0; + int32x4_t vcmax; + int32x4_t v0123x8; + + inline resizeLinearInternals(int32x4_t & vi, u32 srccols) + { + vc_upd = vdupq_n_s32(1); + vc0 = vdupq_n_s32(0); + vcmax = vdupq_n_s32(srccols-1); + s32 tmp0123x8[] = {0, 8, 16, 24}; + v0123x8 = vld1q_s32(tmp0123x8); + + vi = vc0; + } + inline void updateIndexes(int32x4_t & vi, int32x4_t & vsrch, int32x4_t & vsrcl) + { + vsrch = vminq_s32(vsrch, vcmax); + vsrcl = vmaxq_s32(vsrcl, vc0); + vsrch = vshlq_n_s32(vsrch, 5); + vsrcl = vshlq_n_s32(vsrcl, 5); + vsrch = vaddq_s32(vsrch, v0123x8); + vsrcl = vaddq_s32(vsrcl, v0123x8); + vi = vaddq_s32(vi, vc_upd); + } +}; + +template +void resizeLinearOpenCVchan(const Size2D &_ssize, const Size2D &_dsize, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f32 wr, f32 hr) +{ + float scale_x_offset = 0.5f * wr - 0.5f; + + Size2D ssize(_ssize.width*channels, _ssize.height); + Size2D dsize(_dsize.width*channels, _dsize.height); + + std::vector gcweight((dsize.width + 7) & ~7); + std::vector gcols(((dsize.width + 7) & ~7) * 2); + std::vector buf(((ssize.width + 7) & ~7) * 8); // (8 rows) x (width of src) + + float32x4_t vscale_x = vdupq_n_f32(wr); + float32x4_t vscale_x_offset = vdupq_n_f32(scale_x_offset); + int32x4_t vc1 = vdupq_n_s32(1); + float32x4_t vc128f = vdupq_n_f32(128.0f); + + int32x4_t vi; + resizeLinearInternals indexes(vi, _ssize.width);//u32 is used to store indexes + //so we could get issues on src image dimensions greater than (2^32-1) + + for (size_t dcol = 0; dcol < dsize.width; dcol += 8) + { + s32 idx[16]; + + float32x4_t vif = vcvtq_f32_s32(vi); + float32x4_t vw = vmlaq_f32(vscale_x_offset, vscale_x, vif); + int32x4_t vwi = vcvtq_s32_f32(vw); + float32x4_t vwif = vcvtq_f32_s32(vwi); + int32x4_t vmask = (int32x4_t)vcltq_f32(vwif, vw); + int32x4_t vsrch = vsubq_s32(vwi, vmask); + int32x4_t vsrcl = vsubq_s32(vsrch, vc1); + float32x4_t vsrchf = vcvtq_f32_s32(vsrch); + float32x4_t vw2 = vsubq_f32(vsrchf, vw); + + vw2 = vmulq_f32(vw2, vc128f); + uint32x4_t vw32u = vcvtq_u32_f32(vw2); + uint16x4_t vw16ul = vmovn_u32(vw32u); + indexes.updateIndexes(vi, vsrch, vsrcl); + + vst1q_s32(idx + 0, vsrcl); + vst1q_s32(idx + 8, vsrch); + + vif = vcvtq_f32_s32(vi); + vw = vmlaq_f32(vscale_x_offset, vscale_x, vif); + vwi = vcvtq_s32_f32(vw); + vwif = vcvtq_f32_s32(vwi); + vmask = (int32x4_t)vcltq_f32(vwif, vw); + vsrch = vsubq_s32(vwi, vmask); + vsrcl = vsubq_s32(vsrch, vc1); + vsrchf = vcvtq_f32_s32(vsrch); + vw2 = vsubq_f32(vsrchf, vw); + + vw2 = vmulq_f32(vw2, vc128f); + vw32u = vcvtq_u32_f32(vw2); + indexes.updateIndexes(vi, vsrch, vsrcl); + + uint16x4_t vw16uh = vmovn_u32(vw32u); + + vst1q_s32(idx + 4, vsrcl); + vst1q_s32(idx + 12, vsrch); + + uint8x8_t vw8u = vmovn_u16(vcombine_u16(vw16ul, vw16uh)); + + for (u32 i = 0; i < 8; ++i) + { + gcols[dcol * 2 + i*2] = &buf[idx[i]]; + gcols[dcol * 2 + i*2 + 1] = &buf[idx[i + 8]]; + } + + vst1_u8(&gcweight[dcol], vw8u); + } + + resize_bilinear_rows(ssize, dsize, srcBase, srcStride, dstBase, dstStride, hr, &gcols[0], &gcweight[0], &buf[0]); +} + +void downsample_bilinear_8uc1(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f32 wr, f32 hr) +{ + internal::assertSupportedConfiguration(wr <= 2.f && hr <= 2.f); + + enum { SHIFT_BITS = 11 }; + + f32 scale_x_offset = 0.5f * wr - 0.5f; + f32 scale_y_offset = 0.5f * hr - 0.5f; + + std::vector _buf(dsize.height*(2*(sizeof(ptrdiff_t)/sizeof(s32))+1)+1); + ptrdiff_t* buf = (ptrdiff_t*)&_buf[0]; + s32* buf2 = (s32*)buf+2*(sizeof(ptrdiff_t)/sizeof(s32))*dsize.height; + for(size_t row = 0; row < (size_t)dsize.height; ++row) + { + f32 r = row * hr + scale_y_offset; + ptrdiff_t src_row = floorf(r); + ptrdiff_t src_row2 = src_row + 1; + + f32 rweight = src_row2 - r; + buf2[row] = floorf(rweight * (1 << SHIFT_BITS) + 0.5f); + buf[0 * dsize.height + row] = std::max(0, src_row); + buf[1 * dsize.height + row] = std::min((ptrdiff_t)ssize.height-1, src_row2); + } + +#define USE_CORRECT_VERSION 0 + + ptrdiff_t col = 0; +/***********************************************/ + for(; col <= (ptrdiff_t)dsize.width-16; col+=16) + { + ptrdiff_t col1[16]; + ptrdiff_t col2[16]; + s16 cwi[16]; + + for(s32 k = 0; k < 16; ++k) + { + f32 c = (col + k) * wr + scale_x_offset; + col1[k] = (ptrdiff_t)c; + col2[k] = col1[k] + 1; + + cwi[k] = (short)floorf((col2[k] - c) * (1 << SHIFT_BITS) + 0.5f); + + if(col1[k] < 0) col1[k] = 0; + if(col2[k] >= (ptrdiff_t)ssize.width) col2[k] = ssize.width-1; + } + + ptrdiff_t x = std::min(col1[0], (ptrdiff_t)ssize.width-16); + ptrdiff_t y = std::min(col1[8], (ptrdiff_t)ssize.width-16); + u8 lutl[16]; + u8 luth[16]; + for(s32 k = 0; k < 8; ++k) + { + lutl[k] = (u8)(col1[k] - x); + luth[k] = (u8)(col2[k] - x); + lutl[k+8] = (u8)(col1[k+8] - y); + luth[k+8] = (u8)(col2[k+8] - y); + } + + uint8x8_t vlutl = vld1_u8(lutl); + uint8x8_t vluth = vld1_u8(luth); + int16x8_t vcw = vld1q_s16(cwi); + + uint8x8_t vlutl_ = vld1_u8(lutl+8); + uint8x8_t vluth_ = vld1_u8(luth+8); + int16x8_t vcw_ = vld1q_s16(cwi+8); + + for(ptrdiff_t row = 0; row < (ptrdiff_t)dsize.height; ++row) + { +#if USE_CORRECT_VERSION + int32x4_t vrw = vdupq_n_s32(buf2[row]); +#else + int16x8_t vrw = vdupq_n_s16((int16_t)buf2[row]); + int16x8_t vrW = vdupq_n_s16((int16_t)((1 << SHIFT_BITS) - buf2[row])); +#endif + + internal::prefetch(internal::getRowPtr(srcBase, srcStride, buf[1*dsize.height + row]) + x, 2*srcStride); + internal::prefetch(internal::getRowPtr(srcBase, srcStride, buf[1*dsize.height + row]) + x, 3*srcStride); + + { + union { uint8x16_t v; uint8x8x2_t w; } vr1 = { vld1q_u8(internal::getRowPtr(srcBase, srcStride, buf[0*dsize.height + row]) + x) }; + union { uint8x16_t v; uint8x8x2_t w; } vr2 = { vld1q_u8(internal::getRowPtr(srcBase, srcStride, buf[1*dsize.height + row]) + x) }; + + uint8x8_t vr1l = vtbl2_u8(vr1.w, vlutl); + uint8x8_t vr1h = vtbl2_u8(vr1.w, vluth); + uint8x8_t vr2l = vtbl2_u8(vr2.w, vlutl); + uint8x8_t vr2h = vtbl2_u8(vr2.w, vluth); + + uint16x8_t v1hw = vmovl_u8(vr1h); + uint16x8_t v2hw = vmovl_u8(vr2h); + + int16x8_t v1df = vreinterpretq_s16_u16(vsubl_u8(vr1l, vr1h)); + int16x8_t v2df = vreinterpretq_s16_u16(vsubl_u8(vr2l, vr2h)); + + int32x4_t v1L = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(v1hw), SHIFT_BITS)); + int32x4_t v1H = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(v1hw), SHIFT_BITS)); + int32x4_t v2L = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(v2hw), SHIFT_BITS)); + int32x4_t v2H = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(v2hw), SHIFT_BITS)); + + v1L = vmlal_s16(v1L, vget_low_s16(v1df), vget_low_s16(vcw)); + v1H = vmlal_s16(v1H, vget_high_s16(v1df), vget_high_s16(vcw)); + v2L = vmlal_s16(v2L, vget_low_s16(v2df), vget_low_s16(vcw)); + v2H = vmlal_s16(v2H, vget_high_s16(v2df), vget_high_s16(vcw)); + +#if USE_CORRECT_VERSION + /* correct version */ + int32x4_t vL = vshlq_n_s32(v2L, SHIFT_BITS); + int32x4_t vH = vshlq_n_s32(v2H, SHIFT_BITS); + int32x4_t vdiffL = vsubq_s32(v1L, v2L); + int32x4_t vdiffH = vsubq_s32(v1H, v2H); + + vL = vmlaq_s32(vL, vdiffL, vrw); + vH = vmlaq_s32(vH, vdiffH, vrw); + uint16x4_t vL_ = vqrshrun_n_s32(vL, 2*SHIFT_BITS - 8); + uint16x4_t vH_ = vqrshrun_n_s32(vH, 2*SHIFT_BITS - 8); + uint8x8_t vres = vrshrn_n_u16(vcombine_u16(vL_, vH_), 8); + vst1_u8(internal::getRowPtr(dstBase, dstStride, row) + col, vres); +#else + /* ugly version matching to OpenCV's SSE optimization */ + int16x4_t v1Ls = vshrn_n_s32(v1L, 4); + int16x4_t v1Hs = vshrn_n_s32(v1H, 4); + int16x4_t v2Ls = vshrn_n_s32(v2L, 4); + int16x4_t v2Hs = vshrn_n_s32(v2H, 4); + + int16x8_t v1s = vqdmulhq_s16(vcombine_s16(v1Ls, v1Hs), vrw); + int16x8_t v2s = vqdmulhq_s16(vcombine_s16(v2Ls, v2Hs), vrW); + + int16x8_t vsum = vaddq_s16(vshrq_n_s16(v1s,1), vshrq_n_s16(v2s,1)); + uint8x8_t vres = vqrshrun_n_s16(vsum, 2); + + vst1_u8(internal::getRowPtr(dstBase, dstStride, row) + col, vres); +#endif + } + + { + union { uint8x16_t v; uint8x8x2_t w; } vr1 = { vld1q_u8(internal::getRowPtr(srcBase, srcStride, buf[0*dsize.height + row]) + y) }; + union { uint8x16_t v; uint8x8x2_t w; } vr2 = { vld1q_u8(internal::getRowPtr(srcBase, srcStride, buf[1*dsize.height + row]) + y) }; + + uint8x8_t vr1l = vtbl2_u8(vr1.w, vlutl_); + uint8x8_t vr1h = vtbl2_u8(vr1.w, vluth_); + uint8x8_t vr2l = vtbl2_u8(vr2.w, vlutl_); + uint8x8_t vr2h = vtbl2_u8(vr2.w, vluth_); + + uint16x8_t v1hw = vmovl_u8(vr1h); + uint16x8_t v2hw = vmovl_u8(vr2h); + + int16x8_t v1df = vreinterpretq_s16_u16(vsubl_u8(vr1l, vr1h)); + int16x8_t v2df = vreinterpretq_s16_u16(vsubl_u8(vr2l, vr2h)); + + int32x4_t v1L = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(v1hw), SHIFT_BITS)); + int32x4_t v1H = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(v1hw), SHIFT_BITS)); + int32x4_t v2L = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(v2hw), SHIFT_BITS)); + int32x4_t v2H = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(v2hw), SHIFT_BITS)); + + v1L = vmlal_s16(v1L, vget_low_s16(v1df), vget_low_s16(vcw_)); + v1H = vmlal_s16(v1H, vget_high_s16(v1df), vget_high_s16(vcw_)); + v2L = vmlal_s16(v2L, vget_low_s16(v2df), vget_low_s16(vcw_)); + v2H = vmlal_s16(v2H, vget_high_s16(v2df), vget_high_s16(vcw_)); + +#if USE_CORRECT_VERSION + /* correct version */ + int32x4_t vL = vshlq_n_s32(v2L, SHIFT_BITS); + int32x4_t vH = vshlq_n_s32(v2H, SHIFT_BITS); + int32x4_t vdiffL = vsubq_s32(v1L, v2L); + int32x4_t vdiffH = vsubq_s32(v1H, v2H); + + vL = vmlaq_s32(vL, vdiffL, vrw); + vH = vmlaq_s32(vH, vdiffH, vrw); + uint16x4_t vL_ = vqrshrun_n_s32(vL, 2*SHIFT_BITS - 8); + uint16x4_t vH_ = vqrshrun_n_s32(vH, 2*SHIFT_BITS - 8); + uint8x8_t vres = vrshrn_n_u16(vcombine_u16(vL_, vH_), 8); + vst1_u8(internal::getRowPtr(dstBase, dstStride, row) + col + 8, vres); +#else + /* ugly version matching to OpenCV's SSE optimization */ + int16x4_t v1Ls = vshrn_n_s32(v1L, 4); + int16x4_t v1Hs = vshrn_n_s32(v1H, 4); + int16x4_t v2Ls = vshrn_n_s32(v2L, 4); + int16x4_t v2Hs = vshrn_n_s32(v2H, 4); + + int16x8_t v1s = vqdmulhq_s16(vcombine_s16(v1Ls, v1Hs), vrw); + int16x8_t v2s = vqdmulhq_s16(vcombine_s16(v2Ls, v2Hs), vrW); + + int16x8_t vsum = vaddq_s16(vshrq_n_s16(v1s,1), vshrq_n_s16(v2s,1)); + uint8x8_t vres = vqrshrun_n_s16(vsum, 2); + + vst1_u8(internal::getRowPtr(dstBase, dstStride, row) + col + 8, vres); +#endif + } + } + } +/***********************************************/ + for(; col <= (ptrdiff_t)dsize.width-8; col+=8) + { +downsample_bilinear_8uc1_col_loop8: + ptrdiff_t col1[8]; + ptrdiff_t col2[8]; + s16 cwi[8]; + + for(s32 k = 0; k < 8; ++k) + { + f32 c = (col + k) * wr + scale_x_offset; + col1[k] = (ptrdiff_t)c; + col2[k] = col1[k] + 1; + + cwi[k] = (s16)floorf((col2[k] - c) * (1 << SHIFT_BITS) + 0.5f); + + if(col1[k] < 0) col1[k] = 0; + if(col2[k] >= (ptrdiff_t)ssize.width) col2[k] = (ptrdiff_t)ssize.width-1; + } + + ptrdiff_t x = std::min(col1[0], (ptrdiff_t)ssize.width-16); + u8 lutl[8]; + u8 luth[8]; + for(s32 k = 0; k < 8; ++k) + { + lutl[k] = (u8)(col1[k] - x); + luth[k] = (u8)(col2[k] - x); + } + + uint8x8_t vlutl = vld1_u8(lutl); + uint8x8_t vluth = vld1_u8(luth); + int16x8_t vcw = vld1q_s16(cwi); + + for(ptrdiff_t row = 0; row < (ptrdiff_t)dsize.height; ++row) + { +#if USE_CORRECT_VERSION + int32x4_t vrw = vdupq_n_s32(buf2[row]); +#else + int16x8_t vrw = vdupq_n_s16((int16_t)buf2[row]); + int16x8_t vrW = vdupq_n_s16((int16_t)((1 << SHIFT_BITS) - buf2[row])); +#endif + + internal::prefetch(internal::getRowPtr(srcBase, srcStride, buf[1*dsize.height + row]) + x, 2*srcStride); + internal::prefetch(internal::getRowPtr(srcBase, srcStride, buf[1*dsize.height + row]) + x, 3*srcStride); + + union { uint8x16_t v; uint8x8x2_t w; } vr1 = { vld1q_u8(internal::getRowPtr(srcBase, srcStride, buf[0*dsize.height + row]) + x) }; + union { uint8x16_t v; uint8x8x2_t w; } vr2 = { vld1q_u8(internal::getRowPtr(srcBase, srcStride, buf[1*dsize.height + row]) + x) }; + + uint8x8_t vr1l = vtbl2_u8(vr1.w, vlutl); + uint8x8_t vr1h = vtbl2_u8(vr1.w, vluth); + uint8x8_t vr2l = vtbl2_u8(vr2.w, vlutl); + uint8x8_t vr2h = vtbl2_u8(vr2.w, vluth); + + uint16x8_t v1hw = vmovl_u8(vr1h); + uint16x8_t v2hw = vmovl_u8(vr2h); + + int16x8_t v1df = vreinterpretq_s16_u16(vsubl_u8(vr1l, vr1h)); + int16x8_t v2df = vreinterpretq_s16_u16(vsubl_u8(vr2l, vr2h)); + + int32x4_t v1L = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(v1hw), SHIFT_BITS)); + int32x4_t v1H = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(v1hw), SHIFT_BITS)); + int32x4_t v2L = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(v2hw), SHIFT_BITS)); + int32x4_t v2H = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(v2hw), SHIFT_BITS)); + + v1L = vmlal_s16(v1L, vget_low_s16(v1df), vget_low_s16(vcw)); + v1H = vmlal_s16(v1H, vget_high_s16(v1df), vget_high_s16(vcw)); + v2L = vmlal_s16(v2L, vget_low_s16(v2df), vget_low_s16(vcw)); + v2H = vmlal_s16(v2H, vget_high_s16(v2df), vget_high_s16(vcw)); + +#if USE_CORRECT_VERSION + /* correct version */ + int32x4_t vL = vshlq_n_s32(v2L, SHIFT_BITS); + int32x4_t vH = vshlq_n_s32(v2H, SHIFT_BITS); + int32x4_t vdiffL = vsubq_s32(v1L, v2L); + int32x4_t vdiffH = vsubq_s32(v1H, v2H); + + vL = vmlaq_s32(vL, vdiffL, vrw); + vH = vmlaq_s32(vH, vdiffH, vrw); + uint16x4_t vL_ = vqrshrun_n_s32(vL, 2*SHIFT_BITS - 8); + uint16x4_t vH_ = vqrshrun_n_s32(vH, 2*SHIFT_BITS - 8); + uint8x8_t vres = vrshrn_n_u16(vcombine_u16(vL_, vH_), 8); + vst1_u8(internal::getRowPtr(dstBase, dstStride, row) + col, vres); +#else + /* ugly version matching to OpenCV's SSE optimization */ + int16x4_t v1Ls = vshrn_n_s32(v1L, 4); + int16x4_t v1Hs = vshrn_n_s32(v1H, 4); + int16x4_t v2Ls = vshrn_n_s32(v2L, 4); + int16x4_t v2Hs = vshrn_n_s32(v2H, 4); + + int16x8_t v1s = vqdmulhq_s16(vcombine_s16(v1Ls, v1Hs), vrw); + int16x8_t v2s = vqdmulhq_s16(vcombine_s16(v2Ls, v2Hs), vrW); + + int16x8_t vsum = vaddq_s16(vshrq_n_s16(v1s,1), vshrq_n_s16(v2s,1)); + uint8x8_t vres = vqrshrun_n_s16(vsum, 2); + + vst1_u8(internal::getRowPtr(dstBase, dstStride, row) + col, vres); +#endif + } + } + if (col < (ptrdiff_t)dsize.width) + { + col = dsize.width - 8; + goto downsample_bilinear_8uc1_col_loop8; + } +} + +} // namespace + +#endif + +void resizeLinearOpenCV(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f32 wr, f32 hr, u32 channels) +{ + internal::assertSupportedConfiguration(wr > 0 && hr > 0 && + (dsize.width - 0.5) * wr - 0.5 < ssize.width && + (dsize.height - 0.5) * hr - 0.5 < ssize.height && // Ensure we have enough source data + (dsize.width + 0.5) * wr + 0.5 >= ssize.width && + (dsize.height + 0.5) * hr + 0.5 >= ssize.height && // Ensure source isn't too big + isResizeLinearOpenCVSupported(ssize, dsize, channels)); +#ifdef CAROTENE_NEON + if(1 == channels) + { + if (wr <= 1.f && hr <= 1.f) + resizeLinearOpenCVchan<1>(ssize, dsize, srcBase, srcStride, dstBase, dstStride, wr, hr); + else if (wr <= 2.0f && hr <= 2.0f && ssize.width >= 16) + downsample_bilinear_8uc1(ssize, dsize, srcBase, srcStride, dstBase, dstStride, wr, hr); + else + resizeLinearOpenCVchan<1>(ssize, dsize, srcBase, srcStride, dstBase, dstStride, wr, hr); + } + else if(4 == channels) + resizeLinearOpenCVchan<4>(ssize, dsize, srcBase, srcStride, dstBase, dstStride, wr, hr); +#else + (void)ssize; + (void)dsize; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)wr; + (void)hr; + (void)channels; +#endif +} + +void resizeLinear(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + u8 * dstBase, ptrdiff_t dstStride, + f32 wr, f32 hr, u32 channels) +{ + internal::assertSupportedConfiguration(wr > 0 && hr > 0 && + (dsize.width - 0.5) * wr - 0.5 < ssize.width && + (dsize.height - 0.5) * hr - 0.5 < ssize.height && // Ensure we have enough source data + (dsize.width + 0.5) * wr + 0.5 >= ssize.width && + (dsize.height + 0.5) * hr + 0.5 >= ssize.height && // Ensure source isn't too big + isResizeLinearSupported(ssize, dsize, + wr, hr, channels)); +#ifdef CAROTENE_NEON + f32 scale_x = wr; + f32 scale_x_offset = 0.5f * scale_x - 0.5f; + f32 scale_y = hr; + f32 scale_y_offset = 0.5f * scale_y - 0.5f; + + std::vector _buf(dsize.height * 3 + 1); + std::vector coeff(dsize.height); + ptrdiff_t * buf = &_buf[0]; + + for (size_t row = 0; row < dsize.height; ++row) + { + f32 r = row * scale_y + scale_y_offset; + ptrdiff_t src_row = floorf(r); + ptrdiff_t src_row2 = src_row + 1; + + f32 rweight = src_row2 - r; + buf[0 * dsize.height + row] = std::max(0, src_row); + buf[1 * dsize.height + row] = std::min(ssize.height - 1, src_row2); + coeff[row] = rweight; + } + + size_t col = 0; + for ( ; col + 16 <= dsize.width; col += 16) + { + ptrdiff_t col1[16], col2[16]; + f32 cwi[16]; + + for(s32 k = 0; k < 16; ++k) + { + f32 c = (col + k) * scale_x + scale_x_offset; + col1[k] = floorf(c); + col2[k] = col1[k] + 1; + + cwi[k] = col2[k] - c; + + if (col1[k] < 0) + col1[k] = 0; + if (col2[k] >= (ptrdiff_t)ssize.width) + col2[k] = ssize.width - 1; + } + + ptrdiff_t x = std::min(col1[0], ssize.width - 16); + ptrdiff_t y = std::min(col1[8], ssize.width - 16); + u8 lutl[16], luth[16]; + + for (s32 k = 0; k < 8; ++k) + { + lutl[k] = (u8)(col1[k] - x); + luth[k] = (u8)(col2[k] - x); + lutl[k + 8] = (u8)(col1[k + 8] - y); + luth[k + 8] = (u8)(col2[k + 8] - y); + } + + uint8x8_t vlutl = vld1_u8(lutl); + uint8x8_t vluth = vld1_u8(luth); + float32x4_t vcw0 = vld1q_f32(cwi); + float32x4_t vcw1 = vld1q_f32(cwi + 4); + + uint8x8_t vlutl_ = vld1_u8(lutl + 8); + uint8x8_t vluth_ = vld1_u8(luth + 8); + float32x4_t vcw0_ = vld1q_f32(cwi + 8); + float32x4_t vcw1_ = vld1q_f32(cwi + 12); + + if (channels == 1) + { + for (size_t row = 0; row < dsize.height; ++row) + { + float32x4_t vrw = vdupq_n_f32(coeff[row]); + + const u8 * srow0 = internal::getRowPtr(srcBase, srcStride, buf[0 * dsize.height + row]); + const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, buf[1 * dsize.height + row]); + u8 * drow = internal::getRowPtr(dstBase, dstStride, row); + + internal::prefetch(srow0 + x + 2 * srcStride); + internal::prefetch(srow1 + x + 2 * srcStride); + + uint8x8_t vres0 = resizeLinearStep(vld1q_u8(srow0 + x), vld1q_u8(srow1 + x), + vlutl, vluth, + vrw, vcw0, vcw1); + + uint8x8_t vres1 = resizeLinearStep(vld1q_u8(srow0 + y), vld1q_u8(srow1 + y), + vlutl_, vluth_, + vrw, vcw0_, vcw1_); + + vst1q_u8(drow + col, vcombine_u8(vres0, vres1)); + } + } + else if (channels == 3) + { + for (size_t row = 0; row < dsize.height; ++row) + { + float32x4_t vrw = vdupq_n_f32(coeff[row]); + + const u8 * srow0 = internal::getRowPtr(srcBase, srcStride, buf[0 * dsize.height + row]); + const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, buf[1 * dsize.height + row]); + u8 * drow = internal::getRowPtr(dstBase, dstStride, row); + + internal::prefetch(srow0 + x + 2 * srcStride); + internal::prefetch(srow1 + x + 2 * srcStride); + + uint8x16x3_t v_src10 = vld3q_u8(srow0 + (x * 3)); + uint8x16x3_t v_src20 = vld3q_u8(srow1 + (x * 3)); + + uint8x16x3_t v_src11 = vld3q_u8(srow0 + (y * 3)); + uint8x16x3_t v_src21 = vld3q_u8(srow1 + (y * 3)); + + uint8x16x3_t v_dst; + + v_dst.val[0] = vcombine_u8(resizeLinearStep(v_src10.val[0], v_src20.val[0], vlutl, vluth, vrw, vcw0, vcw1), + resizeLinearStep(v_src11.val[0], v_src21.val[0], vlutl_, vluth_, vrw, vcw0_, vcw1_)); + v_dst.val[1] = vcombine_u8(resizeLinearStep(v_src10.val[1], v_src20.val[1], vlutl, vluth, vrw, vcw0, vcw1), + resizeLinearStep(v_src11.val[1], v_src21.val[1], vlutl_, vluth_, vrw, vcw0_, vcw1_)); + v_dst.val[2] = vcombine_u8(resizeLinearStep(v_src10.val[2], v_src20.val[2], vlutl, vluth, vrw, vcw0, vcw1), + resizeLinearStep(v_src11.val[2], v_src21.val[2], vlutl_, vluth_, vrw, vcw0_, vcw1_)); + + vst3q_u8(drow + (col * 3), v_dst); + } + } + else if (channels == 4) + { + for (size_t row = 0; row < dsize.height; ++row) + { + float32x4_t vrw = vdupq_n_f32(coeff[row]); + + const u8 * srow0 = internal::getRowPtr(srcBase, srcStride, buf[0 * dsize.height + row]); + const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, buf[1 * dsize.height + row]); + u8 * drow = internal::getRowPtr(dstBase, dstStride, row); + + internal::prefetch(srow0 + x + 2 * srcStride); + internal::prefetch(srow1 + x + 2 * srcStride); + + uint8x16x4_t v_src10 = vld4q_u8(srow0 + (x << 2)); + uint8x16x4_t v_src20 = vld4q_u8(srow1 + (x << 2)); + + uint8x16x4_t v_src11 = vld4q_u8(srow0 + (y << 2)); + uint8x16x4_t v_src21 = vld4q_u8(srow1 + (y << 2)); + + uint8x16x4_t v_dst; + + v_dst.val[0] = vcombine_u8(resizeLinearStep(v_src10.val[0], v_src20.val[0], vlutl, vluth, vrw, vcw0, vcw1), + resizeLinearStep(v_src11.val[0], v_src21.val[0], vlutl_, vluth_, vrw, vcw0_, vcw1_)); + v_dst.val[1] = vcombine_u8(resizeLinearStep(v_src10.val[1], v_src20.val[1], vlutl, vluth, vrw, vcw0, vcw1), + resizeLinearStep(v_src11.val[1], v_src21.val[1], vlutl_, vluth_, vrw, vcw0_, vcw1_)); + v_dst.val[2] = vcombine_u8(resizeLinearStep(v_src10.val[2], v_src20.val[2], vlutl, vluth, vrw, vcw0, vcw1), + resizeLinearStep(v_src11.val[2], v_src21.val[2], vlutl_, vluth_, vrw, vcw0_, vcw1_)); + v_dst.val[3] = vcombine_u8(resizeLinearStep(v_src10.val[3], v_src20.val[3], vlutl, vluth, vrw, vcw0, vcw1), + resizeLinearStep(v_src11.val[3], v_src21.val[3], vlutl_, vluth_, vrw, vcw0_, vcw1_)); + + vst4q_u8(drow + (col << 2), v_dst); + } + } + } + + for ( ; col + 8 <= dsize.width; col += 8) + { +downsample_bilinear_8uc1_col_loop8: + ptrdiff_t col1[8], col2[8]; + f32 cwi[8]; + + for (s32 k = 0; k < 8; ++k) + { + f32 c = (col + k) * scale_x + scale_x_offset; + col1[k] = floorf(c); + col2[k] = col1[k] + 1; + + cwi[k] = col2[k] - c; + + if (col1[k] < 0) + col1[k] = 0; + if (col2[k] >= (ptrdiff_t)ssize.width) + col2[k] = ssize.width - 1; + } + + ptrdiff_t x = std::min(col1[0], ssize.width - 16); + u8 lutl[8], luth[8]; + for (s32 k = 0; k < 8; ++k) + { + lutl[k] = (u8)(col1[k] - x); + luth[k] = (u8)(col2[k] - x); + } + + uint8x8_t vlutl = vld1_u8(lutl); + uint8x8_t vluth = vld1_u8(luth); + float32x4_t vcw0 = vld1q_f32(cwi); + float32x4_t vcw1 = vld1q_f32(cwi + 4); + + if (channels == 1) + { + for (size_t row = 0; row < dsize.height; ++row) + { + float32x4_t vrw = vdupq_n_f32(coeff[row]); + + const u8 * srow0 = internal::getRowPtr(srcBase, srcStride, buf[0 * dsize.height + row]); + const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, buf[1 * dsize.height + row]); + u8 * drow = internal::getRowPtr(dstBase, dstStride, row); + + internal::prefetch(srow0 + x + 2 * srcStride); + internal::prefetch(srow1 + x + 2 * srcStride); + + uint8x8_t vres = resizeLinearStep(vld1q_u8(srow0 + x), vld1q_u8(srow1 + x), + vlutl, vluth, + vrw, vcw0, vcw1); + vst1_u8(drow + col, vres); + } + } + else if (channels == 3) + { + for (size_t row = 0; row < dsize.height; ++row) + { + float32x4_t vrw = vdupq_n_f32(coeff[row]); + + const u8 * srow0 = internal::getRowPtr(srcBase, srcStride, buf[0 * dsize.height + row]); + const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, buf[1 * dsize.height + row]); + u8 * drow = internal::getRowPtr(dstBase, dstStride, row); + + internal::prefetch(srow0 + x + 2 * srcStride); + internal::prefetch(srow1 + x + 2 * srcStride); + + uint8x16x3_t v_src1 = vld3q_u8(srow0 + (x * 3)); + uint8x16x3_t v_src2 = vld3q_u8(srow1 + (x * 3)); + + uint8x8x3_t v_dst; + + v_dst.val[0] = resizeLinearStep(v_src1.val[0], v_src2.val[0], vlutl, vluth, vrw, vcw0, vcw1); + v_dst.val[1] = resizeLinearStep(v_src1.val[1], v_src2.val[1], vlutl, vluth, vrw, vcw0, vcw1); + v_dst.val[2] = resizeLinearStep(v_src1.val[2], v_src2.val[2], vlutl, vluth, vrw, vcw0, vcw1); + + vst3_u8(drow + (col * 3), v_dst); + } + } + else if (channels == 4) + { + for (size_t row = 0; row < dsize.height; ++row) + { + float32x4_t vrw = vdupq_n_f32(coeff[row]); + + const u8 * srow0 = internal::getRowPtr(srcBase, srcStride, buf[0 * dsize.height + row]); + const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, buf[1 * dsize.height + row]); + u8 * drow = internal::getRowPtr(dstBase, dstStride, row); + + internal::prefetch(srow0 + x + 2 * srcStride); + internal::prefetch(srow1 + x + 2 * srcStride); + + uint8x16x4_t v_src1 = vld4q_u8(srow0 + (x << 2)); + uint8x16x4_t v_src2 = vld4q_u8(srow1 + (x << 2)); + + uint8x8x4_t v_dst; + + v_dst.val[0] = resizeLinearStep(v_src1.val[0], v_src2.val[0], vlutl, vluth, vrw, vcw0, vcw1); + v_dst.val[1] = resizeLinearStep(v_src1.val[1], v_src2.val[1], vlutl, vluth, vrw, vcw0, vcw1); + v_dst.val[2] = resizeLinearStep(v_src1.val[2], v_src2.val[2], vlutl, vluth, vrw, vcw0, vcw1); + v_dst.val[3] = resizeLinearStep(v_src1.val[3], v_src2.val[3], vlutl, vluth, vrw, vcw0, vcw1); + + vst4_u8(drow + (col << 2), v_dst); + } + } + } + + if (col < dsize.width) + { + col = dsize.width - 8; + goto downsample_bilinear_8uc1_col_loop8; + } + +#else + (void)ssize; + (void)dsize; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)wr; + (void)hr; + (void)channels; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/saturate_cast.hpp b/3rdparty/carotene/src/saturate_cast.hpp new file mode 100644 index 0000000000..98f8545009 --- /dev/null +++ b/3rdparty/carotene/src/saturate_cast.hpp @@ -0,0 +1,199 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#ifndef CAROTENE_SATURATE_CAST_HPP +#define CAROTENE_SATURATE_CAST_HPP + +#include +#include +#include + +#if defined _MSC_VER && defined _M_ARM +# include +#endif + +#include +#include + +namespace CAROTENE_NS { namespace internal { + +#if defined _MSC_VER && defined _M_ARM + +__declspec(naked) static void vcvtr_s32_f64_imp(f64 d) +{ + (void)d; + __emit(0xEEBD); // vcvtr.s32.f64 s0, d0 + __emit(0x0B40); + __emit(0xEE10); // vmov r0, s0 + __emit(0x0A10); + __emit(0x4770); // bx lr +} + +# define CAROTENE_ROUND_FLT(x) return ((s32 (*)(f64))vcvtr_s32_f64_imp)((f64)x); +# define CAROTENE_ROUND_DBL(x) return ((s32 (*)(f64))vcvtr_s32_f64_imp)(x); + +#elif defined CV_ICC || defined __GNUC__ + +# if defined(__VFP_FP__) && !defined(__SOFTFP__) && !(defined _DEBUG || defined DEBUG) && !defined(__CUDACC__) +# define CAROTENE_ROUND_FLT(value) { \ + register union { f32 f; s32 i; } result; \ + asm ("ftosis %0, %1 \n" : "=w" (result.f) : "w" (value) ); \ + return result.i; } +# define CAROTENE_ROUND_DBL(value) { \ + register union {f32 f; s32 i;} __tegra_result; \ + asm ( \ + "ftosid %0, %P1\n" \ + : "=w" (__tegra_result.f) \ + : "w" (value) \ + ); \ + return __tegra_result.i; \ + } +# else +# define CAROTENE_ROUND_FLT(x) return (s32)lrintf(value); +# define CAROTENE_ROUND_DBL(value) return (s32)lrint(value); +# endif + +#endif + +inline s32 round(f32 value) +{ +#ifdef CAROTENE_ROUND_FLT + CAROTENE_ROUND_FLT(value) +#else + s32 intpart = (s32)(value); + f32 fractpart = value - intpart; + if ((fractpart != 0.5 && fractpart != -0.5) || ((intpart % 2) != 0)) + return (s32)(value + (value >= 0 ? 0.5 : -0.5)); + else + return intpart; +#endif +} + +inline s32 round(f64 value) +{ +#ifdef CAROTENE_ROUND_DBL + CAROTENE_ROUND_DBL(value) +#else + s32 intpart = (s32)(value); + f64 fractpart = value - intpart; + if ((fractpart != 0.5 && fractpart != -0.5) || ((intpart % 2) != 0)) + return (s32)(value + (value >= 0 ? 0.5 : -0.5)); + else + return intpart; +#endif +} +/////////////// saturate_cast (used in image & signal processing) /////////////////// + +template inline _Tp saturate_cast(u8 v) { return _Tp(v); } +template inline _Tp saturate_cast(s8 v) { return _Tp(v); } +template inline _Tp saturate_cast(u16 v) { return _Tp(v); } +template inline _Tp saturate_cast(s16 v) { return _Tp(v); } +template inline _Tp saturate_cast(u32 v) { return _Tp(v); } +template inline _Tp saturate_cast(s32 v) { return _Tp(v); } +template inline _Tp saturate_cast(s64 v) { return _Tp(v); } +template inline _Tp saturate_cast(u64 v) { return _Tp(v); } +template inline _Tp saturate_cast(f32 v) { return _Tp(v); } +template inline _Tp saturate_cast(f64 v) { return _Tp(v); } + +template<> inline u8 saturate_cast(s8 v) { return (u8)std::max((s32)v, 0); } +template<> inline u8 saturate_cast(u16 v) { return (u8)std::min((u32)v, (u32)UCHAR_MAX); } +template<> inline u8 saturate_cast(s32 v) { return (u8)((u32)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); } +template<> inline u8 saturate_cast(s16 v) { return saturate_cast((s32)v); } +template<> inline u8 saturate_cast(u32 v) { return (u8)std::min(v, (u32)UCHAR_MAX); } +template<> inline u8 saturate_cast(s64 v) { return (u8)((u64)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); } +template<> inline u8 saturate_cast(u64 v) { return (u8)std::min(v, (u64)UCHAR_MAX); } +template<> inline u8 saturate_cast(f32 v) { return saturate_cast(round(v)); } +template<> inline u8 saturate_cast(f64 v) { return saturate_cast(round(v)); } + +template<> inline s8 saturate_cast(u8 v) { return (s8)std::min((s32)v, SCHAR_MAX); } +template<> inline s8 saturate_cast(u16 v) { return (s8)std::min((u32)v, (u32)SCHAR_MAX); } +template<> inline s8 saturate_cast(s32 v) { return (s8)((u32)(v-SCHAR_MIN) <= (u32)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); } +template<> inline s8 saturate_cast(s16 v) { return saturate_cast((s32)v); } +template<> inline s8 saturate_cast(u32 v) { return (s8)std::min(v, (u32)SCHAR_MAX); } +template<> inline s8 saturate_cast(s64 v) { return (s8)((u64)(v-SCHAR_MIN) <= (u64)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); } +template<> inline s8 saturate_cast(u64 v) { return (s8)std::min(v, (u64)SCHAR_MAX); } +template<> inline s8 saturate_cast(f32 v) { return saturate_cast(round(v)); } +template<> inline s8 saturate_cast(f64 v) { return saturate_cast(round(v)); } + +template<> inline u16 saturate_cast(s8 v) { return (u16)std::max((s32)v, 0); } +template<> inline u16 saturate_cast(s16 v) { return (u16)std::max((s32)v, 0); } +template<> inline u16 saturate_cast(s32 v) { return (u16)((u32)v <= (u32)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); } +template<> inline u16 saturate_cast(u32 v) { return (u16)std::min(v, (u32)USHRT_MAX); } +template<> inline u16 saturate_cast(s64 v) { return (u16)((u64)v <= (u64)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); } +template<> inline u16 saturate_cast(u64 v) { return (u16)std::min(v, (u64)USHRT_MAX); } +template<> inline u16 saturate_cast(f32 v) { return saturate_cast(round(v)); } +template<> inline u16 saturate_cast(f64 v) { return saturate_cast(round(v)); } + +template<> inline s16 saturate_cast(u16 v) { return (s16)std::min((s32)v, SHRT_MAX); } +template<> inline s16 saturate_cast(s32 v) { return (s16)((u32)(v - SHRT_MIN) <= (u32)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); } +template<> inline s16 saturate_cast(u32 v) { return (s16)std::min(v, (u32)SHRT_MAX); } +template<> inline s16 saturate_cast(s64 v) { return (s16)((u64)(v - SHRT_MIN) <= (u64)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); } +template<> inline s16 saturate_cast(u64 v) { return (s16)std::min(v, (u64)SHRT_MAX); } +template<> inline s16 saturate_cast(f32 v) { return saturate_cast(round(v)); } +template<> inline s16 saturate_cast(f64 v) { return saturate_cast(round(v)); } + +template<> inline u32 saturate_cast(s8 v) { return (u32)std::max(v, (s8)0); } +template<> inline u32 saturate_cast(s16 v) { return (u32)std::max(v, (s16)0); } +template<> inline u32 saturate_cast(s32 v) { return (u32)std::max(v, (s32)0); } +template<> inline u32 saturate_cast(s64 v) { return (u32)((u64)v <= (u64)UINT_MAX ? v : v > 0 ? UINT_MAX : 0); } +template<> inline u32 saturate_cast(u64 v) { return (u32)std::min(v, (u64)UINT_MAX); } +//OpenCV like f32/f64 -> u32 conversion +//we intentionally do not clip negative numbers, to make -1 become 0xffffffff etc. +template<> inline u32 saturate_cast(f32 v) { return round(v); } +template<> inline u32 saturate_cast(f64 v) { return round(v); } +//Negative clipping implementation +//template<> inline u32 saturate_cast(f32 v) { return saturate_cast(round(v)); } +//template<> inline u32 saturate_cast(f64 v) { return saturate_cast(round(v)); } + +template<> inline s32 saturate_cast(u32 v) { return (s32)std::min(v, (u32)INT_MAX); } +template<> inline s32 saturate_cast(s64 v) { return (s32)((u64)(v - INT_MIN) <= (u64)UINT_MAX ? v : v > 0 ? INT_MAX : INT_MIN); } +template<> inline s32 saturate_cast(u64 v) { return (s32)std::min(v, (u64)INT_MAX); } +template<> inline s32 saturate_cast(f32 v) { return round(v); } +template<> inline s32 saturate_cast(f64 v) { return round(v); } + +template<> inline u64 saturate_cast(s8 v) { return (u64)std::max(v, (s8)0); } +template<> inline u64 saturate_cast(s16 v) { return (u64)std::max(v, (s16)0); } +template<> inline u64 saturate_cast(s32 v) { return (u64)std::max(v, (s32)0); } +template<> inline u64 saturate_cast(s64 v) { return (u64)std::max(v, (s64)0); } + +template<> inline s64 saturate_cast(u64 v) { return (s64)std::min(v, (u64)LLONG_MAX); } + +} } + +#endif diff --git a/3rdparty/carotene/src/scharr.cpp b/3rdparty/carotene/src/scharr.cpp new file mode 100644 index 0000000000..2c4ba29742 --- /dev/null +++ b/3rdparty/carotene/src/scharr.cpp @@ -0,0 +1,219 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include + +#include "common.hpp" + +namespace CAROTENE_NS { + +bool isScharr3x3Supported(const Size2D &size, BORDER_MODE border, s32 dx, s32 dy, Margin borderMargin) +{ + return (dx == 0 && dy == 1 && + isSeparableFilter3x3Supported(size, border, 3, 1, borderMargin)) || + (dx == 1 && dy == 0 && + isSeparableFilter3x3Supported(size, border, 1, 3, borderMargin)); +} + +void Scharr3x3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + s32 dx, s32 dy, + BORDER_MODE border, u8 borderValue, Margin borderMargin) +{ + internal::assertSupportedConfiguration(isScharr3x3Supported(size, border, dx, dy, borderMargin)); +#ifdef CAROTENE_NEON + static s16 dw[] = {3, 10, 3}; + + if (dy == 1) + SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride, + 3, 1, dw, 0, + border, borderValue, borderMargin); + else + SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride, + 1, 3, 0, dw, + border, borderValue, borderMargin); +#else + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)borderValue; +#endif +} + +void ScharrDeriv(const Size2D &size, s32 cn, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t colsn = size.width*cn; + size_t roiw8 = colsn > 7 ? colsn - 7 : 0; + + ptrdiff_t delta = (ptrdiff_t)(((size.width + 2)*cn + 15) & -16);//align size + std::vector _tempBuf((delta << 1) + 64); + s16 *trow0 = internal::alignPtr(&_tempBuf[cn], 16), *trow1 = internal::alignPtr(trow0 + delta, 16); + + int16x8_t vc3 = vmovq_n_s16(3); + int16x8_t vc10 = vmovq_n_s16(10); + uint8x8_t v8c10 = vmov_n_u8(10); + + for(size_t y = 0; y < size.height; y++ ) + { + const u8* srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : size.height > 1 ? 1 : 0); + const u8* srow1 = internal::getRowPtr(srcBase, srcStride, y); + const u8* srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height > 1 ? size.height-2 : 0); + s16* drow = internal::getRowPtr(dstBase, dstStride, y); + + // do vertical convolution + size_t x = 0; + for( ; x < roiw8; x += 8 ) + { + internal::prefetch(srow0 + x); + internal::prefetch(srow1 + x); + internal::prefetch(srow2 + x); +#if __GNUC_MINOR__ < 7 + __asm__ ( + "vld1.8 {d0}, [%[src0]] \n\t" + "vld1.8 {d2}, [%[src2]] \n\t" + "vld1.8 {d1}, [%[src1]] \n\t" + "vaddl.u8 q2, d2, d0 \n\t" + "vmull.u8 q3, d1, %[vc10] \n\t" + "vsubl.u8 q4, d2, d0 \n\t" + "vmla.s16 q3, q2, %q[vc3] \n\t" + "vst1.16 {d8-d9}, [%[out1],:128] \n\t" + "vst1.16 {d6-d7}, [%[out0],:128] \n\t" + : + : [out0] "r" (trow0 + x), + [out1] "r" (trow1 + x), + [src0] "r" (srow0 + x), + [src1] "r" (srow1 + x), + [src2] "r" (srow2 + x), + [vc10] "w" (v8c10), [vc3] "w" (vc3) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15" + ); +#else + uint8x8_t s0 = vld1_u8(srow0 + x); + uint8x8_t s1 = vld1_u8(srow1 + x); + uint8x8_t s2 = vld1_u8(srow2 + x); + + int16x8_t s1x10 = vreinterpretq_s16_u16(vmull_u8(s1, v8c10)); + int16x8_t s02 = vreinterpretq_s16_u16(vaddl_u8(s2, s0)); + int16x8_t t1 = vreinterpretq_s16_u16(vsubl_u8(s2, s0)); + int16x8_t t0 = vmlaq_s16(s1x10, s02, vc3); + + vst1q_s16(trow1 + x, t1); + vst1q_s16(trow0 + x, t0); +#endif + } + for( ; x < colsn; x++ ) + { + trow0[x] = (s16)((srow0[x] + srow2[x])*3 + srow1[x]*10); + trow1[x] = (s16)(srow2[x] - srow0[x]); + } + + // make border + size_t x0 = (size.width > 1 ? cn : 0), x1 = (size.width > 1 ? (size.width-2)*cn : 0); + for( s32 k = 0; k < cn; k++ ) + { + trow0[-cn + k] = trow0[x0 + k]; trow0[colsn + k] = trow0[x1 + k]; + trow1[-cn + k] = trow1[x0 + k]; trow1[colsn + k] = trow1[x1 + k]; + } + + // do horizontal convolution, interleave the results and store them to dst + x = 0; + for( ; x < roiw8; x += 8 ) + { +#if __GNUC_MINOR__ < 6 + __asm__ ( + "vld1.16 {d4-d5}, [%[s2ptr]] \n\t" + "vld1.16 {d8-d9}, [%[s4ptr]] \n\t" + "vld1.16 {d6-d7}, [%[s3ptr],:128] \n\t" + "vld1.16 {d0-d1}, [%[s0ptr]] \n\t" + "vld1.16 {d2-d3}, [%[s1ptr]] \n\t" + "vadd.i16 q7, q2, q4 \n\t" + "vmul.s16 q6, q3, %q[vc10] \n\t" + "vsub.s16 q5, q1, q0 \n\t" + "vmla.s16 q6, q7, %q[vc3] \n\t" + "vst2.16 {d10-d13}, [%[out]] \n\t" + : + : [out] "r" (drow + x * 2), + [s0ptr] "r" (trow0 + x - cn), + [s1ptr] "r" (trow0 + x + cn), + [s2ptr] "r" (trow1 + x - cn), + [s3ptr] "r" (trow1 + x), + [s4ptr] "r" (trow1 + x + cn), + [vc10] "w" (vc10), [vc3] "w" (vc3) + : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15" + ); +#else + int16x8_t s0 = vld1q_s16(trow0 + x - cn); + int16x8_t s1 = vld1q_s16(trow0 + x + cn); + int16x8_t s2 = vld1q_s16(trow1 + x - cn); + int16x8_t s3 = vld1q_s16(trow1 + x); + int16x8_t s4 = vld1q_s16(trow1 + x + cn); + + int16x8_t s3x10 = vmulq_s16(s3, vc10); + int16x8_t s24 = vaddq_s16(s2, s4); + + int16x8x2_t vr; + vr.val[0] = vsubq_s16(s1, s0); + vr.val[1] = vmlaq_s16(s3x10, s24, vc3); + + vst2q_s16(drow + x*2, vr); +#endif //__GNUC_MINOR__ < 6 + } + for( ; x < colsn; x++ ) + { + drow[x*2] = (s16)(trow0[x+cn] - trow0[x-cn]); + drow[x*2+1] = (s16)((trow1[x+cn] + trow1[x-cn])*3 + trow1[x]*10); + } + } +#else + (void)size; + (void)cn; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/separable_filter.cpp b/3rdparty/carotene/src/separable_filter.cpp new file mode 100644 index 0000000000..a06172c4e6 --- /dev/null +++ b/3rdparty/carotene/src/separable_filter.cpp @@ -0,0 +1,109 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +#include "separable_filter.hpp" + +namespace CAROTENE_NS { + +bool isSeparableFilter3x3Supported(const Size2D &size, BORDER_MODE border, s32 dx, s32 dy, Margin borderMargin) +{ + return isSupportedConfiguration() && + size.width >= 9 && size.height >= 1 && + (size.height + borderMargin.top + borderMargin.bottom) >= 2 && + (dx >= 0) && (dx < 4) && (dy >= 0) && (dy < 4) && + (border == BORDER_MODE_CONSTANT || + border == BORDER_MODE_REFLECT || + border == BORDER_MODE_REFLECT101 || + border == BORDER_MODE_REPLICATE ); +} + +void SeparableFilter3x3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + const u8 rowFilter, const u8 colFilter, const s16 *xw, const s16 *yw, + BORDER_MODE border, u8 borderValue, Margin borderMargin) +{ + internal::assertSupportedConfiguration(isSeparableFilter3x3Supported(size, border, rowFilter, colFilter, borderMargin)); +#ifdef CAROTENE_NEON + if(!((xw || rowFilter < 3) && (yw || colFilter < 3))) + std::abort();//Couldn't call generic filter without provided weights + + typedef void (*sepFilter3x3_8u16s_func)(const Size2D&, const u8*, ptrdiff_t, s16*, ptrdiff_t, + const s16*, const s16*, BORDER_MODE, u8, Margin); + + static sepFilter3x3_8u16s_func quickFilters[4][4]= + { + /*d0y*/{ /*d0x*/ internal::sepFilter3x3::process, + /*dx*/ internal::sepFilter3x3::process, + /*d2x*/ internal::sepFilter3x3::process, + /*dNx*/ internal::sepFilter3x3::process}, + + /*dy */{ /*d0x*/ internal::sepFilter3x3::process, + /*dx*/ internal::sepFilter3x3::process, + /*d2x*/ internal::sepFilter3x3::process, + /*dNx*/ internal::sepFilter3x3::process}, + + /*d2y*/{ /*d0x*/ internal::sepFilter3x3::process, + /*dx*/ internal::sepFilter3x3::process, + /*d2x*/ internal::sepFilter3x3::process, + /*dNx*/ internal::sepFilter3x3::process}, + + /*dNy*/{ /*d0x*/ internal::sepFilter3x3::process, + /*dx*/ internal::sepFilter3x3::process, + /*d2x*/ internal::sepFilter3x3::process, + /*dNx*/ internal::sepFilter3x3::process} + }; + + quickFilters[colFilter][rowFilter](size, srcBase, srcStride, dstBase, dstStride, + xw, yw, border, borderValue, borderMargin); +#else + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)xw; + (void)yw; + (void)borderValue; +#endif +} + + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/separable_filter.hpp b/3rdparty/carotene/src/separable_filter.hpp new file mode 100644 index 0000000000..b0f7307fa0 --- /dev/null +++ b/3rdparty/carotene/src/separable_filter.hpp @@ -0,0 +1,1161 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#ifndef CAROTENE_SRC_SEPARABLE_FILTER_HPP +#define CAROTENE_SRC_SEPARABLE_FILTER_HPP + +#include "common.hpp" + +#include + +#include + +#ifdef CAROTENE_NEON + +namespace CAROTENE_NS { + +namespace internal { + +struct RowFilter3x3S16Base +{ + typedef u8 srcType; + /* + Various border types, image boundaries are denoted with '|' + + * BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh + * BORDER_REFLECT: fedcba|abcdefgh|hgfedcb + * BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba + * BORDER_WRAP: cdefgh|abcdefgh|abcdefg + * BORDER_CONSTANT: iiiiii|abcdefgh|iiiiiii with some specified 'i' + */ + inline RowFilter3x3S16Base(const BORDER_MODE _borderType, const srcType _borderValue, const ptrdiff_t borderxl, const ptrdiff_t borderxr): + borderType(_borderType),borderValue(_borderValue) + { + if (borderType == BORDER_MODE_CONSTANT) + { + vfmask = vreinterpret_u8_u64(vmov_n_u64(borderxl ? 0x00ffFFffFFffFFffULL : 0x0100FFffFFffFFffULL)); + vtmask = vreinterpret_u8_u64(vmov_n_u64(borderxr ? 0xFF07060504030201ULL : 0x0706050403020100ULL)); + } + else if (borderType == BORDER_MODE_REFLECT101) + { + vfmask = vreinterpret_u8_u64(vmov_n_u64(borderxl ? 0x0001FFffFFffFFffULL : 0x0100FFffFFffFFffULL)); + vtmask = vreinterpret_u8_u64(vmov_n_u64(borderxr ? 0x0607060504030201ULL : 0x0706050403020100ULL)); + } + else //if (borderType == BORDER_MODE_REFLECT || borderType == BORDER_MODE_REPLICATE) + { + vfmask = vreinterpret_u8_u64(vmov_n_u64(borderxl ? 0x0000FFffFFffFFffULL : 0x0100FFffFFffFFffULL)); + vtmask = vreinterpret_u8_u64(vmov_n_u64(borderxr ? 0x0707060504030201ULL : 0x0706050403020100ULL)); + } + lookLeft = offsetk - borderxl; + lookRight = offsetk - borderxr; + } + + uint8x8_t vfmask; + uint8x8_t vtmask; + enum { offsetk = 1}; + ptrdiff_t lookLeft; + ptrdiff_t lookRight; + const BORDER_MODE borderType; + const srcType borderValue; +}; + +struct ColFilter3x3S16Base +{ + typedef s16 srcType; + + inline ColFilter3x3S16Base(const BORDER_MODE _borderType, const srcType _borderValue): + borderType(_borderType),borderValue(_borderValue) {} + + enum { offsetk = 1}; + const BORDER_MODE borderType; + const srcType borderValue; +}; + +struct RowFilter3x3S16Generic : public RowFilter3x3S16Base +{ + typedef s16 dstType; + + inline RowFilter3x3S16Generic(BORDER_MODE _borderType, const srcType _borderValue, ptrdiff_t borderxl, ptrdiff_t borderxr, const s16 *w): + RowFilter3x3S16Base(_borderType, _borderValue, borderxl, borderxr), borderFilter( (w[0]+w[1]+w[2]) * borderValue ) + { + vw0 = vdupq_n_s16(w[0]); + vw1 = vdupq_n_s16(w[1]); + vw2 = vdupq_n_s16(w[2]); + } + + int16x8_t vw0; + int16x8_t vw1; + int16x8_t vw2; + const dstType borderFilter; + + inline void operator()(const u8* src, s16* dst, ptrdiff_t width) + { + uint8x8_t l = vtbl1_u8(vld1_u8(src - lookLeft), vfmask); + if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT) + l = vset_lane_u8(borderValue, l, 6); + + ptrdiff_t i = 0; + for (; i < width - 16 + lookRight; i += 16) + { + internal::prefetch(src + i); + uint8x8_t l18u = vld1_u8(src + i + 1); + vst1q_s16(dst + i, vaddq_s16(vmlaq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vext_u8(l, l18u, 6))), vw0), + vreinterpretq_s16_u16(vmovl_u8(vext_u8(l, l18u, 7))), vw1), + vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(l18u)), vw2))); + l = vld1_u8(src + i + 9); + vst1q_s16(dst + i + 8, vaddq_s16(vmlaq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vext_u8(l18u, l, 6))), vw0), + vreinterpretq_s16_u16(vmovl_u8(vext_u8(l18u, l, 7))), vw1), + vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(l)), vw2))); + } + if (i < width - 8 + lookRight) + { + uint8x8_t l18u = vld1_u8(src + i + 1); + vst1q_s16(dst + i, vaddq_s16(vmlaq_s16(vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vext_u8(l, l18u, 6))), vw0), + vreinterpretq_s16_u16(vmovl_u8(vext_u8(l, l18u, 7))), vw1), + vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(l18u)), vw2))); + i += 8; + } + + //tail + if (lookRight == 0 || i != width) + { + uint8x8_t tail0 = vld1_u8(src + (width - 9));//can't get left 1 pixel another way if width==8*k+1 + uint8x8_t tail2 = vtbl1_u8(vld1_u8(src + (width - 8 + lookRight)), vtmask); + if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT) + tail2 = vset_lane_u8(borderValue, tail2, 7); + uint8x8_t tail1 = vext_u8(vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(tail0), 8*6)), tail2, 7); + + int16x8_t l0 = vreinterpretq_s16_u16(vmovl_u8(tail0)); + int16x8_t l1 = vreinterpretq_s16_u16(vmovl_u8(tail1)); + int16x8_t l2 = vreinterpretq_s16_u16(vmovl_u8(tail2)); + + int16x8_t l0w = vmulq_s16(l0, vw0); + int16x8_t l2w = vmulq_s16(l2, vw2); + int16x8_t ls = vaddq_s16(vmlaq_s16(l0w, l1, vw1), l2w); + + vst1q_s16(dst + (width - 8), ls); + } + } +}; + +struct RowFilter3x3S16_m101 : public RowFilter3x3S16Base +{ + typedef s16 dstType; + + inline RowFilter3x3S16_m101(const BORDER_MODE _borderType, const srcType _borderValue, ptrdiff_t borderxl, ptrdiff_t borderxr, const s16*): + RowFilter3x3S16Base(_borderType, _borderValue, borderxl, borderxr), borderFilter(0) {} + + const dstType borderFilter; + + inline void operator()(const u8* src, s16* dst, ptrdiff_t width) + { + uint8x8_t l = vtbl1_u8(vld1_u8(src - lookLeft), vfmask); + if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT) + l = vset_lane_u8(borderValue, l, 6); + + ptrdiff_t i = 0; + for (; i < width - 16 + lookRight; i += 16) + { + internal::prefetch(src + i); + + uint8x8_t l2 = vld1_u8(src + i + 1); + vst1q_s16(dst + i, vreinterpretq_s16_u16(vsubl_u8(l2, vext_u8(l, l2, 6)))); + + l = vld1_u8(src + i + 9); + vst1q_s16(dst + i + 8, vreinterpretq_s16_u16(vsubl_u8(l, vext_u8(l2, l, 6)))); + } + + if (i < width - 8 + lookRight) + { + uint8x8_t l2 = vld1_u8(src + i + 1); + vst1q_s16(dst + i, vreinterpretq_s16_u16(vsubl_u8(l2, vext_u8(l, l2, 6)))); + i += 8; + } + + //tail + if (lookRight == 0 || i != width) + { + uint8x8_t tail0 = vld1_u8(src + (width - 9));//can't get left 1 pixel another way if width==8*k+1 + uint8x8_t tail2 = vtbl1_u8(vld1_u8(src + (width - 8 + lookRight)), vtmask); + if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT) + tail2 = vset_lane_u8(borderValue, tail2, 7); + + int16x8_t ls = vreinterpretq_s16_u16(vsubl_u8(tail2, tail0)); + + vst1q_s16(dst + (width - 8), ls); + } + } +}; + +struct RowFilter3x3S16_121 : public RowFilter3x3S16Base +{ + typedef s16 dstType; + + inline RowFilter3x3S16_121(const BORDER_MODE _borderType, const srcType _borderValue, ptrdiff_t borderxl, ptrdiff_t borderxr, const s16*): + RowFilter3x3S16Base(_borderType, _borderValue, borderxl, borderxr), borderFilter(borderValue << 2) {} + + const dstType borderFilter; + + inline void operator()(const u8* src, s16* dst, ptrdiff_t width) + { + uint8x8_t l = vtbl1_u8(vld1_u8(src - lookLeft), vfmask); + if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT) + l = vset_lane_u8(borderValue, l, 6); + + ptrdiff_t i = 0; + for (; i < width - 16 + lookRight; i += 16) + { + internal::prefetch(src + i); + + uint8x8_t l2 = vld1_u8(src + i + 1); + vst1q_s16(dst + i, vqaddq_s16(vreinterpretq_s16_u16(vaddl_u8(vext_u8(l, l2, 6), l2)), + vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l, l2, 7), 1)))); + + l = vld1_u8(src + i + 9); + vst1q_s16(dst + i + 8, vqaddq_s16(vreinterpretq_s16_u16(vaddl_u8(vext_u8(l2, l, 6), l)), + vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l2, l, 7), 1)))); + } + + if (i < width - 8 + lookRight) + { + uint8x8_t l2 = vld1_u8(src + i + 1); + vst1q_s16(dst + i, vqaddq_s16(vreinterpretq_s16_u16(vaddl_u8(vext_u8(l, l2, 6), l2)), + vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l, l2, 7), 1)))); + i += 8; + } + + //tail + if (lookRight == 0 || i != width) + { + uint8x8_t tail0 = vld1_u8(src + (width - 9));//can't get left 1 pixel another way if width==8*k+1 + uint8x8_t tail2 = vtbl1_u8(vld1_u8(src + (width - 8 + lookRight)), vtmask); + if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT) + tail2 = vset_lane_u8(borderValue, tail2, 7); + uint8x8_t tail1 = vext_u8(vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(tail0), 8*6)), tail2, 7); + + int16x8_t tail02 = vreinterpretq_s16_u16(vaddl_u8(tail0, tail2)); + int16x8_t tail1x2 = vreinterpretq_s16_u16(vshll_n_u8(tail1, 1)); + + int16x8_t ls = vqaddq_s16(tail02, tail1x2); + + vst1q_s16(dst + (width - 8), ls); + } + } +}; + +struct RowFilter3x3S16_1m21 : public RowFilter3x3S16Base +{ + typedef s16 dstType; + + inline RowFilter3x3S16_1m21(const BORDER_MODE _borderType, const srcType _borderValue, ptrdiff_t borderxl, ptrdiff_t borderxr, const s16*): + RowFilter3x3S16Base(_borderType, _borderValue, borderxl, borderxr), borderFilter(0) {} + + const dstType borderFilter; + + inline void operator()(const u8* src, s16* dst, ptrdiff_t width) + { + uint8x8_t l = vtbl1_u8(vld1_u8(src - lookLeft), vfmask); + if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT) + l = vset_lane_u8(borderValue, l, 6); + + ptrdiff_t i = 0; + for (; i < width - 16 + lookRight; i += 16) + { + internal::prefetch(src + i); + + uint8x8_t l2 = vld1_u8(src + i + 1); + vst1q_s16(dst + i, vqsubq_s16(vreinterpretq_s16_u16(vaddl_u8(vext_u8(l, l2, 6), l2)), + vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l, l2, 7), 1)))); + + l = vld1_u8(src + i + 9); + vst1q_s16(dst + i + 8, vqsubq_s16(vreinterpretq_s16_u16(vaddl_u8(vext_u8(l2, l, 6), l)), + vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l2, l, 7), 1)))); + } + + if (i < width - 8 + lookRight) + { + uint8x8_t l2 = vld1_u8(src + i + 1); + vst1q_s16(dst + i, vqsubq_s16(vreinterpretq_s16_u16(vaddl_u8(vext_u8(l, l2, 6), l2)), + vreinterpretq_s16_u16(vshll_n_u8(vext_u8(l, l2, 7), 1)))); + i += 8; + } + + //tail + if (lookRight == 0 || i != width) + { + uint8x8_t tail0 = vld1_u8(src + (width - 9));//can't get left 1 pixel another way if width==8*k+1 + uint8x8_t tail2 = vtbl1_u8(vld1_u8(src + (width - 8 + lookRight)), vtmask); + if (lookLeft == 0 && borderType == BORDER_MODE_CONSTANT) + tail2 = vset_lane_u8(borderValue, tail2, 7); + uint8x8_t tail1 = vext_u8(vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(tail0), 8*6)), tail2, 7); + + int16x8_t tail02 = vreinterpretq_s16_u16(vaddl_u8(tail0, tail2)); + int16x8_t tail1x2 = vreinterpretq_s16_u16(vshll_n_u8(tail1, 1)); + + int16x8_t ls = vqsubq_s16(tail02, tail1x2); + + vst1q_s16(dst + (width - 8), ls); + } + } +}; + +struct ColFilter3x3S16Generic : public ColFilter3x3S16Base +{ + typedef s16 dstType; + + inline ColFilter3x3S16Generic(const BORDER_MODE _borderType, const srcType _borderValue, const s16 *w): + ColFilter3x3S16Base(_borderType, _borderValue) + { + vw0 = vdupq_n_s16(w[0]); + vw1 = vdupq_n_s16(w[1]); + vw2 = vdupq_n_s16(w[2]); + } + + int16x8_t vw0; + int16x8_t vw1; + int16x8_t vw2; + + inline void operator()(const s16* src0, const s16* src1, const s16* src2, const s16* src3, s16* dst0, s16* dst1, ptrdiff_t width) + { + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + int16x8_t line1 = vld1q_s16(src1 + j); + int16x8_t line2 = vld1q_s16(src2 + j); + vst1q_s16(dst0 + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j), vw0), line1, vw1), line2, vw2)); + vst1q_s16(dst1 + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src3 + j), vw2), line1, vw0), line2, vw1)); + + line1 = vld1q_s16(src1 + j + 8); + line2 = vld1q_s16(src2 + j + 8); + vst1q_s16(dst0 + j + 8, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j + 8), vw0), line1, vw1), line2, vw2)); + vst1q_s16(dst1 + j + 8, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src3 + j + 8), vw2), line1, vw0), line2, vw1)); + } + if (j <= width - 8) + { + int16x8_t line1 = vld1q_s16(src1 + j); + int16x8_t line2 = vld1q_s16(src2 + j); + vst1q_s16(dst0 + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j), vw0), line1, vw1), line2, vw2)); + vst1q_s16(dst1 + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src3 + j), vw2), line1, vw0), line2, vw1)); + j += 8; + } + if (j != width) + { + j = width - 8; + int16x8_t line1 = vld1q_s16(src1 + j); + int16x8_t line2 = vld1q_s16(src2 + j); + vst1q_s16(dst0 + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j), vw0), line1, vw1), line2, vw2)); + vst1q_s16(dst1 + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src3 + j), vw2), line1, vw0), line2, vw1)); + } + } + + inline void operator()(const s16* src0, const s16* src1, const s16* src2, s16* dst, ptrdiff_t width) + { + if (src0 == 0 || src2 == 0) + { + int16x8_t vwl1 = vw0; + int16x8_t vwl2 = vw2; + if (src2 == 0) + { + src2 = src0; + vwl1 = vw2; + vwl2 = vw0; + } + + int16x8_t v_border = vdupq_n_s16(0); + if (borderType == BORDER_MODE_CONSTANT) + { + v_border = vmulq_s16(vdupq_n_s16(borderValue), vwl1); + vwl1 = vw1; + } + else if (borderType == BORDER_MODE_REFLECT101) + { + vwl1 = vw1; + vwl2 = vaddq_s16(vw0, vw2); + } + else //replicate\reflect + vwl1 = vaddq_s16(vwl1, vw1); + + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + vst1q_s16(dst + j, vaddq_s16(vmlaq_s16(v_border, vld1q_s16(src1 + j), vwl1), + vmulq_s16(vld1q_s16(src2 + j), vwl2))); + vst1q_s16(dst + j + 8, vaddq_s16(vmlaq_s16(v_border, vld1q_s16(src1 + j + 8), vwl1), + vmulq_s16(vld1q_s16(src2 + j + 8), vwl2))); + } + if (j <= width - 8) + { + vst1q_s16(dst + j, vaddq_s16(vmlaq_s16(v_border, vld1q_s16(src1 + j), vwl1), + vmulq_s16(vld1q_s16(src2 + j), vwl2))); + j += 8; + } + if (j != width) + { + j = width - 8; + vst1q_s16(dst + j, vaddq_s16(vmlaq_s16(v_border, vld1q_s16(src1 + j), vwl1), + vmulq_s16(vld1q_s16(src2 + j), vwl2))); + } + } + else + { + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + vst1q_s16(dst + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j), vw0), + vld1q_s16(src1 + j), vw1), + vld1q_s16(src2 + j), vw2)); + vst1q_s16(dst + j + 8, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j + 8), vw0), + vld1q_s16(src1 + j + 8), vw1), + vld1q_s16(src2 + j + 8), vw2)); + } + if (j <= width - 8) + { + vst1q_s16(dst + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j), vw0), + vld1q_s16(src1 + j), vw1), + vld1q_s16(src2 + j), vw2)); + j += 8; + } + if (j != width) + { + j = width - 8; + vst1q_s16(dst + j, vmlaq_s16(vmlaq_s16(vmulq_s16(vld1q_s16(src0 + j), vw0), + vld1q_s16(src1 + j), vw1), + vld1q_s16(src2 + j), vw2)); + } + } + } +}; + +struct ColFilter3x3S16_m101 : public ColFilter3x3S16Base +{ + typedef s16 dstType; + + inline ColFilter3x3S16_m101(const BORDER_MODE _borderType, const srcType _borderValue, const s16 *): + ColFilter3x3S16Base(_borderType, _borderValue) {} + + inline void operator()(const s16* src0, const s16* src1, const s16* src2, const s16* src3, s16* dst0, s16* dst1, ptrdiff_t width) + { + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + vst1q_s16(dst0 + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j))); + vst1q_s16(dst1 + j, vqsubq_s16(vld1q_s16(src3 + j), vld1q_s16(src1 + j))); + vst1q_s16(dst0 + j + 8, vqsubq_s16(vld1q_s16(src2 + j + 8), vld1q_s16(src0 + j + 8))); + vst1q_s16(dst1 + j + 8, vqsubq_s16(vld1q_s16(src3 + j + 8), vld1q_s16(src1 + j + 8))); + } + if (j <= width - 8) + { + vst1q_s16(dst0 + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j))); + vst1q_s16(dst1 + j, vqsubq_s16(vld1q_s16(src3 + j), vld1q_s16(src1 + j))); + j += 8; + } + if (j != width) + { + j = width - 8; + vst1q_s16(dst0 + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j))); + vst1q_s16(dst1 + j, vqsubq_s16(vld1q_s16(src3 + j), vld1q_s16(src1 + j))); + } + } + + inline void operator()(const s16* src0, const s16* src1, const s16* src2, s16* dst, ptrdiff_t width) + { + if (src0 == 0 || src2 == 0) + { + if (borderType == BORDER_MODE_CONSTANT) + { + int16x8_t v_border = vdupq_n_s16(borderValue); + if (src0 == 0) + { + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), v_border)); + vst1q_s16(dst + j + 8, vqsubq_s16(vld1q_s16(src2 + j + 8), v_border)); + } + if (j <= width - 8) + { + vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), v_border)); + j += 8; + } + if (j != width) + { + j = width - 8; + vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), v_border)); + } + } + else + { + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + vst1q_s16(dst + j, vqsubq_s16(v_border, vld1q_s16(src0 + j))); + vst1q_s16(dst + j + 8, vqsubq_s16(v_border, vld1q_s16(src0 + j + 8))); + } + if (j <= width - 8) + { + vst1q_s16(dst + j, vqsubq_s16(v_border, vld1q_s16(src0 + j))); + j += 8; + } + if (j != width) + { + j = width - 8; + vst1q_s16(dst + j, vqsubq_s16(v_border, vld1q_s16(src0 + j))); + } + } + } + else if (borderType == BORDER_MODE_REFLECT101) + { + int16x8_t vzero = vmovq_n_s16(0); + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + vst1q_s16(dst + j, vzero); + vst1q_s16(dst + j + 8, vzero); + } + if (j <= width - 8) + { + vst1q_s16(dst + j, vzero); + j += 8; + } + if (j != width) + { + j = width - 8; + vst1q_s16(dst + j, vzero); + } + } + else //replicate\reflect + { + if (src0 == 0) src0 = src1; else src2 = src1; + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j))); + vst1q_s16(dst + j + 8, vqsubq_s16(vld1q_s16(src2 + j + 8), vld1q_s16(src0 + j + 8))); + } + if (j <= width - 8) + { + vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j))); + j += 8; + } + if (j != width) + { + j = width - 8; + vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j))); + } + } + } + else + { + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j))); + vst1q_s16(dst + j + 8, vqsubq_s16(vld1q_s16(src2 + j + 8), vld1q_s16(src0 + j + 8))); + } + if (j <= width - 8) + { + vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j))); + j += 8; + } + if (j != width) + { + j = width - 8; + vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src0 + j))); + } + } + } +}; + +struct ColFilter3x3S16_121 : public ColFilter3x3S16Base +{ + typedef s16 dstType; + + inline ColFilter3x3S16_121(const BORDER_MODE _borderType, const srcType _borderValue, const s16*): + ColFilter3x3S16Base(_borderType, _borderValue) {} + + inline void operator()(const s16* src0, const s16* src1, const s16* src2, const s16* src3, s16* dst0, s16* dst1, ptrdiff_t width) + { + ptrdiff_t j = 0; + //int16x8_t line0 = vld1q_s16(src0 + j);//1 + //int16x8_t line1 = vld1q_s16(src1 + j);//11 + //int16x8_t line2 = vld1q_s16(src2 + j);// 11 + //int16x8_t line3 = vld1q_s16(src3 + j);// 1 + for (; j <= width - 16; j += 16) + { + int16x8_t line1 = vld1q_s16(src1 + j); + int16x8_t line2 = vld1q_s16(src2 + j); + + int16x8_t l12 = vqaddq_s16(line1, line2); + + vst1q_s16(dst0 + j, vqaddq_s16(vqaddq_s16(vld1q_s16(src0 + j), line1), l12)); + vst1q_s16(dst1 + j, vqaddq_s16(l12, vqaddq_s16(line2, vld1q_s16(src3 + j)))); + + line1 = vld1q_s16(src1 + j + 8); + line2 = vld1q_s16(src2 + j + 8); + + l12 = vqaddq_s16(line1, line2); + + vst1q_s16(dst0 + j + 8, vqaddq_s16(vqaddq_s16(vld1q_s16(src0 + j + 8), line1), l12)); + vst1q_s16(dst1 + j + 8, vqaddq_s16(l12, vqaddq_s16(line2, vld1q_s16(src3 + j + 8)))); + } + if (j <= width - 8) + { + int16x8_t line1 = vld1q_s16(src1 + j); + int16x8_t line2 = vld1q_s16(src2 + j); + + int16x8_t l12 = vqaddq_s16(line1, line2); + + vst1q_s16(dst0 + j, vqaddq_s16(vqaddq_s16(vld1q_s16(src0 + j), line1), l12)); + vst1q_s16(dst1 + j, vqaddq_s16(l12, vqaddq_s16(line2, vld1q_s16(src3 + j)))); + j += 8; + } + if (j != width) + { + j = width - 8; + int16x8_t line1 = vld1q_s16(src1 + j); + int16x8_t line2 = vld1q_s16(src2 + j); + + int16x8_t l12 = vqaddq_s16(line1, line2); + + vst1q_s16(dst0 + j, vqaddq_s16(vqaddq_s16(vld1q_s16(src0 + j), line1), l12)); + vst1q_s16(dst1 + j, vqaddq_s16(l12, vqaddq_s16(line2, vld1q_s16(src3 + j)))); + } + } + + inline void operator()(const s16* src0, const s16* src1, const s16* src2, s16* dst, ptrdiff_t width) + { + if (src0 == 0 || src2 == 0) + { + if (src2 == 0) + src2 = src0; + + if (borderType == BORDER_MODE_CONSTANT) + { + int16x8_t v_border = vdupq_n_s16(borderValue); + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j), 1), + vqaddq_s16(v_border, vld1q_s16(src2 + j)))); + vst1q_s16(dst + j + 8, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j + 8), 1), + vqaddq_s16(v_border, vld1q_s16(src2 + j + 8)))); + } + if (j <= width - 8) + { + vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j), 1), + vqaddq_s16(v_border, vld1q_s16(src2 + j)))); + j += 8; + } + if (j != width) + { + j = width - 8; + vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j), 1), + vqaddq_s16(v_border, vld1q_s16(src2 + j)))); + } + } + else if (borderType == BORDER_MODE_REFLECT101) + { + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + vst1q_s16(dst + j, vqshlq_n_s16(vqaddq_s16(vld1q_s16(src1 + j), + vld1q_s16(src2 + j)), 1)); + vst1q_s16(dst + j + 8, vqshlq_n_s16(vqaddq_s16(vld1q_s16(src1 + j + 8), + vld1q_s16(src2 + j + 8)), 1)); + } + if (j <= width - 8) + { + vst1q_s16(dst + j, vqshlq_n_s16(vqaddq_s16(vld1q_s16(src1 + j), + vld1q_s16(src2 + j)), 1)); + j += 8; + } + if (j != width) + { + j = width - 8; + vst1q_s16(dst + j, vqshlq_n_s16(vqaddq_s16(vld1q_s16(src1 + j), + vld1q_s16(src2 + j)), 1)); + } + } + else //replicate\reflect + { + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + int16x8_t line1 = vld1q_s16(src1 + j); + vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(line1, 1), + vqaddq_s16(line1, vld1q_s16(src2 + j)))); + + line1 = vld1q_s16(src1 + j + 8); + vst1q_s16(dst + j + 8, vqaddq_s16(vqshlq_n_s16(line1, 1), + vqaddq_s16(line1, vld1q_s16(src2 + j + 8)))); + } + if (j <= width - 8) + { + int16x8_t line1 = vld1q_s16(src1 + j); + vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(line1, 1), + vqaddq_s16(line1, vld1q_s16(src2 + j)))); + j += 8; + } + if (j != width) + { + j = width - 8; + int16x8_t line1 = vld1q_s16(src1 + j); + vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(line1, 1), + vqaddq_s16(line1, vld1q_s16(src2 + j)))); + } + } + } + else + { + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j), 1), + vqaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j)))); + + vst1q_s16(dst + j + 8, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j + 8), 1), + vqaddq_s16(vld1q_s16(src0 + j + 8), vld1q_s16(src2 + j + 8)))); + } + if (j <= width - 8) + { + vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j), 1), + vqaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j)))); + j += 8; + } + if (j != width) + { + j = width - 8; + vst1q_s16(dst + j, vqaddq_s16(vqshlq_n_s16(vld1q_s16(src1 + j), 1), + vqaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j)))); + } + } + } +}; + +struct ColFilter3x3U8_121 : public ColFilter3x3S16Base +{ + typedef u8 dstType; + + inline ColFilter3x3U8_121(const BORDER_MODE _borderType, const srcType _borderValue, const s16*): + ColFilter3x3S16Base(_borderType, _borderValue) {} + + inline void operator()(const srcType* src0, const srcType* src1, const srcType* src2, const srcType* src3, dstType* dst0, dstType* dst1, ptrdiff_t width) + { + ptrdiff_t j = 0; + //int16x8_t line0 = vld1q_s16(src0 + j);//1 + //int16x8_t line1 = vld1q_s16(src1 + j);//11 + //int16x8_t line2 = vld1q_s16(src2 + j);// 11 + //int16x8_t line3 = vld1q_s16(src3 + j);// 1 + for (; j <= width - 16; j += 16) + { + int16x8_t line1 = vld1q_s16(src1 + j); + int16x8_t line2 = vld1q_s16(src2 + j); + + int16x8_t l12 = vaddq_s16(line1, line2); + + vst1_u8(dst0 + j, vqrshrun_n_s16(vaddq_s16(vaddq_s16(vld1q_s16(src0 + j), line1), l12), 4)); + vst1_u8(dst1 + j, vqrshrun_n_s16(vaddq_s16(l12, vaddq_s16(line2, vld1q_s16(src3 + j))), 4)); + + line1 = vld1q_s16(src1 + j + 8); + line2 = vld1q_s16(src2 + j + 8); + + l12 = vaddq_s16(line1, line2); + + vst1_u8(dst0 + j + 8, vqrshrun_n_s16(vaddq_s16(vaddq_s16(vld1q_s16(src0 + j + 8), line1), l12), 4)); + vst1_u8(dst1 + j + 8, vqrshrun_n_s16(vaddq_s16(l12, vaddq_s16(line2, vld1q_s16(src3 + j + 8))), 4)); + } + if (j <= width - 8) + { + int16x8_t line1 = vld1q_s16(src1 + j); + int16x8_t line2 = vld1q_s16(src2 + j); + + int16x8_t l12 = vaddq_s16(line1, line2); + + vst1_u8(dst0 + j, vqrshrun_n_s16(vaddq_s16(vaddq_s16(vld1q_s16(src0 + j), line1), l12), 4)); + vst1_u8(dst1 + j, vqrshrun_n_s16(vaddq_s16(l12, vaddq_s16(line2, vld1q_s16(src3 + j))), 4)); + j += 8; + } + if (j != width) + { + j = width - 8; + int16x8_t line1 = vld1q_s16(src1 + j); + int16x8_t line2 = vld1q_s16(src2 + j); + + int16x8_t l12 = vaddq_s16(line1, line2); + + vst1_u8(dst0 + j, vqrshrun_n_s16(vaddq_s16(vaddq_s16(vld1q_s16(src0 + j), line1), l12), 4)); + vst1_u8(dst1 + j, vqrshrun_n_s16(vaddq_s16(l12, vaddq_s16(line2, vld1q_s16(src3 + j))), 4)); + } + } + + inline void operator()(const srcType* src0, const srcType* src1, const srcType* src2, dstType* dst, ptrdiff_t width) + { + if (src0 == 0 || src2 == 0) + { + if (src2 == 0) + src2 = src0; + + if (borderType == BORDER_MODE_CONSTANT) + { + ptrdiff_t j = 0; + int16x8_t v_border = vdupq_n_s16(borderValue); + for (; j <= width - 16; j += 16) + { + //Store normalized result, essential for gaussianBlur + vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j), 1), + vaddq_s16(v_border, vld1q_s16(src2 + j))), 4)); + + vst1_u8(dst + j + 8, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j + 8), 1), + vaddq_s16(v_border, vld1q_s16(src2 + j + 8))), 4)); + } + if (j <= width - 8) + { + vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j), 1), + vaddq_s16(v_border, vld1q_s16(src2 + j))), 4)); + j += 8; + } + if (j != width) + { + j = width - 8; + vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j), 1), + vaddq_s16(v_border, vld1q_s16(src2 + j))), 4)); + } + } + else if (borderType == BORDER_MODE_REFLECT101) + { + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + vst1_u8(dst + j, vqrshrun_n_s16(vshlq_n_s16(vaddq_s16(vld1q_s16(src1 + j), + vld1q_s16(src2 + j)), 1), 4)); + vst1_u8(dst + j + 8, vqrshrun_n_s16(vshlq_n_s16(vaddq_s16(vld1q_s16(src1 + j + 8), + vld1q_s16(src2 + j + 8)), 1), 4)); + } + if (j <= width - 8) + { + vst1_u8(dst + j, vqrshrun_n_s16(vshlq_n_s16(vaddq_s16(vld1q_s16(src1 + j), + vld1q_s16(src2 + j)), 1), 4)); + j += 8; + } + if (j != width) + { + j = width - 8; + vst1_u8(dst + j, vqrshrun_n_s16(vshlq_n_s16(vaddq_s16(vld1q_s16(src1 + j), + vld1q_s16(src2 + j)), 1), 4)); + } + } + else //replicate\reflect + { + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + int16x8_t line1 = vld1q_s16(src1 + j); + vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(line1, 1), + vaddq_s16(line1, vld1q_s16(src2 + j))), 4)); + + line1 = vld1q_s16(src1 + j + 8); + vst1_u8(dst + j + 8, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(line1, 1), + vaddq_s16(line1, vld1q_s16(src2 + j + 8))), 4)); + } + if (j <= width - 8) + { + int16x8_t line1 = vld1q_s16(src1 + j); + vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(line1, 1), + vaddq_s16(line1, vld1q_s16(src2 + j))), 4)); + j += 8; + } + if (j != width) + { + j = width - 8; + int16x8_t line1 = vld1q_s16(src1 + j); + vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(line1, 1), + vaddq_s16(line1, vld1q_s16(src2 + j))), 4)); + } + } + } + else + { + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j), 1), + vaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j))), 4)); + vst1_u8(dst + j + 8, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j + 8), 1), + vaddq_s16(vld1q_s16(src0 + j + 8), vld1q_s16(src2 + j + 8))), 4)); + } + if (j <= width - 8) + { + vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j), 1), + vaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j))), 4)); + j += 8; + } + if (j != width) + { + j = width - 8; + vst1_u8(dst + j, vqrshrun_n_s16(vaddq_s16(vshlq_n_s16(vld1q_s16(src1 + j), 1), + vaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j))), 4)); + } + } + } +}; + +struct ColFilter3x3S16_1m21 : public ColFilter3x3S16Base +{ + typedef s16 dstType; + + inline ColFilter3x3S16_1m21(const BORDER_MODE _borderType, const srcType _borderValue, const s16*): + ColFilter3x3S16Base(_borderType, _borderValue) {} + + inline void operator()(const s16* src0, const s16* src1, const s16* src2, const s16* src3, s16* dst0, s16* dst1, ptrdiff_t width) + { + ptrdiff_t j = 0; + //int16x8_t line0 = vld1q_s16(src0 + j);// 1 + //int16x8_t line1 = vld1q_s16(src1 + j);//-1 1 + //int16x8_t line2 = vld1q_s16(src2 + j);// -1 -1 + //int16x8_t line3 = vld1q_s16(src3 + j);// 1 + for (; j <= width - 16; j += 16) + { + int16x8_t line1 = vld1q_s16(src1 + j); + int16x8_t line2 = vld1q_s16(src2 + j); + + int16x8_t l12 = vqsubq_s16(line1, line2); + + vst1q_s16(dst0 + j, vqsubq_s16(vqsubq_s16(vld1q_s16(src0 + j), line1), l12)); + vst1q_s16(dst1 + j, vqaddq_s16(vqsubq_s16(vld1q_s16(src3 + j), line2), l12)); + + line1 = vld1q_s16(src1 + j + 8); + line2 = vld1q_s16(src2 + j + 8); + + l12 = vqsubq_s16(line1, line2); + + vst1q_s16(dst0 + j + 8, vqsubq_s16(vqsubq_s16(vld1q_s16(src0 + j + 8), line1), l12)); + vst1q_s16(dst1 + j + 8, vqaddq_s16(vqsubq_s16(vld1q_s16(src3 + j + 8), line2), l12)); + } + if (j <= width - 8) + { + int16x8_t line1 = vld1q_s16(src1 + j); + int16x8_t line2 = vld1q_s16(src2 + j); + + int16x8_t l12 = vqsubq_s16(line1, line2); + + vst1q_s16(dst0 + j, vqsubq_s16(vqsubq_s16(vld1q_s16(src0 + j), line1), l12)); + vst1q_s16(dst1 + j, vqaddq_s16(vqsubq_s16(vld1q_s16(src3 + j), line2), l12)); + j += 8; + } + if (j != width) + { + j = width - 8; + int16x8_t line1 = vld1q_s16(src1 + j); + int16x8_t line2 = vld1q_s16(src2 + j); + + int16x8_t l12 = vqsubq_s16(line1, line2); + + vst1q_s16(dst0 + j, vqsubq_s16(vqsubq_s16(vld1q_s16(src0 + j), line1), l12)); + vst1q_s16(dst1 + j, vqaddq_s16(vqsubq_s16(vld1q_s16(src3 + j), line2), l12)); + } + } + + inline void operator()(const s16* src0, const s16* src1, const s16* src2, s16* dst, ptrdiff_t width) + { + if (src0 == 0 || src2 == 0) + { + if (src2 == 0) + src2 = src0; + + if (borderType == BORDER_MODE_CONSTANT) + { + ptrdiff_t j = 0; + int16x8_t v_border = vdupq_n_s16(borderValue); + for (; j <= width - 16; j += 16) + { + vst1q_s16(dst + j, vqsubq_s16(vqaddq_s16(v_border, vld1q_s16(src2 + j)), vshlq_n_s16(vld1q_s16(src1 + j), 1))); + vst1q_s16(dst + j + 8, vqsubq_s16(vqaddq_s16(v_border, vld1q_s16(src2 + j + 8)), vshlq_n_s16(vld1q_s16(src1 + j + 8), 1))); + } + if (j <= width - 8) + { + vst1q_s16(dst + j, vqsubq_s16(vqaddq_s16(v_border, vld1q_s16(src2 + j)), vshlq_n_s16(vld1q_s16(src1 + j), 1))); + j += 8; + } + if (j != width) + { + j = width - 8; + vst1q_s16(dst + j, vqsubq_s16(vqaddq_s16(v_border, vld1q_s16(src2 + j)), vshlq_n_s16(vld1q_s16(src1 + j), 1))); + } + } + else if (borderType == BORDER_MODE_REFLECT101) + { + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + vst1q_s16(dst + j, vqshlq_n_s16(vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src1 + j)), 1)); + vst1q_s16(dst + j + 8, vqshlq_n_s16(vqsubq_s16(vld1q_s16(src2 + j + 8), vld1q_s16(src1 + j + 8)), 1)); + } + if (j <= width - 8) + { + vst1q_s16(dst + j, vqshlq_n_s16(vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src1 + j)), 1)); + j += 8; + } + if (j != width) + { + j = width - 8; + vst1q_s16(dst + j, vqshlq_n_s16(vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src1 + j)), 1)); + } + } + else //replicate\reflect + { + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src1 + j))); + vst1q_s16(dst + j + 8, vqsubq_s16(vld1q_s16(src2 + j + 8), vld1q_s16(src1 + j + 8))); + } + if (j <= width - 8) + { + vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src1 + j))); + j += 8; + } + if (j != width) + { + j = width - 8; + vst1q_s16(dst + j, vqsubq_s16(vld1q_s16(src2 + j), vld1q_s16(src1 + j))); + } + } + } + else + { + ptrdiff_t j = 0; + for (; j <= width - 16; j += 16) + { + vst1q_s16(dst + j, vqsubq_s16(vqaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j)), + vqshlq_n_s16(vld1q_s16(src1 + j), 1))); + vst1q_s16(dst + j + 8, vqsubq_s16(vqaddq_s16(vld1q_s16(src0 + j + 8), vld1q_s16(src2 + j + 8)), + vqshlq_n_s16(vld1q_s16(src1 + j + 8), 1))); + } + if (j <= width - 8) + { + vst1q_s16(dst + j, vqsubq_s16(vqaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j)), + vqshlq_n_s16(vld1q_s16(src1 + j), 1))); + j += 8; + } + if (j != width) + { + j = width - 8; + vst1q_s16(dst + j, vqsubq_s16(vqaddq_s16(vld1q_s16(src0 + j), vld1q_s16(src2 + j)), + vqshlq_n_s16(vld1q_s16(src1 + j), 1))); + } + } + } +}; + +template struct sepFilter3x3 +{ + typedef typename RowFilter::srcType srcType; + typedef typename RowFilter::dstType tmpType; + typedef typename ColFilter::dstType dstType; + + static void process(const Size2D &ssize, + const srcType * srcBase, ptrdiff_t srcStride, + dstType * dstBase, ptrdiff_t dstStride, + const s16 *xw, const s16 *yw, + BORDER_MODE borderType, srcType borderValue, Margin borderMargin) + { + const ptrdiff_t offsetk = 1; + ptrdiff_t borderxl, borderxr, borderyt, borderyb; + borderxl = std::max(0, offsetk - (ptrdiff_t)borderMargin.left); + borderyt = std::max(0, offsetk - (ptrdiff_t)borderMargin.top); + borderxr = std::max(0, offsetk - (ptrdiff_t)borderMargin.right); + borderyb = std::max(0, offsetk - (ptrdiff_t)borderMargin.bottom); + + std::vector _buf(ssize.width << 2); + tmpType * buf = &_buf[0]; + + RowFilter filterX(borderType, borderValue, borderxl, borderxr, xw); + ColFilter filterY(borderType, filterX.borderFilter, yw); + const ptrdiff_t lookTop = offsetk - borderyt; + const ptrdiff_t lookBottom = offsetk - borderyb; + + const srcType* src = srcBase - lookTop * srcStride / sizeof(srcType); + dstType* dst = dstBase; + + ptrdiff_t ridx = -lookTop; + for (; ridx <= (ptrdiff_t)ssize.height + lookBottom - 2; ridx += 2) + { + for (ptrdiff_t bidx = 0; bidx < 2; ++bidx, src += srcStride / sizeof(srcType)) + filterX(src, buf + ssize.width * ((4 + ridx + bidx) % 4), ssize.width); + + if (ridx <= 0) + { + if (ridx == 0) //first row + { + filterY(0, buf + ssize.width * ((ridx + 4) % 4), buf + ssize.width * ((ridx + 1) % 4), dst, ssize.width); + dst += dstStride / sizeof(dstType); + } + continue; + } + + filterY(buf + ssize.width * ((ridx + 2) % 4), + buf + ssize.width * ((ridx + 3) % 4), + buf + ssize.width * ((ridx + 4) % 4), + buf + ssize.width * ((ridx + 1) % 4), + dst, dst + dstStride / sizeof(dstType), ssize.width); + + dst += dstStride * 2 / sizeof(dstType); + } + + if (ridx < (ptrdiff_t)ssize.height + lookBottom) + { + filterX(src, buf + ssize.width * ((4 + ridx) % 4), ssize.width); + filterY(buf + ssize.width * ((2 + ridx) % 4), + buf + ssize.width * ((3 + ridx) % 4), + buf + ssize.width * ((4 + ridx) % 4), dst, ssize.width); + dst += dstStride / sizeof(dstType); + ridx++; + } + if (lookBottom == 0) + filterY(buf + ssize.width * ((ridx + 2) % 4), buf + ssize.width * ((ridx + 3) % 4), 0, dst, ssize.width); + } +}; + +} //namespace internal + +} //namespace CAROTENE_NS + +#endif // CAROTENE_NEON + +#endif // CAROTENE_SRC_REMAP_HPP diff --git a/3rdparty/carotene/src/sobel.cpp b/3rdparty/carotene/src/sobel.cpp new file mode 100644 index 0000000000..5d46045d9f --- /dev/null +++ b/3rdparty/carotene/src/sobel.cpp @@ -0,0 +1,317 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include + +#include "common.hpp" + +namespace CAROTENE_NS { + +bool isSobel3x3Supported(const Size2D &size, BORDER_MODE border, + s32 dx, s32 dy, Margin borderMargin) +{ + return dx < 3 && dx >= 0 && + dy < 3 && dy >= 0 && + (dx + dy) > 0 && + isSeparableFilter3x3Supported(size, border, dx, dy, borderMargin); +} + +void Sobel3x3(const Size2D &size, + const u8 * srcBase, ptrdiff_t srcStride, + s16 * dstBase, ptrdiff_t dstStride, + s32 dx, s32 dy, + BORDER_MODE borderType, u8 borderValue, Margin borderMargin) +{ + internal::assertSupportedConfiguration(isSobel3x3Supported(size, borderType, dx, dy, borderMargin)); +#ifdef CAROTENE_NEON + SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride, + dx, dy, 0, 0, + borderType, borderValue, borderMargin); +#else + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)borderValue; +#endif +} + +bool isSobel3x3f32Supported(const Size2D &size, BORDER_MODE border, + s32 dx, s32 dy) +{ + return isSupportedConfiguration() && + dx < 3 && dx >= 0 && + dy < 3 && dy >= 0 && + (dx + dy) > 0 && + size.width >= 4 && size.height >= 2 && + (border == BORDER_MODE_CONSTANT || + border == BORDER_MODE_REFLECT || + border == BORDER_MODE_REFLECT101 || + border == BORDER_MODE_REPLICATE ); +} + +void Sobel3x3(const Size2D &size, + const f32 * srcBase, ptrdiff_t srcStride, + f32 * dstBase, ptrdiff_t dstStride, + s32 dx, s32 dy, + BORDER_MODE borderType, f32 borderValue) +{ + internal::assertSupportedConfiguration(isSobel3x3f32Supported(size, borderType, dx, dy)); +#ifdef CAROTENE_NEON + std::vector _tmp; + f32 *tmp = 0; + if (borderType == BORDER_MODE_CONSTANT) + { + _tmp.assign(size.width + 2, borderValue); + tmp = &_tmp[1]; + } + + ptrdiff_t delta = (ptrdiff_t)((size.width + 2 + 31) & -32);//align size + std::vector _tempBuf((delta << 1) + 64); + f32 *trow0 = internal::alignPtr(&_tempBuf[1], 32), *trow1 = internal::alignPtr(trow0 + delta, 32); + + for( size_t y = 0; y < size.height; y++ ) + { + const f32* srow0; + const f32* srow1 = internal::getRowPtr(srcBase, srcStride, y); + const f32* srow2; + f32* drow = internal::getRowPtr(dstBase, dstStride, y > 0 ? y-1 : 0); + f32* drow1 = internal::getRowPtr(dstBase, dstStride, y); + if (borderType == BORDER_MODE_REFLECT101) { + srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 1); + srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-2); + } else if (borderType == BORDER_MODE_CONSTANT) { + srow0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp; + srow2 = y < size.height-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp; + } else { // BORDER_MODE_REFLECT || BORDER_MODE_REPLICATE + srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0); + srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-1); + } + + float32x4_t tprev = vmovq_n_f32(0.f); + float32x4_t tcurr = vmovq_n_f32(0.f); + float32x4_t tnext = vmovq_n_f32(0.f); + float32x4_t t0, t1, t2; + // do vertical convolution + size_t x = 0, bcolsn = y + 2 < size.height ? size.width : (size.width - 4); + for( ; x <= bcolsn; x += 4 ) + { + internal::prefetch(srow0 + x); + internal::prefetch(srow1 + x); + internal::prefetch(srow2 + x); + + float32x4_t x0 = vld1q_f32(srow0 + x); + float32x4_t x1 = vld1q_f32(srow1 + x); + float32x4_t x2 = vld1q_f32(srow2 + x); + + tprev = tcurr; + tcurr = tnext; + if(!dy) + { + tnext = vaddq_f32(vaddq_f32(vaddq_f32(x1, x1), x2), x0); + } + else if(dy == 2) + { + tnext = vsubq_f32(vsubq_f32(x2, x1), vsubq_f32(x1, x0)); + } + else + { + tnext = vsubq_f32(x2, x0); + } + + if(!x) { + tcurr = tnext; + // make border + if (borderType == BORDER_MODE_CONSTANT) + { + tcurr = vsetq_lane_f32(borderValue,tcurr, 3); + } + else if (borderType == BORDER_MODE_REFLECT101) + { + tcurr = vsetq_lane_f32(vgetq_lane_f32(tcurr, 1),tcurr, 3); + } + else // BORDER_MODE_REFLECT || BORDER_MODE_REPLICATE + { + tcurr = vsetq_lane_f32(vgetq_lane_f32(tcurr, 0),tcurr, 3); + } + continue; + } + + internal::prefetch(trow0 + x); + internal::prefetch(trow1 + x); + + t0 = vextq_f32(tprev, tcurr, 3); + t1 = tcurr; + t2 = vextq_f32(tcurr, tnext, 1); + if(!dx) + { + t0 = vaddq_f32(t0, vaddq_f32(vaddq_f32(t1, t1), t2)); + } + else if(dx == 2) + { + t0 = vsubq_f32(vsubq_f32(t2, t1), vsubq_f32(t1, t0)); + } + else + { + t0 = vsubq_f32(t2, t0); + } + + if(!(y%2)) + { + vst1q_f32(trow0 + x - 4, t0); + } + else + { + vst1q_f32(trow1 + x - 4, t0); + } + } + x -= 4; + if(x == size.width){ + x--; + } + f32 prevx = 0, rowx = 0, nextx = 0; + if(!dy) + { + prevx = x > 0 ? srow2[x-1] + 2*srow1[x-1] + srow0[x-1] : + (borderType == BORDER_MODE_REFLECT101 ? srow2[1] + 2*srow1[1] + srow0[1] : + (borderType == BORDER_MODE_CONSTANT ? 4*borderValue : + srow2[0] + 2*srow1[0] + srow0[0]) ); + rowx = srow2[x] + 2*srow1[x] + srow0[x]; + } + else if(dy == 2) + { + prevx = x > 0 ? srow2[x-1] - 2*srow1[x-1] + srow0[x-1] : + (borderType == BORDER_MODE_REFLECT101 ? srow2[1] - 2*srow1[1] + srow0[1] : + (borderType == BORDER_MODE_CONSTANT ? 0.f : + srow2[0] - 2*srow1[0] + srow0[0]) ); + rowx = srow2[x] - 2*srow1[x] + srow0[x]; + } + else + { + prevx = x > 0 ? srow2[x-1] - srow0[x-1] : + (borderType == BORDER_MODE_REFLECT101 ? srow2[1] - srow0[1] : + (borderType == BORDER_MODE_CONSTANT ? 0.f : + srow2[0] - srow0[0]) ); + rowx = srow2[x] - srow0[x]; + } + + for( ; x < size.width; x++ ) + { + if(x+1 == size.width) { + // make border + if (borderType == BORDER_MODE_CONSTANT) + { + if(!dy) { + nextx = 4*borderValue; + } else { + nextx = 0.f; + } + } else if (borderType == BORDER_MODE_REFLECT101) + { + if(!dy) { + nextx = srow2[x-1] + 2*srow1[x-1] + srow0[x-1]; + } else if(dy == 2) { + nextx = srow2[x-1] - 2*srow1[x-1] + srow0[x-1]; + } else { + nextx = srow2[x-1] - srow0[x-1]; + } + } else { + if(!dy) { + nextx = srow2[x] + 2*srow1[x] + srow0[x]; + } else if(dy == 2) { + nextx = srow2[x] - 2*srow1[x] + srow0[x]; + } else { + nextx = srow2[x] - srow0[x]; + } + } + } else { + if(!dy) { + nextx = srow2[x+1] + 2*srow1[x+1] + srow0[x+1]; + } else if(dy == 2) { + nextx = srow2[x+1] - 2*srow1[x+1] + srow0[x+1]; + } else { + nextx = srow2[x+1] - srow0[x+1]; + } + } + f32 res; + if(dx==1) { + res = nextx - prevx; + } else if(!dx) { + res = prevx + 2*rowx + nextx; + } else { + res = prevx - 2*rowx + nextx; + } + if(!(y%2)) { + *(trow0+x) = res; + } else { + *(trow1+x) = res; + } + prevx = rowx; + rowx = nextx; + } + + if(y>0) { + for(size_t x1 = 0; x1 < size.width; x1++ ) + { + if(y%2) + *(drow + x1) = trow0[x1]; + else + *(drow + x1) = trow1[x1]; + } + } + if(y == size.height-1) { + for(size_t x1 = 0; x1 < size.width; x1++ ) + { + if(!(y%2)) + *(drow1 + x1) = trow0[x1]; + else + *(drow1 + x1) = trow1[x1]; + } + } + } +#else + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)borderValue; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/sub.cpp b/3rdparty/carotene/src/sub.cpp new file mode 100644 index 0000000000..38853895e7 --- /dev/null +++ b/3rdparty/carotene/src/sub.cpp @@ -0,0 +1,621 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" +#include "vtransform.hpp" + +namespace CAROTENE_NS { + +#ifdef CAROTENE_NEON + +namespace { + +template +struct SubWrap +{ + typedef T type; + + void operator() (const typename internal::VecTraits::vec128 & v_src0, + const typename internal::VecTraits::vec128 & v_src1, + typename internal::VecTraits::vec128 & v_dst) const + { + v_dst = internal::vsubq(v_src0, v_src1); + } + + void operator() (const typename internal::VecTraits::vec64 & v_src0, + const typename internal::VecTraits::vec64 & v_src1, + typename internal::VecTraits::vec64 & v_dst) const + { + v_dst = internal::vsub(v_src0, v_src1); + } + + void operator() (const T * src0, const T * src1, T * dst) const + { + dst[0] = (T)((WT)src0[0] - (WT)src1[0]); + } +}; + +template +struct SubSaturate +{ + typedef T type; + + void operator() (const typename internal::VecTraits::vec128 & v_src0, + const typename internal::VecTraits::vec128 & v_src1, + typename internal::VecTraits::vec128 & v_dst) const + { + v_dst = internal::vqsubq(v_src0, v_src1); + } + + void operator() (const typename internal::VecTraits::vec64 & v_src0, + const typename internal::VecTraits::vec64 & v_src1, + typename internal::VecTraits::vec64 & v_dst) const + { + v_dst = internal::vqsub(v_src0, v_src1); + } + + void operator() (const T * src0, const T * src1, T * dst) const + { + dst[0] = internal::saturate_cast((WT)src0[0] - (WT)src1[0]); + } +}; + +} // namespace + +#endif + +void sub(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + u8 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + if (policy == CONVERT_POLICY_SATURATE) + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + SubSaturate()); + } + else + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + SubWrap()); + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)policy; +#endif +} + +void sub(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i); + u16 * dstu16 = internal::getRowPtr((u16 *)dstBase, dstStride, i); + s16 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw32; j += 32) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src00 = vld1q_u8(src0 + j), v_src01 = vld1q_u8(src0 + j + 16); + uint8x16_t v_src10 = vld1q_u8(src1 + j), v_src11 = vld1q_u8(src1 + j + 16); + vst1q_u16(dstu16 + j, vsubl_u8(vget_low_u8(v_src00), vget_low_u8(v_src10))); + vst1q_u16(dstu16 + j + 8, vsubl_u8(vget_high_u8(v_src00), vget_high_u8(v_src10))); + vst1q_u16(dstu16 + j + 16, vsubl_u8(vget_low_u8(v_src01), vget_low_u8(v_src11))); + vst1q_u16(dstu16 + j + 24, vsubl_u8(vget_high_u8(v_src01), vget_high_u8(v_src11))); + } + for (; j < roiw8; j += 8) + { + uint8x8_t v_src0 = vld1_u8(src0 + j); + uint8x8_t v_src1 = vld1_u8(src1 + j); + vst1q_u16(dstu16 + j, vsubl_u8(v_src0, v_src1)); + } + + for (; j < size.width; j++) + dst[j] = (s16)src0[j] - (s16)src1[j]; + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; +#endif +} + +void sub(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + f32 *dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i); + f32 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw32; j += 32) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src00 = vld1q_u8(src0 + j), v_src01 = vld1q_u8(src0 + j + 16); + uint8x16_t v_src10 = vld1q_u8(src1 + j), v_src11 = vld1q_u8(src1 + j + 16); + int16x8_t vsl = vreinterpretq_s16_u16(vsubl_u8( vget_low_u8(v_src00), vget_low_u8(v_src10))); + int16x8_t vsh = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(v_src00), vget_high_u8(v_src10))); + + vst1q_f32(dst + j + 0, vcvtq_f32_s32(vmovl_s16( vget_low_s16(vsl) ))); + vst1q_f32(dst + j + 4, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vsl) ))); + vst1q_f32(dst + j + 8, vcvtq_f32_s32(vmovl_s16( vget_low_s16(vsh) ))); + vst1q_f32(dst + j + 12, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vsh) ))); + + vsl = vreinterpretq_s16_u16(vsubl_u8( vget_low_u8(v_src01), vget_low_u8(v_src11))); + vsh = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(v_src01), vget_high_u8(v_src11))); + + vst1q_f32(dst + j + 16, vcvtq_f32_s32(vmovl_s16( vget_low_s16(vsl) ))); + vst1q_f32(dst + j + 20, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vsl) ))); + vst1q_f32(dst + j + 24, vcvtq_f32_s32(vmovl_s16( vget_low_s16(vsh) ))); + vst1q_f32(dst + j + 28, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vsh) ))); + } + for (; j < roiw8; j += 8) + { + uint8x8_t v_src0 = vld1_u8(src0 + j); + uint8x8_t v_src1 = vld1_u8(src1 + j); + + int16x8_t vs = vreinterpretq_s16_u16(vsubl_u8(v_src0, v_src1)); + vst1q_f32(dst + j + 0, vcvtq_f32_s32(vmovl_s16( vget_low_s16(vs) ))); + vst1q_f32(dst + j + 4, vcvtq_f32_s32(vmovl_s16( vget_high_s16(vs) ))); + } + for(; j < size.width; j++) + dst[j] = (f32)src0[j] - (f32)src1[j]; + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; +#endif +} + +void sub(const Size2D &size, + const u8 * src0Base, ptrdiff_t src0Stride, + const s16 * src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i); + s16 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + if (policy == CONVERT_POLICY_SATURATE) + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j); + int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0))); + int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0))); + int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8); + int16x8_t v_dst0 = vqsubq_s16(v_src00, v_src10); + int16x8_t v_dst1 = vqsubq_s16(v_src01, v_src11); + vst1q_s16(dst + j, v_dst0); + vst1q_s16(dst + j + 8, v_dst1); + } + for (; j < roiw8; j += 8) + { + int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j))); + int16x8_t v_src1 = vld1q_s16(src1 + j); + int16x8_t v_dst = vqsubq_s16(v_src0, v_src1); + vst1q_s16(dst + j, v_dst); + } + + for (; j < size.width; j++) + dst[j] = internal::saturate_cast((s32)src0[j] - (s32)src1[j]); + } + else + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + uint8x16_t v_src0 = vld1q_u8(src0 + j); + int16x8_t v_src00 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src0))); + int16x8_t v_src01 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src0))); + int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8); + int16x8_t v_dst0 = vsubq_s16(v_src00, v_src10); + int16x8_t v_dst1 = vsubq_s16(v_src01, v_src11); + vst1q_s16(dst + j, v_dst0); + vst1q_s16(dst + j + 8, v_dst1); + } + for (; j < roiw8; j += 8) + { + int16x8_t v_src0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src0 + j))); + int16x8_t v_src1 = vld1q_s16(src1 + j); + int16x8_t v_dst = vsubq_s16(v_src0, v_src1); + vst1q_s16(dst + j, v_dst); + } + + for (; j < size.width; j++) + dst[j] = (s16)((s32)src0[j] - (s32)src1[j]); + } + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)policy; +#endif +} + +void sub(const Size2D &size, + const s16 * src0Base, ptrdiff_t src0Stride, + const u8 * src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s16 * src0 = internal::getRowPtr(src0Base, src0Stride, i); + const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, i); + s16 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + if (policy == CONVERT_POLICY_SATURATE) + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + int16x8_t v_src00 = vld1q_s16(src0 + j), v_src01 = vld1q_s16(src0 + j + 8); + uint8x16_t v_src1 = vld1q_u8(src1 + j); + int16x8_t v_src10 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src1))); + int16x8_t v_src11 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src1))); + int16x8_t v_dst0 = vqsubq_s16(v_src00, v_src10); + int16x8_t v_dst1 = vqsubq_s16(v_src01, v_src11); + vst1q_s16(dst + j, v_dst0); + vst1q_s16(dst + j + 8, v_dst1); + } + for (; j < roiw8; j += 8) + { + int16x8_t v_src0 = vld1q_s16(src0 + j); + int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src1 + j))); + int16x8_t v_dst = vqsubq_s16(v_src0, v_src1); + vst1q_s16(dst + j, v_dst); + } + + for (; j < size.width; j++) + dst[j] = internal::saturate_cast((s32)src0[j] - (s32)src1[j]); + } + else + { + for (; j < roiw16; j += 16) + { + internal::prefetch(src0 + j); + internal::prefetch(src1 + j); + int16x8_t v_src00 = vld1q_s16(src0 + j), v_src01 = vld1q_s16(src0 + j + 8); + uint8x16_t v_src1 = vld1q_u8(src1 + j); + int16x8_t v_src10 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src1))); + int16x8_t v_src11 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src1))); + int16x8_t v_dst0 = vsubq_s16(v_src00, v_src10); + int16x8_t v_dst1 = vsubq_s16(v_src01, v_src11); + vst1q_s16(dst + j, v_dst0); + vst1q_s16(dst + j + 8, v_dst1); + } + for (; j < roiw8; j += 8) + { + int16x8_t v_src0 = vld1q_s16(src0 + j); + int16x8_t v_src1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src1 + j))); + int16x8_t v_dst = vsubq_s16(v_src0, v_src1); + vst1q_s16(dst + j, v_dst); + } + + for (; j < size.width; j++) + dst[j] = (s16)((s32)src0[j] - (s32)src1[j]); + } + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)policy; +#endif +} + +void sub(const Size2D &size, + const s8 * src0Base, ptrdiff_t src0Stride, + const s8 * src1Base, ptrdiff_t src1Stride, + s8 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + if (policy == CONVERT_POLICY_SATURATE) + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + SubSaturate()); + } + else + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + SubWrap()); + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)policy; +#endif +} + +void sub(const Size2D &size, + const s16 * src0Base, ptrdiff_t src0Stride, + const s16 * src1Base, ptrdiff_t src1Stride, + s16 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + if (policy == CONVERT_POLICY_SATURATE) + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + SubSaturate()); + } + else + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + SubWrap()); + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)policy; +#endif +} + +void sub(const Size2D &size, + const u16 * src0Base, ptrdiff_t src0Stride, + const u16 * src1Base, ptrdiff_t src1Stride, + u16 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + if (policy == CONVERT_POLICY_SATURATE) + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + SubSaturate()); + } + else + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + SubWrap()); + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)policy; +#endif +} + +void sub(const Size2D &size, + const s32 * src0Base, ptrdiff_t src0Stride, + const s32 * src1Base, ptrdiff_t src1Stride, + s32 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + if (policy == CONVERT_POLICY_SATURATE) + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + SubSaturate()); + } + else + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + SubWrap()); + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)policy; +#endif +} + +void sub(const Size2D &size, + const u32 * src0Base, ptrdiff_t src0Stride, + const u32 * src1Base, ptrdiff_t src1Stride, + u32 *dstBase, ptrdiff_t dstStride, + CONVERT_POLICY policy) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + if (policy == CONVERT_POLICY_SATURATE) + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + SubSaturate()); + } + else + { + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + SubWrap()); + } +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; + (void)policy; +#endif +} + +void sub(const Size2D &size, + const f32 * src0Base, ptrdiff_t src0Stride, + const f32 * src1Base, ptrdiff_t src1Stride, + f32 *dstBase, ptrdiff_t dstStride) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + internal::vtransform(size, + src0Base, src0Stride, + src1Base, src1Stride, + dstBase, dstStride, + SubWrap()); +#else + (void)size; + (void)src0Base; + (void)src0Stride; + (void)src1Base; + (void)src1Stride; + (void)dstBase; + (void)dstStride; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/sum.cpp b/3rdparty/carotene/src/sum.cpp new file mode 100644 index 0000000000..812e7fca67 --- /dev/null +++ b/3rdparty/carotene/src/sum.cpp @@ -0,0 +1,385 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +#include "vtransform.hpp" + +namespace CAROTENE_NS { + +bool isSumSupported(u32 channels) +{ + return (channels && channels < 5); +} + +void sum(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride, + u32 * sumdst, u32 channels) +{ + internal::assertSupportedConfiguration(isSumSupported(channels)); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + const ptrdiff_t width = size.width * channels; + + for(size_t k = 0; k < size.height; ++k) + { + const u8* src = internal::getRowPtr( srcBase, srcStride, k); + ptrdiff_t i = 0; + + if (channels == 3) + { + uint32x4_t vs1231 = vdupq_n_u32(0); + uint32x4_t vs3123 = vdupq_n_u32(0); + uint32x4_t vs2312 = vdupq_n_u32(0); + for (; i <= width - 257*8*3; i += 257*8*3, src += 257*8*3) + { + uint16x8_t s1 = vmovl_u8(vld1_u8(src + 0)); + uint16x8_t s2 = vmovl_u8(vld1_u8(src + 8)); + uint16x8_t s3 = vmovl_u8(vld1_u8(src + 16)); + + for (ptrdiff_t j = 8*3; j < 257*8*3; j+= 8*3) + { + internal::prefetch(src + j + 24); + s1 = vaddw_u8(s1, vld1_u8(src + j + 0)); + s2 = vaddw_u8(s2, vld1_u8(src + j + 8)); + s3 = vaddw_u8(s3, vld1_u8(src + j + 16)); + } + + vs1231 = vqaddq_u32(vs1231, vaddl_u16(vget_low_u16(s1), vget_high_u16(s2))); + vs3123 = vqaddq_u32(vs3123, vaddl_u16(vget_low_u16(s2), vget_high_u16(s3))); + vs2312 = vqaddq_u32(vs2312, vaddl_u16(vget_low_u16(s3), vget_high_u16(s1))); + } + if (i <= width - 8*3) + { + uint16x8_t s1 = vmovl_u8(vld1_u8(src + 0)); + uint16x8_t s2 = vmovl_u8(vld1_u8(src + 8)); + uint16x8_t s3 = vmovl_u8(vld1_u8(src + 16)); + + for (i += 8*3, src += 8*3; i <= width - 8*3; i += 8*3, src += 8*3) + { + internal::prefetch(src + 24); + s1 = vaddw_u8(s1, vld1_u8(src + 0)); + s2 = vaddw_u8(s2, vld1_u8(src + 8)); + s3 = vaddw_u8(s3, vld1_u8(src + 16)); + } + + vs1231 = vqaddq_u32(vs1231, vaddl_u16(vget_low_u16(s1), vget_high_u16(s2))); + vs3123 = vqaddq_u32(vs3123, vaddl_u16(vget_low_u16(s2), vget_high_u16(s3))); + vs2312 = vqaddq_u32(vs2312, vaddl_u16(vget_low_u16(s3), vget_high_u16(s1))); + } + + u32 sum[12]; + vst1q_u32(sum+0, vs1231); + vst1q_u32(sum+4, vs2312); + vst1q_u32(sum+8, vs3123); + + for (; i < width; i += 3, src += 3) + { + sumdst[0] += src[0]; + sumdst[1] += src[1]; + sumdst[2] += src[2]; + } + + sumdst[0] += sum[0] + sum[3] + sum[6] + sum[9]; + sumdst[1] += sum[1] + sum[4] + sum[7] + sum[10]; + sumdst[2] += sum[2] + sum[5] + sum[8] + sum[11]; + } + else + { + uint32x4_t vs = vdupq_n_u32(0); + for (; i <= width - 257*8; i += 257*8, src += 257 * 8) + { + uint16x8_t s1 = vmovl_u8(vld1_u8(src)); + + for (int j = 8; j < 257 * 8; j += 8) + { + internal::prefetch(src + j); + s1 = vaddw_u8(s1, vld1_u8(src + j)); + } + + vs = vqaddq_u32(vs, vaddl_u16(vget_low_u16(s1), vget_high_u16(s1))); + } + if (i < width - 7) + { + uint16x8_t s1 = vmovl_u8(vld1_u8(src)); + + for(i+=8,src+=8; i < width-7; i+=8,src+=8) + { + internal::prefetch(src); + s1 = vaddw_u8(s1, vld1_u8(src)); + } + vs = vqaddq_u32(vs, vaddl_u16(vget_low_u16(s1), vget_high_u16(s1))); + } + + if (channels == 1) + { + uint32x2_t vs2 = vqadd_u32(vget_low_u32(vs), vget_high_u32(vs)); + uint32x2_t vs1 = vreinterpret_u32_u64(vpaddl_u32(vs2)); + + u32 s0 = vget_lane_u32(vs1, 0); + for(; i < width; ++i,++src) + s0 += src[0]; + sumdst[0] += s0; + } + else if (channels == 4) + { + vst1q_u32(sumdst, vqaddq_u32(vs, vld1q_u32(sumdst))); + + for(; i < width; i+=4,src+=4) + { + sumdst[0] += src[0]; + sumdst[1] += src[1]; + sumdst[2] += src[2]; + sumdst[3] += src[3]; + } + } + else//if (channels == 2) + { + uint32x2_t vs2 = vqadd_u32(vget_low_u32(vs), vget_high_u32(vs)); + vst1_u32(sumdst, vqadd_u32(vs2, vld1_u32(sumdst))); + + for(; i < width; i+=2,src+=2) + { + sumdst[0] += src[0]; + sumdst[1] += src[1]; + } + } + }//channels != 3 + } +#else + (void)_size; + (void)srcBase; + (void)srcStride; + (void)sumdst; + (void)channels; +#endif +} + +void sum(const Size2D &_size, + const f32 * srcBase, ptrdiff_t srcStride, + f64 * sumdst, u32 channels) +{ + internal::assertSupportedConfiguration(isSumSupported(channels)); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width)) + { + size.width *= size.height; + size.height = 1; + } + const ptrdiff_t width = size.width * channels; + + for(size_t k = 0; k < size.height; ++k) + { + const f32* src = internal::getRowPtr( srcBase, srcStride, k); + ptrdiff_t i = 0; + + if (channels == 3) + { + float32x4_t vs1231 = vdupq_n_f32(0); + float32x4_t vs2312 = vdupq_n_f32(0); + float32x4_t vs3123 = vdupq_n_f32(0); + for(; i <= width-12; i += 12) + { + internal::prefetch(src + i + 12); + vs1231 = vaddq_f32(vs1231, vld1q_f32(src + i + 0)); + vs2312 = vaddq_f32(vs2312, vld1q_f32(src + i + 4)); + vs3123 = vaddq_f32(vs3123, vld1q_f32(src + i + 8)); + } + + f32 s[12]; + vst1q_f32(s + 0, vs1231); + vst1q_f32(s + 4, vs2312); + vst1q_f32(s + 8, vs3123); + + sumdst[0] += s[0] + s[3] + s[6] + s[9]; + sumdst[1] += s[1] + s[4] + s[7] + s[10]; + sumdst[2] += s[2] + s[5] + s[8] + s[11]; + for( ; i < width; i+=3) + { + sumdst[0] += src[i]; + sumdst[1] += src[i+1]; + sumdst[2] += src[i+2]; + } + } + else + { + float32x4_t vs = vdupq_n_f32(0); + for(; i <= width-4; i += 4) + { + internal::prefetch(src + i); + vs = vaddq_f32(vs, vld1q_f32(src+i)); + } + + if (channels == 1) + { + float32x2_t vs2 = vpadd_f32(vget_low_f32(vs), vget_high_f32(vs)); + f32 s[2]; + vst1_f32(s, vs2); + + sumdst[0] += s[0] + s[1]; + for( ; i < width; i++) + sumdst[0] += src[i]; + } + else if (channels == 4) + { + f32 s[4]; + vst1q_f32(s, vs); + + sumdst[0] += s[0]; + sumdst[1] += s[1]; + sumdst[2] += s[2]; + sumdst[3] += s[3]; + } + else//if (channels == 2) + { + float32x2_t vs2 = vadd_f32(vget_low_f32(vs), vget_high_f32(vs)); + f32 s[2]; + vst1_f32(s, vs2); + + sumdst[0] += s[0]; + sumdst[1] += s[1]; + + if(i < width) + { + sumdst[0] += src[i]; + sumdst[1] += src[i+1]; + } + } + }//channels != 3 + } +#else + (void)_size; + (void)srcBase; + (void)srcStride; + (void)sumdst; + (void)channels; +#endif +} + +bool isSqsumSupported(u32 channels) +{ + return (channels && ((4/channels)*channels == 4)); +} + +void sqsum(const Size2D &_size, + const u8 * srcBase, ptrdiff_t srcStride, + f64 * sumdst, f64 * sqsumdst, u32 channels) +{ + internal::assertSupportedConfiguration(isSqsumSupported(channels)); +#ifdef CAROTENE_NEON + Size2D size(_size); + if (srcStride == (ptrdiff_t)(size.width*channels)) + { + size.width *= size.height; + size.height = 1; + } + const size_t width = size.width * channels; + + size_t blockSize0 = 1 << 23; + size_t roiw8 = width & ~7; + + uint32x4_t v_zero = vdupq_n_u32(0u); + + for (size_t i = 0; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + size_t j = 0u; + + while (j < roiw8) + { + size_t blockSize = std::min(roiw8 - j, blockSize0) + j; + uint32x4_t v_sum = v_zero; + uint32x4_t v_sqsum = v_zero; + + for ( ; j < blockSize ; j += 8, src += 8) + { + internal::prefetch(src); + uint8x8_t v_src0 = vld1_u8(src); + + uint16x8_t v_src = vmovl_u8(v_src0); + uint16x4_t v_srclo = vget_low_u16(v_src), v_srchi = vget_high_u16(v_src); + v_sum = vaddq_u32(v_sum, vaddl_u16(v_srclo, v_srchi)); + v_sqsum = vmlal_u16(v_sqsum, v_srclo, v_srclo); + v_sqsum = vmlal_u16(v_sqsum, v_srchi, v_srchi); + } + + u32 arsum[8]; + vst1q_u32(arsum, v_sum); + vst1q_u32(arsum + 4, v_sqsum); + + sumdst[0] += (f64)arsum[0]; + sumdst[1 % channels] += (f64)arsum[1]; + sumdst[2 % channels] += (f64)arsum[2]; + sumdst[3 % channels] += (f64)arsum[3]; + sqsumdst[0] += (f64)arsum[4]; + sqsumdst[1 % channels] += (f64)arsum[5]; + sqsumdst[2 % channels] += (f64)arsum[6]; + sqsumdst[3 % channels] += (f64)arsum[7]; + } + // collect a few last elements in the current row + // it's ok to process channels elements per step + // since we could handle 1,2 or 4 channels + // we always have channels-fold amount of elements remaining + for ( ; j < width; j+=channels, src+=channels) + { + for (u32 kk = 0; kk < channels; kk++) + { + u32 srcval = src[kk]; + sumdst[kk] += srcval; + sqsumdst[kk] += srcval * srcval; + } + } + } +#else + (void)_size; + (void)srcBase; + (void)srcStride; + (void)sumdst; + (void)sqsumdst; + (void)channels; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/template_matching.cpp b/3rdparty/carotene/src/template_matching.cpp new file mode 100644 index 0000000000..ad87085188 --- /dev/null +++ b/3rdparty/carotene/src/template_matching.cpp @@ -0,0 +1,241 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2013-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +#include +#include + +namespace CAROTENE_NS { + +#define ENABLE4LINESMATCHING false //Disabled since overall time for simultaneous 4 lines matching is greater than + //time for simultaneous 2 lines matching for the same amount of data + +bool isMatchTemplateSupported(const Size2D &tmplSize) +{ + return isSupportedConfiguration() && + tmplSize.width >= 8 && // Actually the function could process even shorter templates + // but there will be no NEON optimization in this case + (tmplSize.width * tmplSize.height) <= 256; +} + +void matchTemplate(const Size2D &srcSize, + const u8 * srcBase, ptrdiff_t srcStride, + const Size2D &tmplSize, + const u8 * tmplBase, ptrdiff_t tmplStride, + f32 * dstBase, ptrdiff_t dstStride, + bool normalize) +{ + internal::assertSupportedConfiguration(isMatchTemplateSupported(tmplSize)); +#ifdef CAROTENE_NEON + const size_t tmplW = tmplSize.width; + const size_t tmplH = tmplSize.height; + const size_t dstW = srcSize.width - tmplSize.width + 1; + const size_t dstH = srcSize.height - tmplSize.height + 1; + + //template correlation part + { +#if ENABLE4LINESMATCHING + const size_t dstroiw4 = dstW & ~3u; +#endif + const size_t dstroiw2 = dstW & ~1u; + const size_t tmplroiw = tmplW & ~7u; + const size_t dstride = dstStride >> 2; + + f32 *corr = dstBase; + const u8 *imgrrow = srcBase; + for(size_t r = 0; r < dstH; ++r, corr+=dstride, imgrrow+=srcStride) + { + size_t c = 0; +#if ENABLE4LINESMATCHING + for(; c < dstroiw4; c+=4) + { + u32 dot[4] = {0, 0, 0, 0}; + uint32x4_t vdot0 = vmovq_n_u32(0); + uint32x4_t vdot1 = vmovq_n_u32(0); + uint32x4_t vdot2 = vmovq_n_u32(0); + uint32x4_t vdot3 = vmovq_n_u32(0); + + const u8 *img = imgrrow; + const u8 *tmpl = tmplBase; + for(size_t i = 0; i < tmplH; ++i, tmpl+=tmplStride, img+=srcStride) + { + size_t j = 0; + for(; j < tmplroiw; j+=8) + { + uint8x8_t vtmpl = vld1_u8(tmpl + j); + + uint8x8_t vimg0 = vld1_u8(img + j + c + 0); + uint8x8_t vimg1 = vld1_u8(img + j + c + 1); + uint8x8_t vimg2 = vld1_u8(img + j + c + 2); + uint8x8_t vimg3 = vld1_u8(img + j + c + 3); + + uint16x8_t vd0 = vmull_u8(vtmpl, vimg0); + uint16x8_t vd1 = vmull_u8(vtmpl, vimg1); + uint16x8_t vd2 = vmull_u8(vtmpl, vimg2); + uint16x8_t vd3 = vmull_u8(vtmpl, vimg3); + + vdot0 = vpadalq_u16(vdot0, vd0); + vdot1 = vpadalq_u16(vdot1, vd1); + vdot2 = vpadalq_u16(vdot2, vd2); + vdot3 = vpadalq_u16(vdot3, vd3); + } + for(; j < tmplW; ++j) + { + dot[0] += tmpl[j] * img[j + c + 0]; + dot[1] += tmpl[j] * img[j + c + 1]; + dot[2] += tmpl[j] * img[j + c + 2]; + dot[3] += tmpl[j] * img[j + c + 3]; + } + } + uint32x4_t vdotx = vld1q_u32(dot); + uint32x2_t vdot_0 = vpadd_u32(vget_low_u32(vdot0), vget_high_u32(vdot0)); + uint32x2_t vdot_1 = vpadd_u32(vget_low_u32(vdot1), vget_high_u32(vdot1)); + uint32x2_t vdot_2 = vpadd_u32(vget_low_u32(vdot2), vget_high_u32(vdot2)); + uint32x2_t vdot_3 = vpadd_u32(vget_low_u32(vdot3), vget_high_u32(vdot3)); + uint32x2_t vdot_01 = vpadd_u32(vdot_0, vdot_1); + uint32x2_t vdot_23 = vpadd_u32(vdot_2, vdot_3); + + vst1q_f32(corr + c, vcvtq_f32_u32(vaddq_u32(vdotx, vcombine_u32(vdot_01, vdot_23)))); + } +#endif + + for(; c < dstroiw2; c+=2) + { + u32 dot[2] = {0, 0}; + uint32x4_t vdot0 = vmovq_n_u32(0); + uint32x4_t vdot1 = vmovq_n_u32(0); + const u8 *img = imgrrow; + const u8 *tmpl = tmplBase; + for(size_t i = 0; i < tmplH; ++i, tmpl+=tmplStride, img+=srcStride) + { + size_t j = 0; + for(; j < tmplroiw; j+=8) + { + uint8x8_t vtmpl = vld1_u8(tmpl + j); + + uint8x8_t vimg0 = vld1_u8(img + j + c + 0); + uint8x8_t vimg1 = vld1_u8(img + j + c + 1); + + uint16x8_t vd0 = vmull_u8(vtmpl, vimg0); + uint16x8_t vd1 = vmull_u8(vtmpl, vimg1); + + vdot0 = vpadalq_u16(vdot0, vd0); + vdot1 = vpadalq_u16(vdot1, vd1); + } + for(; j < tmplW; ++j) + { + dot[0] += tmpl[j] * img[j + c + 0]; + dot[1] += tmpl[j] * img[j + c + 1]; + } + } + uint32x2_t vdotx = vld1_u32(dot); + uint32x2_t vdot_0 = vpadd_u32(vget_low_u32(vdot0), vget_high_u32(vdot0)); + uint32x2_t vdot_1 = vpadd_u32(vget_low_u32(vdot1), vget_high_u32(vdot1)); + uint32x2_t vdot_ = vpadd_u32(vdot_0, vdot_1); + vst1_f32(corr + c, vcvt_f32_u32(vadd_u32(vdotx, vdot_))); + } + + for(; c < dstW; ++c) + { + u32 dot = 0; + uint32x4_t vdot = vmovq_n_u32(0); + const u8 *img = imgrrow; + const u8 *tmpl = tmplBase; + for(size_t i = 0; i < tmplH; ++i, tmpl+=tmplStride, img+=srcStride) + { + size_t j = 0; + for(; j < tmplroiw; j+=8) + { + uint8x8_t vtmpl = vld1_u8(tmpl + j); + uint8x8_t vimg = vld1_u8(img + j + c); + uint16x8_t vd = vmull_u8(vtmpl, vimg); + vdot = vpadalq_u16(vdot, vd); + } + for(; j < tmplW; ++j) + dot += tmpl[j] * img[j + c]; + } + u32 wdot[2]; + vst1_u32(wdot, vpadd_u32(vget_low_u32(vdot), vget_high_u32(vdot))); + dot += wdot[0] + wdot[1]; + corr[c] = (f32)dot; + } + } + } + + if(normalize) + { + f32 tn = std::sqrt((f32)normL2(tmplSize, tmplBase, tmplStride)); + + size_t iw = srcSize.width+1; + size_t ih = srcSize.height+1; + std::vector _sqsum(iw*ih); + f64 *sqsum = &_sqsum[0]; + memset(sqsum, 0, iw*sizeof(f64)); + for(size_t i = 1; i < ih; ++i) + sqsum[iw*i] = 0.; + sqrIntegral(srcSize, srcBase, srcStride, sqsum + iw + 1, iw*sizeof(f64)); + + for(size_t i = 0; i < dstH; ++i) + { + f32 *result = internal::getRowPtr(dstBase, dstStride, i); + for(size_t j = 0; j < dstW; ++j) + { + double s2 = sqsum[iw*i + j] + + sqsum[iw*(i + tmplSize.height) + j + tmplSize.width] - + sqsum[iw*(i + tmplSize.height) + j] - + sqsum[iw*i + j + tmplSize.width]; + + result[j] /= tn * std::sqrt(s2); + } + } + } +#else + (void)srcSize; + (void)srcBase; + (void)srcStride; + (void)tmplBase; + (void)tmplStride; + (void)dstBase; + (void)dstStride; + (void)normalize; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/threshold.cpp b/3rdparty/carotene/src/threshold.cpp new file mode 100644 index 0000000000..8e03798b02 --- /dev/null +++ b/3rdparty/carotene/src/threshold.cpp @@ -0,0 +1,1627 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "common.hpp" + +namespace CAROTENE_NS { + +void thresholdBinary(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 *dstBase, ptrdiff_t dstStride, + u8 threshold, u8 trueValue, u8 falseValue) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + uint8x16_t vthreshold = vdupq_n_u8(threshold); + uint8x8_t vthreshold8 = vdup_n_u8(threshold); + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + if(trueValue == 255 && falseValue == 0) + { + for (size_t i = 0; i < size.height; ++i) { + const u8* src = internal::getRowPtr(srcBase, srcStride, i); + u8* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw32; j += 32) { + internal::prefetch(src + j); + uint8x16_t v0 = vld1q_u8(src + j); + uint8x16_t v1 = vld1q_u8(src + j + 16); + uint8x16_t r0 = vcgtq_u8(v0, vthreshold); + uint8x16_t r1 = vcgtq_u8(v1, vthreshold); + vst1q_u8(dst + j, r0); + vst1q_u8(dst + j + 16, r1); + } + for (; j < roiw8; j += 8) { + uint8x8_t v0 = vld1_u8(src + j); + uint8x8_t r0 = vcgt_u8(v0, vthreshold8); + vst1_u8(dst + j, r0); + } + + for (; j < size.width; j++) { + *(dst + j) = *(src + j) > threshold ? 255 : 0; + } + } + } + else + { + uint8x16_t vtrue_value = vdupq_n_u8(trueValue); + uint8x8_t vtrue_value8 = vdup_n_u8(trueValue); + uint8x16_t vfalse_value = vdupq_n_u8(falseValue); + uint8x8_t vfalse_value8 = vdup_n_u8(falseValue); + + for (size_t i = 0; i < size.height; ++i) { + const u8* src = internal::getRowPtr(srcBase, srcStride, i); + u8* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw32; j += 32) { + internal::prefetch(src + j); + uint8x16_t v0 = vld1q_u8(src + j); + uint8x16_t v1 = vld1q_u8(src + j + 16); + uint8x16_t r0 = vcgtq_u8(v0, vthreshold); + uint8x16_t r1 = vcgtq_u8(v1, vthreshold); + uint8x16_t r0a = vbslq_u8(r0, vtrue_value, vfalse_value); + uint8x16_t r1a = vbslq_u8(r1, vtrue_value, vfalse_value); + vst1q_u8(dst + j, r0a); + vst1q_u8(dst + j + 16, r1a); + } + for (; j < roiw8; j += 8) { + uint8x8_t v0 = vld1_u8(src + j); + uint8x8_t r0 = vcgt_u8(v0, vthreshold8); + uint8x8_t r0a = vbsl_u8(r0, vtrue_value8, vfalse_value8); + vst1_u8(dst + j, r0a); + } + + for (; j < size.width; j++) { + *(dst + j) = *(src + j) > threshold ? trueValue : falseValue; + } + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; + (void)trueValue; + (void)falseValue; +#endif +} + +void thresholdRange(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 *dstBase, ptrdiff_t dstStride, + u8 lowerThreshold, u8 upperThreshold, + u8 trueValue, u8 falseValue) +{ + internal::assertSupportedConfiguration(); + +#ifdef CAROTENE_NEON + uint8x16_t v_lower = vdupq_n_u8(lowerThreshold), v_upper = vdupq_n_u8(upperThreshold); + uint8x8_t v_lower8 = vdup_n_u8(lowerThreshold), v_upper8 = vdup_n_u8(upperThreshold); + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + if(trueValue == 255 && falseValue == 0) + { + for (size_t i = 0; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw32; j += 32) + { + internal::prefetch(src + j); + uint8x16_t v_src0 = vld1q_u8(src + j), v_src1 = vld1q_u8(src + j + 16); + uint8x16_t v_dst0 = vandq_u8(vcgeq_u8(v_src0, v_lower), vcleq_u8(v_src0, v_upper)); + uint8x16_t v_dst1 = vandq_u8(vcgeq_u8(v_src1, v_lower), vcleq_u8(v_src1, v_upper)); + vst1q_u8(dst + j, v_dst0); + vst1q_u8(dst + j + 16, v_dst1); + } + for (; j < roiw8; j += 8) + { + uint8x8_t v_src = vld1_u8(src + j); + uint8x8_t v_dst = vand_u8(vcge_u8(v_src, v_lower8), vcle_u8(v_src, v_upper8)); + vst1_u8(dst + j, v_dst); + } + + for (; j < size.width; j++) + { + u8 srcVal = src[j]; + dst[j] = lowerThreshold <= srcVal && srcVal <= upperThreshold ? 255 : 0; + } + } + } + else + { + uint8x16_t vtrue_value = vdupq_n_u8(trueValue); + uint8x8_t vtrue_value8 = vdup_n_u8(trueValue); + uint8x16_t vfalse_value = vdupq_n_u8(falseValue); + uint8x8_t vfalse_value8 = vdup_n_u8(falseValue); + + for (size_t i = 0; i < size.height; ++i) + { + const u8 * src = internal::getRowPtr(srcBase, srcStride, i); + u8 * dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw32; j += 32) + { + internal::prefetch(src + j); + uint8x16_t v_src0 = vld1q_u8(src + j), v_src1 = vld1q_u8(src + j + 16); + uint8x16_t v_dst0 = vandq_u8(vcgeq_u8(v_src0, v_lower), vcleq_u8(v_src0, v_upper)); + uint8x16_t v_dst1 = vandq_u8(vcgeq_u8(v_src1, v_lower), vcleq_u8(v_src1, v_upper)); + v_dst0 = vbslq_u8(v_dst0, vtrue_value, vfalse_value); + v_dst1 = vbslq_u8(v_dst1, vtrue_value, vfalse_value); + vst1q_u8(dst + j, v_dst0); + vst1q_u8(dst + j + 16, v_dst1); + } + for (; j < roiw8; j += 8) + { + uint8x8_t v_src = vld1_u8(src + j); + uint8x8_t v_dst = vand_u8(vcge_u8(v_src, v_lower8), vcle_u8(v_src, v_upper8)); + v_dst = vbsl_u8(v_dst, vtrue_value8, vfalse_value8); + vst1_u8(dst + j, v_dst); + } + + for (; j < size.width; j++) + { + u8 srcVal = src[j]; + dst[j] = lowerThreshold <= srcVal && srcVal <= upperThreshold ? trueValue : falseValue; + } + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)lowerThreshold; + (void)upperThreshold; + (void)trueValue; + (void)falseValue; +#endif +} + +void thresholdBinary(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 *dstBase, ptrdiff_t dstStride, + u8 threshold, u8 value) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + uint8x16_t vthreshold = vdupq_n_u8(threshold); + uint8x16_t vvalue = vdupq_n_u8(value); + uint8x8_t vthreshold8 = vdup_n_u8(threshold); + uint8x8_t vvalue8 = vdup_n_u8(value); + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u8* src = internal::getRowPtr(srcBase, srcStride, i); + u8* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw32; j += 32) + { + internal::prefetch(src + j); + uint8x16_t v0 = vld1q_u8(src + j); + uint8x16_t v1 = vld1q_u8(src + j + 16); + uint8x16_t r0 = vcgtq_u8(v0, vthreshold); + uint8x16_t r1 = vcgtq_u8(v1, vthreshold); + uint8x16_t r0a = vandq_u8(r0, vvalue); + uint8x16_t r1a = vandq_u8(r1, vvalue); + vst1q_u8(dst + j, r0a); + vst1q_u8(dst + j + 16, r1a); + } + for (; j < roiw8; j += 8) + { + uint8x8_t v0 = vld1_u8(src + j); + uint8x8_t r0 = vcgt_u8(v0, vthreshold8); + uint8x8_t r0a = vand_u8(r0, vvalue8); + vst1_u8(dst + j, r0a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? value : 0; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; + (void)value; +#endif +} + +void thresholdBinaryInv(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 *dstBase, ptrdiff_t dstStride, + u8 threshold, u8 value) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + uint8x16_t vthreshold = vdupq_n_u8(threshold); + uint8x16_t vvalue = vdupq_n_u8(value); + uint8x8_t vthreshold8 = vdup_n_u8(threshold); + uint8x8_t vvalue8 = vdup_n_u8(value); + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u8* src = internal::getRowPtr(srcBase, srcStride, i); + u8* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw32; j += 32) + { + internal::prefetch(src + j); + uint8x16_t v0 = vld1q_u8(src + j); + uint8x16_t v1 = vld1q_u8(src + j + 16); + uint8x16_t r0 = vcleq_u8(v0, vthreshold); + uint8x16_t r1 = vcleq_u8(v1, vthreshold); + uint8x16_t r0a = vandq_u8(r0, vvalue); + uint8x16_t r1a = vandq_u8(r1, vvalue); + vst1q_u8(dst + j, r0a); + vst1q_u8(dst + j + 16, r1a); + } + for (; j < roiw8; j += 8) + { + uint8x8_t v0 = vld1_u8(src + j); + uint8x8_t r0 = vcle_u8(v0, vthreshold8); + uint8x8_t r0a = vand_u8(r0, vvalue8); + vst1_u8(dst + j, r0a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? 0 : value; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; + (void)value; +#endif +} + +void thresholdTruncate(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 *dstBase, ptrdiff_t dstStride, + u8 threshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + uint8x16_t vthreshold = vdupq_n_u8(threshold); + uint8x8_t vthreshold8 = vdup_n_u8(threshold); + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u8* src = internal::getRowPtr(srcBase, srcStride, i); + u8* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw32; j += 32) + { + internal::prefetch(src + j); + uint8x16_t v0 = vld1q_u8(src + j); + uint8x16_t v1 = vld1q_u8(src + j + 16); + uint8x16_t r0 = vqsubq_u8(v0, vthreshold); + uint8x16_t r1 = vqsubq_u8(v1, vthreshold); + uint8x16_t r0a = vqsubq_u8(v0, r0); + uint8x16_t r1a = vqsubq_u8(v1, r1); + vst1q_u8(dst + j, r0a); + vst1q_u8(dst + j + 16, r1a); + } + for (; j < roiw8; j += 8) + { + uint8x8_t v0 = vld1_u8(src + j); + uint8x8_t r0 = vqsub_u8(v0, vthreshold8); + uint8x8_t r0a = vqsub_u8(v0, r0); + vst1_u8(dst + j, r0a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? threshold : *(src + j); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; +#endif +} + +void thresholdToZero(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 *dstBase, ptrdiff_t dstStride, + u8 threshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + uint8x16_t vthreshold = vdupq_n_u8(threshold); + uint8x8_t vthreshold8 = vdup_n_u8(threshold); + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u8* src = internal::getRowPtr(srcBase, srcStride, i); + u8* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw32; j += 32) + { + internal::prefetch(src + j); + uint8x16_t v0 = vld1q_u8(src + j); + uint8x16_t v1 = vld1q_u8(src + j + 16); + uint8x16_t r0 = vcgtq_u8(v0, vthreshold); + uint8x16_t r1 = vcgtq_u8(v1, vthreshold); + uint8x16_t r0a = vandq_u8(v0, r0); + uint8x16_t r1a = vandq_u8(v1, r1); + vst1q_u8(dst + j, r0a); + vst1q_u8(dst + j + 16, r1a); + } + for (; j < roiw8; j += 8) + { + uint8x8_t v0 = vld1_u8(src + j); + uint8x8_t r0 = vcgt_u8(v0, vthreshold8); + uint8x8_t r0a = vand_u8(v0, r0); + vst1_u8(dst + j, r0a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? *(src + j) : 0; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; +#endif +} + +void thresholdToZeroInv(const Size2D &size, + const u8 *srcBase, ptrdiff_t srcStride, + u8 *dstBase, ptrdiff_t dstStride, + u8 threshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + uint8x16_t vthreshold = vdupq_n_u8(threshold); + uint8x8_t vthreshold8 = vdup_n_u8(threshold); + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u8* src = internal::getRowPtr(srcBase, srcStride, i); + u8* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw32; j += 32) + { + internal::prefetch(src + j); + uint8x16_t v0 = vld1q_u8(src + j); + uint8x16_t v1 = vld1q_u8(src + j + 16); + uint8x16_t r0 = vcgtq_u8(v0, vthreshold); + uint8x16_t r1 = vcgtq_u8(v1, vthreshold); + uint8x16_t r0a = vbicq_u8(v0, r0); + uint8x16_t r1a = vbicq_u8(v1, r1); + vst1q_u8(dst + j, r0a); + vst1q_u8(dst + j + 16, r1a); + } + for (; j < roiw8; j += 8) + { + uint8x8_t v0 = vld1_u8(src + j); + uint8x8_t r0 = vcgt_u8(v0, vthreshold8); + uint8x8_t r0a = vbic_u8(v0, r0); + vst1_u8(dst + j, r0a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? 0 : *(src + j); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; +#endif +} + +void thresholdBinary(const Size2D &size, + const s8 *srcBase, ptrdiff_t srcStride, + s8 *dstBase, ptrdiff_t dstStride, + s8 threshold, s8 value) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + int8x16_t vthreshold = vdupq_n_s8(threshold); + int8x16_t vvalue = vdupq_n_s8(value); + int8x8_t vthreshold8 = vdup_n_s8(threshold); + int8x8_t vvalue8 = vdup_n_s8(value); + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s8* src = internal::getRowPtr(srcBase, srcStride, i); + s8* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw32; j += 32) + { + internal::prefetch(src + j); + int8x16_t v0 = vld1q_s8(src + j); + int8x16_t v1 = vld1q_s8(src + j + 16); + int8x16_t r0 = vreinterpretq_s8_u8(vcgtq_s8(v0, vthreshold)); + int8x16_t r1 = vreinterpretq_s8_u8(vcgtq_s8(v1, vthreshold)); + int8x16_t r0a = vandq_s8(r0, vvalue); + int8x16_t r1a = vandq_s8(r1, vvalue); + vst1q_s8(dst + j, r0a); + vst1q_s8(dst + j + 16, r1a); + } + for (; j < roiw8; j += 8) + { + int8x8_t v0 = vld1_s8(src + j); + int8x8_t r0 = vreinterpret_s8_u8(vcgt_s8(v0, vthreshold8)); + int8x8_t r0a = vand_s8(r0, vvalue8); + vst1_s8(dst + j, r0a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? value : 0; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; + (void)value; +#endif +} + +void thresholdBinaryInv(const Size2D &size, + const s8 *srcBase, ptrdiff_t srcStride, + s8 *dstBase, ptrdiff_t dstStride, + s8 threshold, s8 value) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + int8x16_t vthreshold = vdupq_n_s8(threshold); + int8x16_t vvalue = vdupq_n_s8(value); + int8x8_t vthreshold8 = vdup_n_s8(threshold); + int8x8_t vvalue8 = vdup_n_s8(value); + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s8* src = internal::getRowPtr(srcBase, srcStride, i); + s8* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw32; j += 32) + { + internal::prefetch(src + j); + int8x16_t v0 = vld1q_s8(src + j); + int8x16_t v1 = vld1q_s8(src + j + 16); + int8x16_t r0 = vreinterpretq_s8_u8(vcleq_s8(v0, vthreshold)); + int8x16_t r1 = vreinterpretq_s8_u8(vcleq_s8(v1, vthreshold)); + int8x16_t r0a = vandq_s8(r0, vvalue); + int8x16_t r1a = vandq_s8(r1, vvalue); + vst1q_s8(dst + j, r0a); + vst1q_s8(dst + j + 16, r1a); + } + for (; j < roiw8; j += 8) + { + int8x8_t v0 = vld1_s8(src + j); + int8x8_t r0 = vreinterpret_s8_u8(vcle_s8(v0, vthreshold8)); + int8x8_t r0a = vand_s8(r0, vvalue8); + vst1_s8(dst + j, r0a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? 0 : value; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; + (void)value; +#endif +} + +void thresholdTruncate(const Size2D &size, + const s8 *srcBase, ptrdiff_t srcStride, + s8 *dstBase, ptrdiff_t dstStride, + s8 threshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + int8x16_t vthreshold = vdupq_n_s8(threshold); + int8x8_t vthreshold8 = vdup_n_s8(threshold); + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s8* src = internal::getRowPtr(srcBase, srcStride, i); + s8* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw32; j += 32) + { + internal::prefetch(src + j); + int8x16_t v0 = vld1q_s8(src + j); + int8x16_t v1 = vld1q_s8(src + j + 16); + int8x16_t r0 = vqsubq_s8(v0, vthreshold); + int8x16_t r1 = vqsubq_s8(v1, vthreshold); + int8x16_t r0a = vqsubq_s8(v0, r0); + int8x16_t r1a = vqsubq_s8(v1, r1); + vst1q_s8(dst + j, r0a); + vst1q_s8(dst + j + 16, r1a); + } + for (; j < roiw8; j += 8) + { + int8x8_t v0 = vld1_s8(src + j); + int8x8_t r0 = vqsub_s8(v0, vthreshold8); + int8x8_t r0a = vqsub_s8(v0, r0); + vst1_s8(dst + j, r0a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? threshold : *(src + j); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; +#endif +} + +void thresholdToZero(const Size2D &size, + const s8 *srcBase, ptrdiff_t srcStride, + s8 *dstBase, ptrdiff_t dstStride, + s8 threshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + int8x16_t vthreshold = vdupq_n_s8(threshold); + int8x8_t vthreshold8 = vdup_n_s8(threshold); + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s8* src = internal::getRowPtr(srcBase, srcStride, i); + s8* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw32; j += 32) + { + internal::prefetch(src + j); + int8x16_t v0 = vld1q_s8(src + j); + int8x16_t v1 = vld1q_s8(src + j + 16); + int8x16_t r0 = vreinterpretq_s8_u8(vcgtq_s8(v0, vthreshold)); + int8x16_t r1 = vreinterpretq_s8_u8(vcgtq_s8(v1, vthreshold)); + int8x16_t r0a = vandq_s8(v0, r0); + int8x16_t r1a = vandq_s8(v1, r1); + vst1q_s8(dst + j, r0a); + vst1q_s8(dst + j + 16, r1a); + } + for (; j < roiw8; j += 8) + { + int8x8_t v0 = vld1_s8(src + j); + int8x8_t r0 = vreinterpret_s8_u8(vcgt_s8(v0, vthreshold8)); + int8x8_t r0a = vand_s8(v0, r0); + vst1_s8(dst + j, r0a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? *(src + j) : 0; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; +#endif +} + +void thresholdToZeroInv(const Size2D &size, + const s8 *srcBase, ptrdiff_t srcStride, + s8 *dstBase, ptrdiff_t dstStride, + s8 threshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + int8x16_t vthreshold = vdupq_n_s8(threshold); + int8x8_t vthreshold8 = vdup_n_s8(threshold); + size_t roiw32 = size.width >= 31 ? size.width - 31 : 0; + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s8* src = internal::getRowPtr(srcBase, srcStride, i); + s8* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw32; j += 32) + { + internal::prefetch(src + j); + int8x16_t v0 = vld1q_s8(src + j); + int8x16_t v1 = vld1q_s8(src + j + 16); + int8x16_t r0 = vreinterpretq_s8_u8(vcgtq_s8(v0, vthreshold)); + int8x16_t r1 = vreinterpretq_s8_u8(vcgtq_s8(v1, vthreshold)); + int8x16_t r0a = vbicq_s8(v0, r0); + int8x16_t r1a = vbicq_s8(v1, r1); + vst1q_s8(dst + j, r0a); + vst1q_s8(dst + j + 16, r1a); + } + for (; j < roiw8; j += 8) + { + int8x8_t v0 = vld1_s8(src + j); + int8x8_t r0 = vreinterpret_s8_u8(vcgt_s8(v0, vthreshold8)); + int8x8_t r0a = vbic_s8(v0, r0); + vst1_s8(dst + j, r0a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? 0 : *(src + j); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; +#endif +} + +void thresholdBinary(const Size2D &size, + const s16 *srcBase, ptrdiff_t srcStride, + s16 *dstBase, ptrdiff_t dstStride, + s16 threshold, s16 value) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + int16x8_t vthreshold16 = vdupq_n_s16(threshold); + int16x8_t vvalue16 = vdupq_n_s16(value); + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s16* src = internal::getRowPtr(srcBase, srcStride, i); + s16* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + int16x8_t v0 = vld1q_s16(src + j); + int16x8_t v1 = vld1q_s16(src + j + 8); + uint16x8_t r0 = vcgtq_s16(v0, vthreshold16); + uint16x8_t r1 = vcgtq_s16(v1, vthreshold16); + uint16x8_t r0a = vandq_u16(r0, vreinterpretq_u16_s16(vvalue16)); + uint16x8_t r1a = vandq_u16(r1, vreinterpretq_u16_s16(vvalue16)); + vst1q_u16((u16*)dst + j, r0a); + vst1q_u16((u16*)dst + j + 8, r1a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? value : 0; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; + (void)value; +#endif +} + +void thresholdBinaryInv(const Size2D &size, + const s16 *srcBase, ptrdiff_t srcStride, + s16 *dstBase, ptrdiff_t dstStride, + s16 threshold, s16 value) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + int16x8_t vthreshold16 = vdupq_n_s16(threshold); + int16x8_t vvalue16 = vdupq_n_s16(value); + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s16* src = internal::getRowPtr(srcBase, srcStride, i); + s16* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + int16x8_t v0 = vld1q_s16(src + j); + int16x8_t v1 = vld1q_s16(src + j + 8); + uint16x8_t r0 = vcleq_s16(v0, vthreshold16); + uint16x8_t r1 = vcleq_s16(v1, vthreshold16); + uint16x8_t r0a = vandq_u16(r0, vreinterpretq_u16_s16(vvalue16)); + uint16x8_t r1a = vandq_u16(r1, vreinterpretq_u16_s16(vvalue16)); + vst1q_s16(dst + j, vreinterpretq_s16_u16(r0a)); + vst1q_s16(dst + j + 8, vreinterpretq_s16_u16(r1a)); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? 0 : value; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; + (void)value; +#endif +} + +void thresholdTruncate(const Size2D &size, + const s16 *srcBase, ptrdiff_t srcStride, + s16 *dstBase, ptrdiff_t dstStride, + s16 threshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + int16x8_t vthreshold16 = vdupq_n_s16(threshold); + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s16* src = internal::getRowPtr(srcBase, srcStride, i); + s16* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + int16x8_t v0 = vld1q_s16(src + j); + int16x8_t v1 = vld1q_s16(src + j + 8); + int16x8_t r0 = vminq_s16(v0, vthreshold16); + int16x8_t r1 = vminq_s16(v1, vthreshold16); + vst1q_s16(dst + j, r0); + vst1q_s16(dst + j + 8, r1); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? threshold : *(src + j); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; +#endif +} + +void thresholdToZero(const Size2D &size, + const s16 *srcBase, ptrdiff_t srcStride, + s16 *dstBase, ptrdiff_t dstStride, + s16 threshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + int16x8_t vthreshold16 = vdupq_n_s16(threshold); + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s16* src = internal::getRowPtr(srcBase, srcStride, i); + s16* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + int16x8_t v0 = vld1q_s16(src + j); + int16x8_t v1 = vld1q_s16(src + j + 8); + uint16x8_t r0 = vcgtq_s16(v0, vthreshold16); + uint16x8_t r1 = vcgtq_s16(v1, vthreshold16); + uint16x8_t r0a = vandq_u16(vreinterpretq_u16_s16(v0), r0); + uint16x8_t r1a = vandq_u16(vreinterpretq_u16_s16(v1), r1); + vst1q_u16((u16*)dst + j, r0a); + vst1q_u16((u16*)dst + j + 8, r1a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? *(src + j) : 0; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; +#endif +} + +void thresholdToZeroInv(const Size2D &size, + const s16 *srcBase, ptrdiff_t srcStride, + s16 *dstBase, ptrdiff_t dstStride, + s16 threshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + int16x8_t vthreshold16 = vdupq_n_s16(threshold); + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s16* src = internal::getRowPtr(srcBase, srcStride, i); + s16* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + int16x8_t v0 = vld1q_s16(src + j); + int16x8_t v1 = vld1q_s16(src + j + 8); + uint16x8_t r0 = vcgtq_s16(v0, vthreshold16); + uint16x8_t r1 = vcgtq_s16(v1, vthreshold16); + uint16x8_t r0a = vbicq_u16(vreinterpretq_u16_s16(v0), r0); + uint16x8_t r1a = vbicq_u16(vreinterpretq_u16_s16(v1), r1); + vst1q_u16((u16*)dst + j, r0a); + vst1q_u16((u16*)dst + j + 8, r1a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? 0 : *(src + j); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; +#endif +} + +void thresholdBinary(const Size2D &size, + const u16 *srcBase, ptrdiff_t srcStride, + u16 *dstBase, ptrdiff_t dstStride, + u16 threshold, u16 value) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + uint16x8_t vthreshold16 = vdupq_n_u16(threshold); + uint16x8_t vvalue16 = vdupq_n_u16(value); + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u16* src = internal::getRowPtr(srcBase, srcStride, i); + u16* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + uint16x8_t v0 = vld1q_u16(src + j); + uint16x8_t v1 = vld1q_u16(src + j + 8); + uint16x8_t r0 = vcgtq_u16(v0, vthreshold16); + uint16x8_t r1 = vcgtq_u16(v1, vthreshold16); + uint16x8_t r0a = vandq_u16(r0, vvalue16); + uint16x8_t r1a = vandq_u16(r1, vvalue16); + vst1q_u16(dst + j, r0a); + vst1q_u16(dst + j + 8, r1a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? value : 0; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; + (void)value; +#endif +} + +void thresholdBinaryInv(const Size2D &size, + const u16 *srcBase, ptrdiff_t srcStride, + u16 *dstBase, ptrdiff_t dstStride, + u16 threshold, u16 value) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + uint16x8_t vthreshold16 = vdupq_n_u16(threshold); + uint16x8_t vvalue16 = vdupq_n_u16(value); + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u16* src = internal::getRowPtr(srcBase, srcStride, i); + u16* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + uint16x8_t v0 = vld1q_u16(src + j); + uint16x8_t v1 = vld1q_u16(src + j + 8); + uint16x8_t r0 = vcleq_u16(v0, vthreshold16); + uint16x8_t r1 = vcleq_u16(v1, vthreshold16); + uint16x8_t r0a = vandq_u16(r0, vvalue16); + uint16x8_t r1a = vandq_u16(r1, vvalue16); + vst1q_u16(dst + j, r0a); + vst1q_u16(dst + j + 8, r1a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? 0 : value; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; + (void)value; +#endif +} + +void thresholdTruncate(const Size2D &size, + const u16 *srcBase, ptrdiff_t srcStride, + u16 *dstBase, ptrdiff_t dstStride, + u16 threshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + uint16x8_t vthreshold16 = vdupq_n_u16(threshold); + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u16* src = internal::getRowPtr(srcBase, srcStride, i); + u16* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + uint16x8_t v0 = vld1q_u16(src + j); + uint16x8_t v1 = vld1q_u16(src + j + 8); + uint16x8_t r0 = vminq_u16(v0, vthreshold16); + uint16x8_t r1 = vminq_u16(v1, vthreshold16); + vst1q_u16(dst + j, r0); + vst1q_u16(dst + j + 8, r1); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? threshold : *(src + j); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; +#endif +} + +void thresholdToZero(const Size2D &size, + const u16 *srcBase, ptrdiff_t srcStride, + u16 *dstBase, ptrdiff_t dstStride, + u16 threshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + uint16x8_t vthreshold16 = vdupq_n_u16(threshold); + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u16* src = internal::getRowPtr(srcBase, srcStride, i); + u16* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + uint16x8_t v0 = vld1q_u16(src + j); + uint16x8_t v1 = vld1q_u16(src + j + 8); + uint16x8_t r0 = vcgtq_u16(v0, vthreshold16); + uint16x8_t r1 = vcgtq_u16(v1, vthreshold16); + uint16x8_t r0a = vandq_u16(v0, r0); + uint16x8_t r1a = vandq_u16(v1, r1); + vst1q_u16(dst + j, r0a); + vst1q_u16(dst + j + 8, r1a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? *(src + j) : 0; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; +#endif +} + +void thresholdToZeroInv(const Size2D &size, + const u16 *srcBase, ptrdiff_t srcStride, + u16 *dstBase, ptrdiff_t dstStride, + u16 threshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + uint16x8_t vthreshold16 = vdupq_n_u16(threshold); + size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const u16* src = internal::getRowPtr(srcBase, srcStride, i); + u16* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw16; j += 16) + { + internal::prefetch(src + j); + uint16x8_t v0 = vld1q_u16(src + j); + uint16x8_t v1 = vld1q_u16(src + j + 8); + uint16x8_t r0 = vcgtq_u16(v0, vthreshold16); + uint16x8_t r1 = vcgtq_u16(v1, vthreshold16); + uint16x8_t r0a = vbicq_u16(v0, r0); + uint16x8_t r1a = vbicq_u16(v1, r1); + vst1q_u16(dst + j, r0a); + vst1q_u16(dst + j + 8, r1a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? 0 : *(src + j); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; +#endif +} + +void thresholdBinary(const Size2D &size, + const s32 *srcBase, ptrdiff_t srcStride, + s32 *dstBase, ptrdiff_t dstStride, + s32 threshold, s32 value) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + int32x4_t vthreshold8 = vdupq_n_s32(threshold); + int32x4_t vvalue8 = vdupq_n_s32(value); + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s32* src = internal::getRowPtr(srcBase, srcStride, i); + s32* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw8; j += 8) + { + internal::prefetch(src + j); + int32x4_t v0 = vld1q_s32(src + j); + int32x4_t v1 = vld1q_s32(src + j + 4); + uint32x4_t r0 = vcgtq_s32(v0, vthreshold8); + uint32x4_t r1 = vcgtq_s32(v1, vthreshold8); + uint32x4_t r0a = vandq_u32(r0, vreinterpretq_u32_s32(vvalue8)); + uint32x4_t r1a = vandq_u32(r1, vreinterpretq_u32_s32(vvalue8)); + vst1q_u32((u32*)dst + j, r0a); + vst1q_u32((u32*)dst + j + 4, r1a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? value : 0; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; + (void)value; +#endif +} + +void thresholdBinaryInv(const Size2D &size, + const s32 *srcBase, ptrdiff_t srcStride, + s32 *dstBase, ptrdiff_t dstStride, + s32 threshold, s32 value) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + int32x4_t vthreshold8 = vdupq_n_s32(threshold); + int32x4_t vvalue8 = vdupq_n_s32(value); + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s32* src = internal::getRowPtr(srcBase, srcStride, i); + s32* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw8; j += 8) + { + internal::prefetch(src + j); + int32x4_t v0 = vld1q_s32(src + j); + int32x4_t v1 = vld1q_s32(src + j + 4); + uint32x4_t r0 = vcleq_s32(v0, vthreshold8); + uint32x4_t r1 = vcleq_s32(v1, vthreshold8); + uint32x4_t r0a = vandq_u32(r0, vreinterpretq_u32_s32(vvalue8)); + uint32x4_t r1a = vandq_u32(r1, vreinterpretq_u32_s32(vvalue8)); + vst1q_s32(dst + j, vreinterpretq_s32_u32(r0a)); + vst1q_s32(dst + j + 4, vreinterpretq_s32_u32(r1a)); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? 0 : value; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; + (void)value; +#endif +} + +void thresholdTruncate(const Size2D &size, + const s32 *srcBase, ptrdiff_t srcStride, + s32 *dstBase, ptrdiff_t dstStride, + s32 threshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + int32x4_t vthreshold8 = vdupq_n_s32(threshold); + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s32* src = internal::getRowPtr(srcBase, srcStride, i); + s32* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw8; j += 8) + { + internal::prefetch(src + j); + int32x4_t v0 = vld1q_s32(src + j); + int32x4_t v1 = vld1q_s32(src + j + 4); + int32x4_t r0 = vminq_s32(v0, vthreshold8); + int32x4_t r1 = vminq_s32(v1, vthreshold8); + vst1q_s32(dst + j, r0); + vst1q_s32(dst + j + 4, r1); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? threshold : *(src + j); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; +#endif +} + +void thresholdToZero(const Size2D &size, + const s32 *srcBase, ptrdiff_t srcStride, + s32 *dstBase, ptrdiff_t dstStride, + s32 threshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + int32x4_t vthreshold8 = vdupq_n_s32(threshold); + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s32* src = internal::getRowPtr(srcBase, srcStride, i); + s32* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw8; j += 8) + { + internal::prefetch(src + j); + int32x4_t v0 = vld1q_s32(src + j); + int32x4_t v1 = vld1q_s32(src + j + 4); + uint32x4_t r0 = vcgtq_s32(v0, vthreshold8); + uint32x4_t r1 = vcgtq_s32(v1, vthreshold8); + uint32x4_t r0a = vandq_u32(vreinterpretq_u32_s32(v0), r0); + uint32x4_t r1a = vandq_u32(vreinterpretq_u32_s32(v1), r1); + vst1q_u32((u32*)dst + j, r0a); + vst1q_u32((u32*)dst + j + 4, r1a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? *(src + j) : 0; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; +#endif +} + +void thresholdToZeroInv(const Size2D &size, + const s32 *srcBase, ptrdiff_t srcStride, + s32 *dstBase, ptrdiff_t dstStride, + s32 threshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + int32x4_t vthreshold8 = vdupq_n_s32(threshold); + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const s32* src = internal::getRowPtr(srcBase, srcStride, i); + s32* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw8; j += 8) + { + internal::prefetch(src + j); + int32x4_t v0 = vld1q_s32(src + j); + int32x4_t v1 = vld1q_s32(src + j + 4); + uint32x4_t r0 = vcgtq_s32(v0, vthreshold8); + uint32x4_t r1 = vcgtq_s32(v1, vthreshold8); + uint32x4_t r0a = vbicq_u32(vreinterpretq_u32_s32(v0), r0); + uint32x4_t r1a = vbicq_u32(vreinterpretq_u32_s32(v1), r1); + vst1q_u32((u32*)dst + j, r0a); + vst1q_u32((u32*)dst + j + 4, r1a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? 0 : *(src + j); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; +#endif +} + +void thresholdBinary(const Size2D &size, + const f32 *srcBase, ptrdiff_t srcStride, + f32 *dstBase, ptrdiff_t dstStride, + f32 threshold, f32 value) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + float32x4_t vthreshold8 = vdupq_n_f32(threshold); + float32x4_t vvalue8 = vdupq_n_f32(value); + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const f32* src = internal::getRowPtr(srcBase, srcStride, i); + f32* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw8; j += 8) + { + internal::prefetch(src + j); + float32x4_t v0 = vld1q_f32(src + j); + float32x4_t v1 = vld1q_f32(src + j + 4); + uint32x4_t r0 = vcgtq_f32(v0, vthreshold8); + uint32x4_t r1 = vcgtq_f32(v1, vthreshold8); + uint32x4_t r0a = vandq_u32(r0, vreinterpretq_u32_f32(vvalue8)); + uint32x4_t r1a = vandq_u32(r1, vreinterpretq_u32_f32(vvalue8)); + vst1q_u32((u32*)dst + j, r0a); + vst1q_u32((u32*)dst + j + 4, r1a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? value : 0; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; + (void)value; +#endif +} + +void thresholdBinaryInv(const Size2D &size, + const f32 *srcBase, ptrdiff_t srcStride, + f32 *dstBase, ptrdiff_t dstStride, + f32 threshold, f32 value) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + float32x4_t vthreshold8 = vdupq_n_f32(threshold); + float32x4_t vvalue8 = vdupq_n_f32(value); + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const f32* src = internal::getRowPtr(srcBase, srcStride, i); + f32* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw8; j += 8) + { + internal::prefetch(src + j); + float32x4_t v0 = vld1q_f32(src + j); + float32x4_t v1 = vld1q_f32(src + j + 4); + uint32x4_t r0 = vcleq_f32(v0, vthreshold8); + uint32x4_t r1 = vcleq_f32(v1, vthreshold8); + uint32x4_t r0a = vandq_u32(r0, vreinterpretq_u32_f32(vvalue8)); + uint32x4_t r1a = vandq_u32(r1, vreinterpretq_u32_f32(vvalue8)); + vst1q_f32(dst + j, vreinterpretq_f32_u32(r0a)); + vst1q_f32(dst + j + 4, vreinterpretq_f32_u32(r1a)); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? 0 : value; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; + (void)value; +#endif +} + +void thresholdTruncate(const Size2D &size, + const f32 *srcBase, ptrdiff_t srcStride, + f32 *dstBase, ptrdiff_t dstStride, + f32 threshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + float32x4_t vthreshold8 = vdupq_n_f32(threshold); + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const f32* src = internal::getRowPtr(srcBase, srcStride, i); + f32* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw8; j += 8) + { + internal::prefetch(src + j); + float32x4_t v0 = vld1q_f32(src + j); + float32x4_t v1 = vld1q_f32(src + j + 4); + float32x4_t r0 = vminq_f32(v0, vthreshold8); + float32x4_t r1 = vminq_f32(v1, vthreshold8); + vst1q_f32(dst + j, r0); + vst1q_f32(dst + j + 4, r1); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? threshold : *(src + j); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; +#endif +} + +void thresholdToZero(const Size2D &size, + const f32 *srcBase, ptrdiff_t srcStride, + f32 *dstBase, ptrdiff_t dstStride, + f32 threshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + float32x4_t vthreshold8 = vdupq_n_f32(threshold); + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const f32* src = internal::getRowPtr(srcBase, srcStride, i); + f32* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw8; j += 8) + { + internal::prefetch(src + j); + float32x4_t v0 = vld1q_f32(src + j); + float32x4_t v1 = vld1q_f32(src + j + 4); + uint32x4_t r0 = vcgtq_f32(v0, vthreshold8); + uint32x4_t r1 = vcgtq_f32(v1, vthreshold8); + uint32x4_t r0a = vandq_u32(vreinterpretq_u32_f32(v0), r0); + uint32x4_t r1a = vandq_u32(vreinterpretq_u32_f32(v1), r1); + vst1q_u32((u32*)dst + j, r0a); + vst1q_u32((u32*)dst + j + 4, r1a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? *(src + j) : 0; + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; +#endif +} + +void thresholdToZeroInv(const Size2D &size, + const f32 *srcBase, ptrdiff_t srcStride, + f32 *dstBase, ptrdiff_t dstStride, + f32 threshold) +{ + internal::assertSupportedConfiguration(); +#ifdef CAROTENE_NEON + float32x4_t vthreshold8 = vdupq_n_f32(threshold); + size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; + + for (size_t i = 0; i < size.height; ++i) + { + const f32* src = internal::getRowPtr(srcBase, srcStride, i); + f32* dst = internal::getRowPtr(dstBase, dstStride, i); + size_t j = 0; + + for (; j < roiw8; j += 8) + { + internal::prefetch(src + j); + float32x4_t v0 = vld1q_f32(src + j); + float32x4_t v1 = vld1q_f32(src + j + 4); + uint32x4_t r0 = vcgtq_f32(v0, vthreshold8); + uint32x4_t r1 = vcgtq_f32(v1, vthreshold8); + uint32x4_t r0a = vbicq_u32(vreinterpretq_u32_f32(v0), r0); + uint32x4_t r1a = vbicq_u32(vreinterpretq_u32_f32(v1), r1); + vst1q_u32((u32*)dst + j, r0a); + vst1q_u32((u32*)dst + j + 4, r1a); + } + for (; j < size.width; j++) + { + *(dst + j) = *(src + j) > threshold ? 0 : *(src + j); + } + } +#else + (void)size; + (void)srcBase; + (void)srcStride; + (void)dstBase; + (void)dstStride; + (void)threshold; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/vtransform.hpp b/3rdparty/carotene/src/vtransform.hpp new file mode 100644 index 0000000000..08841a2263 --- /dev/null +++ b/3rdparty/carotene/src/vtransform.hpp @@ -0,0 +1,689 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#ifndef CAROTENE_SRC_VTRANSFORM_HPP +#define CAROTENE_SRC_VTRANSFORM_HPP + +#include "common.hpp" + +#include + +#ifdef CAROTENE_NEON + +namespace CAROTENE_NS { namespace internal { + +////////////////////////////// Type Traits /////////////////////// + +template +struct VecTraits; + +template <> struct VecTraits< u8, 1> { typedef uint8x16_t vec128; typedef uint8x8_t vec64; typedef VecTraits< u8, 1> unsign; }; +template <> struct VecTraits< s8, 1> { typedef int8x16_t vec128; typedef int8x8_t vec64; typedef VecTraits< u8, 1> unsign; }; +template <> struct VecTraits { typedef uint16x8_t vec128; typedef uint16x4_t vec64; typedef VecTraits< u16, 1> unsign; }; +template <> struct VecTraits { typedef int16x8_t vec128; typedef int16x4_t vec64; typedef VecTraits< u16, 1> unsign; }; +template <> struct VecTraits { typedef int32x4_t vec128; typedef int32x2_t vec64; typedef VecTraits< u32, 1> unsign; }; +template <> struct VecTraits { typedef uint32x4_t vec128; typedef uint32x2_t vec64; typedef VecTraits< u32, 1> unsign; }; +template <> struct VecTraits { typedef int64x2_t vec128; typedef int64x1_t vec64; typedef VecTraits< u64, 1> unsign; }; +template <> struct VecTraits { typedef uint64x2_t vec128; typedef uint64x1_t vec64; typedef VecTraits< u64, 1> unsign; }; +template <> struct VecTraits { typedef float32x4_t vec128; typedef float32x2_t vec64; typedef VecTraits< u32, 1> unsign; }; + +template <> struct VecTraits< u8, 2> { typedef uint8x16x2_t vec128; typedef uint8x8x2_t vec64; typedef VecTraits< u8, 2> unsign; }; +template <> struct VecTraits< s8, 2> { typedef int8x16x2_t vec128; typedef int8x8x2_t vec64; typedef VecTraits< u8, 2> unsign; }; +template <> struct VecTraits { typedef uint16x8x2_t vec128; typedef uint16x4x2_t vec64; typedef VecTraits< u16, 2> unsign; }; +template <> struct VecTraits { typedef int16x8x2_t vec128; typedef int16x4x2_t vec64; typedef VecTraits< u16, 2> unsign; }; +template <> struct VecTraits { typedef int32x4x2_t vec128; typedef int32x2x2_t vec64; typedef VecTraits< u32, 2> unsign; }; +template <> struct VecTraits { typedef uint32x4x2_t vec128; typedef uint32x2x2_t vec64; typedef VecTraits< u32, 2> unsign; }; +template <> struct VecTraits { typedef int64x2x2_t vec128; typedef int64x1x2_t vec64; typedef VecTraits< u64, 2> unsign; }; +template <> struct VecTraits { typedef uint64x2x2_t vec128; typedef uint64x1x2_t vec64; typedef VecTraits< u64, 2> unsign; }; +template <> struct VecTraits { typedef float32x4x2_t vec128; typedef float32x2x2_t vec64; typedef VecTraits< u32, 2> unsign; }; + +template <> struct VecTraits< u8, 3> { typedef uint8x16x3_t vec128; typedef uint8x8x3_t vec64; typedef VecTraits< u8, 3> unsign; }; +template <> struct VecTraits< s8, 3> { typedef int8x16x3_t vec128; typedef int8x8x3_t vec64; typedef VecTraits< u8, 3> unsign; }; +template <> struct VecTraits { typedef uint16x8x3_t vec128; typedef uint16x4x3_t vec64; typedef VecTraits< u16, 3> unsign; }; +template <> struct VecTraits { typedef int16x8x3_t vec128; typedef int16x4x3_t vec64; typedef VecTraits< u16, 3> unsign; }; +template <> struct VecTraits { typedef int32x4x3_t vec128; typedef int32x2x3_t vec64; typedef VecTraits< u32, 3> unsign; }; +template <> struct VecTraits { typedef uint32x4x3_t vec128; typedef uint32x2x3_t vec64; typedef VecTraits< u32, 3> unsign; }; +template <> struct VecTraits { typedef int64x2x3_t vec128; typedef int64x1x3_t vec64; typedef VecTraits< u64, 2> unsign; }; +template <> struct VecTraits { typedef uint64x2x3_t vec128; typedef uint64x1x3_t vec64; typedef VecTraits< u64, 2> unsign; }; +template <> struct VecTraits { typedef float32x4x3_t vec128; typedef float32x2x3_t vec64; typedef VecTraits< u32, 3> unsign; }; + +template <> struct VecTraits< u8, 4> { typedef uint8x16x4_t vec128; typedef uint8x8x4_t vec64; typedef VecTraits< u8, 3> unsign; }; +template <> struct VecTraits< s8, 4> { typedef int8x16x4_t vec128; typedef int8x8x4_t vec64; typedef VecTraits< u8, 3> unsign; }; +template <> struct VecTraits { typedef uint16x8x4_t vec128; typedef uint16x4x4_t vec64; typedef VecTraits< u16, 3> unsign; }; +template <> struct VecTraits { typedef int16x8x4_t vec128; typedef int16x4x4_t vec64; typedef VecTraits< u16, 3> unsign; }; +template <> struct VecTraits { typedef int32x4x4_t vec128; typedef int32x2x4_t vec64; typedef VecTraits< u32, 3> unsign; }; +template <> struct VecTraits { typedef uint32x4x4_t vec128; typedef uint32x2x4_t vec64; typedef VecTraits< u32, 3> unsign; }; +template <> struct VecTraits { typedef int64x2x4_t vec128; typedef int64x1x4_t vec64; typedef VecTraits< u64, 2> unsign; }; +template <> struct VecTraits { typedef uint64x2x4_t vec128; typedef uint64x1x4_t vec64; typedef VecTraits< u64, 2> unsign; }; +template <> struct VecTraits { typedef float32x4x4_t vec128; typedef float32x2x4_t vec64; typedef VecTraits< u32, 3> unsign; }; + +////////////////////////////// vld1q /////////////////////// + +inline uint8x16_t vld1q(const u8 * ptr) { return vld1q_u8(ptr); } +inline int8x16_t vld1q(const s8 * ptr) { return vld1q_s8(ptr); } +inline uint16x8_t vld1q(const u16 * ptr) { return vld1q_u16(ptr); } +inline int16x8_t vld1q(const s16 * ptr) { return vld1q_s16(ptr); } +inline uint32x4_t vld1q(const u32 * ptr) { return vld1q_u32(ptr); } +inline int32x4_t vld1q(const s32 * ptr) { return vld1q_s32(ptr); } +inline float32x4_t vld1q(const f32 * ptr) { return vld1q_f32(ptr); } + +////////////////////////////// vld1 /////////////////////// + +inline uint8x8_t vld1(const u8 * ptr) { return vld1_u8(ptr); } +inline int8x8_t vld1(const s8 * ptr) { return vld1_s8(ptr); } +inline uint16x4_t vld1(const u16 * ptr) { return vld1_u16(ptr); } +inline int16x4_t vld1(const s16 * ptr) { return vld1_s16(ptr); } +inline uint32x2_t vld1(const u32 * ptr) { return vld1_u32(ptr); } +inline int32x2_t vld1(const s32 * ptr) { return vld1_s32(ptr); } +inline float32x2_t vld1(const f32 * ptr) { return vld1_f32(ptr); } + +////////////////////////////// vld2q /////////////////////// + +inline uint8x16x2_t vld2q(const u8 * ptr) { return vld2q_u8(ptr); } +inline int8x16x2_t vld2q(const s8 * ptr) { return vld2q_s8(ptr); } +inline uint16x8x2_t vld2q(const u16 * ptr) { return vld2q_u16(ptr); } +inline int16x8x2_t vld2q(const s16 * ptr) { return vld2q_s16(ptr); } +inline uint32x4x2_t vld2q(const u32 * ptr) { return vld2q_u32(ptr); } +inline int32x4x2_t vld2q(const s32 * ptr) { return vld2q_s32(ptr); } +inline float32x4x2_t vld2q(const f32 * ptr) { return vld2q_f32(ptr); } + +////////////////////////////// vld2 /////////////////////// + +inline uint8x8x2_t vld2(const u8 * ptr) { return vld2_u8(ptr); } +inline int8x8x2_t vld2(const s8 * ptr) { return vld2_s8(ptr); } +inline uint16x4x2_t vld2(const u16 * ptr) { return vld2_u16(ptr); } +inline int16x4x2_t vld2(const s16 * ptr) { return vld2_s16(ptr); } +inline uint32x2x2_t vld2(const u32 * ptr) { return vld2_u32(ptr); } +inline int32x2x2_t vld2(const s32 * ptr) { return vld2_s32(ptr); } +inline float32x2x2_t vld2(const f32 * ptr) { return vld2_f32(ptr); } + +////////////////////////////// vld3q /////////////////////// + +inline uint8x16x3_t vld3q(const u8 * ptr) { return vld3q_u8(ptr); } +inline int8x16x3_t vld3q(const s8 * ptr) { return vld3q_s8(ptr); } +inline uint16x8x3_t vld3q(const u16 * ptr) { return vld3q_u16(ptr); } +inline int16x8x3_t vld3q(const s16 * ptr) { return vld3q_s16(ptr); } +inline uint32x4x3_t vld3q(const u32 * ptr) { return vld3q_u32(ptr); } +inline int32x4x3_t vld3q(const s32 * ptr) { return vld3q_s32(ptr); } +inline float32x4x3_t vld3q(const f32 * ptr) { return vld3q_f32(ptr); } + +////////////////////////////// vld3 /////////////////////// + +inline uint8x8x3_t vld3(const u8 * ptr) { return vld3_u8(ptr); } +inline int8x8x3_t vld3(const s8 * ptr) { return vld3_s8(ptr); } +inline uint16x4x3_t vld3(const u16 * ptr) { return vld3_u16(ptr); } +inline int16x4x3_t vld3(const s16 * ptr) { return vld3_s16(ptr); } +inline uint32x2x3_t vld3(const u32 * ptr) { return vld3_u32(ptr); } +inline int32x2x3_t vld3(const s32 * ptr) { return vld3_s32(ptr); } +inline float32x2x3_t vld3(const f32 * ptr) { return vld3_f32(ptr); } + +////////////////////////////// vld4q /////////////////////// + +inline uint8x16x4_t vld4q(const u8 * ptr) { return vld4q_u8(ptr); } +inline int8x16x4_t vld4q(const s8 * ptr) { return vld4q_s8(ptr); } +inline uint16x8x4_t vld4q(const u16 * ptr) { return vld4q_u16(ptr); } +inline int16x8x4_t vld4q(const s16 * ptr) { return vld4q_s16(ptr); } +inline uint32x4x4_t vld4q(const u32 * ptr) { return vld4q_u32(ptr); } +inline int32x4x4_t vld4q(const s32 * ptr) { return vld4q_s32(ptr); } +inline float32x4x4_t vld4q(const f32 * ptr) { return vld4q_f32(ptr); } + +////////////////////////////// vld4 /////////////////////// + +inline uint8x8x4_t vld4(const u8 * ptr) { return vld4_u8(ptr); } +inline int8x8x4_t vld4(const s8 * ptr) { return vld4_s8(ptr); } +inline uint16x4x4_t vld4(const u16 * ptr) { return vld4_u16(ptr); } +inline int16x4x4_t vld4(const s16 * ptr) { return vld4_s16(ptr); } +inline uint32x2x4_t vld4(const u32 * ptr) { return vld4_u32(ptr); } +inline int32x2x4_t vld4(const s32 * ptr) { return vld4_s32(ptr); } +inline float32x2x4_t vld4(const f32 * ptr) { return vld4_f32(ptr); } + +////////////////////////////// vst1q /////////////////////// + +inline void vst1q(u8 * ptr, const uint8x16_t & v) { return vst1q_u8(ptr, v); } +inline void vst1q(s8 * ptr, const int8x16_t & v) { return vst1q_s8(ptr, v); } +inline void vst1q(u16 * ptr, const uint16x8_t & v) { return vst1q_u16(ptr, v); } +inline void vst1q(s16 * ptr, const int16x8_t & v) { return vst1q_s16(ptr, v); } +inline void vst1q(u32 * ptr, const uint32x4_t & v) { return vst1q_u32(ptr, v); } +inline void vst1q(s32 * ptr, const int32x4_t & v) { return vst1q_s32(ptr, v); } +inline void vst1q(f32 * ptr, const float32x4_t & v) { return vst1q_f32(ptr, v); } + +////////////////////////////// vst1 /////////////////////// + +inline void vst1(u8 * ptr, const uint8x8_t & v) { return vst1_u8(ptr, v); } +inline void vst1(s8 * ptr, const int8x8_t & v) { return vst1_s8(ptr, v); } +inline void vst1(u16 * ptr, const uint16x4_t & v) { return vst1_u16(ptr, v); } +inline void vst1(s16 * ptr, const int16x4_t & v) { return vst1_s16(ptr, v); } +inline void vst1(u32 * ptr, const uint32x2_t & v) { return vst1_u32(ptr, v); } +inline void vst1(s32 * ptr, const int32x2_t & v) { return vst1_s32(ptr, v); } +inline void vst1(f32 * ptr, const float32x2_t & v) { return vst1_f32(ptr, v); } + +////////////////////////////// vst2q /////////////////////// + +inline void vst2q(u8 * ptr, const uint8x16x2_t & v) { return vst2q_u8(ptr, v); } +inline void vst2q(s8 * ptr, const int8x16x2_t & v) { return vst2q_s8(ptr, v); } +inline void vst2q(u16 * ptr, const uint16x8x2_t & v) { return vst2q_u16(ptr, v); } +inline void vst2q(s16 * ptr, const int16x8x2_t & v) { return vst2q_s16(ptr, v); } +inline void vst2q(u32 * ptr, const uint32x4x2_t & v) { return vst2q_u32(ptr, v); } +inline void vst2q(s32 * ptr, const int32x4x2_t & v) { return vst2q_s32(ptr, v); } +inline void vst2q(f32 * ptr, const float32x4x2_t & v) { return vst2q_f32(ptr, v); } + +////////////////////////////// vst2 /////////////////////// + +inline void vst2(u8 * ptr, const uint8x8x2_t & v) { return vst2_u8(ptr, v); } +inline void vst2(s8 * ptr, const int8x8x2_t & v) { return vst2_s8(ptr, v); } +inline void vst2(u16 * ptr, const uint16x4x2_t & v) { return vst2_u16(ptr, v); } +inline void vst2(s16 * ptr, const int16x4x2_t & v) { return vst2_s16(ptr, v); } +inline void vst2(u32 * ptr, const uint32x2x2_t & v) { return vst2_u32(ptr, v); } +inline void vst2(s32 * ptr, const int32x2x2_t & v) { return vst2_s32(ptr, v); } +inline void vst2(f32 * ptr, const float32x2x2_t & v) { return vst2_f32(ptr, v); } + +////////////////////////////// vst3q /////////////////////// + +inline void vst3q(u8 * ptr, const uint8x16x3_t & v) { return vst3q_u8(ptr, v); } +inline void vst3q(s8 * ptr, const int8x16x3_t & v) { return vst3q_s8(ptr, v); } +inline void vst3q(u16 * ptr, const uint16x8x3_t & v) { return vst3q_u16(ptr, v); } +inline void vst3q(s16 * ptr, const int16x8x3_t & v) { return vst3q_s16(ptr, v); } +inline void vst3q(u32 * ptr, const uint32x4x3_t & v) { return vst3q_u32(ptr, v); } +inline void vst3q(s32 * ptr, const int32x4x3_t & v) { return vst3q_s32(ptr, v); } +inline void vst3q(f32 * ptr, const float32x4x3_t & v) { return vst3q_f32(ptr, v); } + +////////////////////////////// vst3 /////////////////////// + +inline void vst3(u8 * ptr, const uint8x8x3_t & v) { return vst3_u8(ptr, v); } +inline void vst3(s8 * ptr, const int8x8x3_t & v) { return vst3_s8(ptr, v); } +inline void vst3(u16 * ptr, const uint16x4x3_t & v) { return vst3_u16(ptr, v); } +inline void vst3(s16 * ptr, const int16x4x3_t & v) { return vst3_s16(ptr, v); } +inline void vst3(u32 * ptr, const uint32x2x3_t & v) { return vst3_u32(ptr, v); } +inline void vst3(s32 * ptr, const int32x2x3_t & v) { return vst3_s32(ptr, v); } +inline void vst3(f32 * ptr, const float32x2x3_t & v) { return vst3_f32(ptr, v); } + +////////////////////////////// vst4q /////////////////////// + +inline void vst4q(u8 * ptr, const uint8x16x4_t & v) { return vst4q_u8(ptr, v); } +inline void vst4q(s8 * ptr, const int8x16x4_t & v) { return vst4q_s8(ptr, v); } +inline void vst4q(u16 * ptr, const uint16x8x4_t & v) { return vst4q_u16(ptr, v); } +inline void vst4q(s16 * ptr, const int16x8x4_t & v) { return vst4q_s16(ptr, v); } +inline void vst4q(u32 * ptr, const uint32x4x4_t & v) { return vst4q_u32(ptr, v); } +inline void vst4q(s32 * ptr, const int32x4x4_t & v) { return vst4q_s32(ptr, v); } +inline void vst4q(f32 * ptr, const float32x4x4_t & v) { return vst4q_f32(ptr, v); } + +////////////////////////////// vst4 /////////////////////// + +inline void vst4(u8 * ptr, const uint8x8x4_t & v) { return vst4_u8(ptr, v); } +inline void vst4(s8 * ptr, const int8x8x4_t & v) { return vst4_s8(ptr, v); } +inline void vst4(u16 * ptr, const uint16x4x4_t & v) { return vst4_u16(ptr, v); } +inline void vst4(s16 * ptr, const int16x4x4_t & v) { return vst4_s16(ptr, v); } +inline void vst4(u32 * ptr, const uint32x2x4_t & v) { return vst4_u32(ptr, v); } +inline void vst4(s32 * ptr, const int32x2x4_t & v) { return vst4_s32(ptr, v); } +inline void vst4(f32 * ptr, const float32x2x4_t & v) { return vst4_f32(ptr, v); } + +////////////////////////////// vabdq /////////////////////// + +inline uint8x16_t vabdq(const uint8x16_t & v0, const uint8x16_t & v1) { return vabdq_u8 (v0, v1); } +inline int8x16_t vabdq(const int8x16_t & v0, const int8x16_t & v1) { return vabdq_s8 (v0, v1); } +inline uint16x8_t vabdq(const uint16x8_t & v0, const uint16x8_t & v1) { return vabdq_u16(v0, v1); } +inline int16x8_t vabdq(const int16x8_t & v0, const int16x8_t & v1) { return vabdq_s16(v0, v1); } +inline uint32x4_t vabdq(const uint32x4_t & v0, const uint32x4_t & v1) { return vabdq_u32(v0, v1); } +inline int32x4_t vabdq(const int32x4_t & v0, const int32x4_t & v1) { return vabdq_s32(v0, v1); } +inline float32x4_t vabdq(const float32x4_t & v0, const float32x4_t & v1) { return vabdq_f32(v0, v1); } + +////////////////////////////// vabd /////////////////////// + +inline uint8x8_t vabd(const uint8x8_t & v0, const uint8x8_t & v1) { return vabd_u8 (v0, v1); } +inline int8x8_t vabd(const int8x8_t & v0, const int8x8_t & v1) { return vabd_s8 (v0, v1); } +inline uint16x4_t vabd(const uint16x4_t & v0, const uint16x4_t & v1) { return vabd_u16(v0, v1); } +inline int16x4_t vabd(const int16x4_t & v0, const int16x4_t & v1) { return vabd_s16(v0, v1); } +inline uint32x2_t vabd(const uint32x2_t & v0, const uint32x2_t & v1) { return vabd_u32(v0, v1); } +inline int32x2_t vabd(const int32x2_t & v0, const int32x2_t & v1) { return vabd_s32(v0, v1); } +inline float32x2_t vabd(const float32x2_t & v0, const float32x2_t & v1) { return vabd_f32(v0, v1); } + +////////////////////////////// vminq /////////////////////// + +inline uint8x16_t vminq(const uint8x16_t & v0, const uint8x16_t & v1) { return vminq_u8 (v0, v1); } +inline int8x16_t vminq(const int8x16_t & v0, const int8x16_t & v1) { return vminq_s8 (v0, v1); } +inline uint16x8_t vminq(const uint16x8_t & v0, const uint16x8_t & v1) { return vminq_u16(v0, v1); } +inline int16x8_t vminq(const int16x8_t & v0, const int16x8_t & v1) { return vminq_s16(v0, v1); } +inline uint32x4_t vminq(const uint32x4_t & v0, const uint32x4_t & v1) { return vminq_u32(v0, v1); } +inline int32x4_t vminq(const int32x4_t & v0, const int32x4_t & v1) { return vminq_s32(v0, v1); } +inline float32x4_t vminq(const float32x4_t & v0, const float32x4_t & v1) { return vminq_f32(v0, v1); } + +////////////////////////////// vmin /////////////////////// + +inline uint8x8_t vmin(const uint8x8_t & v0, const uint8x8_t & v1) { return vmin_u8 (v0, v1); } +inline int8x8_t vmin(const int8x8_t & v0, const int8x8_t & v1) { return vmin_s8 (v0, v1); } +inline uint16x4_t vmin(const uint16x4_t & v0, const uint16x4_t & v1) { return vmin_u16(v0, v1); } +inline int16x4_t vmin(const int16x4_t & v0, const int16x4_t & v1) { return vmin_s16(v0, v1); } +inline uint32x2_t vmin(const uint32x2_t & v0, const uint32x2_t & v1) { return vmin_u32(v0, v1); } +inline int32x2_t vmin(const int32x2_t & v0, const int32x2_t & v1) { return vmin_s32(v0, v1); } +inline float32x2_t vmin(const float32x2_t & v0, const float32x2_t & v1) { return vmin_f32(v0, v1); } + +////////////////////////////// vmaxq /////////////////////// + +inline uint8x16_t vmaxq(const uint8x16_t & v0, const uint8x16_t & v1) { return vmaxq_u8 (v0, v1); } +inline int8x16_t vmaxq(const int8x16_t & v0, const int8x16_t & v1) { return vmaxq_s8 (v0, v1); } +inline uint16x8_t vmaxq(const uint16x8_t & v0, const uint16x8_t & v1) { return vmaxq_u16(v0, v1); } +inline int16x8_t vmaxq(const int16x8_t & v0, const int16x8_t & v1) { return vmaxq_s16(v0, v1); } +inline uint32x4_t vmaxq(const uint32x4_t & v0, const uint32x4_t & v1) { return vmaxq_u32(v0, v1); } +inline int32x4_t vmaxq(const int32x4_t & v0, const int32x4_t & v1) { return vmaxq_s32(v0, v1); } +inline float32x4_t vmaxq(const float32x4_t & v0, const float32x4_t & v1) { return vmaxq_f32(v0, v1); } + +////////////////////////////// vmax /////////////////////// + +inline uint8x8_t vmax(const uint8x8_t & v0, const uint8x8_t & v1) { return vmax_u8 (v0, v1); } +inline int8x8_t vmax(const int8x8_t & v0, const int8x8_t & v1) { return vmax_s8 (v0, v1); } +inline uint16x4_t vmax(const uint16x4_t & v0, const uint16x4_t & v1) { return vmax_u16(v0, v1); } +inline int16x4_t vmax(const int16x4_t & v0, const int16x4_t & v1) { return vmax_s16(v0, v1); } +inline uint32x2_t vmax(const uint32x2_t & v0, const uint32x2_t & v1) { return vmax_u32(v0, v1); } +inline int32x2_t vmax(const int32x2_t & v0, const int32x2_t & v1) { return vmax_s32(v0, v1); } +inline float32x2_t vmax(const float32x2_t & v0, const float32x2_t & v1) { return vmax_f32(v0, v1); } + +////////////////////////////// vdupq_n /////////////////////// + +inline uint8x16_t vdupq_n(const u8 & val) { return vdupq_n_u8(val); } +inline int8x16_t vdupq_n(const s8 & val) { return vdupq_n_s8(val); } +inline uint16x8_t vdupq_n(const u16 & val) { return vdupq_n_u16(val); } +inline int16x8_t vdupq_n(const s16 & val) { return vdupq_n_s16(val); } +inline uint32x4_t vdupq_n(const u32 & val) { return vdupq_n_u32(val); } +inline int32x4_t vdupq_n(const s32 & val) { return vdupq_n_s32(val); } +inline uint64x2_t vdupq_n(const u64 & val) { return vdupq_n_u64(val); } +inline int64x2_t vdupq_n(const s64 & val) { return vdupq_n_s64(val); } +inline float32x4_t vdupq_n(const f32 & val) { return vdupq_n_f32(val); } + +////////////////////////////// vdup_n /////////////////////// + +inline uint8x8_t vdup_n(const u8 & val) { return vdup_n_u8(val); } +inline int8x8_t vdup_n(const s8 & val) { return vdup_n_s8(val); } +inline uint16x4_t vdup_n(const u16 & val) { return vdup_n_u16(val); } +inline int16x4_t vdup_n(const s16 & val) { return vdup_n_s16(val); } +inline uint32x2_t vdup_n(const u32 & val) { return vdup_n_u32(val); } +inline int32x2_t vdup_n(const s32 & val) { return vdup_n_s32(val); } +inline uint64x1_t vdup_n(const u64 & val) { return vdup_n_u64(val); } +inline int64x1_t vdup_n(const s64 & val) { return vdup_n_s64(val); } +inline float32x2_t vdup_n(const f32 & val) { return vdup_n_f32(val); } + +////////////////////////////// vget_low /////////////////////// + +inline uint8x8_t vget_low(const uint8x16_t & v) { return vget_low_u8 (v); } +inline int8x8_t vget_low(const int8x16_t & v) { return vget_low_s8 (v); } +inline uint16x4_t vget_low(const uint16x8_t & v) { return vget_low_u16(v); } +inline int16x4_t vget_low(const int16x8_t & v) { return vget_low_s16(v); } +inline uint32x2_t vget_low(const uint32x4_t & v) { return vget_low_u32(v); } +inline int32x2_t vget_low(const int32x4_t & v) { return vget_low_s32(v); } +inline float32x2_t vget_low(const float32x4_t & v) { return vget_low_f32(v); } + +////////////////////////////// vget_high /////////////////////// + +inline uint8x8_t vget_high(const uint8x16_t & v) { return vget_high_u8 (v); } +inline int8x8_t vget_high(const int8x16_t & v) { return vget_high_s8 (v); } +inline uint16x4_t vget_high(const uint16x8_t & v) { return vget_high_u16(v); } +inline int16x4_t vget_high(const int16x8_t & v) { return vget_high_s16(v); } +inline uint32x2_t vget_high(const uint32x4_t & v) { return vget_high_u32(v); } +inline int32x2_t vget_high(const int32x4_t & v) { return vget_high_s32(v); } +inline float32x2_t vget_high(const float32x4_t & v) { return vget_high_f32(v); } + +////////////////////////////// vcombine /////////////////////// + +inline uint8x16_t vcombine(const uint8x8_t & v0, const uint8x8_t & v1) { return vcombine_u8 (v0, v1); } +inline int8x16_t vcombine(const int8x8_t & v0, const int8x8_t & v1) { return vcombine_s8 (v0, v1); } +inline uint16x8_t vcombine(const uint16x4_t & v0, const uint16x4_t & v1) { return vcombine_u16(v0, v1); } +inline int16x8_t vcombine(const int16x4_t & v0, const int16x4_t & v1) { return vcombine_s16(v0, v1); } +inline uint32x4_t vcombine(const uint32x2_t & v0, const uint32x2_t & v1) { return vcombine_u32(v0, v1); } +inline int32x4_t vcombine(const int32x2_t & v0, const int32x2_t & v1) { return vcombine_s32(v0, v1); } +inline float32x4_t vcombine(const float32x2_t & v0, const float32x2_t & v1) { return vcombine_f32(v0, v1); } + +////////////////////////////// vaddq /////////////////////// + +inline uint8x16_t vaddq(const uint8x16_t & v0, const uint8x16_t & v1) { return vaddq_u8 (v0, v1); } +inline int8x16_t vaddq(const int8x16_t & v0, const int8x16_t & v1) { return vaddq_s8 (v0, v1); } +inline uint16x8_t vaddq(const uint16x8_t & v0, const uint16x8_t & v1) { return vaddq_u16(v0, v1); } +inline int16x8_t vaddq(const int16x8_t & v0, const int16x8_t & v1) { return vaddq_s16(v0, v1); } +inline uint32x4_t vaddq(const uint32x4_t & v0, const uint32x4_t & v1) { return vaddq_u32(v0, v1); } +inline int32x4_t vaddq(const int32x4_t & v0, const int32x4_t & v1) { return vaddq_s32(v0, v1); } +inline float32x4_t vaddq(const float32x4_t & v0, const float32x4_t & v1) { return vaddq_f32(v0, v1); } + +////////////////////////////// vadd /////////////////////// + +inline uint8x8_t vadd(const uint8x8_t & v0, const uint8x8_t & v1) { return vadd_u8 (v0, v1); } +inline int8x8_t vadd(const int8x8_t & v0, const int8x8_t & v1) { return vadd_s8 (v0, v1); } +inline uint16x4_t vadd(const uint16x4_t & v0, const uint16x4_t & v1) { return vadd_u16(v0, v1); } +inline int16x4_t vadd(const int16x4_t & v0, const int16x4_t & v1) { return vadd_s16(v0, v1); } +inline uint32x2_t vadd(const uint32x2_t & v0, const uint32x2_t & v1) { return vadd_u32(v0, v1); } +inline int32x2_t vadd(const int32x2_t & v0, const int32x2_t & v1) { return vadd_s32(v0, v1); } +inline float32x2_t vadd(const float32x2_t & v0, const float32x2_t & v1) { return vadd_f32(v0, v1); } + +////////////////////////////// vqaddq /////////////////////// + +inline uint8x16_t vqaddq(const uint8x16_t & v0, const uint8x16_t & v1) { return vqaddq_u8 (v0, v1); } +inline int8x16_t vqaddq(const int8x16_t & v0, const int8x16_t & v1) { return vqaddq_s8 (v0, v1); } +inline uint16x8_t vqaddq(const uint16x8_t & v0, const uint16x8_t & v1) { return vqaddq_u16(v0, v1); } +inline int16x8_t vqaddq(const int16x8_t & v0, const int16x8_t & v1) { return vqaddq_s16(v0, v1); } +inline uint32x4_t vqaddq(const uint32x4_t & v0, const uint32x4_t & v1) { return vqaddq_u32(v0, v1); } +inline int32x4_t vqaddq(const int32x4_t & v0, const int32x4_t & v1) { return vqaddq_s32(v0, v1); } + +////////////////////////////// vqadd /////////////////////// + +inline uint8x8_t vqadd(const uint8x8_t & v0, const uint8x8_t & v1) { return vqadd_u8 (v0, v1); } +inline int8x8_t vqadd(const int8x8_t & v0, const int8x8_t & v1) { return vqadd_s8 (v0, v1); } +inline uint16x4_t vqadd(const uint16x4_t & v0, const uint16x4_t & v1) { return vqadd_u16(v0, v1); } +inline int16x4_t vqadd(const int16x4_t & v0, const int16x4_t & v1) { return vqadd_s16(v0, v1); } +inline uint32x2_t vqadd(const uint32x2_t & v0, const uint32x2_t & v1) { return vqadd_u32(v0, v1); } +inline int32x2_t vqadd(const int32x2_t & v0, const int32x2_t & v1) { return vqadd_s32(v0, v1); } + +////////////////////////////// vsubq /////////////////////// + +inline uint8x16_t vsubq(const uint8x16_t & v0, const uint8x16_t & v1) { return vsubq_u8 (v0, v1); } +inline int8x16_t vsubq(const int8x16_t & v0, const int8x16_t & v1) { return vsubq_s8 (v0, v1); } +inline uint16x8_t vsubq(const uint16x8_t & v0, const uint16x8_t & v1) { return vsubq_u16(v0, v1); } +inline int16x8_t vsubq(const int16x8_t & v0, const int16x8_t & v1) { return vsubq_s16(v0, v1); } +inline uint32x4_t vsubq(const uint32x4_t & v0, const uint32x4_t & v1) { return vsubq_u32(v0, v1); } +inline int32x4_t vsubq(const int32x4_t & v0, const int32x4_t & v1) { return vsubq_s32(v0, v1); } +inline float32x4_t vsubq(const float32x4_t & v0, const float32x4_t & v1) { return vsubq_f32(v0, v1); } + +////////////////////////////// vsub /////////////////////// + +inline uint8x8_t vsub(const uint8x8_t & v0, const uint8x8_t & v1) { return vsub_u8 (v0, v1); } +inline int8x8_t vsub(const int8x8_t & v0, const int8x8_t & v1) { return vsub_s8 (v0, v1); } +inline uint16x4_t vsub(const uint16x4_t & v0, const uint16x4_t & v1) { return vsub_u16(v0, v1); } +inline int16x4_t vsub(const int16x4_t & v0, const int16x4_t & v1) { return vsub_s16(v0, v1); } +inline uint32x2_t vsub(const uint32x2_t & v0, const uint32x2_t & v1) { return vsub_u32(v0, v1); } +inline int32x2_t vsub(const int32x2_t & v0, const int32x2_t & v1) { return vsub_s32(v0, v1); } +inline float32x2_t vsub(const float32x2_t & v0, const float32x2_t & v1) { return vsub_f32(v0, v1); } + +////////////////////////////// vqsubq /////////////////////// + +inline uint8x16_t vqsubq(const uint8x16_t & v0, const uint8x16_t & v1) { return vqsubq_u8 (v0, v1); } +inline int8x16_t vqsubq(const int8x16_t & v0, const int8x16_t & v1) { return vqsubq_s8 (v0, v1); } +inline uint16x8_t vqsubq(const uint16x8_t & v0, const uint16x8_t & v1) { return vqsubq_u16(v0, v1); } +inline int16x8_t vqsubq(const int16x8_t & v0, const int16x8_t & v1) { return vqsubq_s16(v0, v1); } +inline uint32x4_t vqsubq(const uint32x4_t & v0, const uint32x4_t & v1) { return vqsubq_u32(v0, v1); } +inline int32x4_t vqsubq(const int32x4_t & v0, const int32x4_t & v1) { return vqsubq_s32(v0, v1); } +inline uint64x2_t vqsubq(const uint64x2_t & v0, const uint64x2_t & v1) { return vqsubq_u64(v0, v1); } +inline int64x2_t vqsubq(const int64x2_t & v0, const int64x2_t & v1) { return vqsubq_s64(v0, v1); } + +////////////////////////////// vqsub /////////////////////// + +inline uint8x8_t vqsub(const uint8x8_t & v0, const uint8x8_t & v1) { return vqsub_u8 (v0, v1); } +inline int8x8_t vqsub(const int8x8_t & v0, const int8x8_t & v1) { return vqsub_s8 (v0, v1); } +inline uint16x4_t vqsub(const uint16x4_t & v0, const uint16x4_t & v1) { return vqsub_u16(v0, v1); } +inline int16x4_t vqsub(const int16x4_t & v0, const int16x4_t & v1) { return vqsub_s16(v0, v1); } +inline uint32x2_t vqsub(const uint32x2_t & v0, const uint32x2_t & v1) { return vqsub_u32(v0, v1); } +inline int32x2_t vqsub(const int32x2_t & v0, const int32x2_t & v1) { return vqsub_s32(v0, v1); } +inline uint64x1_t vqsub(const uint64x1_t & v0, const uint64x1_t & v1) { return vqsub_u64(v0, v1); } +inline int64x1_t vqsub(const int64x1_t & v0, const int64x1_t & v1) { return vqsub_s64(v0, v1); } + +////////////////////////////// vmull /////////////////////// + +inline uint16x8_t vmull(const uint8x8_t & v0, const uint8x8_t & v1) { return vmull_u8 (v0, v1); } +inline int16x8_t vmull(const int8x8_t & v0, const int8x8_t & v1) { return vmull_s8 (v0, v1); } +inline uint32x4_t vmull(const uint16x4_t & v0, const uint16x4_t & v1) { return vmull_u16(v0, v1); } +inline int32x4_t vmull(const int16x4_t & v0, const int16x4_t & v1) { return vmull_s16(v0, v1); } +inline uint64x2_t vmull(const uint32x2_t & v0, const uint32x2_t & v1) { return vmull_u32(v0, v1); } +inline int64x2_t vmull(const int32x2_t & v0, const int32x2_t & v1) { return vmull_s32(v0, v1); } + +////////////////////////////// vrev64q /////////////////////// + +inline uint8x16_t vrev64q(const uint8x16_t & v) { return vrev64q_u8 (v); } +inline int8x16_t vrev64q(const int8x16_t & v) { return vrev64q_s8 (v); } +inline uint16x8_t vrev64q(const uint16x8_t & v) { return vrev64q_u16(v); } +inline int16x8_t vrev64q(const int16x8_t & v) { return vrev64q_s16(v); } +inline uint32x4_t vrev64q(const uint32x4_t & v) { return vrev64q_u32(v); } +inline int32x4_t vrev64q(const int32x4_t & v) { return vrev64q_s32(v); } +inline float32x4_t vrev64q(const float32x4_t & v) { return vrev64q_f32(v); } + +////////////////////////////// vrev64 /////////////////////// + +inline uint8x8_t vrev64(const uint8x8_t & v) { return vrev64_u8 (v); } +inline int8x8_t vrev64(const int8x8_t & v) { return vrev64_s8 (v); } +inline uint16x4_t vrev64(const uint16x4_t & v) { return vrev64_u16(v); } +inline int16x4_t vrev64(const int16x4_t & v) { return vrev64_s16(v); } +inline uint32x2_t vrev64(const uint32x2_t & v) { return vrev64_u32(v); } +inline int32x2_t vrev64(const int32x2_t & v) { return vrev64_s32(v); } +inline float32x2_t vrev64(const float32x2_t & v) { return vrev64_f32(v); } + +////////////////////////////// vceqq /////////////////////// + +inline uint8x16_t vceqq(const uint8x16_t & v0, const uint8x16_t & v1) { return vceqq_u8 (v0, v1); } +inline uint8x16_t vceqq(const int8x16_t & v0, const int8x16_t & v1) { return vceqq_s8 (v0, v1); } +inline uint16x8_t vceqq(const uint16x8_t & v0, const uint16x8_t & v1) { return vceqq_u16(v0, v1); } +inline uint16x8_t vceqq(const int16x8_t & v0, const int16x8_t & v1) { return vceqq_s16(v0, v1); } +inline uint32x4_t vceqq(const uint32x4_t & v0, const uint32x4_t & v1) { return vceqq_u32(v0, v1); } +inline uint32x4_t vceqq(const int32x4_t & v0, const int32x4_t & v1) { return vceqq_s32(v0, v1); } +inline uint32x4_t vceqq(const float32x4_t & v0, const float32x4_t & v1) { return vceqq_f32(v0, v1); } + +////////////////////////////// vceq /////////////////////// + +inline uint8x8_t vceq(const uint8x8_t & v0, const uint8x8_t & v1) { return vceq_u8 (v0, v1); } +inline uint8x8_t vceq(const int8x8_t & v0, const int8x8_t & v1) { return vceq_s8 (v0, v1); } +inline uint16x4_t vceq(const uint16x4_t & v0, const uint16x4_t & v1) { return vceq_u16(v0, v1); } +inline uint16x4_t vceq(const int16x4_t & v0, const int16x4_t & v1) { return vceq_s16(v0, v1); } +inline uint32x2_t vceq(const uint32x2_t & v0, const uint32x2_t & v1) { return vceq_u32(v0, v1); } +inline uint32x2_t vceq(const int32x2_t & v0, const int32x2_t & v1) { return vceq_s32(v0, v1); } +inline uint32x2_t vceq(const float32x2_t & v0, const float32x2_t & v1) { return vceq_f32(v0, v1); } + +////////////////////////////// vcgtq /////////////////////// + +inline uint8x16_t vcgtq(const uint8x16_t & v0, const uint8x16_t & v1) { return vcgtq_u8 (v0, v1); } +inline uint8x16_t vcgtq(const int8x16_t & v0, const int8x16_t & v1) { return vcgtq_s8 (v0, v1); } +inline uint16x8_t vcgtq(const uint16x8_t & v0, const uint16x8_t & v1) { return vcgtq_u16(v0, v1); } +inline uint16x8_t vcgtq(const int16x8_t & v0, const int16x8_t & v1) { return vcgtq_s16(v0, v1); } +inline uint32x4_t vcgtq(const uint32x4_t & v0, const uint32x4_t & v1) { return vcgtq_u32(v0, v1); } +inline uint32x4_t vcgtq(const int32x4_t & v0, const int32x4_t & v1) { return vcgtq_s32(v0, v1); } +inline uint32x4_t vcgtq(const float32x4_t & v0, const float32x4_t & v1) { return vcgtq_f32(v0, v1); } + +////////////////////////////// vcgt /////////////////////// + +inline uint8x8_t vcgt(const uint8x8_t & v0, const uint8x8_t & v1) { return vcgt_u8 (v0, v1); } +inline uint8x8_t vcgt(const int8x8_t & v0, const int8x8_t & v1) { return vcgt_s8 (v0, v1); } +inline uint16x4_t vcgt(const uint16x4_t & v0, const uint16x4_t & v1) { return vcgt_u16(v0, v1); } +inline uint16x4_t vcgt(const int16x4_t & v0, const int16x4_t & v1) { return vcgt_s16(v0, v1); } +inline uint32x2_t vcgt(const uint32x2_t & v0, const uint32x2_t & v1) { return vcgt_u32(v0, v1); } +inline uint32x2_t vcgt(const int32x2_t & v0, const int32x2_t & v1) { return vcgt_s32(v0, v1); } +inline uint32x2_t vcgt(const float32x2_t & v0, const float32x2_t & v1) { return vcgt_f32(v0, v1); } + +////////////////////////////// vcgeq /////////////////////// + +inline uint8x16_t vcgeq(const uint8x16_t & v0, const uint8x16_t & v1) { return vcgeq_u8 (v0, v1); } +inline uint8x16_t vcgeq(const int8x16_t & v0, const int8x16_t & v1) { return vcgeq_s8 (v0, v1); } +inline uint16x8_t vcgeq(const uint16x8_t & v0, const uint16x8_t & v1) { return vcgeq_u16(v0, v1); } +inline uint16x8_t vcgeq(const int16x8_t & v0, const int16x8_t & v1) { return vcgeq_s16(v0, v1); } +inline uint32x4_t vcgeq(const uint32x4_t & v0, const uint32x4_t & v1) { return vcgeq_u32(v0, v1); } +inline uint32x4_t vcgeq(const int32x4_t & v0, const int32x4_t & v1) { return vcgeq_s32(v0, v1); } +inline uint32x4_t vcgeq(const float32x4_t & v0, const float32x4_t & v1) { return vcgeq_f32(v0, v1); } + +////////////////////////////// vcge /////////////////////// + +inline uint8x8_t vcge(const uint8x8_t & v0, const uint8x8_t & v1) { return vcge_u8 (v0, v1); } +inline uint8x8_t vcge(const int8x8_t & v0, const int8x8_t & v1) { return vcge_s8 (v0, v1); } +inline uint16x4_t vcge(const uint16x4_t & v0, const uint16x4_t & v1) { return vcge_u16(v0, v1); } +inline uint16x4_t vcge(const int16x4_t & v0, const int16x4_t & v1) { return vcge_s16(v0, v1); } +inline uint32x2_t vcge(const uint32x2_t & v0, const uint32x2_t & v1) { return vcge_u32(v0, v1); } +inline uint32x2_t vcge(const int32x2_t & v0, const int32x2_t & v1) { return vcge_s32(v0, v1); } +inline uint32x2_t vcge(const float32x2_t & v0, const float32x2_t & v1) { return vcge_f32(v0, v1); } + +////////////////////////////// vandq /////////////////////// + +inline uint8x16_t vandq(const uint8x16_t & v0, const uint8x16_t & v1) { return vandq_u8 (v0, v1); } +inline int8x16_t vandq(const int8x16_t & v0, const int8x16_t & v1) { return vandq_s8 (v0, v1); } +inline uint16x8_t vandq(const uint16x8_t & v0, const uint16x8_t & v1) { return vandq_u16(v0, v1); } +inline int16x8_t vandq(const int16x8_t & v0, const int16x8_t & v1) { return vandq_s16(v0, v1); } +inline uint32x4_t vandq(const uint32x4_t & v0, const uint32x4_t & v1) { return vandq_u32(v0, v1); } +inline int32x4_t vandq(const int32x4_t & v0, const int32x4_t & v1) { return vandq_s32(v0, v1); } + +////////////////////////////// vand /////////////////////// + +inline uint8x8_t vand(const uint8x8_t & v0, const uint8x8_t & v1) { return vand_u8 (v0, v1); } +inline int8x8_t vand(const int8x8_t & v0, const int8x8_t & v1) { return vand_s8 (v0, v1); } +inline uint16x4_t vand(const uint16x4_t & v0, const uint16x4_t & v1) { return vand_u16(v0, v1); } +inline int16x4_t vand(const int16x4_t & v0, const int16x4_t & v1) { return vand_s16(v0, v1); } +inline uint32x2_t vand(const uint32x2_t & v0, const uint32x2_t & v1) { return vand_u32(v0, v1); } +inline int32x2_t vand(const int32x2_t & v0, const int32x2_t & v1) { return vand_s32(v0, v1); } + +////////////////////////////// vmovn /////////////////////// + +inline uint8x8_t vmovn(const uint16x8_t & v) { return vmovn_u16(v); } +inline int8x8_t vmovn(const int16x8_t & v) { return vmovn_s16(v); } +inline uint16x4_t vmovn(const uint32x4_t & v) { return vmovn_u32(v); } +inline int16x4_t vmovn(const int32x4_t & v) { return vmovn_s32(v); } +inline uint32x2_t vmovn(const uint64x2_t & v) { return vmovn_u64(v); } +inline int32x2_t vmovn(const int64x2_t & v) { return vmovn_s64(v); } + +////////////////////////////// vqmovn /////////////////////// + +inline uint8x8_t vqmovn(const uint16x8_t & v) { return vqmovn_u16(v); } +inline int8x8_t vqmovn(const int16x8_t & v) { return vqmovn_s16(v); } +inline uint16x4_t vqmovn(const uint32x4_t & v) { return vqmovn_u32(v); } +inline int16x4_t vqmovn(const int32x4_t & v) { return vqmovn_s32(v); } +inline uint32x2_t vqmovn(const uint64x2_t & v) { return vqmovn_u64(v); } +inline int32x2_t vqmovn(const int64x2_t & v) { return vqmovn_s64(v); } + +////////////////////////////// vmovl /////////////////////// + +inline uint16x8_t vmovl(const uint8x8_t & v) { return vmovl_u8(v); } +inline int16x8_t vmovl(const int8x8_t & v) { return vmovl_s8(v); } +inline uint32x4_t vmovl(const uint16x4_t & v) { return vmovl_u16(v); } +inline int32x4_t vmovl(const int16x4_t & v) { return vmovl_s16(v); } + +////////////////////////////// vmvnq /////////////////////// + +inline uint8x16_t vmvnq(const uint8x16_t & v) { return vmvnq_u8 (v); } +inline int8x16_t vmvnq(const int8x16_t & v) { return vmvnq_s8 (v); } +inline uint16x8_t vmvnq(const uint16x8_t & v) { return vmvnq_u16(v); } +inline int16x8_t vmvnq(const int16x8_t & v) { return vmvnq_s16(v); } +inline uint32x4_t vmvnq(const uint32x4_t & v) { return vmvnq_u32(v); } +inline int32x4_t vmvnq(const int32x4_t & v) { return vmvnq_s32(v); } + +////////////////////////////// vmvn /////////////////////// + +inline uint8x8_t vmvn(const uint8x8_t & v) { return vmvn_u8 (v); } +inline int8x8_t vmvn(const int8x8_t & v) { return vmvn_s8 (v); } +inline uint16x4_t vmvn(const uint16x4_t & v) { return vmvn_u16(v); } +inline int16x4_t vmvn(const int16x4_t & v) { return vmvn_s16(v); } +inline uint32x2_t vmvn(const uint32x2_t & v) { return vmvn_u32(v); } +inline int32x2_t vmvn(const int32x2_t & v) { return vmvn_s32(v); } + +////////////////////////////// vbicq /////////////////////// + +inline uint8x16_t vbicq(const uint8x16_t & v0, const uint8x16_t & v1) { return vbicq_u8 (v0, v1); } +inline int8x16_t vbicq(const int8x16_t & v0, const int8x16_t & v1) { return vbicq_s8 (v0, v1); } +inline uint16x8_t vbicq(const uint16x8_t & v0, const uint16x8_t & v1) { return vbicq_u16(v0, v1); } +inline int16x8_t vbicq(const int16x8_t & v0, const int16x8_t & v1) { return vbicq_s16(v0, v1); } +inline uint32x4_t vbicq(const uint32x4_t & v0, const uint32x4_t & v1) { return vbicq_u32(v0, v1); } +inline int32x4_t vbicq(const int32x4_t & v0, const int32x4_t & v1) { return vbicq_s32(v0, v1); } +inline uint64x2_t vbicq(const uint64x2_t & v0, const uint64x2_t & v1) { return vbicq_u64(v0, v1); } +inline int64x2_t vbicq(const int64x2_t & v0, const int64x2_t & v1) { return vbicq_s64(v0, v1); } + +////////////////////////////// vbic /////////////////////// + +inline uint8x8_t vbic(const uint8x8_t & v0, const uint8x8_t & v1) { return vbic_u8 (v0, v1); } +inline int8x8_t vbic(const int8x8_t & v0, const int8x8_t & v1) { return vbic_s8 (v0, v1); } +inline uint16x4_t vbic(const uint16x4_t & v0, const uint16x4_t & v1) { return vbic_u16(v0, v1); } +inline int16x4_t vbic(const int16x4_t & v0, const int16x4_t & v1) { return vbic_s16(v0, v1); } +inline uint32x2_t vbic(const uint32x2_t & v0, const uint32x2_t & v1) { return vbic_u32(v0, v1); } +inline int32x2_t vbic(const int32x2_t & v0, const int32x2_t & v1) { return vbic_s32(v0, v1); } +inline uint64x1_t vbic(const uint64x1_t & v0, const uint64x1_t & v1) { return vbic_u64(v0, v1); } +inline int64x1_t vbic(const int64x1_t & v0, const int64x1_t & v1) { return vbic_s64(v0, v1); } + +////////////////////////////// vtransform /////////////////////// + +template +void vtransform(Size2D size, + const typename Op::type * src0Base, ptrdiff_t src0Stride, + const typename Op::type * src1Base, ptrdiff_t src1Stride, + typename Op::type * dstBase, ptrdiff_t dstStride, const Op & op) +{ + typedef typename Op::type type; + typedef typename VecTraits::vec128 vec128; + typedef typename VecTraits::vec64 vec64; + + if (src0Stride == src1Stride && src0Stride == dstStride && + src0Stride == (ptrdiff_t)(size.width * sizeof(type))) + { + size.width *= size.height; + size.height = 1; + } + + const size_t step_base = 32 / sizeof(type); + size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0; + const size_t step_tail = 8 / sizeof(type); + size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0; + + for (size_t y = 0; y < size.height; ++y) + { + const type * src0 = internal::getRowPtr(src0Base, src0Stride, y); + const type * src1 = internal::getRowPtr(src1Base, src1Stride, y); + typename Op::type * dst = internal::getRowPtr(dstBase, dstStride, y); + size_t x = 0; + + for( ; x < roiw_base; x += step_base ) + { + internal::prefetch(src0 + x); + internal::prefetch(src1 + x); + + vec128 v_src00 = vld1q(src0 + x), v_src01 = vld1q(src0 + x + 16 / sizeof(type)); + vec128 v_src10 = vld1q(src1 + x), v_src11 = vld1q(src1 + x + 16 / sizeof(type)); + vec128 v_dst; + + op(v_src00, v_src10, v_dst); + vst1q(dst + x, v_dst); + + op(v_src01, v_src11, v_dst); + vst1q(dst + x + 16 / sizeof(type), v_dst); + } + for( ; x < roiw_tail; x += step_tail ) + { + vec64 v_src0 = vld1(src0 + x); + vec64 v_src1 = vld1(src1 + x); + vec64 v_dst; + + op(v_src0, v_src1, v_dst); + vst1(dst + x, v_dst); + } + + for (; x < size.width; ++x) + { + op(src0 + x, src1 + x, dst + x); + } + } +} + +} } + +#endif // CAROTENE_NEON + +#endif diff --git a/3rdparty/carotene/src/warp_affine.cpp b/3rdparty/carotene/src/warp_affine.cpp new file mode 100644 index 0000000000..d546efbc10 --- /dev/null +++ b/3rdparty/carotene/src/warp_affine.cpp @@ -0,0 +1,434 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + +#include "remap.hpp" + +namespace CAROTENE_NS { + +bool isWarpAffineNearestNeighborSupported(const Size2D &ssize) +{ +#if SIZE_MAX > UINT32_MAX + return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation + // is performed with u32 + isSupportedConfiguration(); +#else + (void)ssize; + return isSupportedConfiguration(); +#endif +} + +bool isWarpAffineLinearSupported(const Size2D &ssize) +{ +#if SIZE_MAX > UINT32_MAX + return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation + // is performed with u32 + isSupportedConfiguration(); +#else + (void)ssize; + return isSupportedConfiguration(); +#endif +} + +void warpAffineNearestNeighbor(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + const f32 * m, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderMode, u8 borderValue) +{ + internal::assertSupportedConfiguration(isWarpAffineNearestNeighborSupported(ssize)); +#ifdef CAROTENE_NEON + using namespace internal; + + s32 _map[BLOCK_SIZE * BLOCK_SIZE + 16]; + s32 * map = alignPtr(_map, 16); + + int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1); + int32x4_t v_step4 = vdupq_n_s32(srcStride); + float32x4_t v_4 = vdupq_n_f32(4.0f); + + float32x4_t v_m0 = vdupq_n_f32(m[0]); + float32x4_t v_m1 = vdupq_n_f32(m[1]); + float32x4_t v_m2 = vdupq_n_f32(m[2]); + float32x4_t v_m3 = vdupq_n_f32(m[3]); + float32x4_t v_m4 = vdupq_n_f32(m[4]); + float32x4_t v_m5 = vdupq_n_f32(m[5]); + + if (borderMode == BORDER_MODE_REPLICATE) + { + int32x4_t v_zero4 = vdupq_n_s32(0); + + for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE) + { + size_t blockHeight = std::min(BLOCK_SIZE, dsize.height - i); + for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE) + { + size_t blockWidth = std::min(BLOCK_SIZE, dsize.width - j); + + // compute table + for (size_t y = 0; y < blockHeight; ++y) + { + s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y); + + size_t x = 0, y_ = y + i; + f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f }; + float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_); + float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y); + + for ( ; x + 4 <= blockWidth; x += 4) + { + float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x); + float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x); + + int32x4_t v_src_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_src_xf))); + int32x4_t v_src_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_src_yf))); + int32x4_t v_src_index = vmlaq_s32(v_src_x, v_src_y, v_step4); + vst1q_s32(map_row + x, v_src_index); + + v_x = vaddq_f32(v_x, v_4); + } + + f32 yx = m[2] * y_ + m[4], yy = m[3] * y_ + m[5]; + for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_) + { + f32 src_x_f = m[0] * x_ + yx; + f32 src_y_f = m[1] * x_ + yy; + s32 src_x = floorf(src_x_f), src_y = floorf(src_y_f); + + src_x = std::max(0, std::min(ssize.width - 1, src_x)); + src_y = std::max(0, std::min(ssize.height - 1, src_y)); + map_row[x] = src_y * srcStride + src_x; + } + } + + // make remap + remapNearestNeighborReplicate(Size2D(blockWidth, blockHeight), srcBase, &map[0], + getRowPtr(dstBase, dstStride, i) + j, dstStride); + } + } + } + else if (borderMode == BORDER_MODE_CONSTANT) + { + int32x4_t v_m1_4 = vdupq_n_s32(-1); + float32x4_t v_zero4 = vdupq_n_f32(0.0f); + + for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE) + { + size_t blockHeight = std::min(BLOCK_SIZE, dsize.height - i); + for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE) + { + size_t blockWidth = std::min(BLOCK_SIZE, dsize.width - j); + + // compute table + for (size_t y = 0; y < blockHeight; ++y) + { + s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y); + + size_t x = 0, y_ = y + i; + f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f }; + float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_); + float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y); + + for ( ; x + 4 <= blockWidth; x += 4) + { + float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x); + float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x); + + int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf); + int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf); + uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x, v_width4)), + vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y, v_height4))); + int32x4_t v_src_index = vbslq_s32(v_mask, vmlaq_s32(v_src_x, v_src_y, v_step4), v_m1_4); + vst1q_s32(map_row + x, v_src_index); + + v_x = vaddq_f32(v_x, v_4); + } + + f32 yx = m[2] * y_ + m[4], yy = m[3] * y_ + m[5]; + for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_) + { + f32 src_x_f = m[0] * x_ + yx; + f32 src_y_f = m[1] * x_ + yy; + s32 src_x = floorf(src_x_f), src_y = floorf(src_y_f); + + map_row[x] = (src_x >= 0) && (src_x < (s32)ssize.width) && + (src_y >= 0) && (src_y < (s32)ssize.height) ? src_y * srcStride + src_x : -1; + } + } + + // make remap + remapNearestNeighborConst(Size2D(blockWidth, blockHeight), srcBase, &map[0], + getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue); + } + } + } +#else + (void)ssize; + (void)dsize; + (void)srcBase; + (void)srcStride; + (void)m; + (void)dstBase; + (void)dstStride; + (void)borderMode; + (void)borderValue; +#endif +} + +void warpAffineLinear(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + const f32 * m, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderMode, u8 borderValue) +{ + internal::assertSupportedConfiguration(isWarpAffineLinearSupported(ssize)); +#ifdef CAROTENE_NEON + using namespace internal; + + s32 _map[((BLOCK_SIZE * BLOCK_SIZE) << 2) + 16]; + f32 _coeffs[((BLOCK_SIZE * BLOCK_SIZE) << 1) + 16]; + s32 * map = alignPtr(_map, 16); + f32 * coeffs = alignPtr(_coeffs, 16); + + int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1); + int32x4_t v_step4 = vdupq_n_s32(srcStride), v_1 = vdupq_n_s32(1); + float32x4_t v_zero4f = vdupq_n_f32(0.0f), v_one4f = vdupq_n_f32(1.0f); + + float32x4_t v_m0 = vdupq_n_f32(m[0]); + float32x4_t v_m1 = vdupq_n_f32(m[1]); + float32x4_t v_m2 = vdupq_n_f32(m[2]); + float32x4_t v_m3 = vdupq_n_f32(m[3]); + float32x4_t v_m4 = vdupq_n_f32(m[4]); + float32x4_t v_m5 = vdupq_n_f32(m[5]); + + if (borderMode == BORDER_MODE_REPLICATE) + { + int32x4_t v_zero4 = vdupq_n_s32(0); + + for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE) + { + size_t blockHeight = std::min(BLOCK_SIZE, dsize.height - i); + for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE) + { + size_t blockWidth = std::min(BLOCK_SIZE, dsize.width - j); + + // compute table + for (size_t y = 0; y < blockHeight; ++y) + { + s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y); + f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y); + + size_t x = 0, y_ = y + i; + f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f }; + float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_), v_4 = vdupq_n_f32(4.0f); + float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y); + + for ( ; x + 4 <= blockWidth; x += 4) + { + float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x); + float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x); + + int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf); + int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf); + + float32x4x2_t v_coeff; + v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x)); + v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y)); + uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f); + uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f); + v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]); + v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]); + v_src_x = vbslq_s32(v_maskx, vsubq_s32(v_src_x, v_1), v_src_x); + v_src_y = vbslq_s32(v_masky, vsubq_s32(v_src_y, v_1), v_src_y); + + int32x4_t v_dst0_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, v_src_x)); + int32x4_t v_dst0_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, v_src_y)); + int32x4_t v_dst1_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vaddq_s32(v_1, v_src_x))); + int32x4_t v_dst1_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vaddq_s32(v_1, v_src_y))); + + int32x4x4_t v_dst_index; + v_dst_index.val[0] = vmlaq_s32(v_dst0_x, v_dst0_y, v_step4); + v_dst_index.val[1] = vmlaq_s32(v_dst1_x, v_dst0_y, v_step4); + v_dst_index.val[2] = vmlaq_s32(v_dst0_x, v_dst1_y, v_step4); + v_dst_index.val[3] = vmlaq_s32(v_dst1_x, v_dst1_y, v_step4); + + vst2q_f32(coeff_row + (x << 1), v_coeff); + vst4q_s32(map_row + (x << 2), v_dst_index); + + v_x = vaddq_f32(v_x, v_4); + } + + f32 yx = m[2] * y_ + m[4], yy = m[3] * y_ + m[5]; + for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_) + { + f32 src_x_f = m[0] * x_ + yx; + f32 src_y_f = m[1] * x_ + yy; + + s32 src0_x = (s32)floorf(src_x_f); + s32 src0_y = (s32)floorf(src_y_f); + + coeff_row[(x << 1) + 0] = src_x_f - src0_x; + coeff_row[(x << 1) + 1] = src_y_f - src0_y; + + s32 src1_y = std::max(0, std::min(ssize.height - 1, src0_y + 1)); + src0_y = std::max(0, std::min(ssize.height - 1, src0_y)); + s32 src1_x = std::max(0, std::min(ssize.width - 1, src0_x + 1)); + src0_x = std::max(0, std::min(ssize.width - 1, src0_x)); + + map_row[(x << 2) + 0] = src0_y * srcStride + src0_x; + map_row[(x << 2) + 1] = src0_y * srcStride + src1_x; + map_row[(x << 2) + 2] = src1_y * srcStride + src0_x; + map_row[(x << 2) + 3] = src1_y * srcStride + src1_x; + } + } + + remapLinearReplicate(Size2D(blockWidth, blockHeight), + srcBase, &map[0], &coeffs[0], + getRowPtr(dstBase, dstStride, i) + j, dstStride); + } + } + } + else if (borderMode == BORDER_MODE_CONSTANT) + { + float32x4_t v_zero4 = vdupq_n_f32(0.0f); + int32x4_t v_m1_4 = vdupq_n_s32(-1); + + for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE) + { + size_t blockHeight = std::min(BLOCK_SIZE, dsize.height - i); + for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE) + { + size_t blockWidth = std::min(BLOCK_SIZE, dsize.width - j); + + // compute table + for (size_t y = 0; y < blockHeight; ++y) + { + s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y); + f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y); + + size_t x = 0, y_ = y + i; + f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f }; + float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_), v_4 = vdupq_n_f32(4.0f); + float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y); + + for ( ; x + 4 <= blockWidth; x += 4) + { + float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x); + float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x); + + int32x4_t v_src_x0 = vcvtq_s32_f32(v_src_xf); + int32x4_t v_src_y0 = vcvtq_s32_f32(v_src_yf); + + float32x4x2_t v_coeff; + v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x0)); + v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y0)); + uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f); + uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f); + v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]); + v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]); + v_src_x0 = vbslq_s32(v_maskx, vsubq_s32(v_src_x0, v_1), v_src_x0); + v_src_y0 = vbslq_s32(v_masky, vsubq_s32(v_src_y0, v_1), v_src_y0); + + int32x4_t v_src_x1 = vaddq_s32(v_src_x0, v_1); + int32x4_t v_src_y1 = vaddq_s32(v_src_y0, v_1); + + int32x4x4_t v_dst_index; + v_dst_index.val[0] = vmlaq_s32(v_src_x0, v_src_y0, v_step4); + v_dst_index.val[1] = vmlaq_s32(v_src_x1, v_src_y0, v_step4); + v_dst_index.val[2] = vmlaq_s32(v_src_x0, v_src_y1, v_step4); + v_dst_index.val[3] = vmlaq_s32(v_src_x1, v_src_y1, v_step4); + + uint32x4_t v_mask_x0 = vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x0, v_width4)); + uint32x4_t v_mask_x1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_xf, v_one4f), v_zero4), vcleq_s32(v_src_x1, v_width4)); + uint32x4_t v_mask_y0 = vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y0, v_height4)); + uint32x4_t v_mask_y1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_yf, v_one4f), v_zero4), vcleq_s32(v_src_y1, v_height4)); + + v_dst_index.val[0] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y0), v_dst_index.val[0], v_m1_4); + v_dst_index.val[1] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y0), v_dst_index.val[1], v_m1_4); + v_dst_index.val[2] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y1), v_dst_index.val[2], v_m1_4); + v_dst_index.val[3] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y1), v_dst_index.val[3], v_m1_4); + + vst2q_f32(coeff_row + (x << 1), v_coeff); + vst4q_s32(map_row + (x << 2), v_dst_index); + + v_x = vaddq_f32(v_x, v_4); + } + + f32 yx = m[2] * y_ + m[4], yy = m[3] * y_ + m[5]; + for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_) + { + f32 src_x_f = m[0] * x_ + yx; + f32 src_y_f = m[1] * x_ + yy; + + s32 src0_x = (s32)floorf(src_x_f), src1_x = src0_x + 1; + s32 src0_y = (s32)floorf(src_y_f), src1_y = src0_y + 1; + + coeff_row[(x << 1) + 0] = src_x_f - src0_x; + coeff_row[(x << 1) + 1] = src_y_f - src0_y; + + map_row[(x << 2) + 0] = (src0_x >= 0) && (src0_x < (s32)ssize.width) && + (src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src0_x : -1; + map_row[(x << 2) + 1] = (src1_x >= 0) && (src1_x < (s32)ssize.width) && + (src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src1_x : -1; + map_row[(x << 2) + 2] = (src0_x >= 0) && (src0_x < (s32)ssize.width) && + (src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src0_x : -1; + map_row[(x << 2) + 3] = (src1_x >= 0) && (src1_x < (s32)ssize.width) && + (src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src1_x : -1; + } + } + + remapLinearConst(Size2D(blockWidth, blockHeight), + srcBase, &map[0], &coeffs[0], + getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue); + } + } + } +#else + (void)ssize; + (void)dsize; + (void)srcBase; + (void)srcStride; + (void)m; + (void)dstBase; + (void)dstStride; + (void)borderMode; + (void)borderValue; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/carotene/src/warp_perspective.cpp b/3rdparty/carotene/src/warp_perspective.cpp new file mode 100644 index 0000000000..4437661413 --- /dev/null +++ b/3rdparty/carotene/src/warp_perspective.cpp @@ -0,0 +1,464 @@ +/* + * By downloading, copying, installing or using the software you agree to this license. + * If you do not agree to this license, do not download, install, + * copy or use the software. + * + * + * License Agreement + * For Open Source Computer Vision Library + * (3-clause BSD License) + * + * Copyright (C) 2015, NVIDIA Corporation, all rights reserved. + * Third party copyrights are property of their respective owners. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the names of the copyright holders nor the names of the contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * This software is provided by the copyright holders and contributors "as is" and + * any express or implied warranties, including, but not limited to, the implied + * warranties of merchantability and fitness for a particular purpose are disclaimed. + * In no event shall copyright holders or contributors be liable for any direct, + * indirect, incidental, special, exemplary, or consequential damages + * (including, but not limited to, procurement of substitute goods or services; + * loss of use, data, or profits; or business interruption) however caused + * and on any theory of liability, whether in contract, strict liability, + * or tort (including negligence or otherwise) arising in any way out of + * the use of this software, even if advised of the possibility of such damage. + */ + + + +#include "remap.hpp" + +namespace CAROTENE_NS { + +bool isWarpPerspectiveNearestNeighborSupported(const Size2D &ssize) +{ +#if SIZE_MAX > UINT32_MAX + return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation + // is performed with u32 + isSupportedConfiguration(); +#else + (void)ssize; + return isSupportedConfiguration(); +#endif +} + +bool isWarpPerspectiveLinearSupported(const Size2D &ssize) +{ +#if SIZE_MAX > UINT32_MAX + return !(ssize.width > 0xffffFFFF || ssize.height > 0xffffFFFF) && // Restrict image size since internal index evaluation + // is performed with u32 + isSupportedConfiguration(); +#else + (void)ssize; + return isSupportedConfiguration(); +#endif +} + +void warpPerspectiveNearestNeighbor(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + const f32 * m, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderMode, u8 borderValue) +{ + internal::assertSupportedConfiguration(isWarpPerspectiveNearestNeighborSupported(ssize)); +#ifdef CAROTENE_NEON + using namespace internal; + + s32 _map[BLOCK_SIZE * BLOCK_SIZE + 16]; + s32 * map = alignPtr(_map, 16); + + int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1); + int32x4_t v_step4 = vdupq_n_s32(srcStride); + float32x4_t v_4 = vdupq_n_f32(4.0f); + + float32x4_t v_m0 = vdupq_n_f32(m[0]); + float32x4_t v_m1 = vdupq_n_f32(m[1]); + float32x4_t v_m2 = vdupq_n_f32(m[2]); + float32x4_t v_m3 = vdupq_n_f32(m[3]); + float32x4_t v_m4 = vdupq_n_f32(m[4]); + float32x4_t v_m5 = vdupq_n_f32(m[5]); + float32x4_t v_m6 = vdupq_n_f32(m[6]); + float32x4_t v_m7 = vdupq_n_f32(m[7]); + float32x4_t v_m8 = vdupq_n_f32(m[8]); + + if (borderMode == BORDER_MODE_REPLICATE) + { + int32x4_t v_zero4 = vdupq_n_s32(0); + + for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE) + { + size_t blockHeight = std::min(BLOCK_SIZE, dsize.height - i); + for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE) + { + size_t blockWidth = std::min(BLOCK_SIZE, dsize.width - j); + + // compute table + for (size_t y = 0; y < blockHeight; ++y) + { + s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y); + + size_t x = 0, y_ = y + i; + f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f }; + float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_); + float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y), + v_yw = vmlaq_f32(v_m8, v_m5, v_y); + + for ( ; x + 4 <= blockWidth; x += 4) + { + float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x); + float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x); + float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x)); + v_src_xf = vmulq_f32(v_wf, v_src_xf); + v_src_yf = vmulq_f32(v_wf, v_src_yf); + + int32x4_t v_src_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vcvtq_s32_f32(v_src_xf))); + int32x4_t v_src_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vcvtq_s32_f32(v_src_yf))); + int32x4_t v_src_index = vmlaq_s32(v_src_x, v_src_y, v_step4); + vst1q_s32(map_row + x, v_src_index); + + v_x = vaddq_f32(v_x, v_4); + } + + f32 yx = m[3] * y_ + m[6], yy = m[4] * y_ + m[7], yw = m[5] * y_ + m[8]; + for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_) + { + f32 w_f = 1.0f / (m[2] * x_ + yw); + f32 src_x_f = (m[0] * x_ + yx) * w_f; + f32 src_y_f = (m[1] * x_ + yy) * w_f; + s32 src_x = floorf(src_x_f), src_y = floorf(src_y_f); + + src_x = std::max(0, std::min(ssize.width - 1, src_x)); + src_y = std::max(0, std::min(ssize.height - 1, src_y)); + map_row[x] = src_y * srcStride + src_x; + } + } + + // make remap + remapNearestNeighborReplicate(Size2D(blockWidth, blockHeight), srcBase, &map[0], + getRowPtr(dstBase, dstStride, i) + j, dstStride); + } + } + } + else if (borderMode == BORDER_MODE_CONSTANT) + { + int32x4_t v_m1_4 = vdupq_n_s32(-1); + float32x4_t v_zero4 = vdupq_n_f32(0.0f); + + for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE) + { + size_t blockHeight = std::min(BLOCK_SIZE, dsize.height - i); + for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE) + { + size_t blockWidth = std::min(BLOCK_SIZE, dsize.width - j); + + // compute table + for (size_t y = 0; y < blockHeight; ++y) + { + s32 * map_row = getRowPtr(&map[0], blockWidth * sizeof(s32), y); + + size_t x = 0, y_ = y + i; + f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f }; + float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_); + float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y), + v_yw = vmlaq_f32(v_m8, v_m5, v_y); + + for ( ; x + 4 <= blockWidth; x += 4) + { + float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x); + float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x); + float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x)); + v_src_xf = vmulq_f32(v_wf, v_src_xf); + v_src_yf = vmulq_f32(v_wf, v_src_yf); + + int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf); + int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf); + uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x, v_width4)), + vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y, v_height4))); + int32x4_t v_src_index = vbslq_s32(v_mask, vmlaq_s32(v_src_x, v_src_y, v_step4), v_m1_4); + vst1q_s32(map_row + x, v_src_index); + + v_x = vaddq_f32(v_x, v_4); + } + + f32 yx = m[3] * y_ + m[6], yy = m[4] * y_ + m[7], yw = m[5] * y_ + m[8]; + for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_) + { + f32 w_f = 1.0f / (m[2] * x_ + yw); + f32 src_x_f = (m[0] * x_ + yx) * w_f; + f32 src_y_f = (m[1] * x_ + yy) * w_f; + s32 src_x = floorf(src_x_f), src_y = floorf(src_y_f); + + map_row[x] = (src_x >= 0) && (src_x < (s32)ssize.width) && + (src_y >= 0) && (src_y < (s32)ssize.height) ? src_y * srcStride + src_x : -1; + } + } + + // make remap + remapNearestNeighborConst(Size2D(blockWidth, blockHeight), srcBase, &map[0], + getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue); + } + } + } +#else + (void)ssize; + (void)dsize; + (void)srcBase; + (void)srcStride; + (void)m; + (void)dstBase; + (void)dstStride; + (void)borderMode; + (void)borderValue; +#endif +} + +void warpPerspectiveLinear(const Size2D &ssize, const Size2D &dsize, + const u8 * srcBase, ptrdiff_t srcStride, + const f32 * m, + u8 * dstBase, ptrdiff_t dstStride, + BORDER_MODE borderMode, u8 borderValue) +{ + internal::assertSupportedConfiguration(isWarpPerspectiveLinearSupported(ssize)); +#ifdef CAROTENE_NEON + using namespace internal; + + s32 _map[((BLOCK_SIZE * BLOCK_SIZE) << 2) + 16]; + f32 _coeffs[((BLOCK_SIZE * BLOCK_SIZE) << 1) + 16]; + s32 * map = alignPtr(_map, 16); + f32 * coeffs = alignPtr(_coeffs, 16); + + int32x4_t v_width4 = vdupq_n_s32(ssize.width - 1), v_height4 = vdupq_n_s32(ssize.height - 1); + int32x4_t v_step4 = vdupq_n_s32(srcStride), v_1 = vdupq_n_s32(1); + float32x4_t v_zero4f = vdupq_n_f32(0.0f), v_one4f = vdupq_n_f32(1.0f); + + float32x4_t v_4 = vdupq_n_f32(4.0f); + + float32x4_t v_m0 = vdupq_n_f32(m[0]); + float32x4_t v_m1 = vdupq_n_f32(m[1]); + float32x4_t v_m2 = vdupq_n_f32(m[2]); + float32x4_t v_m3 = vdupq_n_f32(m[3]); + float32x4_t v_m4 = vdupq_n_f32(m[4]); + float32x4_t v_m5 = vdupq_n_f32(m[5]); + float32x4_t v_m6 = vdupq_n_f32(m[6]); + float32x4_t v_m7 = vdupq_n_f32(m[7]); + float32x4_t v_m8 = vdupq_n_f32(m[8]); + + if (borderMode == BORDER_MODE_REPLICATE) + { + int32x4_t v_zero4 = vdupq_n_s32(0); + + for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE) + { + size_t blockHeight = std::min(BLOCK_SIZE, dsize.height - i); + for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE) + { + size_t blockWidth = std::min(BLOCK_SIZE, dsize.width - j); + + // compute table + for (size_t y = 0; y < blockHeight; ++y) + { + s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y); + f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y); + + size_t x = 0, y_ = y + i; + f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f }; + float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_); + float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y), + v_yw = vmlaq_f32(v_m8, v_m5, v_y); + + for ( ; x + 4 <= blockWidth; x += 4) + { + float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x); + float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x); + float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x)); + v_src_xf = vmulq_f32(v_wf, v_src_xf); + v_src_yf = vmulq_f32(v_wf, v_src_yf); + + int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf); + int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf); + + float32x4x2_t v_coeff; + v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x)); + v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y)); + uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f); + uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f); + v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]); + v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]); + v_src_x = vbslq_s32(v_maskx, vsubq_s32(v_src_x, v_1), v_src_x); + v_src_y = vbslq_s32(v_masky, vsubq_s32(v_src_y, v_1), v_src_y); + + int32x4_t v_dst0_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, v_src_x)); + int32x4_t v_dst0_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, v_src_y)); + int32x4_t v_dst1_x = vmaxq_s32(v_zero4, vminq_s32(v_width4, vaddq_s32(v_1, v_src_x))); + int32x4_t v_dst1_y = vmaxq_s32(v_zero4, vminq_s32(v_height4, vaddq_s32(v_1, v_src_y))); + + int32x4x4_t v_dst_index; + v_dst_index.val[0] = vmlaq_s32(v_dst0_x, v_dst0_y, v_step4); + v_dst_index.val[1] = vmlaq_s32(v_dst1_x, v_dst0_y, v_step4); + v_dst_index.val[2] = vmlaq_s32(v_dst0_x, v_dst1_y, v_step4); + v_dst_index.val[3] = vmlaq_s32(v_dst1_x, v_dst1_y, v_step4); + + vst2q_f32(coeff_row + (x << 1), v_coeff); + vst4q_s32(map_row + (x << 2), v_dst_index); + + v_x = vaddq_f32(v_x, v_4); + } + + f32 yx = m[3] * y_ + m[6], yy = m[4] * y_ + m[7], yw = m[5] * y_ + m[8]; + for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_) + { + f32 w_f = 1.0f / (m[2] * x_ + yw); + f32 src_x_f = (m[0] * x_ + yx) * w_f; + f32 src_y_f = (m[1] * x_ + yy) * w_f; + + s32 src0_x = (s32)floorf(src_x_f); + s32 src0_y = (s32)floorf(src_y_f); + + coeff_row[(x << 1) + 0] = src_x_f - src0_x; + coeff_row[(x << 1) + 1] = src_y_f - src0_y; + + s32 src1_y = std::max(0, std::min(ssize.height - 1, src0_y + 1)); + src0_y = std::max(0, std::min(ssize.height - 1, src0_y)); + s32 src1_x = std::max(0, std::min(ssize.width - 1, src0_x + 1)); + src0_x = std::max(0, std::min(ssize.width - 1, src0_x)); + + map_row[(x << 2) + 0] = src0_y * srcStride + src0_x; + map_row[(x << 2) + 1] = src0_y * srcStride + src1_x; + map_row[(x << 2) + 2] = src1_y * srcStride + src0_x; + map_row[(x << 2) + 3] = src1_y * srcStride + src1_x; + } + } + + remapLinearReplicate(Size2D(blockWidth, blockHeight), + srcBase, &map[0], &coeffs[0], + getRowPtr(dstBase, dstStride, i) + j, dstStride); + } + } + } + else if (borderMode == BORDER_MODE_CONSTANT) + { + float32x4_t v_zero4 = vdupq_n_f32(0.0f); + int32x4_t v_m1_4 = vdupq_n_s32(-1); + + for (size_t i = 0; i < dsize.height; i += BLOCK_SIZE) + { + size_t blockHeight = std::min(BLOCK_SIZE, dsize.height - i); + for (size_t j = 0; j < dsize.width; j += BLOCK_SIZE) + { + size_t blockWidth = std::min(BLOCK_SIZE, dsize.width - j); + + // compute table + for (size_t y = 0; y < blockHeight; ++y) + { + s32 * map_row = getRowPtr(map, blockWidth * sizeof(s32) * 4, y); + f32 * coeff_row = getRowPtr(coeffs, blockWidth * sizeof(f32) * 2, y); + + size_t x = 0, y_ = y + i; + f32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f }; + float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_); + float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y), + v_yw = vmlaq_f32(v_m8, v_m5, v_y); + + for ( ; x + 4 <= blockWidth; x += 4) + { + float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x); + float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x); + float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x)); + v_src_xf = vmulq_f32(v_wf, v_src_xf); + v_src_yf = vmulq_f32(v_wf, v_src_yf); + + int32x4_t v_src_x0 = vcvtq_s32_f32(v_src_xf); + int32x4_t v_src_y0 = vcvtq_s32_f32(v_src_yf); + + float32x4x2_t v_coeff; + v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x0)); + v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y0)); + uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f); + uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f); + v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]); + v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]); + v_src_x0 = vbslq_s32(v_maskx, vsubq_s32(v_src_x0, v_1), v_src_x0); + v_src_y0 = vbslq_s32(v_masky, vsubq_s32(v_src_y0, v_1), v_src_y0); + + int32x4_t v_src_x1 = vaddq_s32(v_src_x0, v_1); + int32x4_t v_src_y1 = vaddq_s32(v_src_y0, v_1); + + int32x4x4_t v_dst_index; + v_dst_index.val[0] = vmlaq_s32(v_src_x0, v_src_y0, v_step4); + v_dst_index.val[1] = vmlaq_s32(v_src_x1, v_src_y0, v_step4); + v_dst_index.val[2] = vmlaq_s32(v_src_x0, v_src_y1, v_step4); + v_dst_index.val[3] = vmlaq_s32(v_src_x1, v_src_y1, v_step4); + + uint32x4_t v_mask_x0 = vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x0, v_width4)); + uint32x4_t v_mask_x1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_xf, v_one4f), v_zero4), vcleq_s32(v_src_x1, v_width4)); + uint32x4_t v_mask_y0 = vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y0, v_height4)); + uint32x4_t v_mask_y1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_yf, v_one4f), v_zero4), vcleq_s32(v_src_y1, v_height4)); + + v_dst_index.val[0] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y0), v_dst_index.val[0], v_m1_4); + v_dst_index.val[1] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y0), v_dst_index.val[1], v_m1_4); + v_dst_index.val[2] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y1), v_dst_index.val[2], v_m1_4); + v_dst_index.val[3] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y1), v_dst_index.val[3], v_m1_4); + + vst2q_f32(coeff_row + (x << 1), v_coeff); + vst4q_s32(map_row + (x << 2), v_dst_index); + + v_x = vaddq_f32(v_x, v_4); + } + + f32 yx = m[3] * y_ + m[6], yy = m[4] * y_ + m[7], yw = m[5] * y_ + m[8]; + for (ptrdiff_t x_ = x + j; x < blockWidth; ++x, ++x_) + { + f32 w_f = 1.0f / (m[2] * x_ + yw); + f32 src_x_f = (m[0] * x_ + yx) * w_f; + f32 src_y_f = (m[1] * x_ + yy) * w_f; + + s32 src0_x = (s32)floorf(src_x_f), src1_x = src0_x + 1; + s32 src0_y = (s32)floorf(src_y_f), src1_y = src0_y + 1; + + coeff_row[(x << 1) + 0] = src_x_f - src0_x; + coeff_row[(x << 1) + 1] = src_y_f - src0_y; + + map_row[(x << 2) + 0] = (src0_x >= 0) && (src0_x < (s32)ssize.width) && + (src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src0_x : -1; + map_row[(x << 2) + 1] = (src1_x >= 0) && (src1_x < (s32)ssize.width) && + (src0_y >= 0) && (src0_y < (s32)ssize.height) ? src0_y * srcStride + src1_x : -1; + map_row[(x << 2) + 2] = (src0_x >= 0) && (src0_x < (s32)ssize.width) && + (src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src0_x : -1; + map_row[(x << 2) + 3] = (src1_x >= 0) && (src1_x < (s32)ssize.width) && + (src1_y >= 0) && (src1_y < (s32)ssize.height) ? src1_y * srcStride + src1_x : -1; + } + } + + remapLinearConst(Size2D(blockWidth, blockHeight), + srcBase, &map[0], &coeffs[0], + getRowPtr(dstBase, dstStride, i) + j, dstStride, borderValue); + } + } + } +#else + (void)ssize; + (void)dsize; + (void)srcBase; + (void)srcStride; + (void)m; + (void)dstBase; + (void)dstStride; + (void)borderMode; + (void)borderValue; +#endif +} + +} // namespace CAROTENE_NS diff --git a/3rdparty/tbb/CMakeLists.txt b/3rdparty/tbb/CMakeLists.txt index a76854d4a3..eddeaef56a 100644 --- a/3rdparty/tbb/CMakeLists.txt +++ b/3rdparty/tbb/CMakeLists.txt @@ -5,9 +5,9 @@ if (WIN32 AND NOT ARM) message(FATAL_ERROR "BUILD_TBB option supports Windows on ARM only!\nUse regular official TBB build instead of the BUILD_TBB option!") endif() -set(tbb_ver "tbb43_20141204oss") -set(tbb_url "http://www.threadingbuildingblocks.org/sites/default/files/software_releases/source/tbb43_20141204oss_src.tgz") -set(tbb_md5 "e903dd92d9433701f097fa7ca29a3c1f") +set(tbb_ver "tbb44_20160128oss") +set(tbb_url "http://www.threadingbuildingblocks.org/sites/default/files/software_releases/source/tbb44_20160128oss_src_0.tgz") +set(tbb_md5 "9d8a4cdf43496f1b3f7c473a5248e5cc") set(tbb_version_file "version_string.ver") ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4702) ocv_warnings_disable(CMAKE_CXX_FLAGS -Wshadow) diff --git a/CMakeLists.txt b/CMakeLists.txt index f043acd614..7ea42680b9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -81,13 +81,14 @@ if(POLICY CMP0026) cmake_policy(SET CMP0026 OLD) endif() -if (POLICY CMP0042) - # silence cmake 3.0+ warnings about MACOSX_RPATH - cmake_policy(SET CMP0042 OLD) +if(POLICY CMP0042) + cmake_policy(SET CMP0042 NEW) endif() +include(cmake/OpenCVUtils.cmake) + # must go before the project command -set(CMAKE_CONFIGURATION_TYPES "Debug;Release" CACHE STRING "Configs" FORCE) +ocv_update(CMAKE_CONFIGURATION_TYPES "Debug;Release" CACHE STRING "Configs" FORCE) if(DEFINED CMAKE_BUILD_TYPE) set_property( CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS ${CMAKE_CONFIGURATION_TYPES} ) endif() @@ -100,8 +101,6 @@ if(MSVC) set(CMAKE_USE_RELATIVE_PATHS ON CACHE INTERNAL "" FORCE) endif() -include(cmake/OpenCVUtils.cmake) - ocv_cmake_eval(DEBUG_PRE ONCE) ocv_clear_vars(OpenCVModules_TARGETS) @@ -170,6 +169,7 @@ endif() OCV_OPTION(WITH_1394 "Include IEEE1394 support" ON IF (NOT ANDROID AND NOT IOS AND NOT WINRT) ) OCV_OPTION(WITH_AVFOUNDATION "Use AVFoundation for Video I/O" ON IF IOS) OCV_OPTION(WITH_CARBON "Use Carbon for UI instead of Cocoa" OFF IF APPLE ) +OCV_OPTION(WITH_CAROTENE "Use NVidia carotene acceleration library for ARM platform" ON IF (ARM OR AARCH64) AND NOT IOS AND NOT (CMAKE_VERSION VERSION_LESS "2.8.11")) OCV_OPTION(WITH_VTK "Include VTK library support (and build opencv_viz module eiher)" ON IF (NOT ANDROID AND NOT IOS AND NOT WINRT AND NOT CMAKE_CROSSCOMPILING) ) OCV_OPTION(WITH_CUDA "Include NVidia Cuda Runtime support" ON IF (NOT IOS AND NOT WINRT) ) OCV_OPTION(WITH_CUFFT "Include NVidia Cuda Fast Fourier Transform (FFT) library support" ON IF (NOT IOS AND NOT WINRT) ) @@ -304,50 +304,50 @@ include(cmake/OpenCVVersion.cmake) # ---------------------------------------------------------------------------- # Save libs and executables in the same place -set(EXECUTABLE_OUTPUT_PATH "${CMAKE_BINARY_DIR}/bin" CACHE PATH "Output directory for applications" ) +set(EXECUTABLE_OUTPUT_PATH "${CMAKE_BINARY_DIR}/bin" CACHE PATH "Output directory for applications") -if (ANDROID) - if (ANDROID_ABI MATCHES "NEON") +if(ANDROID) + if(ANDROID_ABI MATCHES "NEON") set(ENABLE_NEON ON) endif() - if (ANDROID_ABI MATCHES "VFPV3") + if(ANDROID_ABI MATCHES "VFPV3") set(ENABLE_VFPV3 ON) endif() endif() if(ANDROID OR WIN32) - set(OPENCV_DOC_INSTALL_PATH doc) + ocv_update(OPENCV_DOC_INSTALL_PATH doc) else() - set(OPENCV_DOC_INSTALL_PATH share/OpenCV/doc) + ocv_update(OPENCV_DOC_INSTALL_PATH share/OpenCV/doc) endif() if(WIN32 AND CMAKE_HOST_SYSTEM_NAME MATCHES Windows) if(DEFINED OpenCV_RUNTIME AND DEFINED OpenCV_ARCH) - set(OpenCV_INSTALL_BINARIES_PREFIX "${OpenCV_ARCH}/${OpenCV_RUNTIME}/") + ocv_update(OpenCV_INSTALL_BINARIES_PREFIX "${OpenCV_ARCH}/${OpenCV_RUNTIME}/") else() message(STATUS "Can't detect runtime and/or arch") - set(OpenCV_INSTALL_BINARIES_PREFIX "") + ocv_update(OpenCV_INSTALL_BINARIES_PREFIX "") endif() elseif(ANDROID) - set(OpenCV_INSTALL_BINARIES_PREFIX "sdk/native/") + ocv_update(OpenCV_INSTALL_BINARIES_PREFIX "sdk/native/") else() - set(OpenCV_INSTALL_BINARIES_PREFIX "") + ocv_update(OpenCV_INSTALL_BINARIES_PREFIX "") endif() if(ANDROID) - set(OPENCV_SAMPLES_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}samples/${ANDROID_NDK_ABI_NAME}") + ocv_update(OPENCV_SAMPLES_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}samples/${ANDROID_NDK_ABI_NAME}") else() - set(OPENCV_SAMPLES_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}samples") + ocv_update(OPENCV_SAMPLES_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}samples") endif() if(ANDROID) - set(OPENCV_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}bin/${ANDROID_NDK_ABI_NAME}") + ocv_update(OPENCV_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}bin/${ANDROID_NDK_ABI_NAME}") else() - set(OPENCV_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}bin") + ocv_update(OPENCV_BIN_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}bin") endif() if(NOT OPENCV_TEST_INSTALL_PATH) - set(OPENCV_TEST_INSTALL_PATH "${OPENCV_BIN_INSTALL_PATH}") + ocv_update(OPENCV_TEST_INSTALL_PATH "${OPENCV_BIN_INSTALL_PATH}") endif() if (OPENCV_TEST_DATA_PATH) @@ -356,66 +356,74 @@ endif() if(OPENCV_TEST_DATA_PATH AND NOT OPENCV_TEST_DATA_INSTALL_PATH) if(ANDROID) - set(OPENCV_TEST_DATA_INSTALL_PATH "sdk/etc/testdata") + ocv_update(OPENCV_TEST_DATA_INSTALL_PATH "sdk/etc/testdata") elseif(WIN32) - set(OPENCV_TEST_DATA_INSTALL_PATH "testdata") + ocv_update(OPENCV_TEST_DATA_INSTALL_PATH "testdata") else() - set(OPENCV_TEST_DATA_INSTALL_PATH "share/OpenCV/testdata") + ocv_update(OPENCV_TEST_DATA_INSTALL_PATH "share/OpenCV/testdata") endif() endif() if(ANDROID) - set(LIBRARY_OUTPUT_PATH "${OpenCV_BINARY_DIR}/lib/${ANDROID_NDK_ABI_NAME}") - set(3P_LIBRARY_OUTPUT_PATH "${OpenCV_BINARY_DIR}/3rdparty/lib/${ANDROID_NDK_ABI_NAME}") - set(OPENCV_LIB_INSTALL_PATH sdk/native/libs/${ANDROID_NDK_ABI_NAME}) - set(OPENCV_3P_LIB_INSTALL_PATH sdk/native/3rdparty/libs/${ANDROID_NDK_ABI_NAME}) - set(OPENCV_CONFIG_INSTALL_PATH sdk/native/jni) - set(OPENCV_INCLUDE_INSTALL_PATH sdk/native/jni/include) - set(OPENCV_SAMPLES_SRC_INSTALL_PATH samples/native) - set(OPENCV_OTHER_INSTALL_PATH sdk/etc) + set(LIBRARY_OUTPUT_PATH "${OpenCV_BINARY_DIR}/lib/${ANDROID_NDK_ABI_NAME}") + ocv_update(3P_LIBRARY_OUTPUT_PATH "${OpenCV_BINARY_DIR}/3rdparty/lib/${ANDROID_NDK_ABI_NAME}") + ocv_update(OPENCV_LIB_INSTALL_PATH sdk/native/libs/${ANDROID_NDK_ABI_NAME}) + ocv_update(OPENCV_3P_LIB_INSTALL_PATH sdk/native/3rdparty/libs/${ANDROID_NDK_ABI_NAME}) + ocv_update(OPENCV_CONFIG_INSTALL_PATH sdk/native/jni) + ocv_update(OPENCV_INCLUDE_INSTALL_PATH sdk/native/jni/include) + ocv_update(OPENCV_SAMPLES_SRC_INSTALL_PATH samples/native) + ocv_update(OPENCV_OTHER_INSTALL_PATH sdk/etc) else() - set(LIBRARY_OUTPUT_PATH "${OpenCV_BINARY_DIR}/lib") - set(3P_LIBRARY_OUTPUT_PATH "${OpenCV_BINARY_DIR}/3rdparty/lib${LIB_SUFFIX}") + set(LIBRARY_OUTPUT_PATH "${OpenCV_BINARY_DIR}/lib") + ocv_update(3P_LIBRARY_OUTPUT_PATH "${OpenCV_BINARY_DIR}/3rdparty/lib${LIB_SUFFIX}") if(WIN32 AND CMAKE_HOST_SYSTEM_NAME MATCHES Windows) if(OpenCV_STATIC) - set(OPENCV_LIB_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}staticlib${LIB_SUFFIX}") + ocv_update(OPENCV_LIB_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}staticlib${LIB_SUFFIX}") else() - set(OPENCV_LIB_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}lib${LIB_SUFFIX}") + ocv_update(OPENCV_LIB_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}lib${LIB_SUFFIX}") endif() - set(OPENCV_3P_LIB_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}staticlib${LIB_SUFFIX}") - set(OPENCV_SAMPLES_SRC_INSTALL_PATH samples/native) - set(OPENCV_JAR_INSTALL_PATH java) - set(OPENCV_OTHER_INSTALL_PATH etc) + ocv_update(OPENCV_3P_LIB_INSTALL_PATH "${OpenCV_INSTALL_BINARIES_PREFIX}staticlib${LIB_SUFFIX}") + ocv_update(OPENCV_SAMPLES_SRC_INSTALL_PATH samples/native) + ocv_update(OPENCV_JAR_INSTALL_PATH java) + ocv_update(OPENCV_OTHER_INSTALL_PATH etc) + ocv_update(OPENCV_CONFIG_INSTALL_PATH ".") else() - set(OPENCV_LIB_INSTALL_PATH lib${LIB_SUFFIX}) - set(OPENCV_3P_LIB_INSTALL_PATH share/OpenCV/3rdparty/${OPENCV_LIB_INSTALL_PATH}) - set(OPENCV_SAMPLES_SRC_INSTALL_PATH share/OpenCV/samples) - set(OPENCV_JAR_INSTALL_PATH share/OpenCV/java) - set(OPENCV_OTHER_INSTALL_PATH share/OpenCV) - endif() - set(OPENCV_INCLUDE_INSTALL_PATH "include") - - math(EXPR SIZEOF_VOID_P_BITS "8 * ${CMAKE_SIZEOF_VOID_P}") - if(LIB_SUFFIX AND NOT SIZEOF_VOID_P_BITS EQUAL LIB_SUFFIX) - set(OPENCV_CONFIG_INSTALL_PATH lib${LIB_SUFFIX}/cmake/opencv) - else() - set(OPENCV_CONFIG_INSTALL_PATH share/OpenCV) + ocv_update(OPENCV_LIB_INSTALL_PATH lib${LIB_SUFFIX}) + ocv_update(OPENCV_3P_LIB_INSTALL_PATH share/OpenCV/3rdparty/${OPENCV_LIB_INSTALL_PATH}) + ocv_update(OPENCV_SAMPLES_SRC_INSTALL_PATH share/OpenCV/samples) + ocv_update(OPENCV_JAR_INSTALL_PATH share/OpenCV/java) + ocv_update(OPENCV_OTHER_INSTALL_PATH share/OpenCV) + + if(NOT DEFINED OPENCV_CONFIG_INSTALL_PATH) + math(EXPR SIZEOF_VOID_P_BITS "8 * ${CMAKE_SIZEOF_VOID_P}") + if(LIB_SUFFIX AND NOT SIZEOF_VOID_P_BITS EQUAL LIB_SUFFIX) + ocv_update(OPENCV_CONFIG_INSTALL_PATH lib${LIB_SUFFIX}/cmake/opencv) + else() + ocv_update(OPENCV_CONFIG_INSTALL_PATH share/OpenCV) + endif() + endif() endif() + ocv_update(OPENCV_INCLUDE_INSTALL_PATH "include") endif() -set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${OPENCV_LIB_INSTALL_PATH}") +ocv_update(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${OPENCV_LIB_INSTALL_PATH}") set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) if(INSTALL_TO_MANGLED_PATHS) set(OPENCV_INCLUDE_INSTALL_PATH ${OPENCV_INCLUDE_INSTALL_PATH}/opencv-${OPENCV_VERSION}) - string(REPLACE "OpenCV" "OpenCV-${OPENCV_VERSION}" OPENCV_3P_LIB_INSTALL_PATH "${OPENCV_3P_LIB_INSTALL_PATH}") - string(REPLACE "OpenCV" "OpenCV-${OPENCV_VERSION}" OPENCV_SAMPLES_SRC_INSTALL_PATH "${OPENCV_SAMPLES_SRC_INSTALL_PATH}") - string(REPLACE "OpenCV" "OpenCV-${OPENCV_VERSION}" OPENCV_CONFIG_INSTALL_PATH "${OPENCV_CONFIG_INSTALL_PATH}") - string(REPLACE "OpenCV" "OpenCV-${OPENCV_VERSION}" OPENCV_DOC_INSTALL_PATH "${OPENCV_DOC_INSTALL_PATH}") - string(REPLACE "OpenCV" "OpenCV-${OPENCV_VERSION}" OPENCV_JAR_INSTALL_PATH "${OPENCV_JAR_INSTALL_PATH}") - string(REPLACE "OpenCV" "OpenCV-${OPENCV_VERSION}" OPENCV_TEST_DATA_INSTALL_PATH "${OPENCV_TEST_DATA_INSTALL_PATH}") - string(REPLACE "OpenCV" "OpenCV-${OPENCV_VERSION}" OPENCV_OTHER_INSTALL_PATH "${OPENCV_OTHER_INSTALL_PATH}") + foreach(v + OPENCV_3P_LIB_INSTALL_PATH + OPENCV_SAMPLES_SRC_INSTALL_PATH + OPENCV_CONFIG_INSTALL_PATH + OPENCV_DOC_INSTALL_PATH + OPENCV_JAR_INSTALL_PATH + OPENCV_TEST_DATA_INSTALL_PATH + OPENCV_OTHER_INSTALL_PATH + ) + string(REPLACE "OpenCV" "OpenCV-${OPENCV_VERSION}" ${v} "${${v}}") + string(REPLACE "opencv" "opencv-${OPENCV_VERSION}" ${v} "${${v}}") + endforeach() endif() @@ -440,7 +448,7 @@ endif() # ---------------------------------------------------------------------------- # Path for build/platform -specific headers # ---------------------------------------------------------------------------- -set(OPENCV_CONFIG_FILE_INCLUDE_DIR "${CMAKE_BINARY_DIR}/" CACHE PATH "Where to create the platform-dependant cvconfig.h") +ocv_update(OPENCV_CONFIG_FILE_INCLUDE_DIR "${CMAKE_BINARY_DIR}/" CACHE PATH "Where to create the platform-dependant cvconfig.h") ocv_include_directories(${OPENCV_CONFIG_FILE_INCLUDE_DIR}) # ---------------------------------------------------------------------------- @@ -453,7 +461,7 @@ set(OPENCV_EXTRA_MODULES_PATH "" CACHE PATH "Where to look for additional OpenCV # ---------------------------------------------------------------------------- find_host_package(Git QUIET) -if(GIT_FOUND) +if(NOT DEFINED OPENCV_VCSVERSION AND GIT_FOUND) execute_process(COMMAND "${GIT_EXECUTABLE}" describe --tags --always --dirty --match "[0-9].[0-9].[0-9]*" WORKING_DIRECTORY "${OpenCV_SOURCE_DIR}" OUTPUT_VARIABLE OPENCV_VCSVERSION @@ -464,7 +472,7 @@ if(GIT_FOUND) if(NOT GIT_RESULT EQUAL 0) set(OPENCV_VCSVERSION "unknown") endif() -else() +elseif(NOT DEFINED OPENCV_VCSVERSION) # We don't have git: set(OPENCV_VCSVERSION "unknown") endif() @@ -627,7 +635,20 @@ endmacro() if(NOT DEFINED OpenCV_HAL) set(OpenCV_HAL "OpenCV_HAL") endif() + +if(WITH_CAROTENE) + ocv_debug_message(STATUS "Enable carotene acceleration") + if(NOT ";${OpenCV_HAL};" MATCHES ";carotene;") + set(OpenCV_HAL "carotene;${OpenCV_HAL}") + endif() +endif() + foreach(hal ${OpenCV_HAL}) + if(hal STREQUAL "carotene") + add_subdirectory(3rdparty/carotene/hal) + ocv_hal_register(CAROTENE_HAL_LIBRARIES CAROTENE_HAL_HEADERS CAROTENE_HAL_INCLUDE_DIRS) + list(APPEND OpenCV_USED_HAL "carotene (ver ${CAROTENE_HAL_VERSION})") + else() ocv_debug_message(STATUS "OpenCV HAL: ${hal} ...") ocv_clear_vars(OpenCV_HAL_LIBRARIES OpenCV_HAL_HEADERS OpenCV_HAL_INCLUDE_DIRS) find_package(${hal} NO_MODULE QUIET) @@ -635,6 +656,7 @@ foreach(hal ${OpenCV_HAL}) ocv_hal_register(OpenCV_HAL_LIBRARIES OpenCV_HAL_HEADERS OpenCV_HAL_INCLUDE_DIRS) list(APPEND OpenCV_USED_HAL "${hal} (ver ${${hal}_VERSION})") endif() + endif() endforeach() configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/custom_hal.hpp.in" "${CMAKE_BINARY_DIR}/custom_hal.hpp" @ONLY) unset(_hal_includes) @@ -652,7 +674,7 @@ if(HAVE_CUDA) set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} ${CUDA_cufft_LIBRARY}) endif() foreach(p ${CUDA_LIBS_PATH}) - set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} -L${p}) + set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} ${CMAKE_LIBRARY_PATH_FLAG}${p}) endforeach() endif() # ---------------------------------------------------------------------------- diff --git a/apps/visualisation/opencv_visualisation.cpp b/apps/visualisation/opencv_visualisation.cpp index 2c685f521a..75703bd528 100644 --- a/apps/visualisation/opencv_visualisation.cpp +++ b/apps/visualisation/opencv_visualisation.cpp @@ -47,7 +47,7 @@ Software for visualising cascade classifier models trained by OpenCV and to get understanding of the used features. USAGE: -./visualise_models -model -image -data +./opencv_visualisation --model= --image= --data=