mirror of https://github.com/opencv/opencv.git
- added new functions from core module: split, merge, add, sub, mul, div, ... - added function replacement mechanism - added example of HAL replacement librarypull/5743/head
parent
e8742be30b
commit
b4bcdd10a1
25 changed files with 6189 additions and 5571 deletions
@ -0,0 +1,6 @@ |
||||
#ifndef _CUSTOM_HAL_INCLUDED_ |
||||
#define _CUSTOM_HAL_INCLUDED_ |
||||
|
||||
@OPENCV_HAL_HEADERS_INCLUDES@ |
||||
|
||||
#endif |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,91 @@ |
||||
#ifndef _HAL_INTERFACE_HPP_INCLUDED_ |
||||
#define _HAL_INTERFACE_HPP_INCLUDED_ |
||||
|
||||
#define CV_HAL_ERROR_OK 0 |
||||
#define CV_HAL_ERROR_NI 1 |
||||
#define CV_HAL_ERROR_UNKNOWN -1 |
||||
|
||||
#define CV_HAL_CMP_EQ 0 |
||||
#define CV_HAL_CMP_GT 1 |
||||
#define CV_HAL_CMP_GE 2 |
||||
#define CV_HAL_CMP_LT 3 |
||||
#define CV_HAL_CMP_LE 4 |
||||
#define CV_HAL_CMP_NE 5 |
||||
|
||||
#ifdef __cplusplus |
||||
namespace cv { namespace hal { |
||||
|
||||
namespace Error { |
||||
|
||||
enum
|
||||
{ |
||||
Ok = 0, |
||||
NotImplemented = 1, |
||||
Unknown = -1 |
||||
}; |
||||
|
||||
} |
||||
|
||||
enum
|
||||
{ |
||||
CMP_EQ = 0, |
||||
CMP_GT = 1, |
||||
CMP_GE = 2, |
||||
CMP_LT = 3, |
||||
CMP_LE = 4, |
||||
CMP_NE = 5 |
||||
}; |
||||
|
||||
}} |
||||
#endif |
||||
|
||||
#ifdef __cplusplus |
||||
#include <cstddef> |
||||
#else |
||||
#include <stddef.h> |
||||
#endif |
||||
|
||||
/* primitive types */ |
||||
/*
|
||||
schar - signed 1 byte integer |
||||
uchar - unsigned 1 byte integer |
||||
short - signed 2 byte integer |
||||
ushort - unsigned 2 byte integer |
||||
int - signed 4 byte integer |
||||
uint - unsigned 4 byte integer |
||||
int64 - signed 8 byte integer |
||||
uint64 - unsigned 8 byte integer |
||||
*/ |
||||
|
||||
#if !defined _MSC_VER && !defined __BORLANDC__ |
||||
# if defined __cplusplus && __cplusplus >= 201103L && !defined __APPLE__ |
||||
# include <cstdint> |
||||
typedef std::uint32_t uint; |
||||
# else |
||||
# include <stdint.h> |
||||
typedef uint32_t uint; |
||||
# endif |
||||
#else |
||||
typedef unsigned uint; |
||||
#endif |
||||
|
||||
typedef signed char schar; |
||||
|
||||
#ifndef __IPL_H__ |
||||
typedef unsigned char uchar; |
||||
typedef unsigned short ushort; |
||||
#endif |
||||
|
||||
#if defined _MSC_VER || defined __BORLANDC__ |
||||
typedef __int64 int64; |
||||
typedef unsigned __int64 uint64; |
||||
# define CV_BIG_INT(n) n##I64 |
||||
# define CV_BIG_UINT(n) n##UI64 |
||||
#else |
||||
typedef int64_t int64; |
||||
typedef uint64_t uint64; |
||||
# define CV_BIG_INT(n) n##LL |
||||
# define CV_BIG_UINT(n) n##ULL |
||||
#endif |
||||
|
||||
#endif |
@ -0,0 +1,12 @@ |
||||
cmake_minimum_required(VERSION 2.8.8 FATAL_ERROR) |
||||
|
||||
if(UNIX) |
||||
if(CMAKE_COMPILER_IS_GNUCXX OR CV_ICC) |
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") |
||||
endif() |
||||
endif() |
||||
|
||||
add_library(simple_hal simple.cpp) |
||||
set(OPENCV_HAL_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../..") |
||||
target_include_directories(simple_hal PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${OPENCV_HAL_DIR}/include) |
||||
|
@ -0,0 +1,34 @@ |
||||
#include "simple.hpp" |
||||
|
||||
int slow_and8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height) |
||||
{ |
||||
for(; height--; src1 = src1 + step1, src2 = src2 + step2, dst = dst + step) |
||||
for(int x = 0 ; x < width; x++ ) |
||||
dst[x] = src1[x] & src2[x]; |
||||
return cv::hal::Error::Ok; |
||||
} |
||||
|
||||
int slow_or8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height) |
||||
{ |
||||
for(; height--; src1 = src1 + step1, src2 = src2 + step2, dst = dst + step) |
||||
for(int x = 0 ; x < width; x++ ) |
||||
dst[x] = src1[x] | src2[x]; |
||||
return cv::hal::Error::Ok; |
||||
} |
||||
|
||||
int slow_xor8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height) |
||||
{ |
||||
for(; height--; src1 = src1 + step1, src2 = src2 + step2, dst = dst + step) |
||||
for(int x = 0 ; x < width; x++ ) |
||||
dst[x] = src1[x] ^ src2[x]; |
||||
return cv::hal::Error::Ok; |
||||
} |
||||
|
||||
int slow_not8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height) |
||||
{ |
||||
for(; height--; src1 = src1 + step1, src2 = src2 + step2, dst = dst + step) |
||||
for(int x = 0 ; x < width; x++ ) |
||||
dst[x] = ~src1[x]; |
||||
return cv::hal::Error::Ok; |
||||
} |
||||
|
@ -0,0 +1,20 @@ |
||||
#ifndef _SIMPLE_HPP_INCLUDED_ |
||||
#define _SIMPLE_HPP_INCLUDED_ |
||||
|
||||
#include "opencv2/hal/interface.hpp" |
||||
|
||||
int slow_and8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height); |
||||
int slow_or8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height); |
||||
int slow_xor8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height); |
||||
int slow_not8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height); |
||||
|
||||
#undef hal_and8u |
||||
#define hal_and8u slow_and8u |
||||
#undef hal_or8u |
||||
#define hal_or8u slow_or8u |
||||
#undef hal_xor8u |
||||
#define hal_xor8u slow_xor8u |
||||
#undef hal_not8u |
||||
#define hal_not8u slow_not8u |
||||
|
||||
#endif |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,657 @@ |
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
|
||||
// Copyright (C) 2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_HAL_ARITHM_CORE_HPP__ |
||||
#define __OPENCV_HAL_ARITHM_CORE_HPP__ |
||||
|
||||
#include "arithm_simd.hpp" |
||||
|
||||
const uchar g_Saturate8u[] = |
||||
{ |
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, |
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, |
||||
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, |
||||
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, |
||||
64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, |
||||
80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, |
||||
96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, |
||||
112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, |
||||
128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, |
||||
144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, |
||||
160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, |
||||
176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, |
||||
192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, |
||||
208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, |
||||
224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, |
||||
240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, |
||||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
||||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
||||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
||||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
||||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
||||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
||||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
||||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
||||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
||||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
||||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
||||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
||||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
||||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
||||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
||||
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
||||
255 |
||||
}; |
||||
|
||||
|
||||
#define CV_FAST_CAST_8U(t) (assert(-256 <= (t) && (t) <= 512), g_Saturate8u[(t)+256]) |
||||
#define CV_MIN_8U(a,b) ((a) - CV_FAST_CAST_8U((a) - (b))) |
||||
#define CV_MAX_8U(a,b) ((a) + CV_FAST_CAST_8U((b) - (a))) |
||||
|
||||
const float g_8x32fTab[] = |
||||
{ |
||||
-128.f, -127.f, -126.f, -125.f, -124.f, -123.f, -122.f, -121.f, |
||||
-120.f, -119.f, -118.f, -117.f, -116.f, -115.f, -114.f, -113.f, |
||||
-112.f, -111.f, -110.f, -109.f, -108.f, -107.f, -106.f, -105.f, |
||||
-104.f, -103.f, -102.f, -101.f, -100.f, -99.f, -98.f, -97.f, |
||||
-96.f, -95.f, -94.f, -93.f, -92.f, -91.f, -90.f, -89.f, |
||||
-88.f, -87.f, -86.f, -85.f, -84.f, -83.f, -82.f, -81.f, |
||||
-80.f, -79.f, -78.f, -77.f, -76.f, -75.f, -74.f, -73.f, |
||||
-72.f, -71.f, -70.f, -69.f, -68.f, -67.f, -66.f, -65.f, |
||||
-64.f, -63.f, -62.f, -61.f, -60.f, -59.f, -58.f, -57.f, |
||||
-56.f, -55.f, -54.f, -53.f, -52.f, -51.f, -50.f, -49.f, |
||||
-48.f, -47.f, -46.f, -45.f, -44.f, -43.f, -42.f, -41.f, |
||||
-40.f, -39.f, -38.f, -37.f, -36.f, -35.f, -34.f, -33.f, |
||||
-32.f, -31.f, -30.f, -29.f, -28.f, -27.f, -26.f, -25.f, |
||||
-24.f, -23.f, -22.f, -21.f, -20.f, -19.f, -18.f, -17.f, |
||||
-16.f, -15.f, -14.f, -13.f, -12.f, -11.f, -10.f, -9.f, |
||||
-8.f, -7.f, -6.f, -5.f, -4.f, -3.f, -2.f, -1.f, |
||||
0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, |
||||
8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f, |
||||
16.f, 17.f, 18.f, 19.f, 20.f, 21.f, 22.f, 23.f, |
||||
24.f, 25.f, 26.f, 27.f, 28.f, 29.f, 30.f, 31.f, |
||||
32.f, 33.f, 34.f, 35.f, 36.f, 37.f, 38.f, 39.f, |
||||
40.f, 41.f, 42.f, 43.f, 44.f, 45.f, 46.f, 47.f, |
||||
48.f, 49.f, 50.f, 51.f, 52.f, 53.f, 54.f, 55.f, |
||||
56.f, 57.f, 58.f, 59.f, 60.f, 61.f, 62.f, 63.f, |
||||
64.f, 65.f, 66.f, 67.f, 68.f, 69.f, 70.f, 71.f, |
||||
72.f, 73.f, 74.f, 75.f, 76.f, 77.f, 78.f, 79.f, |
||||
80.f, 81.f, 82.f, 83.f, 84.f, 85.f, 86.f, 87.f, |
||||
88.f, 89.f, 90.f, 91.f, 92.f, 93.f, 94.f, 95.f, |
||||
96.f, 97.f, 98.f, 99.f, 100.f, 101.f, 102.f, 103.f, |
||||
104.f, 105.f, 106.f, 107.f, 108.f, 109.f, 110.f, 111.f, |
||||
112.f, 113.f, 114.f, 115.f, 116.f, 117.f, 118.f, 119.f, |
||||
120.f, 121.f, 122.f, 123.f, 124.f, 125.f, 126.f, 127.f, |
||||
128.f, 129.f, 130.f, 131.f, 132.f, 133.f, 134.f, 135.f, |
||||
136.f, 137.f, 138.f, 139.f, 140.f, 141.f, 142.f, 143.f, |
||||
144.f, 145.f, 146.f, 147.f, 148.f, 149.f, 150.f, 151.f, |
||||
152.f, 153.f, 154.f, 155.f, 156.f, 157.f, 158.f, 159.f, |
||||
160.f, 161.f, 162.f, 163.f, 164.f, 165.f, 166.f, 167.f, |
||||
168.f, 169.f, 170.f, 171.f, 172.f, 173.f, 174.f, 175.f, |
||||
176.f, 177.f, 178.f, 179.f, 180.f, 181.f, 182.f, 183.f, |
||||
184.f, 185.f, 186.f, 187.f, 188.f, 189.f, 190.f, 191.f, |
||||
192.f, 193.f, 194.f, 195.f, 196.f, 197.f, 198.f, 199.f, |
||||
200.f, 201.f, 202.f, 203.f, 204.f, 205.f, 206.f, 207.f, |
||||
208.f, 209.f, 210.f, 211.f, 212.f, 213.f, 214.f, 215.f, |
||||
216.f, 217.f, 218.f, 219.f, 220.f, 221.f, 222.f, 223.f, |
||||
224.f, 225.f, 226.f, 227.f, 228.f, 229.f, 230.f, 231.f, |
||||
232.f, 233.f, 234.f, 235.f, 236.f, 237.f, 238.f, 239.f, |
||||
240.f, 241.f, 242.f, 243.f, 244.f, 245.f, 246.f, 247.f, |
||||
248.f, 249.f, 250.f, 251.f, 252.f, 253.f, 254.f, 255.f |
||||
}; |
||||
|
||||
#define CV_8TO32F(x) g_8x32fTab[(x)+128] |
||||
|
||||
namespace cv { |
||||
|
||||
template<> inline uchar OpAdd<uchar>::operator ()(uchar a, uchar b) const |
||||
{ return CV_FAST_CAST_8U(a + b); } |
||||
|
||||
template<> inline uchar OpSub<uchar>::operator ()(uchar a, uchar b) const |
||||
{ return CV_FAST_CAST_8U(a - b); } |
||||
|
||||
template<> inline short OpAbsDiff<short>::operator ()(short a, short b) const |
||||
{ return saturate_cast<short>(std::abs(a - b)); } |
||||
|
||||
template<> inline schar OpAbsDiff<schar>::operator ()(schar a, schar b) const |
||||
{ return saturate_cast<schar>(std::abs(a - b)); } |
||||
|
||||
template<> inline uchar OpMin<uchar>::operator ()(uchar a, uchar b) const { return CV_MIN_8U(a, b); } |
||||
|
||||
template<> inline uchar OpMax<uchar>::operator ()(uchar a, uchar b) const { return CV_MAX_8U(a, b); } |
||||
|
||||
} |
||||
|
||||
namespace cv { namespace hal { |
||||
|
||||
template<typename T, class Op, class VOp> |
||||
void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, int width, int height) |
||||
{ |
||||
#if CV_SSE2 || CV_NEON |
||||
VOp vop; |
||||
#endif |
||||
Op op; |
||||
|
||||
for( ; height--; src1 = (const T *)((const uchar *)src1 + step1), |
||||
src2 = (const T *)((const uchar *)src2 + step2), |
||||
dst = (T *)((uchar *)dst + step) ) |
||||
{ |
||||
int x = 0; |
||||
|
||||
#if CV_NEON || CV_SSE2 |
||||
#if CV_AVX2 |
||||
if( USE_AVX2 ) |
||||
{ |
||||
for( ; x <= width - 32/(int)sizeof(T); x += 32/sizeof(T) ) |
||||
{ |
||||
typename VLoadStore256<T>::reg_type r0 = VLoadStore256<T>::load(src1 + x); |
||||
r0 = vop(r0, VLoadStore256<T>::load(src2 + x)); |
||||
VLoadStore256<T>::store(dst + x, r0); |
||||
} |
||||
} |
||||
#else |
||||
#if CV_SSE2 |
||||
if( USE_SSE2 ) |
||||
{ |
||||
#endif // CV_SSE2
|
||||
for( ; x <= width - 32/(int)sizeof(T); x += 32/sizeof(T) ) |
||||
{ |
||||
typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x ); |
||||
typename VLoadStore128<T>::reg_type r1 = VLoadStore128<T>::load(src1 + x + 16/sizeof(T)); |
||||
r0 = vop(r0, VLoadStore128<T>::load(src2 + x )); |
||||
r1 = vop(r1, VLoadStore128<T>::load(src2 + x + 16/sizeof(T))); |
||||
VLoadStore128<T>::store(dst + x , r0); |
||||
VLoadStore128<T>::store(dst + x + 16/sizeof(T), r1); |
||||
} |
||||
#if CV_SSE2 |
||||
} |
||||
#endif // CV_SSE2
|
||||
#endif // CV_AVX2
|
||||
#endif // CV_NEON || CV_SSE2
|
||||
|
||||
#if CV_AVX2 |
||||
// nothing
|
||||
#elif CV_SSE2 |
||||
if( USE_SSE2 ) |
||||
{ |
||||
for( ; x <= width - 8/(int)sizeof(T); x += 8/sizeof(T) ) |
||||
{ |
||||
typename VLoadStore64<T>::reg_type r = VLoadStore64<T>::load(src1 + x); |
||||
r = vop(r, VLoadStore64<T>::load(src2 + x)); |
||||
VLoadStore64<T>::store(dst + x, r); |
||||
} |
||||
} |
||||
#endif |
||||
|
||||
#if CV_ENABLE_UNROLLED |
||||
for( ; x <= width - 4; x += 4 ) |
||||
{ |
||||
T v0 = op(src1[x], src2[x]); |
||||
T v1 = op(src1[x+1], src2[x+1]); |
||||
dst[x] = v0; dst[x+1] = v1; |
||||
v0 = op(src1[x+2], src2[x+2]); |
||||
v1 = op(src1[x+3], src2[x+3]); |
||||
dst[x+2] = v0; dst[x+3] = v1; |
||||
} |
||||
#endif |
||||
|
||||
for( ; x < width; x++ ) |
||||
dst[x] = op(src1[x], src2[x]); |
||||
} |
||||
} |
||||
|
||||
template<typename T, class Op, class Op32> |
||||
void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2, |
||||
T* dst, size_t step, int width, int height) |
||||
{ |
||||
#if CV_SSE2 || CV_NEON |
||||
Op32 op32; |
||||
#endif |
||||
Op op; |
||||
|
||||
for( ; height--; src1 = (const T *)((const uchar *)src1 + step1), |
||||
src2 = (const T *)((const uchar *)src2 + step2), |
||||
dst = (T *)((uchar *)dst + step) ) |
||||
{ |
||||
int x = 0; |
||||
|
||||
#if CV_AVX2 |
||||
if( USE_AVX2 ) |
||||
{ |
||||
if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 ) |
||||
{ |
||||
for( ; x <= width - 8; x += 8 ) |
||||
{ |
||||
typename VLoadStore256Aligned<T>::reg_type r0 = VLoadStore256Aligned<T>::load(src1 + x); |
||||
r0 = op32(r0, VLoadStore256Aligned<T>::load(src2 + x)); |
||||
VLoadStore256Aligned<T>::store(dst + x, r0); |
||||
} |
||||
} |
||||
} |
||||
#elif CV_SSE2 |
||||
if( USE_SSE2 ) |
||||
{ |
||||
if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 ) |
||||
{ |
||||
for( ; x <= width - 8; x += 8 ) |
||||
{ |
||||
typename VLoadStore128Aligned<T>::reg_type r0 = VLoadStore128Aligned<T>::load(src1 + x ); |
||||
typename VLoadStore128Aligned<T>::reg_type r1 = VLoadStore128Aligned<T>::load(src1 + x + 4); |
||||
r0 = op32(r0, VLoadStore128Aligned<T>::load(src2 + x )); |
||||
r1 = op32(r1, VLoadStore128Aligned<T>::load(src2 + x + 4)); |
||||
VLoadStore128Aligned<T>::store(dst + x , r0); |
||||
VLoadStore128Aligned<T>::store(dst + x + 4, r1); |
||||
} |
||||
} |
||||
} |
||||
#endif // CV_AVX2
|
||||
|
||||
#if CV_NEON || CV_SSE2 |
||||
#if CV_AVX2 |
||||
if( USE_AVX2 ) |
||||
{ |
||||
for( ; x <= width - 8; x += 8 ) |
||||
{ |
||||
typename VLoadStore256<T>::reg_type r0 = VLoadStore256<T>::load(src1 + x); |
||||
r0 = op32(r0, VLoadStore256<T>::load(src2 + x)); |
||||
VLoadStore256<T>::store(dst + x, r0); |
||||
} |
||||
} |
||||
#else |
||||
#if CV_SSE2 |
||||
if( USE_SSE2 ) |
||||
{ |
||||
#endif // CV_SSE2
|
||||
for( ; x <= width - 8; x += 8 ) |
||||
{ |
||||
typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x ); |
||||
typename VLoadStore128<T>::reg_type r1 = VLoadStore128<T>::load(src1 + x + 4); |
||||
r0 = op32(r0, VLoadStore128<T>::load(src2 + x )); |
||||
r1 = op32(r1, VLoadStore128<T>::load(src2 + x + 4)); |
||||
VLoadStore128<T>::store(dst + x , r0); |
||||
VLoadStore128<T>::store(dst + x + 4, r1); |
||||
} |
||||
#if CV_SSE2 |
||||
} |
||||
#endif // CV_SSE2
|
||||
#endif // CV_AVX2
|
||||
#endif // CV_NEON || CV_SSE2
|
||||
|
||||
#if CV_ENABLE_UNROLLED |
||||
for( ; x <= width - 4; x += 4 ) |
||||
{ |
||||
T v0 = op(src1[x], src2[x]); |
||||
T v1 = op(src1[x+1], src2[x+1]); |
||||
dst[x] = v0; dst[x+1] = v1; |
||||
v0 = op(src1[x+2], src2[x+2]); |
||||
v1 = op(src1[x+3], src2[x+3]); |
||||
dst[x+2] = v0; dst[x+3] = v1; |
||||
} |
||||
#endif |
||||
|
||||
for( ; x < width; x++ ) |
||||
dst[x] = op(src1[x], src2[x]); |
||||
} |
||||
} |
||||
|
||||
|
||||
template<typename T, class Op, class Op64> |
||||
void vBinOp64(const T* src1, size_t step1, const T* src2, size_t step2, |
||||
T* dst, size_t step, int width, int height) |
||||
{ |
||||
#if CV_SSE2 |
||||
Op64 op64; |
||||
#endif |
||||
Op op; |
||||
|
||||
for( ; height--; src1 = (const T *)((const uchar *)src1 + step1), |
||||
src2 = (const T *)((const uchar *)src2 + step2), |
||||
dst = (T *)((uchar *)dst + step) ) |
||||
{ |
||||
int x = 0; |
||||
|
||||
#if CV_AVX2 |
||||
if( USE_AVX2 ) |
||||
{ |
||||
if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 ) |
||||
{ |
||||
for( ; x <= width - 4; x += 4 ) |
||||
{ |
||||
typename VLoadStore256Aligned<T>::reg_type r0 = VLoadStore256Aligned<T>::load(src1 + x); |
||||
r0 = op64(r0, VLoadStore256Aligned<T>::load(src2 + x)); |
||||
VLoadStore256Aligned<T>::store(dst + x, r0); |
||||
} |
||||
} |
||||
} |
||||
#elif CV_SSE2 |
||||
if( USE_SSE2 ) |
||||
{ |
||||
if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 ) |
||||
{ |
||||
for( ; x <= width - 4; x += 4 ) |
||||
{ |
||||
typename VLoadStore128Aligned<T>::reg_type r0 = VLoadStore128Aligned<T>::load(src1 + x ); |
||||
typename VLoadStore128Aligned<T>::reg_type r1 = VLoadStore128Aligned<T>::load(src1 + x + 2); |
||||
r0 = op64(r0, VLoadStore128Aligned<T>::load(src2 + x )); |
||||
r1 = op64(r1, VLoadStore128Aligned<T>::load(src2 + x + 2)); |
||||
VLoadStore128Aligned<T>::store(dst + x , r0); |
||||
VLoadStore128Aligned<T>::store(dst + x + 2, r1); |
||||
} |
||||
} |
||||
} |
||||
#endif |
||||
|
||||
for( ; x <= width - 4; x += 4 ) |
||||
{ |
||||
T v0 = op(src1[x], src2[x]); |
||||
T v1 = op(src1[x+1], src2[x+1]); |
||||
dst[x] = v0; dst[x+1] = v1; |
||||
v0 = op(src1[x+2], src2[x+2]); |
||||
v1 = op(src1[x+3], src2[x+3]); |
||||
dst[x+2] = v0; dst[x+3] = v1; |
||||
} |
||||
|
||||
for( ; x < width; x++ ) |
||||
dst[x] = op(src1[x], src2[x]); |
||||
} |
||||
} |
||||
|
||||
template<typename T> static void |
||||
cmp_(const T* src1, size_t step1, const T* src2, size_t step2, |
||||
uchar* dst, size_t step, int width, int height, int code) |
||||
{ |
||||
step1 /= sizeof(src1[0]); |
||||
step2 /= sizeof(src2[0]); |
||||
if( code == CMP_GE || code == CMP_LT ) |
||||
{ |
||||
std::swap(src1, src2); |
||||
std::swap(step1, step2); |
||||
code = code == CMP_GE ? CMP_LE : CMP_GT; |
||||
} |
||||
|
||||
Cmp_SIMD<T> vop(code); |
||||
|
||||
if( code == CMP_GT || code == CMP_LE ) |
||||
{ |
||||
int m = code == CMP_GT ? 0 : 255; |
||||
for( ; height--; src1 += step1, src2 += step2, dst += step ) |
||||
{ |
||||
int x = vop(src1, src2, dst, width); |
||||
#if CV_ENABLE_UNROLLED |
||||
for( ; x <= width - 4; x += 4 ) |
||||
{ |
||||
int t0, t1; |
||||
t0 = -(src1[x] > src2[x]) ^ m; |
||||
t1 = -(src1[x+1] > src2[x+1]) ^ m; |
||||
dst[x] = (uchar)t0; dst[x+1] = (uchar)t1; |
||||
t0 = -(src1[x+2] > src2[x+2]) ^ m; |
||||
t1 = -(src1[x+3] > src2[x+3]) ^ m; |
||||
dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1; |
||||
} |
||||
#endif |
||||
for( ; x < width; x++ ) |
||||
dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m); |
||||
} |
||||
} |
||||
else if( code == CMP_EQ || code == CMP_NE ) |
||||
{ |
||||
int m = code == CMP_EQ ? 0 : 255; |
||||
for( ; height--; src1 += step1, src2 += step2, dst += step ) |
||||
{ |
||||
int x = 0; |
||||
#if CV_ENABLE_UNROLLED |
||||
for( ; x <= width - 4; x += 4 ) |
||||
{ |
||||
int t0, t1; |
||||
t0 = -(src1[x] == src2[x]) ^ m; |
||||
t1 = -(src1[x+1] == src2[x+1]) ^ m; |
||||
dst[x] = (uchar)t0; dst[x+1] = (uchar)t1; |
||||
t0 = -(src1[x+2] == src2[x+2]) ^ m; |
||||
t1 = -(src1[x+3] == src2[x+3]) ^ m; |
||||
dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1; |
||||
} |
||||
#endif |
||||
for( ; x < width; x++ ) |
||||
dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m); |
||||
} |
||||
} |
||||
} |
||||
|
||||
template<typename T, typename WT> static void |
||||
mul_( const T* src1, size_t step1, const T* src2, size_t step2, |
||||
T* dst, size_t step, int width, int height, WT scale ) |
||||
{ |
||||
step1 /= sizeof(src1[0]); |
||||
step2 /= sizeof(src2[0]); |
||||
step /= sizeof(dst[0]); |
||||
|
||||
Mul_SIMD<T, WT> vop; |
||||
|
||||
if( scale == (WT)1. ) |
||||
{ |
||||
for( ; height--; src1 += step1, src2 += step2, dst += step ) |
||||
{ |
||||
int i = vop(src1, src2, dst, width, scale); |
||||
#if CV_ENABLE_UNROLLED |
||||
for(; i <= width - 4; i += 4 ) |
||||
{ |
||||
T t0; |
||||
T t1; |
||||
t0 = saturate_cast<T>(src1[i ] * src2[i ]); |
||||
t1 = saturate_cast<T>(src1[i+1] * src2[i+1]); |
||||
dst[i ] = t0; |
||||
dst[i+1] = t1; |
||||
|
||||
t0 = saturate_cast<T>(src1[i+2] * src2[i+2]); |
||||
t1 = saturate_cast<T>(src1[i+3] * src2[i+3]); |
||||
dst[i+2] = t0; |
||||
dst[i+3] = t1; |
||||
} |
||||
#endif |
||||
for( ; i < width; i++ ) |
||||
dst[i] = saturate_cast<T>(src1[i] * src2[i]); |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
for( ; height--; src1 += step1, src2 += step2, dst += step ) |
||||
{ |
||||
int i = vop(src1, src2, dst, width, scale); |
||||
#if CV_ENABLE_UNROLLED |
||||
for(; i <= width - 4; i += 4 ) |
||||
{ |
||||
T t0 = saturate_cast<T>(scale*(WT)src1[i]*src2[i]); |
||||
T t1 = saturate_cast<T>(scale*(WT)src1[i+1]*src2[i+1]); |
||||
dst[i] = t0; dst[i+1] = t1; |
||||
|
||||
t0 = saturate_cast<T>(scale*(WT)src1[i+2]*src2[i+2]); |
||||
t1 = saturate_cast<T>(scale*(WT)src1[i+3]*src2[i+3]); |
||||
dst[i+2] = t0; dst[i+3] = t1; |
||||
} |
||||
#endif |
||||
for( ; i < width; i++ ) |
||||
dst[i] = saturate_cast<T>(scale*(WT)src1[i]*src2[i]); |
||||
} |
||||
} |
||||
} |
||||
|
||||
|
||||
template<typename T> static void |
||||
div_i( const T* src1, size_t step1, const T* src2, size_t step2, |
||||
T* dst, size_t step, int width, int height, double scale ) |
||||
{ |
||||
step1 /= sizeof(src1[0]); |
||||
step2 /= sizeof(src2[0]); |
||||
step /= sizeof(dst[0]); |
||||
|
||||
Div_SIMD<T> vop; |
||||
float scale_f = (float)scale; |
||||
|
||||
for( ; height--; src1 += step1, src2 += step2, dst += step ) |
||||
{ |
||||
int i = vop(src1, src2, dst, width, scale); |
||||
for( ; i < width; i++ ) |
||||
{ |
||||
T num = src1[i], denom = src2[i]; |
||||
dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0; |
||||
} |
||||
} |
||||
} |
||||
|
||||
template<typename T> static void |
||||
div_f( const T* src1, size_t step1, const T* src2, size_t step2, |
||||
T* dst, size_t step, int width, int height, double scale ) |
||||
{ |
||||
T scale_f = (T)scale; |
||||
step1 /= sizeof(src1[0]); |
||||
step2 /= sizeof(src2[0]); |
||||
step /= sizeof(dst[0]); |
||||
|
||||
Div_SIMD<T> vop; |
||||
|
||||
for( ; height--; src1 += step1, src2 += step2, dst += step ) |
||||
{ |
||||
int i = vop(src1, src2, dst, width, scale); |
||||
for( ; i < width; i++ ) |
||||
{ |
||||
T num = src1[i], denom = src2[i]; |
||||
dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0; |
||||
} |
||||
} |
||||
} |
||||
|
||||
template<typename T> static void |
||||
recip_i( const T*, size_t, const T* src2, size_t step2, |
||||
T* dst, size_t step, int width, int height, double scale ) |
||||
{ |
||||
step2 /= sizeof(src2[0]); |
||||
step /= sizeof(dst[0]); |
||||
|
||||
Recip_SIMD<T> vop; |
||||
float scale_f = (float)scale; |
||||
|
||||
for( ; height--; src2 += step2, dst += step ) |
||||
{ |
||||
int i = vop(src2, dst, width, scale); |
||||
for( ; i < width; i++ ) |
||||
{ |
||||
T denom = src2[i]; |
||||
dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0; |
||||
} |
||||
} |
||||
} |
||||
|
||||
template<typename T> static void |
||||
recip_f( const T*, size_t, const T* src2, size_t step2, |
||||
T* dst, size_t step, int width, int height, double scale ) |
||||
{ |
||||
T scale_f = (T)scale; |
||||
step2 /= sizeof(src2[0]); |
||||
step /= sizeof(dst[0]); |
||||
|
||||
Recip_SIMD<T> vop; |
||||
|
||||
for( ; height--; src2 += step2, dst += step ) |
||||
{ |
||||
int i = vop(src2, dst, width, scale); |
||||
for( ; i < width; i++ ) |
||||
{ |
||||
T denom = src2[i]; |
||||
dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0; |
||||
} |
||||
} |
||||
} |
||||
|
||||
template<typename T, typename WT> static void |
||||
addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2, |
||||
T* dst, size_t step, int width, int height, void* _scalars ) |
||||
{ |
||||
const double* scalars = (const double*)_scalars; |
||||
WT alpha = (WT)scalars[0], beta = (WT)scalars[1], gamma = (WT)scalars[2]; |
||||
step1 /= sizeof(src1[0]); |
||||
step2 /= sizeof(src2[0]); |
||||
step /= sizeof(dst[0]); |
||||
|
||||
AddWeighted_SIMD<T, WT> vop; |
||||
|
||||
for( ; height--; src1 += step1, src2 += step2, dst += step ) |
||||
{ |
||||
int x = vop(src1, src2, dst, width, alpha, beta, gamma); |
||||
#if CV_ENABLE_UNROLLED |
||||
for( ; x <= width - 4; x += 4 ) |
||||
{ |
||||
T t0 = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma); |
||||
T t1 = saturate_cast<T>(src1[x+1]*alpha + src2[x+1]*beta + gamma); |
||||
dst[x] = t0; dst[x+1] = t1; |
||||
|
||||
t0 = saturate_cast<T>(src1[x+2]*alpha + src2[x+2]*beta + gamma); |
||||
t1 = saturate_cast<T>(src1[x+3]*alpha + src2[x+3]*beta + gamma); |
||||
dst[x+2] = t0; dst[x+3] = t1; |
||||
} |
||||
#endif |
||||
for( ; x < width; x++ ) |
||||
dst[x] = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma); |
||||
} |
||||
} |
||||
|
||||
}} // cv::hal::
|
||||
|
||||
|
||||
#endif // __OPENCV_HAL_ARITHM_CORE_HPP__
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,221 @@ |
||||
#include "precomp.hpp" |
||||
|
||||
#if defined WIN32 || defined _WIN32 || defined WINCE |
||||
#include <windows.h> |
||||
#if defined _MSC_VER |
||||
#if _MSC_VER >= 1400 |
||||
#include <intrin.h> |
||||
#elif defined _M_IX86 |
||||
static void __cpuid(int* cpuid_data, int) |
||||
{ |
||||
__asm |
||||
{ |
||||
push ebx |
||||
push edi |
||||
mov edi, cpuid_data |
||||
mov eax, 1 |
||||
cpuid |
||||
mov [edi], eax |
||||
mov [edi + 4], ebx |
||||
mov [edi + 8], ecx |
||||
mov [edi + 12], edx |
||||
pop edi |
||||
pop ebx |
||||
} |
||||
} |
||||
static void __cpuidex(int* cpuid_data, int, int) |
||||
{ |
||||
__asm |
||||
{ |
||||
push edi |
||||
mov edi, cpuid_data |
||||
mov eax, 7 |
||||
mov ecx, 0 |
||||
cpuid |
||||
mov [edi], eax |
||||
mov [edi + 4], ebx |
||||
mov [edi + 8], ecx |
||||
mov [edi + 12], edx |
||||
pop edi |
||||
} |
||||
} |
||||
#endif |
||||
#endif |
||||
#endif |
||||
|
||||
#if defined ANDROID || defined __linux__ |
||||
# include <unistd.h> |
||||
# include <fcntl.h> |
||||
# include <elf.h> |
||||
# include <linux/auxvec.h> |
||||
#endif |
||||
|
||||
#if defined __linux__ || defined __APPLE__ || defined __EMSCRIPTEN__ |
||||
#include <unistd.h> |
||||
#include <stdio.h> |
||||
#include <sys/types.h> |
||||
#if defined ANDROID |
||||
#include <sys/sysconf.h> |
||||
#endif |
||||
#endif |
||||
|
||||
#ifdef ANDROID |
||||
# include <android/log.h> |
||||
#endif |
||||
|
||||
struct HWFeatures |
||||
{ |
||||
enum { MAX_FEATURE = CV_HARDWARE_MAX_FEATURE }; |
||||
|
||||
HWFeatures(void) |
||||
{ |
||||
memset( have, 0, sizeof(have) ); |
||||
x86_family = 0; |
||||
} |
||||
|
||||
static HWFeatures initialize(void) |
||||
{ |
||||
HWFeatures f; |
||||
int cpuid_data[4] = { 0, 0, 0, 0 }; |
||||
|
||||
#if defined _MSC_VER && (defined _M_IX86 || defined _M_X64) |
||||
__cpuid(cpuid_data, 1); |
||||
#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__) |
||||
#ifdef __x86_64__ |
||||
asm __volatile__ |
||||
( |
||||
"movl $1, %%eax\n\t" |
||||
"cpuid\n\t" |
||||
:[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3]) |
||||
: |
||||
: "cc" |
||||
); |
||||
#else |
||||
asm volatile |
||||
( |
||||
"pushl %%ebx\n\t" |
||||
"movl $1,%%eax\n\t" |
||||
"cpuid\n\t" |
||||
"popl %%ebx\n\t" |
||||
: "=a"(cpuid_data[0]), "=c"(cpuid_data[2]), "=d"(cpuid_data[3]) |
||||
: |
||||
: "cc" |
||||
); |
||||
#endif |
||||
#endif |
||||
|
||||
f.x86_family = (cpuid_data[0] >> 8) & 15; |
||||
if( f.x86_family >= 6 ) |
||||
{ |
||||
f.have[CV_CPU_MMX] = (cpuid_data[3] & (1 << 23)) != 0; |
||||
f.have[CV_CPU_SSE] = (cpuid_data[3] & (1<<25)) != 0; |
||||
f.have[CV_CPU_SSE2] = (cpuid_data[3] & (1<<26)) != 0; |
||||
f.have[CV_CPU_SSE3] = (cpuid_data[2] & (1<<0)) != 0; |
||||
f.have[CV_CPU_SSSE3] = (cpuid_data[2] & (1<<9)) != 0; |
||||
f.have[CV_CPU_FMA3] = (cpuid_data[2] & (1<<12)) != 0; |
||||
f.have[CV_CPU_SSE4_1] = (cpuid_data[2] & (1<<19)) != 0; |
||||
f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0; |
||||
f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0; |
||||
f.have[CV_CPU_AVX] = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX
|
||||
|
||||
// make the second call to the cpuid command in order to get
|
||||
// information about extended features like AVX2
|
||||
#if defined _MSC_VER && (defined _M_IX86 || defined _M_X64) |
||||
__cpuidex(cpuid_data, 7, 0); |
||||
#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__) |
||||
#ifdef __x86_64__ |
||||
asm __volatile__ |
||||
( |
||||
"movl $7, %%eax\n\t" |
||||
"movl $0, %%ecx\n\t" |
||||
"cpuid\n\t" |
||||
:[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3]) |
||||
: |
||||
: "cc" |
||||
); |
||||
#else |
||||
asm volatile |
||||
( |
||||
"pushl %%ebx\n\t" |
||||
"movl $7,%%eax\n\t" |
||||
"movl $0,%%ecx\n\t" |
||||
"cpuid\n\t" |
||||
"movl %%ebx, %0\n\t" |
||||
"popl %%ebx\n\t" |
||||
: "=r"(cpuid_data[1]), "=c"(cpuid_data[2]) |
||||
: |
||||
: "cc" |
||||
); |
||||
#endif |
||||
#endif |
||||
f.have[CV_CPU_AVX2] = (cpuid_data[1] & (1<<5)) != 0; |
||||
|
||||
f.have[CV_CPU_AVX_512F] = (cpuid_data[1] & (1<<16)) != 0; |
||||
f.have[CV_CPU_AVX_512DQ] = (cpuid_data[1] & (1<<17)) != 0; |
||||
f.have[CV_CPU_AVX_512IFMA512] = (cpuid_data[1] & (1<<21)) != 0; |
||||
f.have[CV_CPU_AVX_512PF] = (cpuid_data[1] & (1<<26)) != 0; |
||||
f.have[CV_CPU_AVX_512ER] = (cpuid_data[1] & (1<<27)) != 0; |
||||
f.have[CV_CPU_AVX_512CD] = (cpuid_data[1] & (1<<28)) != 0; |
||||
f.have[CV_CPU_AVX_512BW] = (cpuid_data[1] & (1<<30)) != 0; |
||||
f.have[CV_CPU_AVX_512VL] = (cpuid_data[1] & (1<<31)) != 0; |
||||
f.have[CV_CPU_AVX_512VBMI] = (cpuid_data[2] & (1<<1)) != 0; |
||||
} |
||||
|
||||
#if defined ANDROID || defined __linux__ |
||||
#ifdef __aarch64__ |
||||
f.have[CV_CPU_NEON] = true; |
||||
#else |
||||
int cpufile = open("/proc/self/auxv", O_RDONLY); |
||||
|
||||
if (cpufile >= 0) |
||||
{ |
||||
Elf32_auxv_t auxv; |
||||
const size_t size_auxv_t = sizeof(auxv); |
||||
|
||||
while ((size_t)read(cpufile, &auxv, size_auxv_t) == size_auxv_t) |
||||
{ |
||||
if (auxv.a_type == AT_HWCAP) |
||||
{ |
||||
f.have[CV_CPU_NEON] = (auxv.a_un.a_val & 4096) != 0; |
||||
break; |
||||
} |
||||
} |
||||
|
||||
close(cpufile); |
||||
} |
||||
#endif |
||||
#elif (defined __clang__ || defined __APPLE__) && (defined __ARM_NEON__ || (defined __ARM_NEON && defined __aarch64__)) |
||||
f.have[CV_CPU_NEON] = true; |
||||
#endif |
||||
|
||||
return f; |
||||
} |
||||
|
||||
int x86_family; |
||||
bool have[MAX_FEATURE+1]; |
||||
}; |
||||
|
||||
static HWFeatures featuresEnabled = HWFeatures::initialize(), featuresDisabled = HWFeatures(); |
||||
static HWFeatures* currentFeatures = &featuresEnabled; |
||||
volatile bool useOptimizedFlag = true; |
||||
|
||||
namespace cv { namespace hal { |
||||
|
||||
bool checkHardwareSupport(int feature) |
||||
{ |
||||
// CV_DbgAssert( 0 <= feature && feature <= CV_HARDWARE_MAX_FEATURE );
|
||||
return currentFeatures->have[feature]; |
||||
} |
||||
|
||||
void setUseOptimized( bool flag ) |
||||
{ |
||||
useOptimizedFlag = flag; |
||||
currentFeatures = flag ? &featuresEnabled : &featuresDisabled; |
||||
} |
||||
|
||||
bool useOptimized(void) |
||||
{ |
||||
return useOptimizedFlag; |
||||
} |
||||
|
||||
}} |
@ -0,0 +1,408 @@ |
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "precomp.hpp" |
||||
|
||||
namespace cv { namespace hal { |
||||
|
||||
#if CV_NEON |
||||
template<typename T> struct VMerge2; |
||||
template<typename T> struct VMerge3; |
||||
template<typename T> struct VMerge4; |
||||
|
||||
#define MERGE2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ |
||||
template<> \
|
||||
struct name<data_type>{ \
|
||||
void operator()(const data_type* src0, const data_type* src1, \
|
||||
data_type* dst){ \
|
||||
reg_type r; \
|
||||
r.val[0] = load_func(src0); \
|
||||
r.val[1] = load_func(src1); \
|
||||
store_func(dst, r); \
|
||||
} \
|
||||
} |
||||
|
||||
#define MERGE3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ |
||||
template<> \
|
||||
struct name<data_type>{ \
|
||||
void operator()(const data_type* src0, const data_type* src1, \
|
||||
const data_type* src2, data_type* dst){ \
|
||||
reg_type r; \
|
||||
r.val[0] = load_func(src0); \
|
||||
r.val[1] = load_func(src1); \
|
||||
r.val[2] = load_func(src2); \
|
||||
store_func(dst, r); \
|
||||
} \
|
||||
} |
||||
|
||||
#define MERGE4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ |
||||
template<> \
|
||||
struct name<data_type>{ \
|
||||
void operator()(const data_type* src0, const data_type* src1, \
|
||||
const data_type* src2, const data_type* src3, \
|
||||
data_type* dst){ \
|
||||
reg_type r; \
|
||||
r.val[0] = load_func(src0); \
|
||||
r.val[1] = load_func(src1); \
|
||||
r.val[2] = load_func(src2); \
|
||||
r.val[3] = load_func(src3); \
|
||||
store_func(dst, r); \
|
||||
} \
|
||||
} |
||||
|
||||
MERGE2_KERNEL_TEMPLATE(VMerge2, uchar , uint8x16x2_t, vld1q_u8 , vst2q_u8 ); |
||||
MERGE2_KERNEL_TEMPLATE(VMerge2, ushort, uint16x8x2_t, vld1q_u16, vst2q_u16); |
||||
MERGE2_KERNEL_TEMPLATE(VMerge2, int , int32x4x2_t, vld1q_s32, vst2q_s32); |
||||
MERGE2_KERNEL_TEMPLATE(VMerge2, int64 , int64x1x2_t, vld1_s64 , vst2_s64 ); |
||||
|
||||
MERGE3_KERNEL_TEMPLATE(VMerge3, uchar , uint8x16x3_t, vld1q_u8 , vst3q_u8 ); |
||||
MERGE3_KERNEL_TEMPLATE(VMerge3, ushort, uint16x8x3_t, vld1q_u16, vst3q_u16); |
||||
MERGE3_KERNEL_TEMPLATE(VMerge3, int , int32x4x3_t, vld1q_s32, vst3q_s32); |
||||
MERGE3_KERNEL_TEMPLATE(VMerge3, int64 , int64x1x3_t, vld1_s64 , vst3_s64 ); |
||||
|
||||
MERGE4_KERNEL_TEMPLATE(VMerge4, uchar , uint8x16x4_t, vld1q_u8 , vst4q_u8 ); |
||||
MERGE4_KERNEL_TEMPLATE(VMerge4, ushort, uint16x8x4_t, vld1q_u16, vst4q_u16); |
||||
MERGE4_KERNEL_TEMPLATE(VMerge4, int , int32x4x4_t, vld1q_s32, vst4q_s32); |
||||
MERGE4_KERNEL_TEMPLATE(VMerge4, int64 , int64x1x4_t, vld1_s64 , vst4_s64 ); |
||||
|
||||
#elif CV_SSE2 |
||||
|
||||
template <typename T> |
||||
struct VMerge2 |
||||
{ |
||||
VMerge2() : support(false) { } |
||||
void operator()(const T *, const T *, T *) const { } |
||||
|
||||
bool support; |
||||
}; |
||||
|
||||
template <typename T> |
||||
struct VMerge3 |
||||
{ |
||||
VMerge3() : support(false) { } |
||||
void operator()(const T *, const T *, const T *, T *) const { } |
||||
|
||||
bool support; |
||||
}; |
||||
|
||||
template <typename T> |
||||
struct VMerge4 |
||||
{ |
||||
VMerge4() : support(false) { } |
||||
void operator()(const T *, const T *, const T *, const T *, T *) const { } |
||||
|
||||
bool support; |
||||
}; |
||||
|
||||
#define MERGE2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \ |
||||
template <> \
|
||||
struct VMerge2<data_type> \
|
||||
{ \
|
||||
enum \ |
||||
{ \
|
||||
ELEMS_IN_VEC = 16 / sizeof(data_type) \
|
||||
}; \
|
||||
\
|
||||
VMerge2() \
|
||||
{ \
|
||||
support = checkHardwareSupport(se); \
|
||||
} \
|
||||
\
|
||||
void operator()(const data_type * src0, const data_type * src1, \
|
||||
data_type * dst) const \
|
||||
{ \
|
||||
reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \
|
||||
reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \
|
||||
reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \
|
||||
reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \
|
||||
\
|
||||
_mm_interleave(v_src0, v_src1, v_src2, v_src3); \
|
||||
\
|
||||
_mm_storeu_##flavor((cast_type *)(dst), v_src0); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \
|
||||
} \
|
||||
\
|
||||
bool support; \
|
||||
} |
||||
|
||||
#define MERGE3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \ |
||||
template <> \
|
||||
struct VMerge3<data_type> \
|
||||
{ \
|
||||
enum \ |
||||
{ \
|
||||
ELEMS_IN_VEC = 16 / sizeof(data_type) \
|
||||
}; \
|
||||
\
|
||||
VMerge3() \
|
||||
{ \
|
||||
support = checkHardwareSupport(se); \
|
||||
} \
|
||||
\
|
||||
void operator()(const data_type * src0, const data_type * src1, const data_type * src2,\
|
||||
data_type * dst) const \
|
||||
{ \
|
||||
reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \
|
||||
reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \
|
||||
reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \
|
||||
reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \
|
||||
reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2)); \
|
||||
reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC)); \
|
||||
\
|
||||
_mm_interleave(v_src0, v_src1, v_src2, \
|
||||
v_src3, v_src4, v_src5); \
|
||||
\
|
||||
_mm_storeu_##flavor((cast_type *)(dst), v_src0); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5); \
|
||||
} \
|
||||
\
|
||||
bool support; \
|
||||
} |
||||
|
||||
#define MERGE4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \ |
||||
template <> \
|
||||
struct VMerge4<data_type> \
|
||||
{ \
|
||||
enum \ |
||||
{ \
|
||||
ELEMS_IN_VEC = 16 / sizeof(data_type) \
|
||||
}; \
|
||||
\
|
||||
VMerge4() \
|
||||
{ \
|
||||
support = checkHardwareSupport(se); \
|
||||
} \
|
||||
\
|
||||
void operator()(const data_type * src0, const data_type * src1, \
|
||||
const data_type * src2, const data_type * src3, \
|
||||
data_type * dst) const \
|
||||
{ \
|
||||
reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \
|
||||
reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \
|
||||
reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \
|
||||
reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \
|
||||
reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2)); \
|
||||
reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC)); \
|
||||
reg_type v_src6 = _mm_loadu_##flavor((const cast_type *)(src3)); \
|
||||
reg_type v_src7 = _mm_loadu_##flavor((const cast_type *)(src3 + ELEMS_IN_VEC)); \
|
||||
\
|
||||
_mm_interleave(v_src0, v_src1, v_src2, v_src3, \
|
||||
v_src4, v_src5, v_src6, v_src7); \
|
||||
\
|
||||
_mm_storeu_##flavor((cast_type *)(dst), v_src0); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 6), v_src6); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 7), v_src7); \
|
||||
} \
|
||||
\
|
||||
bool support; \
|
||||
} |
||||
|
||||
MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2); |
||||
MERGE3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2); |
||||
MERGE4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2); |
||||
|
||||
#if CV_SSE4_1 |
||||
MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1); |
||||
MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1); |
||||
MERGE4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1); |
||||
#endif |
||||
|
||||
MERGE2_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2); |
||||
MERGE3_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2); |
||||
MERGE4_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2); |
||||
|
||||
#endif |
||||
|
||||
template<typename T> static void |
||||
merge_( const T** src, T* dst, int len, int cn ) |
||||
{ |
||||
int k = cn % 4 ? cn % 4 : 4; |
||||
int i, j; |
||||
if( k == 1 ) |
||||
{ |
||||
const T* src0 = src[0]; |
||||
for( i = j = 0; i < len; i++, j += cn ) |
||||
dst[j] = src0[i]; |
||||
} |
||||
else if( k == 2 ) |
||||
{ |
||||
const T *src0 = src[0], *src1 = src[1]; |
||||
i = j = 0; |
||||
#if CV_NEON |
||||
if(cn == 2) |
||||
{ |
||||
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); |
||||
int inc_j = 2 * inc_i; |
||||
|
||||
VMerge2<T> vmerge; |
||||
for( ; i < len - inc_i; i += inc_i, j += inc_j) |
||||
vmerge(src0 + i, src1 + i, dst + j); |
||||
} |
||||
#elif CV_SSE2 |
||||
if(cn == 2) |
||||
{ |
||||
int inc_i = 32/sizeof(T); |
||||
int inc_j = 2 * inc_i; |
||||
|
||||
VMerge2<T> vmerge; |
||||
if (vmerge.support) |
||||
for( ; i < len - inc_i; i += inc_i, j += inc_j) |
||||
vmerge(src0 + i, src1 + i, dst + j); |
||||
} |
||||
#endif |
||||
for( ; i < len; i++, j += cn ) |
||||
{ |
||||
dst[j] = src0[i]; |
||||
dst[j+1] = src1[i]; |
||||
} |
||||
} |
||||
else if( k == 3 ) |
||||
{ |
||||
const T *src0 = src[0], *src1 = src[1], *src2 = src[2]; |
||||
i = j = 0; |
||||
#if CV_NEON |
||||
if(cn == 3) |
||||
{ |
||||
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); |
||||
int inc_j = 3 * inc_i; |
||||
|
||||
VMerge3<T> vmerge; |
||||
for( ; i < len - inc_i; i += inc_i, j += inc_j) |
||||
vmerge(src0 + i, src1 + i, src2 + i, dst + j); |
||||
} |
||||
#elif CV_SSE2 |
||||
if(cn == 3) |
||||
{ |
||||
int inc_i = 32/sizeof(T); |
||||
int inc_j = 3 * inc_i; |
||||
|
||||
VMerge3<T> vmerge; |
||||
if (vmerge.support) |
||||
for( ; i < len - inc_i; i += inc_i, j += inc_j) |
||||
vmerge(src0 + i, src1 + i, src2 + i, dst + j); |
||||
} |
||||
#endif |
||||
for( ; i < len; i++, j += cn ) |
||||
{ |
||||
dst[j] = src0[i]; |
||||
dst[j+1] = src1[i]; |
||||
dst[j+2] = src2[i]; |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
const T *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3]; |
||||
i = j = 0; |
||||
#if CV_NEON |
||||
if(cn == 4) |
||||
{ |
||||
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); |
||||
int inc_j = 4 * inc_i; |
||||
|
||||
VMerge4<T> vmerge; |
||||
for( ; i < len - inc_i; i += inc_i, j += inc_j) |
||||
vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j); |
||||
} |
||||
#elif CV_SSE2 |
||||
if(cn == 4) |
||||
{ |
||||
int inc_i = 32/sizeof(T); |
||||
int inc_j = 4 * inc_i; |
||||
|
||||
VMerge4<T> vmerge; |
||||
if (vmerge.support) |
||||
for( ; i < len - inc_i; i += inc_i, j += inc_j) |
||||
vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j); |
||||
} |
||||
#endif |
||||
for( ; i < len; i++, j += cn ) |
||||
{ |
||||
dst[j] = src0[i]; dst[j+1] = src1[i]; |
||||
dst[j+2] = src2[i]; dst[j+3] = src3[i]; |
||||
} |
||||
} |
||||
|
||||
for( ; k < cn; k += 4 ) |
||||
{ |
||||
const T *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3]; |
||||
for( i = 0, j = k; i < len; i++, j += cn ) |
||||
{ |
||||
dst[j] = src0[i]; dst[j+1] = src1[i]; |
||||
dst[j+2] = src2[i]; dst[j+3] = src3[i]; |
||||
} |
||||
} |
||||
} |
||||
|
||||
|
||||
void merge8u(const uchar** src, uchar* dst, int len, int cn ) |
||||
{ |
||||
merge_(src, dst, len, cn); |
||||
} |
||||
|
||||
void merge16u(const ushort** src, ushort* dst, int len, int cn ) |
||||
{ |
||||
merge_(src, dst, len, cn); |
||||
} |
||||
|
||||
void merge32s(const int** src, int* dst, int len, int cn ) |
||||
{ |
||||
merge_(src, dst, len, cn); |
||||
} |
||||
|
||||
void merge64s(const int64** src, int64* dst, int len, int cn ) |
||||
{ |
||||
merge_(src, dst, len, cn); |
||||
} |
||||
|
||||
}} |
@ -0,0 +1,208 @@ |
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
|
||||
// Copyright (C) 2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_HAL_REPLACEMENT_HPP__ |
||||
#define __OPENCV_HAL_REPLACEMENT_HPP__ |
||||
|
||||
#include "opencv2/hal.hpp" |
||||
|
||||
inline int hal_t_add8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_add8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_add16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_add16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_add32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_add32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_add64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_sub8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_sub8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_sub16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_sub16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_sub32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_sub32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_sub64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_max8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_max8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_max16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_max16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_max32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_max32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_max64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_min8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_min8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_min16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_min16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_min32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_min32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_min64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_absdiff8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_absdiff8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_absdiff16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_absdiff16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_absdiff32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_absdiff32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_absdiff64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_and8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_or8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_xor8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_not8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; } |
||||
|
||||
#define hal_add8u hal_t_add8u |
||||
#define hal_add8s hal_t_add8s |
||||
#define hal_add16u hal_t_add16u |
||||
#define hal_add16s hal_t_add16s |
||||
#define hal_add32s hal_t_add32s |
||||
#define hal_add32f hal_t_add32f |
||||
#define hal_add64f hal_t_add64f |
||||
#define hal_sub8u hal_t_sub8u |
||||
#define hal_sub8s hal_t_sub8s |
||||
#define hal_sub16u hal_t_sub16u |
||||
#define hal_sub16s hal_t_sub16s |
||||
#define hal_sub32s hal_t_sub32s |
||||
#define hal_sub32f hal_t_sub32f |
||||
#define hal_sub64f hal_t_sub64f |
||||
#define hal_max8u hal_t_max8u |
||||
#define hal_max8s hal_t_max8s |
||||
#define hal_max16u hal_t_max16u |
||||
#define hal_max16s hal_t_max16s |
||||
#define hal_max32s hal_t_max32s |
||||
#define hal_max32f hal_t_max32f |
||||
#define hal_max64f hal_t_max64f |
||||
#define hal_min8u hal_t_min8u |
||||
#define hal_min8s hal_t_min8s |
||||
#define hal_min16u hal_t_min16u |
||||
#define hal_min16s hal_t_min16s |
||||
#define hal_min32s hal_t_min32s |
||||
#define hal_min32f hal_t_min32f |
||||
#define hal_min64f hal_t_min64f |
||||
#define hal_absdiff8u hal_t_absdiff8u |
||||
#define hal_absdiff8s hal_t_absdiff8s |
||||
#define hal_absdiff16u hal_t_absdiff16u |
||||
#define hal_absdiff16s hal_t_absdiff16s |
||||
#define hal_absdiff32s hal_t_absdiff32s |
||||
#define hal_absdiff32f hal_t_absdiff32f |
||||
#define hal_absdiff64f hal_t_absdiff64f |
||||
#define hal_and8u hal_t_and8u |
||||
#define hal_or8u hal_t_or8u |
||||
#define hal_xor8u hal_t_xor8u |
||||
#define hal_not8u hal_t_not8u |
||||
|
||||
inline int hal_t_cmp8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_cmp8s(const schar*, size_t, const schar*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_cmp16u(const ushort*, size_t, const ushort*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_cmp16s(const short*, size_t, const short*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_cmp32s(const int*, size_t, const int*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_cmp32f(const float*, size_t, const float*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_cmp64f(const double*, size_t, const double*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; } |
||||
|
||||
#define hal_cmp8u hal_t_cmp8u |
||||
#define hal_cmp8s hal_t_cmp8s |
||||
#define hal_cmp16u hal_t_cmp16u |
||||
#define hal_cmp16s hal_t_cmp16s |
||||
#define hal_cmp32s hal_t_cmp32s |
||||
#define hal_cmp32f hal_t_cmp32f |
||||
#define hal_cmp64f hal_t_cmp64f |
||||
|
||||
inline int hal_t_mul8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_mul8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_mul16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_mul16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_mul32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_mul32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_mul64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_div8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_div8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_div16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_div16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_div32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_div32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_div64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_recip8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_recip8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_recip16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_recip16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_recip32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_recip32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_recip64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; } |
||||
|
||||
#define hal_mul8u hal_t_mul8u |
||||
#define hal_mul8s hal_t_mul8s |
||||
#define hal_mul16u hal_t_mul16u |
||||
#define hal_mul16s hal_t_mul16s |
||||
#define hal_mul32s hal_t_mul32s |
||||
#define hal_mul32f hal_t_mul32f |
||||
#define hal_mul64f hal_t_mul64f |
||||
#define hal_div8u hal_t_div8u |
||||
#define hal_div8s hal_t_div8s |
||||
#define hal_div16u hal_t_div16u |
||||
#define hal_div16s hal_t_div16s |
||||
#define hal_div32s hal_t_div32s |
||||
#define hal_div32f hal_t_div32f |
||||
#define hal_div64f hal_t_div64f |
||||
#define hal_recip8u hal_t_recip8u |
||||
#define hal_recip8s hal_t_recip8s |
||||
#define hal_recip16u hal_t_recip16u |
||||
#define hal_recip16s hal_t_recip16s |
||||
#define hal_recip32s hal_t_recip32s |
||||
#define hal_recip32f hal_t_recip32f |
||||
#define hal_recip64f hal_t_recip64f |
||||
|
||||
inline int hal_t_addWeighted8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_addWeighted8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_addWeighted16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_addWeighted16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_addWeighted32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_addWeighted32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; } |
||||
inline int hal_t_addWeighted64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; } |
||||
|
||||
#define hal_addWeighted8u hal_t_addWeighted8u |
||||
#define hal_addWeighted8s hal_t_addWeighted8s |
||||
#define hal_addWeighted16u hal_t_addWeighted16u |
||||
#define hal_addWeighted16s hal_t_addWeighted16s |
||||
#define hal_addWeighted32s hal_t_addWeighted32s |
||||
#define hal_addWeighted32f hal_t_addWeighted32f |
||||
#define hal_addWeighted64f hal_t_addWeighted64f |
||||
|
||||
#include "custom_hal.hpp" |
||||
|
||||
#endif |
@ -0,0 +1,424 @@ |
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "precomp.hpp" |
||||
|
||||
namespace cv { namespace hal { |
||||
|
||||
#if CV_NEON |
||||
template<typename T> struct VSplit2; |
||||
template<typename T> struct VSplit3; |
||||
template<typename T> struct VSplit4; |
||||
|
||||
#define SPLIT2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ |
||||
template<> \
|
||||
struct name<data_type> \
|
||||
{ \
|
||||
void operator()(const data_type* src, data_type* dst0, \
|
||||
data_type* dst1) const \
|
||||
{ \
|
||||
reg_type r = load_func(src); \
|
||||
store_func(dst0, r.val[0]); \
|
||||
store_func(dst1, r.val[1]); \
|
||||
} \
|
||||
} |
||||
|
||||
#define SPLIT3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ |
||||
template<> \
|
||||
struct name<data_type> \
|
||||
{ \
|
||||
void operator()(const data_type* src, data_type* dst0, data_type* dst1, \
|
||||
data_type* dst2) const \
|
||||
{ \
|
||||
reg_type r = load_func(src); \
|
||||
store_func(dst0, r.val[0]); \
|
||||
store_func(dst1, r.val[1]); \
|
||||
store_func(dst2, r.val[2]); \
|
||||
} \
|
||||
} |
||||
|
||||
#define SPLIT4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ |
||||
template<> \
|
||||
struct name<data_type> \
|
||||
{ \
|
||||
void operator()(const data_type* src, data_type* dst0, data_type* dst1, \
|
||||
data_type* dst2, data_type* dst3) const \
|
||||
{ \
|
||||
reg_type r = load_func(src); \
|
||||
store_func(dst0, r.val[0]); \
|
||||
store_func(dst1, r.val[1]); \
|
||||
store_func(dst2, r.val[2]); \
|
||||
store_func(dst3, r.val[3]); \
|
||||
} \
|
||||
} |
||||
|
||||
SPLIT2_KERNEL_TEMPLATE(VSplit2, uchar , uint8x16x2_t, vld2q_u8 , vst1q_u8 ); |
||||
SPLIT2_KERNEL_TEMPLATE(VSplit2, ushort, uint16x8x2_t, vld2q_u16, vst1q_u16); |
||||
SPLIT2_KERNEL_TEMPLATE(VSplit2, int , int32x4x2_t, vld2q_s32, vst1q_s32); |
||||
SPLIT2_KERNEL_TEMPLATE(VSplit2, int64 , int64x1x2_t, vld2_s64 , vst1_s64 ); |
||||
|
||||
SPLIT3_KERNEL_TEMPLATE(VSplit3, uchar , uint8x16x3_t, vld3q_u8 , vst1q_u8 ); |
||||
SPLIT3_KERNEL_TEMPLATE(VSplit3, ushort, uint16x8x3_t, vld3q_u16, vst1q_u16); |
||||
SPLIT3_KERNEL_TEMPLATE(VSplit3, int , int32x4x3_t, vld3q_s32, vst1q_s32); |
||||
SPLIT3_KERNEL_TEMPLATE(VSplit3, int64 , int64x1x3_t, vld3_s64 , vst1_s64 ); |
||||
|
||||
SPLIT4_KERNEL_TEMPLATE(VSplit4, uchar , uint8x16x4_t, vld4q_u8 , vst1q_u8 ); |
||||
SPLIT4_KERNEL_TEMPLATE(VSplit4, ushort, uint16x8x4_t, vld4q_u16, vst1q_u16); |
||||
SPLIT4_KERNEL_TEMPLATE(VSplit4, int , int32x4x4_t, vld4q_s32, vst1q_s32); |
||||
SPLIT4_KERNEL_TEMPLATE(VSplit4, int64 , int64x1x4_t, vld4_s64 , vst1_s64 ); |
||||
|
||||
#elif CV_SSE2 |
||||
|
||||
template <typename T> |
||||
struct VSplit2 |
||||
{ |
||||
VSplit2() : support(false) { } |
||||
void operator()(const T *, T *, T *) const { } |
||||
|
||||
bool support; |
||||
}; |
||||
|
||||
template <typename T> |
||||
struct VSplit3 |
||||
{ |
||||
VSplit3() : support(false) { } |
||||
void operator()(const T *, T *, T *, T *) const { } |
||||
|
||||
bool support; |
||||
}; |
||||
|
||||
template <typename T> |
||||
struct VSplit4 |
||||
{ |
||||
VSplit4() : support(false) { } |
||||
void operator()(const T *, T *, T *, T *, T *) const { } |
||||
|
||||
bool support; |
||||
}; |
||||
|
||||
#define SPLIT2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \ |
||||
template <> \
|
||||
struct VSplit2<data_type> \
|
||||
{ \
|
||||
enum \ |
||||
{ \
|
||||
ELEMS_IN_VEC = 16 / sizeof(data_type) \
|
||||
}; \
|
||||
\
|
||||
VSplit2() \
|
||||
{ \
|
||||
support = checkHardwareSupport(CV_CPU_SSE2); \
|
||||
} \
|
||||
\
|
||||
void operator()(const data_type * src, \
|
||||
data_type * dst0, data_type * dst1) const \
|
||||
{ \
|
||||
reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \
|
||||
reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \
|
||||
reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
|
||||
reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
|
||||
\
|
||||
_mm_deinterleave(v_src0, v_src1, v_src2, v_src3); \
|
||||
\
|
||||
_mm_storeu_##flavor((cast_type *)(dst0), v_src0); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst1), v_src2); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \
|
||||
} \
|
||||
\
|
||||
bool support; \
|
||||
} |
||||
|
||||
#define SPLIT3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \ |
||||
template <> \
|
||||
struct VSplit3<data_type> \
|
||||
{ \
|
||||
enum \ |
||||
{ \
|
||||
ELEMS_IN_VEC = 16 / sizeof(data_type) \
|
||||
}; \
|
||||
\
|
||||
VSplit3() \
|
||||
{ \
|
||||
support = checkHardwareSupport(CV_CPU_SSE2); \
|
||||
} \
|
||||
\
|
||||
void operator()(const data_type * src, \
|
||||
data_type * dst0, data_type * dst1, data_type * dst2) const \
|
||||
{ \
|
||||
reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \
|
||||
reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \
|
||||
reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
|
||||
reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
|
||||
reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
|
||||
reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
|
||||
\
|
||||
_mm_deinterleave(v_src0, v_src1, v_src2, \
|
||||
v_src3, v_src4, v_src5); \
|
||||
\
|
||||
_mm_storeu_##flavor((cast_type *)(dst0), v_src0); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst1), v_src2); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst2), v_src4); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \
|
||||
} \
|
||||
\
|
||||
bool support; \
|
||||
} |
||||
|
||||
#define SPLIT4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \ |
||||
template <> \
|
||||
struct VSplit4<data_type> \
|
||||
{ \
|
||||
enum \ |
||||
{ \
|
||||
ELEMS_IN_VEC = 16 / sizeof(data_type) \
|
||||
}; \
|
||||
\
|
||||
VSplit4() \
|
||||
{ \
|
||||
support = checkHardwareSupport(CV_CPU_SSE2); \
|
||||
} \
|
||||
\
|
||||
void operator()(const data_type * src, data_type * dst0, data_type * dst1, \
|
||||
data_type * dst2, data_type * dst3) const \
|
||||
{ \
|
||||
reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \
|
||||
reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \
|
||||
reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
|
||||
reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
|
||||
reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
|
||||
reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
|
||||
reg_type v_src6 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 6)); \
|
||||
reg_type v_src7 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 7)); \
|
||||
\
|
||||
_mm_deinterleave(v_src0, v_src1, v_src2, v_src3, \
|
||||
v_src4, v_src5, v_src6, v_src7); \
|
||||
\
|
||||
_mm_storeu_##flavor((cast_type *)(dst0), v_src0); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst1), v_src2); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst2), v_src4); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst3), v_src6); \
|
||||
_mm_storeu_##flavor((cast_type *)(dst3 + ELEMS_IN_VEC), v_src7); \
|
||||
} \
|
||||
\
|
||||
bool support; \
|
||||
} |
||||
|
||||
SPLIT2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); |
||||
SPLIT2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); |
||||
SPLIT2_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); |
||||
|
||||
SPLIT3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); |
||||
SPLIT3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); |
||||
SPLIT3_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); |
||||
|
||||
SPLIT4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); |
||||
SPLIT4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); |
||||
SPLIT4_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); |
||||
|
||||
#endif |
||||
|
||||
template<typename T> static void |
||||
split_( const T* src, T** dst, int len, int cn ) |
||||
{ |
||||
int k = cn % 4 ? cn % 4 : 4; |
||||
int i, j; |
||||
if( k == 1 ) |
||||
{ |
||||
T* dst0 = dst[0]; |
||||
|
||||
if(cn == 1) |
||||
{ |
||||
memcpy(dst0, src, len * sizeof(T)); |
||||
} |
||||
else |
||||
{ |
||||
for( i = 0, j = 0 ; i < len; i++, j += cn ) |
||||
dst0[i] = src[j]; |
||||
} |
||||
} |
||||
else if( k == 2 ) |
||||
{ |
||||
T *dst0 = dst[0], *dst1 = dst[1]; |
||||
i = j = 0; |
||||
|
||||
#if CV_NEON |
||||
if(cn == 2) |
||||
{ |
||||
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); |
||||
int inc_j = 2 * inc_i; |
||||
|
||||
VSplit2<T> vsplit; |
||||
for( ; i < len - inc_i; i += inc_i, j += inc_j) |
||||
vsplit(src + j, dst0 + i, dst1 + i); |
||||
} |
||||
#elif CV_SSE2 |
||||
if (cn == 2) |
||||
{ |
||||
int inc_i = 32/sizeof(T); |
||||
int inc_j = 2 * inc_i; |
||||
|
||||
VSplit2<T> vsplit; |
||||
if (vsplit.support) |
||||
{ |
||||
for( ; i <= len - inc_i; i += inc_i, j += inc_j) |
||||
vsplit(src + j, dst0 + i, dst1 + i); |
||||
} |
||||
} |
||||
#endif |
||||
for( ; i < len; i++, j += cn ) |
||||
{ |
||||
dst0[i] = src[j]; |
||||
dst1[i] = src[j+1]; |
||||
} |
||||
} |
||||
else if( k == 3 ) |
||||
{ |
||||
T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2]; |
||||
i = j = 0; |
||||
|
||||
#if CV_NEON |
||||
if(cn == 3) |
||||
{ |
||||
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); |
||||
int inc_j = 3 * inc_i; |
||||
|
||||
VSplit3<T> vsplit; |
||||
for( ; i <= len - inc_i; i += inc_i, j += inc_j) |
||||
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i); |
||||
} |
||||
#elif CV_SSE2 |
||||
if (cn == 3) |
||||
{ |
||||
int inc_i = 32/sizeof(T); |
||||
int inc_j = 3 * inc_i; |
||||
|
||||
VSplit3<T> vsplit; |
||||
|
||||
if (vsplit.support) |
||||
{ |
||||
for( ; i <= len - inc_i; i += inc_i, j += inc_j) |
||||
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i); |
||||
} |
||||
} |
||||
#endif |
||||
for( ; i < len; i++, j += cn ) |
||||
{ |
||||
dst0[i] = src[j]; |
||||
dst1[i] = src[j+1]; |
||||
dst2[i] = src[j+2]; |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2], *dst3 = dst[3]; |
||||
i = j = 0; |
||||
|
||||
#if CV_NEON |
||||
if(cn == 4) |
||||
{ |
||||
int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); |
||||
int inc_j = 4 * inc_i; |
||||
|
||||
VSplit4<T> vsplit; |
||||
for( ; i <= len - inc_i; i += inc_i, j += inc_j) |
||||
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i); |
||||
} |
||||
#elif CV_SSE2 |
||||
if (cn == 4) |
||||
{ |
||||
int inc_i = 32/sizeof(T); |
||||
int inc_j = 4 * inc_i; |
||||
|
||||
VSplit4<T> vsplit; |
||||
if (vsplit.support) |
||||
{ |
||||
for( ; i <= len - inc_i; i += inc_i, j += inc_j) |
||||
vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i); |
||||
} |
||||
} |
||||
#endif |
||||
for( ; i < len; i++, j += cn ) |
||||
{ |
||||
dst0[i] = src[j]; dst1[i] = src[j+1]; |
||||
dst2[i] = src[j+2]; dst3[i] = src[j+3]; |
||||
} |
||||
} |
||||
|
||||
for( ; k < cn; k += 4 ) |
||||
{ |
||||
T *dst0 = dst[k], *dst1 = dst[k+1], *dst2 = dst[k+2], *dst3 = dst[k+3]; |
||||
for( i = 0, j = k; i < len; i++, j += cn ) |
||||
{ |
||||
dst0[i] = src[j]; dst1[i] = src[j+1]; |
||||
dst2[i] = src[j+2]; dst3[i] = src[j+3]; |
||||
} |
||||
} |
||||
} |
||||
|
||||
void split8u(const uchar* src, uchar** dst, int len, int cn ) |
||||
{ |
||||
split_(src, dst, len, cn); |
||||
} |
||||
|
||||
void split16u(const ushort* src, ushort** dst, int len, int cn ) |
||||
{ |
||||
split_(src, dst, len, cn); |
||||
} |
||||
|
||||
void split32s(const int* src, int** dst, int len, int cn ) |
||||
{ |
||||
split_(src, dst, len, cn); |
||||
} |
||||
|
||||
void split64s(const int64* src, int64** dst, int len, int cn ) |
||||
{ |
||||
split_(src, dst, len, cn); |
||||
} |
||||
|
||||
}} |
Loading…
Reference in new issue