From a2a9a01e05e87956c1f2c853a553eb61bb0c08dc Mon Sep 17 00:00:00 2001 From: Maksim Shabunin Date: Wed, 18 Jul 2018 15:22:42 +0300 Subject: [PATCH 01/19] AVI container: verbose error messages --- modules/videoio/src/container_avi.cpp | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/modules/videoio/src/container_avi.cpp b/modules/videoio/src/container_avi.cpp index 0c581b3f51..c2a93698bd 100644 --- a/modules/videoio/src/container_avi.cpp +++ b/modules/videoio/src/container_avi.cpp @@ -12,7 +12,7 @@ namespace cv // Utility function for safe integer conversions template -inline D safe_int_cast(S val) +inline D safe_int_cast(S val, const char * msg = 0) { typedef std::numeric_limits st; typedef std::numeric_limits dt; @@ -21,7 +21,10 @@ inline D safe_int_cast(S val) const bool in_range_l = (double)val >= (double)dt::min(); if (!in_range_r || !in_range_l) { - CV_Error_(cv::Error::StsOutOfRange, ("Can not convert integer values (%s -> %s), value 0x%llx is out of range", typeid(S).name(), typeid(D).name(), val)); + if (!msg) + CV_Error_(Error::StsOutOfRange, ("Can not convert integer values (%s -> %s), value 0x%llx is out of range", typeid(S).name(), typeid(D).name(), val)); + else + CV_Error(Error::StsOutOfRange, msg); } return static_cast(val); } @@ -128,7 +131,7 @@ public: VideoInputStream(); VideoInputStream(const String& filename); ~VideoInputStream(); - VideoInputStream& read(char*, uint64_t); + VideoInputStream& read(char*, uint32_t); VideoInputStream& seekg(uint64_t); uint64_t tellg(); bool isOpened() const; @@ -229,11 +232,11 @@ void VideoInputStream::close() } } -VideoInputStream& VideoInputStream::read(char* buf, uint64_t count) +VideoInputStream& VideoInputStream::read(char* buf, uint32_t count) { if(isOpened()) { - input.read(buf, safe_int_cast(count)); + input.read(buf, safe_int_cast(count, "Failed to read AVI file: requested chunk size is too large")); m_is_valid = (input.gcount() == (std::streamsize)count); } @@ -243,7 +246,7 @@ VideoInputStream& VideoInputStream::read(char* buf, uint64_t count) VideoInputStream& VideoInputStream::seekg(uint64_t pos) { input.clear(); - input.seekg(safe_int_cast(pos)); + input.seekg(safe_int_cast(pos, "Failed to seek in AVI file: position is out of range")); m_is_valid = !input.eof(); return *this; } @@ -668,7 +671,7 @@ void BitStream::writeBlock() } size_t BitStream::getPos() const { - return safe_int_cast(m_current - m_start) + m_pos; + return safe_int_cast(m_current - m_start, "Failed to determine AVI bufer position: value is out of range") + m_pos; } void BitStream::putByte(int val) @@ -737,7 +740,7 @@ void BitStream::patchInt(uint32_t val, size_t pos) { if( pos >= m_pos ) { - ptrdiff_t delta = safe_int_cast(pos - m_pos); + ptrdiff_t delta = safe_int_cast(pos - m_pos, "Failed to seek in AVI buffer: value is out of range"); CV_Assert( delta < m_current - m_start ); m_start[delta] = (uchar)val; m_start[delta+1] = (uchar)(val >> 8); @@ -747,7 +750,7 @@ void BitStream::patchInt(uint32_t val, size_t pos) else { std::streamoff fpos = output.tellp(); - output.seekp(safe_int_cast(pos)); + output.seekp(safe_int_cast(pos, "Failed to seek in AVI file: value is out of range")); uchar buf[] = { (uchar)val, (uchar)(val >> 8), (uchar)(val >> 16), (uchar)(val >> 24) }; output.write((char *)buf, 4); output.seekp(fpos); @@ -960,7 +963,7 @@ void AVIWriteContainer::endWriteChunk() size_t pospos = AVIChunkSizeIndex.back(); AVIChunkSizeIndex.pop_back(); CV_Assert(currpos >= pospos); - uint32_t chunksz = safe_int_cast(currpos - pospos); + uint32_t chunksz = safe_int_cast(currpos - pospos, "Failed to write AVI file: chunk size is out of bounds"); strm->patchInt(chunksz, pospos); } } @@ -996,7 +999,7 @@ void AVIWriteContainer::writeIndex(int stream_number, StreamType strm_type) void AVIWriteContainer::finishWriteAVI() { - uint32_t nframes = safe_int_cast(frameOffset.size()); + uint32_t nframes = safe_int_cast(frameOffset.size(), "Failed to write AVI file: number of frames is too large"); // Record frames numbers to AVI Header while (!frameNumIndexes.empty()) { From e526c4bfe4afab05d604880ee1a68e2fb40c3d04 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Wed, 18 Jul 2018 16:09:27 +0300 Subject: [PATCH 02/19] core(test): remove verbose messages --- modules/core/test/test_rand.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/core/test/test_rand.cpp b/modules/core/test/test_rand.cpp index 8677aa0c31..34b32a7e53 100644 --- a/modules/core/test/test_rand.cpp +++ b/modules/core/test/test_rand.cpp @@ -173,7 +173,7 @@ void Core_RandTest::run( int ) dsz = slice+1 < maxSlice ? (int)(cvtest::randInt(rng) % (SZ - sz) + 1) : SZ - sz; Mat aslice = arr[k].colRange(sz, sz + dsz); tested_rng.fill(aslice, dist_type, A, B); - printf("%d - %d\n", sz, sz + dsz); + //printf("%d - %d\n", sz, sz + dsz); } } From f4df537e277605480400c65e126914707ee6a28c Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Wed, 18 Jul 2018 15:20:43 +0300 Subject: [PATCH 03/19] ts: fix PERF_TEST() macro to allow test_case name reusing Example (reuse 'Transform' test case): PERF_TEST(Transform, getPerspectiveTransform_1000) { ... } PERF_TEST(Transform, getPerspectiveTransform_QR_1000) { ... } --- modules/ts/include/opencv2/ts/ts_ext.hpp | 18 +++++++++--------- modules/ts/include/opencv2/ts/ts_perf.hpp | 12 +----------- 2 files changed, 10 insertions(+), 20 deletions(-) diff --git a/modules/ts/include/opencv2/ts/ts_ext.hpp b/modules/ts/include/opencv2/ts/ts_ext.hpp index 37c399515a..265db94123 100644 --- a/modules/ts/include/opencv2/ts/ts_ext.hpp +++ b/modules/ts/include/opencv2/ts/ts_ext.hpp @@ -44,13 +44,13 @@ extern int testThreads; #undef TEST -#define TEST_(test_case_name, test_name, BODY_IMPL) \ - class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) : public ::testing::Test {\ +#define TEST_(test_case_name, test_name, parent_class, bodyMethodName, BODY_IMPL) \ + class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) : public parent_class {\ public:\ GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}\ private:\ virtual void TestBody() CV_OVERRIDE;\ - virtual void Body();\ + virtual void bodyMethodName();\ static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;\ GTEST_DISALLOW_COPY_AND_ASSIGN_(\ GTEST_TEST_CLASS_NAME_(test_case_name, test_name));\ @@ -62,14 +62,14 @@ extern int testThreads; #test_case_name, #test_name, NULL, NULL, \ ::testing::internal::CodeLocation(__FILE__, __LINE__), \ (::testing::internal::GetTestTypeId()), \ - ::testing::Test::SetUpTestCase, \ - ::testing::Test::TearDownTestCase, \ + parent_class::SetUpTestCase, \ + parent_class::TearDownTestCase, \ new ::testing::internal::TestFactoryImpl<\ GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>);\ void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody() BODY_IMPL( #test_case_name "_" #test_name ) \ - void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::Body() + void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::bodyMethodName() -#define TEST(test_case_name, test_name) TEST_(test_case_name, test_name, CV__TEST_BODY_IMPL) +#define TEST(test_case_name, test_name) TEST_(test_case_name, test_name, ::testing::Test, Body, CV__TEST_BODY_IMPL) #define CV__TEST_BIGDATA_BODY_IMPL(name) \ { \ @@ -92,9 +92,9 @@ extern int testThreads; // Special type of tests which require / use or validate processing of huge amount of data (>= 2Gb) #if defined(_M_X64) || defined(__x86_64__) || defined(__aarch64__) -#define BIGDATA_TEST(test_case_name, test_name) TEST_(BigData_ ## test_case_name, test_name, CV__TEST_BIGDATA_BODY_IMPL) +#define BIGDATA_TEST(test_case_name, test_name) TEST_(BigData_ ## test_case_name, test_name, ::testing::Test, Body, CV__TEST_BIGDATA_BODY_IMPL) #else -#define BIGDATA_TEST(test_case_name, test_name) TEST_(BigData_ ## test_case_name, DISABLED_ ## test_name, CV__TEST_BIGDATA_BODY_IMPL) +#define BIGDATA_TEST(test_case_name, test_name) TEST_(BigData_ ## test_case_name, DISABLED_ ## test_name, ::testing::Test, Body, CV__TEST_BIGDATA_BODY_IMPL) #endif #undef TEST_F diff --git a/modules/ts/include/opencv2/ts/ts_perf.hpp b/modules/ts/include/opencv2/ts/ts_perf.hpp index a5d0acfa41..e33850c267 100644 --- a/modules/ts/include/opencv2/ts/ts_perf.hpp +++ b/modules/ts/include/opencv2/ts/ts_perf.hpp @@ -546,17 +546,7 @@ void PrintTo(const Size& sz, ::std::ostream* os); // EXPECT_TRUE(foo.StatusIsOK()); // } #define PERF_TEST(test_case_name, test_name)\ - namespace PERF_PROXY_NAMESPACE_NAME_(test_case_name, test_name) {\ - class TestBase {/*compile error for this class means that you are trying to use perf::TestBase as a fixture*/};\ - class test_case_name : public ::perf::TestBase {\ - public:\ - test_case_name() {}\ - protected:\ - virtual void PerfTestBody();\ - };\ - TEST_F(test_case_name, test_name){ CV__PERF_TEST_BODY_IMPL(#test_case_name "_" #test_name); }\ - }\ - void PERF_PROXY_NAMESPACE_NAME_(test_case_name, test_name)::test_case_name::PerfTestBody() + TEST_(test_case_name, test_name, ::perf::TestBase, PerfTestBody, CV__PERF_TEST_BODY_IMPL) // Defines a performance test that uses a test fixture. // From 070393dfda25335024b932988266e78d939df405 Mon Sep 17 00:00:00 2001 From: Dmitry Kurtaev Date: Mon, 4 Jun 2018 23:51:28 +0300 Subject: [PATCH 04/19] uint8 inputs for deep learning networks --- modules/dnn/include/opencv2/dnn/dnn.hpp | 36 ++-- modules/dnn/src/dnn.cpp | 262 ++++++++++++++++++++---- modules/dnn/src/op_inf_engine.cpp | 50 ++++- modules/dnn/src/op_inf_engine.hpp | 10 +- modules/dnn/test/test_halide_layers.cpp | 10 +- modules/dnn/test/test_layers.cpp | 76 ++++++- modules/dnn/test/test_misc.cpp | 40 ++++ 7 files changed, 408 insertions(+), 76 deletions(-) diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp index 7cc95ca0c4..0809891942 100644 --- a/modules/dnn/include/opencv2/dnn/dnn.hpp +++ b/modules/dnn/include/opencv2/dnn/dnn.hpp @@ -46,9 +46,9 @@ #include #if !defined CV_DOXYGEN && !defined CV_DNN_DONT_ADD_EXPERIMENTAL_NS -#define CV__DNN_EXPERIMENTAL_NS_BEGIN namespace experimental_dnn_v5 { +#define CV__DNN_EXPERIMENTAL_NS_BEGIN namespace experimental_dnn_v6 { #define CV__DNN_EXPERIMENTAL_NS_END } -namespace cv { namespace dnn { namespace experimental_dnn_v5 { } using namespace experimental_dnn_v5; }} +namespace cv { namespace dnn { namespace experimental_dnn_v6 { } using namespace experimental_dnn_v6; }} #else #define CV__DNN_EXPERIMENTAL_NS_BEGIN #define CV__DNN_EXPERIMENTAL_NS_END @@ -487,14 +487,19 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN */ CV_WRAP void setPreferableTarget(int targetId); - /** @brief Sets the new value for the layer output blob - * @param name descriptor of the updating layer output blob. - * @param blob new blob. + /** @brief Sets the new input value for the network + * @param blob A new blob. Should have CV_32F or CV_8U depth. + * @param name A name of input layer. + * @param scalefactor An optional normalization scale. + * @param mean An optional mean subtraction values. * @see connect(String, String) to know format of the descriptor. - * @note If updating blob is not empty then @p blob must have the same shape, - * because network reshaping is not implemented yet. + * + * If scale or mean values are specified, a final input blob is computed + * as: + * \f[input(n,c,h,w) = scalefactor \times (blob(n,c,h,w) - mean_c)\f] */ - CV_WRAP void setInput(InputArray blob, const String& name = ""); + CV_WRAP void setInput(InputArray blob, const String& name = "", + double scalefactor = 1.0, const Scalar& mean = Scalar()); /** @brief Sets the new value for the learned param of the layer. * @param layer name or id of the layer. @@ -805,13 +810,15 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN * @param swapRB flag which indicates that swap first and last channels * in 3-channel image is necessary. * @param crop flag which indicates whether image will be cropped after resize or not + * @param ddepth Depth of output blob. Choose CV_32F or CV_8U. * @details if @p crop is true, input image is resized so one side after resize is equal to corresponding * dimension in @p size and another one is equal or larger. Then, crop from the center is performed. * If @p crop is false, direct resize without cropping and preserving aspect ratio is performed. * @returns 4-dimensional Mat with NCHW dimensions order. */ CV_EXPORTS_W Mat blobFromImage(InputArray image, double scalefactor=1.0, const Size& size = Size(), - const Scalar& mean = Scalar(), bool swapRB=true, bool crop=true); + const Scalar& mean = Scalar(), bool swapRB=true, bool crop=true, + int ddepth=CV_32F); /** @brief Creates 4-dimensional blob from image. * @details This is an overloaded member function, provided for convenience. @@ -819,7 +826,7 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN */ CV_EXPORTS void blobFromImage(InputArray image, OutputArray blob, double scalefactor=1.0, const Size& size = Size(), const Scalar& mean = Scalar(), - bool swapRB=true, bool crop=true); + bool swapRB=true, bool crop=true, int ddepth=CV_32F); /** @brief Creates 4-dimensional blob from series of images. Optionally resizes and @@ -833,13 +840,15 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN * @param swapRB flag which indicates that swap first and last channels * in 3-channel image is necessary. * @param crop flag which indicates whether image will be cropped after resize or not + * @param ddepth Depth of output blob. Choose CV_32F or CV_8U. * @details if @p crop is true, input image is resized so one side after resize is equal to corresponding * dimension in @p size and another one is equal or larger. Then, crop from the center is performed. * If @p crop is false, direct resize without cropping and preserving aspect ratio is performed. - * @returns 4-dimansional Mat with NCHW dimensions order. + * @returns 4-dimensional Mat with NCHW dimensions order. */ CV_EXPORTS_W Mat blobFromImages(InputArrayOfArrays images, double scalefactor=1.0, - Size size = Size(), const Scalar& mean = Scalar(), bool swapRB=true, bool crop=true); + Size size = Size(), const Scalar& mean = Scalar(), bool swapRB=true, bool crop=true, + int ddepth=CV_32F); /** @brief Creates 4-dimensional blob from series of images. * @details This is an overloaded member function, provided for convenience. @@ -847,7 +856,8 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN */ CV_EXPORTS void blobFromImages(InputArrayOfArrays images, OutputArray blob, double scalefactor=1.0, Size size = Size(), - const Scalar& mean = Scalar(), bool swapRB=true, bool crop=true); + const Scalar& mean = Scalar(), bool swapRB=true, bool crop=true, + int ddepth=CV_32F); /** @brief Parse a 4D blob and output the images it contains as 2D arrays through a simpler data structure * (std::vector). diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp index 994df854b0..5014365fdd 100644 --- a/modules/dnn/src/dnn.cpp +++ b/modules/dnn/src/dnn.cpp @@ -97,35 +97,42 @@ namespace } Mat blobFromImage(InputArray image, double scalefactor, const Size& size, - const Scalar& mean, bool swapRB, bool crop) + const Scalar& mean, bool swapRB, bool crop, int ddepth) { CV_TRACE_FUNCTION(); Mat blob; - blobFromImage(image, blob, scalefactor, size, mean, swapRB, crop); + blobFromImage(image, blob, scalefactor, size, mean, swapRB, crop, ddepth); return blob; } void blobFromImage(InputArray image, OutputArray blob, double scalefactor, - const Size& size, const Scalar& mean, bool swapRB, bool crop) + const Size& size, const Scalar& mean, bool swapRB, bool crop, int ddepth) { CV_TRACE_FUNCTION(); std::vector images(1, image.getMat()); - blobFromImages(images, blob, scalefactor, size, mean, swapRB, crop); + blobFromImages(images, blob, scalefactor, size, mean, swapRB, crop, ddepth); } Mat blobFromImages(InputArrayOfArrays images, double scalefactor, Size size, - const Scalar& mean, bool swapRB, bool crop) + const Scalar& mean, bool swapRB, bool crop, int ddepth) { CV_TRACE_FUNCTION(); Mat blob; - blobFromImages(images, blob, scalefactor, size, mean, swapRB, crop); + blobFromImages(images, blob, scalefactor, size, mean, swapRB, crop, ddepth); return blob; } void blobFromImages(InputArrayOfArrays images_, OutputArray blob_, double scalefactor, - Size size, const Scalar& mean_, bool swapRB, bool crop) + Size size, const Scalar& mean_, bool swapRB, bool crop, int ddepth) { CV_TRACE_FUNCTION(); + CV_CheckType(ddepth, ddepth == CV_32F || ddepth == CV_8U, "Blob depth should be CV_32F or CV_8U"); + if (ddepth == CV_8U) + { + CV_CheckEQ(scalefactor, 1.0, "Scaling is not supported for CV_8U blob depth"); + CV_Assert(mean_ == Scalar(), "Mean subtraction is not supported for CV_8U blob depth"); + } + std::vector images; images_.getMatVector(images); CV_Assert(!images.empty()); @@ -149,7 +156,7 @@ void blobFromImages(InputArrayOfArrays images_, OutputArray blob_, double scalef else resize(images[i], images[i], size, 0, 0, INTER_LINEAR); } - if(images[i].depth() == CV_8U) + if(images[i].depth() == CV_8U && ddepth == CV_32F) images[i].convertTo(images[i], CV_32F); Scalar mean = mean_; if (swapRB) @@ -167,20 +174,20 @@ void blobFromImages(InputArrayOfArrays images_, OutputArray blob_, double scalef if (nch == 3 || nch == 4) { int sz[] = { (int)nimages, nch, image0.rows, image0.cols }; - blob_.create(4, sz, CV_32F); + blob_.create(4, sz, ddepth); Mat blob = blob_.getMat(); Mat ch[4]; for( i = 0; i < nimages; i++ ) { image = images[i]; - CV_Assert(image.depth() == CV_32F); + CV_Assert(image.depth() == blob_.depth()); nch = image.channels(); CV_Assert(image.dims == 2 && (nch == 3 || nch == 4)); CV_Assert(image.size() == image0.size()); for( int j = 0; j < nch; j++ ) - ch[j] = Mat(image.rows, image.cols, CV_32F, blob.ptr((int)i, j)); + ch[j] = Mat(image.rows, image.cols, ddepth, blob.ptr((int)i, j)); if(swapRB) std::swap(ch[0], ch[2]); split(image, ch); @@ -190,18 +197,18 @@ void blobFromImages(InputArrayOfArrays images_, OutputArray blob_, double scalef { CV_Assert(nch == 1); int sz[] = { (int)nimages, 1, image0.rows, image0.cols }; - blob_.create(4, sz, CV_32F); + blob_.create(4, sz, ddepth); Mat blob = blob_.getMat(); for( i = 0; i < nimages; i++ ) { Mat image = images[i]; - CV_Assert(image.depth() == CV_32F); + CV_Assert(image.depth() == blob_.depth()); nch = image.channels(); CV_Assert(image.dims == 2 && (nch == 1)); CV_Assert(image.size() == image0.size()); - image.copyTo(Mat(image.rows, image.cols, CV_32F, blob.ptr((int)i, 0))); + image.copyTo(Mat(image.rows, image.cols, ddepth, blob.ptr((int)i, 0))); } } } @@ -408,7 +415,16 @@ struct LayerData //fake layer containing network input blobs struct DataLayer : public Layer { - void finalize(const std::vector&, std::vector&) CV_OVERRIDE {} + DataLayer() : Layer() + { + skip = false; + } + + virtual bool supportBackend(int backendId) CV_OVERRIDE + { + return backendId == DNN_BACKEND_OPENCV || + backendId == DNN_BACKEND_INFERENCE_ENGINE && inputsData.size() == 1; + } void forward(InputArrayOfArrays inputs, OutputArrayOfArrays outputs, OutputArrayOfArrays internals) CV_OVERRIDE { @@ -423,11 +439,36 @@ struct DataLayer : public Layer void forward(std::vector&, std::vector& outputs, std::vector &) CV_OVERRIDE { + // Supported modes: + // | Input type | Output type | + // | fp32 | fp32 | + // | uint8 | fp32 | for (int i = 0; i < inputsData.size(); ++i) { - if (inputsData[i].type() == CV_32F && outputs[i].type() == CV_16S) + double scale = scaleFactors[i]; + Scalar& mean = means[i]; + CV_Assert(mean == Scalar() || inputsData[i].size[1] <= 4, + outputs[i].type() == CV_32F); + + bool singleMean = true; + for (int j = 1; j < std::min(4, inputsData[i].size[1]) && singleMean; ++j) { - convertFp16(inputsData[i], outputs[i]); + singleMean = mean[j] == mean[j - 1]; + } + + if (singleMean) + { + inputsData[i].convertTo(outputs[i], CV_32F, scale, -mean[0] * scale); + } + else + { + for (int n = 0; n < inputsData[i].size[0]; ++n) + for (int c = 0; c < inputsData[i].size[1]; ++c) + { + Mat inp = getPlane(inputsData[i], n, c); + Mat out = getPlane(outputs[i], n, c); + inp.convertTo(out, CV_32F, scale, -mean[c] * scale); + } } } } @@ -435,13 +476,66 @@ struct DataLayer : public Layer #ifdef HAVE_OPENCL bool forward_ocl(InputArrayOfArrays, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_) { - if (outputs_.depth() == CV_16S) + // Supported modes: + // | Input type | Output type | + // | fp32 | fp32 | + // | fp32 | fp16 | + // | uint8 | fp32 | + std::vector outputs; + outputs_.getUMatVector(outputs); + + for (int i = 0; i < inputsData.size(); ++i) { - std::vector outputs; - outputs_.getUMatVector(outputs); - for (int i = 0; i < inputsData.size(); ++i) + double scale = scaleFactors[i]; + Scalar& mean = means[i]; + + CV_Assert(mean == Scalar() || inputsData[i].size[1] <= 4); + bool singleMean = true; + for (int j = 1; j < std::min(4, inputsData[i].size[1]) && singleMean; ++j) { - convertFp16(inputsData[i], outputs[i]); + singleMean = mean[j] == mean[j - 1]; + } + + if (outputs_.depth() == CV_16S) + { + if (singleMean) + convertFp16(scale * (inputsData[i] - mean[0]), outputs[i]); + else + { + for (int n = 0; n < inputsData[i].size[0]; ++n) + for (int c = 0; c < inputsData[i].size[1]; ++c) + { + Mat inp = getPlane(inputsData[i], n, c); + + std::vector plane(4, Range::all()); + plane[0] = Range(n, n + 1); + plane[1] = Range(c, c + 1); + UMat out = outputs[i](plane).reshape(1, inp.dims, inp.size); + + convertFp16(scale * (inp - mean[c]), out); + } + } + } + else + { + CV_Assert(outputs_.depth() == CV_32F); + if (singleMean) + inputsData[i].convertTo(outputs[i], CV_32F, scale, -mean[0] * scale); + else + { + for (int n = 0; n < inputsData[i].size[0]; ++n) + for (int c = 0; c < inputsData[i].size[1]; ++c) + { + Mat inp = getPlane(inputsData[i], n, c); + + std::vector plane(4, Range::all()); + plane[0] = Range(n, n + 1); + plane[1] = Range(c, c + 1); + UMat out = outputs[i](plane).reshape(1, inp.dims, inp.size); + + inp.convertTo(out, CV_32F, scale, -mean[c] * scale); + } + } } } return true; @@ -469,8 +563,61 @@ struct DataLayer : public Layer return false; } + void finalize(const std::vector&, std::vector& outputs) CV_OVERRIDE + { + CV_Assert(outputs.size() == scaleFactors.size(), outputs.size() == means.size(), + inputsData.size() == outputs.size()); + skip = true; + for (int i = 0; skip && i < inputsData.size(); ++i) + { + if (inputsData[i].data != outputs[i].data || scaleFactors[i] != 1.0 || means[i] != Scalar()) + skip = false; + } + } + + virtual Ptr initInfEngine(const std::vector >&) CV_OVERRIDE + { +#ifdef HAVE_INF_ENGINE + InferenceEngine::LayerParams lp; + lp.name = name; + lp.type = "ScaleShift"; + lp.precision = InferenceEngine::Precision::FP32; + std::shared_ptr ieLayer(new InferenceEngine::ScaleShiftLayer(lp)); + + CV_Assert(inputsData.size() == 1, inputsData[0].dims == 4); + const size_t numChannels = inputsData[0].size[1]; + CV_Assert(numChannels <= 4); + + // Scale + auto weights = InferenceEngine::make_shared_blob(InferenceEngine::Precision::FP32, + {numChannels}); + weights->allocate(); + weights->set(std::vector(numChannels, scaleFactors[0])); + ieLayer->_weights = weights; + + // Mean subtraction + auto biases = InferenceEngine::make_shared_blob(InferenceEngine::Precision::FP32, + {numChannels}); + biases->allocate(); + std::vector biasesVec(numChannels); + for (int i = 0; i < numChannels; ++i) + { + biasesVec[i] = -means[0][i] * scaleFactors[0]; + } + biases->set(biasesVec); + ieLayer->_biases = biases; + + return Ptr(new InfEngineBackendNode(ieLayer)); +#endif // HAVE_INF_ENGINE + return Ptr(); + } + std::vector outNames; + // Preprocessing parameters for each network's input. + std::vector scaleFactors; + std::vector means; std::vector inputsData; + bool skip; }; struct BlobManager @@ -739,7 +886,7 @@ struct Net::Impl netInputLayer = Ptr(new DataLayer()); LayerData &inpl = layers.insert( make_pair(0, LayerData()) ).first->second; inpl.id = 0; - inpl.name = "_input"; + netInputLayer->name = inpl.name = "_input"; inpl.type = "__NetInputLayer__"; inpl.layerInstance = netInputLayer; layerNameToId.insert(std::make_pair(inpl.name, inpl.id)); @@ -930,6 +1077,11 @@ struct Net::Impl clear(); allocateLayers(blobsToKeep_); + + MapIdToLayerData::iterator it = layers.find(0); + CV_Assert(it != layers.end()); + it->second.skip = netInputLayer->skip; + initBackend(); if (!netWasAllocated ) @@ -1179,6 +1331,29 @@ struct Net::Impl MapIdToLayerData::iterator it; Ptr net; + for (it = layers.begin(); it != layers.end(); ++it) + { + LayerData &ld = it->second; + if (ld.id == 0) + { + CV_Assert((netInputLayer->outNames.empty() && ld.outputBlobsWrappers.size() == 1) || + (netInputLayer->outNames.size() == ld.outputBlobsWrappers.size())); + for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i) + { + InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]); + dataPtr->name = netInputLayer->outNames.empty() ? ld.name : netInputLayer->outNames[i]; + } + } + else + { + for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i) + { + InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]); + dataPtr->name = ld.name; + } + } + } + if (skipInfEngineInit) { Ptr node = layers[lastLayerId].backendNodes[preferableBackend]; @@ -1190,11 +1365,21 @@ struct Net::Impl for (it = layers.begin(); it != layers.end(); ++it) { LayerData &ld = it->second; - - for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i) + if (ld.id == 0) { - InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]); - dataPtr->name = ld.id == 0 ? netInputLayer->outNames[i] : ld.name; + for (int i = 0; i < ld.inputBlobsWrappers.size(); ++i) + { + InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.inputBlobsWrappers[i]); + dataPtr->name = netInputLayer->outNames[i]; + } + } + else + { + for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i) + { + InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]); + dataPtr->name = ld.name; + } } ieNode->net->addBlobs(ld.inputBlobsWrappers); ieNode->net->addBlobs(ld.outputBlobsWrappers); @@ -1210,11 +1395,11 @@ struct Net::Impl // some of layers is not implemented. // Set of all input and output blobs wrappers for current network. - std::map > netBlobsWrappers; + std::map > netBlobsWrappers; for (it = layers.begin(); it != layers.end(); ++it) { LayerData &ld = it->second; - if (ld.id == 0) + if (ld.id == 0 && ld.skip) continue; bool fused = ld.skip; @@ -1251,20 +1436,17 @@ struct Net::Impl // So we need to rewrap all the external blobs. for (int i = 0; i < ld.inputBlobsId.size(); ++i) { - int lid = ld.inputBlobsId[i].lid; - LayerData &inpLd = layers[lid]; - auto it = netBlobsWrappers.find(lid); + LayerPin inPin = ld.inputBlobsId[i]; + auto it = netBlobsWrappers.find(inPin); if (it == netBlobsWrappers.end()) { - ld.inputBlobsWrappers[i] = wrap(*ld.inputBlobs[i]); - auto dataPtr = infEngineDataNode(ld.inputBlobsWrappers[i]); - dataPtr->name = inpLd.name; - netBlobsWrappers[lid] = ld.inputBlobsWrappers[i]; + ld.inputBlobsWrappers[i] = InfEngineBackendWrapper::create(ld.inputBlobsWrappers[i]); + netBlobsWrappers[inPin] = ld.inputBlobsWrappers[i]; } else ld.inputBlobsWrappers[i] = it->second; } - netBlobsWrappers[ld.id] = ld.outputBlobsWrappers[0]; + netBlobsWrappers[LayerPin(ld.id, 0)] = ld.outputBlobsWrappers[0]; Ptr node; if (!net.empty()) @@ -2343,7 +2525,7 @@ void Net::setInputsNames(const std::vector &inputBlobNames) impl->netInputLayer->setNames(inputBlobNames); } -void Net::setInput(InputArray blob, const String& name) +void Net::setInput(InputArray blob, const String& name, double scalefactor, const Scalar& mean) { CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); @@ -2360,6 +2542,8 @@ void Net::setInput(InputArray blob, const String& name) ld.outputBlobs.resize(numInputs); ld.outputBlobsWrappers.resize(numInputs); impl->netInputLayer->inputsData.resize(numInputs); + impl->netInputLayer->scaleFactors.resize(numInputs); + impl->netInputLayer->means.resize(numInputs); MatShape prevShape = shape(impl->netInputLayer->inputsData[pin.oid]); Mat blob_ = blob.getMat(); @@ -2378,6 +2562,8 @@ void Net::setInput(InputArray blob, const String& name) { ld.outputBlobsWrappers[pin.oid]->setHostDirty(); } + impl->netInputLayer->scaleFactors[pin.oid] = scalefactor; + impl->netInputLayer->means[pin.oid] = mean; impl->netWasAllocated = impl->netWasAllocated && oldShape; } diff --git a/modules/dnn/src/op_inf_engine.cpp b/modules/dnn/src/op_inf_engine.cpp index 9481fc347c..f60efef95d 100644 --- a/modules/dnn/src/op_inf_engine.cpp +++ b/modules/dnn/src/op_inf_engine.cpp @@ -68,19 +68,32 @@ static InferenceEngine::DataPtr wrapToInfEngineDataNode(const Mat& m, const std: { std::vector reversedShape(&m.size[0], &m.size[0] + m.dims); std::reverse(reversedShape.begin(), reversedShape.end()); - return InferenceEngine::DataPtr( - new InferenceEngine::Data(name, reversedShape, InferenceEngine::Precision::FP32, estimateLayout(m)) - ); + if (m.type() == CV_32F) + return InferenceEngine::DataPtr( + new InferenceEngine::Data(name, reversedShape, InferenceEngine::Precision::FP32, estimateLayout(m)) + ); + else if (m.type() == CV_8U) + return InferenceEngine::DataPtr( + new InferenceEngine::Data(name, reversedShape, InferenceEngine::Precision::U8, estimateLayout(m)) + ); + else + CV_Error(Error::StsNotImplemented, format("Unsupported data type %d", m.type())); } -InferenceEngine::TBlob::Ptr wrapToInfEngineBlob(const Mat& m, const std::vector& shape, - InferenceEngine::Layout layout) +InferenceEngine::Blob::Ptr wrapToInfEngineBlob(const Mat& m, const std::vector& shape, + InferenceEngine::Layout layout) { - return InferenceEngine::make_shared_blob(InferenceEngine::Precision::FP32, - layout, shape, (float*)m.data); + if (m.type() == CV_32F) + return InferenceEngine::make_shared_blob(InferenceEngine::Precision::FP32, + layout, shape, (float*)m.data); + else if (m.type() == CV_8U) + return InferenceEngine::make_shared_blob(InferenceEngine::Precision::U8, + layout, shape, (uint8_t*)m.data); + else + CV_Error(Error::StsNotImplemented, format("Unsupported data type %d", m.type())); } -InferenceEngine::TBlob::Ptr wrapToInfEngineBlob(const Mat& m, InferenceEngine::Layout layout) +InferenceEngine::Blob::Ptr wrapToInfEngineBlob(const Mat& m, InferenceEngine::Layout layout) { std::vector reversedShape(&m.size[0], &m.size[0] + m.dims); std::reverse(reversedShape.begin(), reversedShape.end()); @@ -102,6 +115,24 @@ InfEngineBackendWrapper::InfEngineBackendWrapper(int targetId, const cv::Mat& m) blob = wrapToInfEngineBlob(m, estimateLayout(m)); } +InfEngineBackendWrapper::InfEngineBackendWrapper(Ptr wrapper) + : BackendWrapper(DNN_BACKEND_INFERENCE_ENGINE, wrapper->targetId) +{ + Ptr ieWrapper = wrapper.dynamicCast(); + CV_Assert(!ieWrapper.empty()); + InferenceEngine::DataPtr srcData = ieWrapper->dataPtr; + dataPtr = InferenceEngine::DataPtr( + new InferenceEngine::Data(srcData->name, srcData->dims, srcData->precision, + srcData->layout) + ); + blob = ieWrapper->blob; +} + +Ptr InfEngineBackendWrapper::create(Ptr wrapper) +{ + return Ptr(new InfEngineBackendWrapper(wrapper)); +} + InfEngineBackendWrapper::~InfEngineBackendWrapper() { @@ -329,6 +360,7 @@ void InfEngineBackendNet::init(int targetId) { CV_Assert(allBlobs.find(it.first) != allBlobs.end()); inpBlobs[it.first] = allBlobs[it.first]; + it.second->setPrecision(inpBlobs[it.first]->precision()); } // Set up output blobs. @@ -427,7 +459,7 @@ void InfEngineBackendNet::addBlobs(const std::vector >& ptrs auto wrappers = infEngineWrappers(ptrs); for (const auto& wrapper : wrappers) { - allBlobs[wrapper->dataPtr->name] = wrapper->blob; + allBlobs.insert({wrapper->dataPtr->name, wrapper->blob}); } } diff --git a/modules/dnn/src/op_inf_engine.hpp b/modules/dnn/src/op_inf_engine.hpp index 075c1be849..4295e10417 100644 --- a/modules/dnn/src/op_inf_engine.hpp +++ b/modules/dnn/src/op_inf_engine.hpp @@ -115,19 +115,23 @@ class InfEngineBackendWrapper : public BackendWrapper public: InfEngineBackendWrapper(int targetId, const Mat& m); + InfEngineBackendWrapper(Ptr wrapper); + ~InfEngineBackendWrapper(); + static Ptr create(Ptr wrapper); + virtual void copyToHost() CV_OVERRIDE; virtual void setHostDirty() CV_OVERRIDE; InferenceEngine::DataPtr dataPtr; - InferenceEngine::TBlob::Ptr blob; + InferenceEngine::Blob::Ptr blob; }; -InferenceEngine::TBlob::Ptr wrapToInfEngineBlob(const Mat& m, InferenceEngine::Layout layout = InferenceEngine::Layout::ANY); +InferenceEngine::Blob::Ptr wrapToInfEngineBlob(const Mat& m, InferenceEngine::Layout layout = InferenceEngine::Layout::ANY); -InferenceEngine::TBlob::Ptr wrapToInfEngineBlob(const Mat& m, const std::vector& shape, InferenceEngine::Layout layout); +InferenceEngine::Blob::Ptr wrapToInfEngineBlob(const Mat& m, const std::vector& shape, InferenceEngine::Layout layout); InferenceEngine::DataPtr infEngineDataNode(const Ptr& ptr); diff --git a/modules/dnn/test/test_halide_layers.cpp b/modules/dnn/test/test_halide_layers.cpp index 563ae993b6..eda414551f 100644 --- a/modules/dnn/test/test_halide_layers.cpp +++ b/modules/dnn/test/test_halide_layers.cpp @@ -107,12 +107,10 @@ TEST_P(Convolution, Accuracy) if (backendId == DNN_BACKEND_INFERENCE_ENGINE && targetId == DNN_TARGET_MYRIAD) throw SkipTestException(""); - // TODO: unstable test cases - if (backendId == DNN_BACKEND_OPENCV && (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16) && - inChannels == 6 && outChannels == 9 && group == 1 && inSize == Size(5, 6) && - kernel == Size(3, 1) && stride == Size(1, 1) && pad == Size(0, 1) && dilation == Size(1, 1) && - hasBias) - throw SkipTestException(""); + if (cvtest::skipUnstableTests && backendId == DNN_BACKEND_OPENCV && + (targetId == DNN_TARGET_OPENCL || targetId == DNN_TARGET_OPENCL_FP16) && + kernel == Size(3, 1) && stride == Size(1, 1) && pad == Size(0, 1)) + throw SkipTestException("Skip unstable test"); int sz[] = {outChannels, inChannels / group, kernel.height, kernel.width}; Mat weights(4, &sz[0], CV_32F); diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp index ca6645057b..3ebb4172d9 100644 --- a/modules/dnn/test/test_layers.cpp +++ b/modules/dnn/test/test_layers.cpp @@ -291,7 +291,7 @@ TEST_P(Test_Caffe_layers, Fused_Concat) TEST_P(Test_Caffe_layers, Eltwise) { - if (backend == DNN_BACKEND_INFERENCE_ENGINE) + if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) throw SkipTestException(""); testLayerUsingCaffeModels("layer_eltwise"); } @@ -939,6 +939,25 @@ TEST(Layer_Test_Convolution_DLDT, Accuracy) ASSERT_EQ(net.getLayer(outLayers[0])->type, "Concat"); } +TEST(Layer_Test_Convolution_DLDT, setInput_uint8) +{ + Mat inp = blobFromNPY(_tf("blob.npy")); + + Mat inputs[] = {Mat(inp.dims, inp.size, CV_8U), Mat()}; + randu(inputs[0], 0, 255); + inputs[0].convertTo(inputs[1], CV_32F); + + Mat outs[2]; + for (int i = 0; i < 2; ++i) + { + Net net = readNet(_tf("layer_convolution.xml"), _tf("layer_convolution.bin")); + net.setInput(inputs[i]); + outs[i] = net.forward(); + ASSERT_EQ(outs[i].type(), CV_32F); + } + normAssert(outs[0], outs[1]); +} + // 1. Create a .prototxt file with the following network: // layer { // type: "Input" name: "data" top: "data" @@ -961,22 +980,65 @@ TEST(Layer_Test_Convolution_DLDT, Accuracy) // net.save('/path/to/caffemodel') // // 3. Convert using ModelOptimizer. -TEST(Test_DLDT, two_inputs) +typedef testing::TestWithParam > Test_DLDT_two_inputs; +TEST_P(Test_DLDT_two_inputs, as_IR) { + int firstInpType = get<0>(GetParam()); + int secondInpType = get<1>(GetParam()); + // TODO: It looks like a bug in Inference Engine. + if (secondInpType == CV_8U) + throw SkipTestException(""); + Net net = readNet(_tf("net_two_inputs.xml"), _tf("net_two_inputs.bin")); int inpSize[] = {1, 2, 3}; - Mat firstInp(3, &inpSize[0], CV_32F); - Mat secondInp(3, &inpSize[0], CV_32F); - randu(firstInp, -1, 1); - randu(secondInp, -1, 1); + Mat firstInp(3, &inpSize[0], firstInpType); + Mat secondInp(3, &inpSize[0], secondInpType); + randu(firstInp, 0, 255); + randu(secondInp, 0, 255); net.setInput(firstInp, "data"); net.setInput(secondInp, "second_input"); Mat out = net.forward(); - normAssert(out, firstInp + secondInp); + Mat ref; + cv::add(firstInp, secondInp, ref, Mat(), CV_32F); + normAssert(out, ref); } +TEST_P(Test_DLDT_two_inputs, as_backend) +{ + static const float kScale = 0.5f; + static const float kScaleInv = 1.0f / kScale; + + Net net; + LayerParams lp; + lp.type = "Eltwise"; + lp.name = "testLayer"; + lp.set("operation", "sum"); + int eltwiseId = net.addLayerToPrev(lp.name, lp.type, lp); // connect to a first input + net.connect(0, 1, eltwiseId, 1); // connect to a second input + + int inpSize[] = {1, 2, 3}; + Mat firstInp(3, &inpSize[0], get<0>(GetParam())); + Mat secondInp(3, &inpSize[0], get<1>(GetParam())); + randu(firstInp, 0, 255); + randu(secondInp, 0, 255); + + net.setInputsNames({"data", "second_input"}); + net.setInput(firstInp, "data", kScale); + net.setInput(secondInp, "second_input", kScaleInv); + net.setPreferableBackend(DNN_BACKEND_INFERENCE_ENGINE); + Mat out = net.forward(); + + Mat ref; + addWeighted(firstInp, kScale, secondInp, kScaleInv, 0, ref, CV_32F); + normAssert(out, ref); +} + +INSTANTIATE_TEST_CASE_P(/*nothing*/, Test_DLDT_two_inputs, Combine( + Values(CV_8U, CV_32F), Values(CV_8U, CV_32F) +)); + class UnsupportedLayer : public Layer { public: diff --git a/modules/dnn/test/test_misc.cpp b/modules/dnn/test/test_misc.cpp index aff79bf0ec..ae7c7d00a7 100644 --- a/modules/dnn/test/test_misc.cpp +++ b/modules/dnn/test/test_misc.cpp @@ -138,4 +138,44 @@ TEST(LayerFactory, custom_layers) LayerFactory::unregisterLayer("CustomType"); } +typedef testing::TestWithParam > > setInput; +TEST_P(setInput, normalization) +{ + const float kScale = get<0>(GetParam()); + const Scalar kMean = get<1>(GetParam()); + const int dtype = get<2>(GetParam()); + const int backend = get<0>(get<3>(GetParam())); + const int target = get<1>(get<3>(GetParam())); + const bool kSwapRB = true; + + if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD && !checkMyriadTarget()) + throw SkipTestException("Myriad is not available/disabled in OpenCV"); + if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16 && dtype != CV_32F) + throw SkipTestException(""); + + Mat inp(5, 5, CV_8UC3); + randu(inp, 0, 255); + Mat ref = blobFromImage(inp, kScale, Size(), kMean, kSwapRB, /*crop*/false); + + LayerParams lp; + Net net; + net.addLayerToPrev("testLayer", "Identity", lp); + net.setPreferableBackend(backend); + net.setPreferableTarget(target); + + Mat blob = blobFromImage(inp, 1.0, Size(), Scalar(), kSwapRB, /*crop*/false, dtype); + ASSERT_EQ(blob.type(), dtype); + net.setInput(blob, "", kScale, kMean); + Mat out = net.forward(); + ASSERT_EQ(out.type(), CV_32F); + normAssert(ref, out, "", 4e-4, 1e-3); +} + +INSTANTIATE_TEST_CASE_P(/**/, setInput, Combine( + Values(1.0f, 1.0 / 127.5), + Values(Vec3f(), Vec3f(50, 50, 50), Vec3f(10, 50, 140)), + Values(CV_32F, CV_8U), + dnnBackendsAndTargets() +)); + }} // namespace From c213a3823e62381897e3796595b06d8d5176e16d Mon Sep 17 00:00:00 2001 From: Dmitry Kurtaev Date: Thu, 19 Jul 2018 17:05:56 +0300 Subject: [PATCH 05/19] Run entire SSDs from TensorFlow using Intel's Inference Engine --- modules/dnn/src/tensorflow/tf_importer.cpp | 45 +++++++++++++++++++--- modules/dnn/test/test_tf_importer.cpp | 2 +- samples/dnn/tf_text_graph_ssd.py | 22 +++++------ 3 files changed, 49 insertions(+), 20 deletions(-) diff --git a/modules/dnn/src/tensorflow/tf_importer.cpp b/modules/dnn/src/tensorflow/tf_importer.cpp index 89732b45ad..6c16502aa2 100644 --- a/modules/dnn/src/tensorflow/tf_importer.cpp +++ b/modules/dnn/src/tensorflow/tf_importer.cpp @@ -771,6 +771,13 @@ void TFImporter::populateNet(Net dstNet) type = layer.op(); } + // For the object detection networks, TensorFlow Object Detection API + // predicts deltas for bounding boxes in yxYX (ymin, xmin, ymax, xmax) + // order. We can manage it at DetectionOutput layer parsing predictions + // or shuffle last convolution's weights. + bool locPredTransposed = hasLayerAttr(layer, "loc_pred_transposed") && + getLayerAttr(layer, "loc_pred_transposed").b(); + layerParams.set("bias_term", false); layerParams.blobs.resize(1); @@ -784,18 +791,32 @@ void TFImporter::populateNet(Net dstNet) blobFromTensor(getConstBlob(net.node(weights_layer_index), value_id), layerParams.blobs[1]); ExcludeLayer(net, weights_layer_index, 0, false); layers_to_ignore.insert(next_layers[0].first); + + // Shuffle bias from yxYX to xyXY. + if (locPredTransposed) + { + const int numWeights = layerParams.blobs[1].total(); + float* biasData = reinterpret_cast(layerParams.blobs[1].data); + CV_Assert(numWeights % 4 == 0); + for (int i = 0; i < numWeights; i += 2) + { + std::swap(biasData[i], biasData[i + 1]); + } + } } const tensorflow::TensorProto& kernelTensor = getConstBlob(layer, value_id); kernelFromTensor(kernelTensor, layerParams.blobs[0]); releaseTensor(const_cast(&kernelTensor)); int* kshape = layerParams.blobs[0].size.p; + const int outCh = kshape[0]; + const int inCh = kshape[1]; + const int height = kshape[2]; + const int width = kshape[3]; if (type == "DepthwiseConv2dNative") { + CV_Assert(!locPredTransposed); const int chMultiplier = kshape[0]; - const int inCh = kshape[1]; - const int height = kshape[2]; - const int width = kshape[3]; Mat copy = layerParams.blobs[0].clone(); float* src = (float*)copy.data; @@ -814,9 +835,21 @@ void TFImporter::populateNet(Net dstNet) size_t* kstep = layerParams.blobs[0].step.p; kstep[0] = kstep[1]; // fix steps too } - layerParams.set("kernel_h", kshape[2]); - layerParams.set("kernel_w", kshape[3]); - layerParams.set("num_output", kshape[0]); + layerParams.set("kernel_h", height); + layerParams.set("kernel_w", width); + layerParams.set("num_output", outCh); + + // Shuffle output channels from yxYX to xyXY. + if (locPredTransposed) + { + const int slice = height * width * inCh; + for (int i = 0; i < outCh; i += 2) + { + cv::Mat src(1, slice, CV_32F, layerParams.blobs[0].ptr(i)); + cv::Mat dst(1, slice, CV_32F, layerParams.blobs[0].ptr(i + 1)); + std::swap_ranges(src.begin(), src.end(), dst.begin()); + } + } setStrides(layerParams, layer); setPadding(layerParams, layer); diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp index 6ab0e41e18..8b0a20747b 100644 --- a/modules/dnn/test/test_tf_importer.cpp +++ b/modules/dnn/test/test_tf_importer.cpp @@ -309,7 +309,7 @@ TEST_P(Test_TensorFlow_nets, Inception_v2_SSD) 0, 10, 0.95932811, 0.38349164, 0.32528657, 0.40387636, 0.39165527, 0, 10, 0.93973452, 0.66561931, 0.37841269, 0.68074018, 0.42907384); double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 5e-3 : default_l1; - double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.025 : default_lInf; + double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.09 : default_lInf; normAssertDetections(ref, out, "", 0.5, scoreDiff, iouDiff); } diff --git a/samples/dnn/tf_text_graph_ssd.py b/samples/dnn/tf_text_graph_ssd.py index 851e0d881e..1bf4079113 100644 --- a/samples/dnn/tf_text_graph_ssd.py +++ b/samples/dnn/tf_text_graph_ssd.py @@ -208,12 +208,18 @@ for label in ['ClassPredictor', 'BoxEncodingPredictor']: graph_def.node.extend([flatten]) addConcatNode('%s/concat' % label, concatInputs, 'concat/axis_flatten') +idx = 0 +for node in graph_def.node: + if node.name == ('BoxPredictor_%d/BoxEncodingPredictor/Conv2D' % idx): + text_format.Merge('b: true', node.attr["loc_pred_transposed"]) + idx += 1 +assert(idx == args.num_layers) + # Add layers that generate anchors (bounding boxes proposals). scales = [args.min_scale + (args.max_scale - args.min_scale) * i / (args.num_layers - 1) for i in range(args.num_layers)] + [1.0] priorBoxes = [] -addConstNode('reshape_prior_boxes_to_4d', [1, 2, -1, 1]) for i in range(args.num_layers): priorBox = NodeDef() priorBox.name = 'PriorBox_%d' % i @@ -240,18 +246,9 @@ for i in range(args.num_layers): text_format.Merge(tensorMsg([0.1, 0.1, 0.2, 0.2]), priorBox.attr["variance"]) graph_def.node.extend([priorBox]) + priorBoxes.append(priorBox.name) - # Reshape from 1x2xN to 1x2xNx1 - reshape = NodeDef() - reshape.name = priorBox.name + '/4d' - reshape.op = 'Reshape' - reshape.input.append(priorBox.name) - reshape.input.append('reshape_prior_boxes_to_4d') - graph_def.node.extend([reshape]) - - priorBoxes.append(reshape.name) - -addConcatNode('PriorBox/concat', priorBoxes, 'PriorBox/concat/axis') +addConcatNode('PriorBox/concat', priorBoxes, 'concat/axis_flatten') # Sigmoid for classes predictions and DetectionOutput layer sigmoid = NodeDef() @@ -276,7 +273,6 @@ text_format.Merge('i: 100', detectionOut.attr['top_k']) text_format.Merge('s: "CENTER_SIZE"', detectionOut.attr['code_type']) text_format.Merge('i: 100', detectionOut.attr['keep_top_k']) text_format.Merge('f: 0.01', detectionOut.attr['confidence_threshold']) -text_format.Merge('b: true', detectionOut.attr['loc_pred_transposed']) graph_def.node.extend([detectionOut]) From 270cc3bcbc6de8549c3dd861ef392accee793397 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Thu, 19 Jul 2018 16:14:50 +0300 Subject: [PATCH 06/19] videoio: add routines to query information about backends API into cv::videoio_registry namespace --- modules/python/src2/cv2.cpp | 3 +- modules/python/test/test_videoio.py | 25 ++++++++ modules/videoio/include/opencv2/videoio.hpp | 1 + .../include/opencv2/videoio/registry.hpp | 44 +++++++++++++ .../videoio/misc/python/pyopencv_videoio.hpp | 50 +++++++++++++++ modules/videoio/src/videoio_registry.cpp | 56 ++++++++++++++++ modules/videoio/test/test_precomp.hpp | 27 ++++++++ modules/videoio/test/test_video_io.cpp | 64 +++---------------- 8 files changed, 213 insertions(+), 57 deletions(-) create mode 100644 modules/python/test/test_videoio.py create mode 100644 modules/videoio/include/opencv2/videoio/registry.hpp create mode 100644 modules/videoio/misc/python/pyopencv_videoio.hpp diff --git a/modules/python/src2/cv2.cpp b/modules/python/src2/cv2.cpp index 179bf678b2..03fd912104 100644 --- a/modules/python/src2/cv2.cpp +++ b/modules/python/src2/cv2.cpp @@ -1563,8 +1563,6 @@ PyObject* pyopencv_from(const Moments& m) "nu30", m.nu30, "nu21", m.nu21, "nu12", m.nu12, "nu03", m.nu03); } -#include "pyopencv_custom_headers.h" - static int OnError(int status, const char *func_name, const char *err_msg, const char *file_name, int line, void *userdata) { PyGILState_STATE gstate; @@ -1802,6 +1800,7 @@ static int convert_to_char(PyObject *o, char *dst, const char *name = "no_name") # pragma GCC diagnostic ignored "-Wmissing-field-initializers" #endif +#include "pyopencv_custom_headers.h" #include "pyopencv_generated_types.h" #include "pyopencv_generated_funcs.h" diff --git a/modules/python/test/test_videoio.py b/modules/python/test/test_videoio.py new file mode 100644 index 0000000000..2bbfeecda0 --- /dev/null +++ b/modules/python/test/test_videoio.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python +from __future__ import print_function + +import numpy as np +import cv2 as cv + +from tests_common import NewOpenCVTests + +class Bindings(NewOpenCVTests): + + def check_name(self, name): + #print(name) + self.assertFalse(name == None) + self.assertFalse(name == "") + + def test_registry(self): + self.check_name(cv.videoio_registry.getBackendName(cv.CAP_ANY)); + self.check_name(cv.videoio_registry.getBackendName(cv.CAP_FFMPEG)) + self.check_name(cv.videoio_registry.getBackendName(cv.CAP_OPENCV_MJPEG)) + backends = cv.videoio_registry.getBackends() + for backend in backends: + self.check_name(cv.videoio_registry.getBackendName(backend)) + +if __name__ == '__main__': + NewOpenCVTests.bootstrap() diff --git a/modules/videoio/include/opencv2/videoio.hpp b/modules/videoio/include/opencv2/videoio.hpp index d43e703984..eef840b6ec 100644 --- a/modules/videoio/include/opencv2/videoio.hpp +++ b/modules/videoio/include/opencv2/videoio.hpp @@ -59,6 +59,7 @@ @defgroup videoio_c C API for video I/O @defgroup videoio_ios iOS glue for video I/O @defgroup videoio_winrt WinRT glue for video I/O + @defgroup videoio_registry Query I/O API backends registry @} */ diff --git a/modules/videoio/include/opencv2/videoio/registry.hpp b/modules/videoio/include/opencv2/videoio/registry.hpp new file mode 100644 index 0000000000..7404c68116 --- /dev/null +++ b/modules/videoio/include/opencv2/videoio/registry.hpp @@ -0,0 +1,44 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_VIDEOIO_REGISTRY_HPP +#define OPENCV_VIDEOIO_REGISTRY_HPP + +#include + +namespace cv { namespace videoio_registry { +/** @addtogroup videoio_registry +This section contains API description how to query/configure available Video I/O backends. + +Runtime configuration options: +- enable debug mode: `OPENCV_VIDEOIO_DEBUG=1` +- change backend priority: `OPENCV_VIDEOIO_PRIORITY_=9999` +- disable backend: `OPENCV_VIDEOIO_PRIORITY_=0` +- specify list of backends with high priority (>100000): `OPENCV_VIDEOIO_PRIORITY_LIST=FFMPEG,GSTREAMER` + +@{ + */ + + +/** @brief Returns backend API name or "unknown" +@param api backend ID (#VideoCaptureAPIs) +*/ +CV_EXPORTS_W cv::String getBackendName(VideoCaptureAPIs api); + +/** @brief Returns list of all builtin backends */ +CV_EXPORTS_W std::vector getBackends(); + +/** @brief Returns list of available backends which works via `cv::VideoCapture(int index)` */ +CV_EXPORTS_W std::vector getCameraBackends(); + +/** @brief Returns list of available backends which works via `cv::VideoCapture(filename)` */ +CV_EXPORTS_W std::vector getStreamBackends(); + +/** @brief Returns list of available backends which works via `cv::VideoWriter()` */ +CV_EXPORTS_W std::vector getWriterBackends(); + +//! @} +}} // namespace + +#endif // OPENCV_VIDEOIO_REGISTRY_HPP diff --git a/modules/videoio/misc/python/pyopencv_videoio.hpp b/modules/videoio/misc/python/pyopencv_videoio.hpp new file mode 100644 index 0000000000..453a57a126 --- /dev/null +++ b/modules/videoio/misc/python/pyopencv_videoio.hpp @@ -0,0 +1,50 @@ +#ifdef HAVE_OPENCV_VIDEOIO +typedef std::vector vector_VideoCaptureAPIs; + +template<> +bool pyopencv_to(PyObject *o, cv::VideoCaptureAPIs &v, const char *name) +{ + (void)name; + v = CAP_ANY; + if (!o || o == Py_None) + return false; + else if (PyLong_Check(o)) + { + v = VideoCaptureAPIs((int64)PyLong_AsLongLong(o)); + return true; + } + else if (PyInt_Check(o)) + { + v = VideoCaptureAPIs((int64)PyInt_AS_LONG(o)); + return true; + } + else + return false; +} + +template<> +PyObject* pyopencv_from(const cv::VideoCaptureAPIs &v) +{ + return pyopencv_from((int)(v)); +} + +template<> struct pyopencvVecConverter +{ + static bool to(PyObject* obj, std::vector& value, const ArgInfo info) + { + return pyopencv_to_generic_vec(obj, value, info); + } + + static PyObject* from(const std::vector& value) + { + return pyopencv_from_generic_vec(value); + } +}; + +template<> +bool pyopencv_to(PyObject *o, std::vector& apis, const char *name) +{ + return pyopencvVecConverter::to(o, apis, ArgInfo(name, false)); +} + +#endif // HAVE_OPENCV_VIDEOIO diff --git a/modules/videoio/src/videoio_registry.cpp b/modules/videoio/src/videoio_registry.cpp index 9f0abc512b..85fc239ad9 100644 --- a/modules/videoio/src/videoio_registry.cpp +++ b/modules/videoio/src/videoio_registry.cpp @@ -6,6 +6,8 @@ #include "videoio_registry.hpp" +#include "opencv2/videoio/registry.hpp" + #include "cap_intelperc.hpp" #include "cap_dshow.hpp" @@ -247,6 +249,8 @@ public: return g_instance; } + inline std::vector getEnabledBackends() const { return enabledBackends; } + inline std::vector getAvailableBackends_CaptureByIndex() const { std::vector result; @@ -302,6 +306,58 @@ std::vector getAvailableBackends_Writer() return result; } +cv::String getBackendName(VideoCaptureAPIs api) +{ + if (api == CAP_ANY) + return "CAP_ANY"; // special case, not a part of backends list + const int N = sizeof(builtin_backends)/sizeof(builtin_backends[0]); + for (size_t i = 0; i < N; i++) + { + const VideoBackendInfo& backend = builtin_backends[i]; + if (backend.id == api) + return backend.name; + } + return cv::format("UnknownVideoAPI(%d)", (int)api); +} + +std::vector getBackends() +{ + std::vector backends = VideoBackendRegistry::getInstance().getEnabledBackends(); + std::vector result; + for (size_t i = 0; i < backends.size(); i++) + result.push_back((VideoCaptureAPIs)backends[i].id); + return result; +} + +std::vector getCameraBackends() +{ + const std::vector backends = VideoBackendRegistry::getInstance().getAvailableBackends_CaptureByIndex(); + std::vector result; + for (size_t i = 0; i < backends.size(); i++) + result.push_back((VideoCaptureAPIs)backends[i].id); + return result; + +} + +std::vector getStreamBackends() +{ + const std::vector backends = VideoBackendRegistry::getInstance().getAvailableBackends_CaptureByFilename(); + std::vector result; + for (size_t i = 0; i < backends.size(); i++) + result.push_back((VideoCaptureAPIs)backends[i].id); + return result; + +} + +std::vector getWriterBackends() +{ + const std::vector backends = VideoBackendRegistry::getInstance().getAvailableBackends_Writer(); + std::vector result; + for (size_t i = 0; i < backends.size(); i++) + result.push_back((VideoCaptureAPIs)backends[i].id); + return result; +} + } // namespace registry #define TRY_OPEN(backend_func) \ diff --git a/modules/videoio/test/test_precomp.hpp b/modules/videoio/test/test_precomp.hpp index 8d9f5e0358..e3612adc1e 100644 --- a/modules/videoio/test/test_precomp.hpp +++ b/modules/videoio/test/test_precomp.hpp @@ -6,10 +6,26 @@ #include "opencv2/ts.hpp" #include "opencv2/videoio.hpp" +#include "opencv2/videoio/registry.hpp" #include "opencv2/imgproc/imgproc_c.h" #include "opencv2/core/private.hpp" +namespace cv { + +inline std::ostream &operator<<(std::ostream &out, const VideoCaptureAPIs& api) +{ + out << cv::videoio_registry::getBackendName(api); return out; +} + +static inline void PrintTo(const cv::VideoCaptureAPIs& api, std::ostream* os) +{ + *os << cv::videoio_registry::getBackendName(api); +} + +} // namespace + + inline std::string fourccToString(int fourcc) { return cv::format("%c%c%c%c", fourcc & 255, (fourcc >> 8) & 255, (fourcc >> 16) & 255, (fourcc >> 24) & 255); @@ -55,4 +71,15 @@ public: } }; + +static inline bool isBackendAvailable(cv::VideoCaptureAPIs api, const std::vector& api_list) +{ + for (size_t i = 0; i < api_list.size(); i++) + { + if (api_list[i] == api) + return true; + } + return false; +} + #endif diff --git a/modules/videoio/test/test_video_io.cpp b/modules/videoio/test/test_video_io.cpp index c1834be5ec..7dcdc1d574 100644 --- a/modules/videoio/test/test_video_io.cpp +++ b/modules/videoio/test/test_video_io.cpp @@ -46,62 +46,12 @@ namespace opencv_test { -struct VideoCaptureAPI -{ - VideoCaptureAPIs api; - - inline const char * toString() const - { - switch (api) - { - case CAP_ANY: return "CAP_ANY"; - #ifdef __linux__ - case CAP_V4L2: return "CAP_V4L/CAP_V4L2"; - #else - case CAP_VFW: return "CAP_VFW"; - #endif - case CAP_FIREWIRE: return "CAP_FIREWIRE"; - case CAP_QT: return "CAP_QT"; - case CAP_UNICAP: return "CAP_UNICAP"; - case CAP_DSHOW: return "CAP_DSHOW"; - case CAP_PVAPI: return "CAP_PVAPI"; - case CAP_OPENNI: return "CAP_OPENNI"; - case CAP_OPENNI_ASUS: return "CAP_OPENNI_ASUS"; - case CAP_ANDROID: return "CAP_ANDROID"; - case CAP_XIAPI: return "CAP_XIAPI"; - case CAP_AVFOUNDATION: return "CAP_AVFOUNDATION"; - case CAP_GIGANETIX: return "CAP_GIGANETIX"; - case CAP_MSMF: return "CAP_MSMF"; - case CAP_WINRT: return "CAP_WINRT"; - case CAP_INTELPERC: return "CAP_INTELPERC"; - case CAP_OPENNI2: return "CAP_OPENNI2"; - case CAP_OPENNI2_ASUS: return "CAP_OPENNI2_ASUS"; - case CAP_GPHOTO2: return "CAP_GPHOTO2"; - case CAP_GSTREAMER: return "CAP_GSTREAMER"; - case CAP_FFMPEG: return "CAP_FFMPEG"; - case CAP_IMAGES: return "CAP_IMAGES"; - case CAP_ARAVIS: return "CAP_ARAVIS"; - case CAP_OPENCV_MJPEG: return "CAP_OPENCV_MJPEG"; - case CAP_INTEL_MFX: return "CAP_INTEL_MFX"; - case CAP_XINE: return "CAP_XINE"; - } - return "unknown"; - } - VideoCaptureAPI(int api_ = CAP_ANY) : api((VideoCaptureAPIs)api_) {} - operator int() { return api; } -}; - -inline std::ostream &operator<<(std::ostream &out, const VideoCaptureAPI & api) -{ - out << api.toString(); return out; -} - class Videoio_Test_Base { protected: string ext; string video_file; - VideoCaptureAPI apiPref; + VideoCaptureAPIs apiPref; protected: Videoio_Test_Base() {} virtual ~Videoio_Test_Base() {} @@ -131,6 +81,8 @@ protected: public: void doTest() { + if (!isBackendAvailable(apiPref, cv::videoio_registry::getStreamBackends())) + throw SkipTestException(cv::String("Backend is not available/disabled: ") + cv::videoio_registry::getBackendName(apiPref)); VideoCapture cap; ASSERT_NO_THROW(cap.open(video_file, apiPref)); if (!cap.isOpened()) @@ -200,7 +152,7 @@ public: }; //================================================================================================== -typedef tuple Backend_Type_Params; +typedef tuple Backend_Type_Params; class Videoio_Bunny : public Videoio_Test_Base, public testing::TestWithParam { @@ -214,6 +166,8 @@ public: } void doFrameCountTest() { + if (!isBackendAvailable(apiPref, cv::videoio_registry::getStreamBackends())) + throw SkipTestException(cv::String("Backend is not available/disabled: ") + cv::videoio_registry::getBackendName(apiPref)); VideoCapture cap; EXPECT_NO_THROW(cap.open(video_file, apiPref)); if (!cap.isOpened()) @@ -274,7 +228,7 @@ struct Ext_Fourcc_PSNR string ext; string fourcc; float PSNR; - VideoCaptureAPI api; + VideoCaptureAPIs api; }; typedef tuple Size_Ext_Fourcc_PSNR; @@ -348,7 +302,7 @@ public: //================================================================================================== -static VideoCaptureAPI backend_params[] = { +static const VideoCaptureAPIs backend_params[] = { #ifdef HAVE_QUICKTIME CAP_QT, #endif @@ -383,7 +337,7 @@ static VideoCaptureAPI backend_params[] = { // CAP_INTEL_MFX }; -static string bunny_params[] = { +static const string bunny_params[] = { #ifdef HAVE_VIDEO_INPUT string("wmv"), string("mov"), From a4060e15a4fd5dde58adec0c7db5b3202523b690 Mon Sep 17 00:00:00 2001 From: Maksim Shabunin Date: Thu, 19 Jul 2018 19:22:23 +0300 Subject: [PATCH 07/19] dnn, IE backend: updated to match new interface --- modules/dnn/src/op_inf_engine.cpp | 43 +++++++++++++++++++++++++++---- modules/dnn/src/op_inf_engine.hpp | 30 ++++++++++++++++----- 2 files changed, 62 insertions(+), 11 deletions(-) diff --git a/modules/dnn/src/op_inf_engine.cpp b/modules/dnn/src/op_inf_engine.cpp index f60efef95d..bcf2c2a3d9 100644 --- a/modules/dnn/src/op_inf_engine.cpp +++ b/modules/dnn/src/op_inf_engine.cpp @@ -180,10 +180,15 @@ InferenceEngine::Precision InfEngineBackendNet::getPrecision() noexcept return precision; } +InferenceEngine::Precision InfEngineBackendNet::getPrecision() const noexcept +{ + return precision; +} + // Assume that outputs of network is unconnected blobs. void InfEngineBackendNet::getOutputsInfo(InferenceEngine::OutputsDataMap &outputs_) noexcept { - outputs_ = outputs; + const_cast(this)->getOutputsInfo(outputs_); } void InfEngineBackendNet::getOutputsInfo(InferenceEngine::OutputsDataMap &outputs_) const noexcept { @@ -193,7 +198,7 @@ void InfEngineBackendNet::getOutputsInfo(InferenceEngine::OutputsDataMap &output // Returns input references that aren't connected to internal outputs. void InfEngineBackendNet::getInputsInfo(InferenceEngine::InputsDataMap &inputs_) noexcept { - inputs_ = inputs; + const_cast(this)->getInputsInfo(inputs_); } // Returns input references that aren't connected to internal outputs. @@ -204,7 +209,11 @@ void InfEngineBackendNet::getInputsInfo(InferenceEngine::InputsDataMap &inputs_) InferenceEngine::InputInfo::Ptr InfEngineBackendNet::getInput(const std::string &inputName) noexcept { - getInputsInfo(inputs); + return const_cast(this)->getInput(inputName); +} + +InferenceEngine::InputInfo::Ptr InfEngineBackendNet::getInput(const std::string &inputName) const noexcept +{ const auto& it = inputs.find(inputName); CV_Assert(it != inputs.end()); return it->second; @@ -218,7 +227,17 @@ void InfEngineBackendNet::getName(char*, size_t) const noexcept { } +const std::string& InfEngineBackendNet::getName() const noexcept +{ + return name; +} + size_t InfEngineBackendNet::layerCount() noexcept +{ + return const_cast(this)->layerCount(); +} + +size_t InfEngineBackendNet::layerCount() const noexcept { return layers.size(); } @@ -258,6 +277,13 @@ InfEngineBackendNet::addOutput(const std::string &layerName, size_t outputIndex, InferenceEngine::StatusCode InfEngineBackendNet::getLayerByName(const char *layerName, InferenceEngine::CNNLayerPtr &out, InferenceEngine::ResponseDesc *resp) noexcept +{ + return const_cast(this)->getLayerByName(layerName, out, resp); +} + +InferenceEngine::StatusCode InfEngineBackendNet::getLayerByName(const char *layerName, + InferenceEngine::CNNLayerPtr &out, + InferenceEngine::ResponseDesc *resp) const noexcept { for (auto& l : layers) { @@ -285,7 +311,12 @@ InferenceEngine::TargetDevice InfEngineBackendNet::getTargetDevice() noexcept return targetDevice; } -InferenceEngine::StatusCode InfEngineBackendNet::setBatchSize(const size_t size) noexcept +InferenceEngine::TargetDevice InfEngineBackendNet::getTargetDevice() const noexcept +{ + return targetDevice; +} + +InferenceEngine::StatusCode InfEngineBackendNet::setBatchSize(const size_t) noexcept { CV_Error(Error::StsNotImplemented, ""); return InferenceEngine::StatusCode::OK; @@ -374,7 +405,9 @@ void InfEngineBackendNet::init(int targetId) switch (targetId) { case DNN_TARGET_CPU: setTargetDevice(InferenceEngine::TargetDevice::eCPU); break; - case DNN_TARGET_OPENCL_FP16: setPrecision(InferenceEngine::Precision::FP16); // Fallback to the next. + case DNN_TARGET_OPENCL_FP16: + setPrecision(InferenceEngine::Precision::FP16); + /* Falls through. */ case DNN_TARGET_OPENCL: setTargetDevice(InferenceEngine::TargetDevice::eGPU); break; case DNN_TARGET_MYRIAD: { diff --git a/modules/dnn/src/op_inf_engine.hpp b/modules/dnn/src/op_inf_engine.hpp index 4295e10417..a33d93cb03 100644 --- a/modules/dnn/src/op_inf_engine.hpp +++ b/modules/dnn/src/op_inf_engine.hpp @@ -8,6 +8,8 @@ #ifndef __OPENCV_DNN_OP_INF_ENGINE_HPP__ #define __OPENCV_DNN_OP_INF_ENGINE_HPP__ +#include "opencv2/core/cvdef.h" + #ifdef HAVE_INF_ENGINE #if defined(__GNUC__) && __GNUC__ >= 5 //#pragma GCC diagnostic push @@ -34,7 +36,9 @@ public: void setPrecision(InferenceEngine::Precision p) noexcept; - virtual InferenceEngine::Precision getPrecision() noexcept CV_OVERRIDE; + virtual InferenceEngine::Precision getPrecision() noexcept; + + virtual InferenceEngine::Precision getPrecision() const noexcept; virtual void getOutputsInfo(InferenceEngine::OutputsDataMap &out) noexcept /*CV_OVERRIDE*/; @@ -44,13 +48,19 @@ public: virtual void getInputsInfo(InferenceEngine::InputsDataMap &inputs) const noexcept /*CV_OVERRIDE*/; - virtual InferenceEngine::InputInfo::Ptr getInput(const std::string &inputName) noexcept CV_OVERRIDE; + virtual InferenceEngine::InputInfo::Ptr getInput(const std::string &inputName) noexcept; + + virtual InferenceEngine::InputInfo::Ptr getInput(const std::string &inputName) const noexcept; virtual void getName(char *pName, size_t len) noexcept; virtual void getName(char *pName, size_t len) const noexcept; - virtual size_t layerCount() noexcept CV_OVERRIDE; + virtual const std::string& getName() const noexcept; + + virtual size_t layerCount() noexcept; + + virtual size_t layerCount() const noexcept; virtual InferenceEngine::DataPtr& getData(const char *dname) noexcept CV_OVERRIDE; @@ -58,15 +68,21 @@ public: virtual InferenceEngine::StatusCode addOutput(const std::string &layerName, size_t outputIndex = 0, - InferenceEngine::ResponseDesc *resp = nullptr) noexcept CV_OVERRIDE; + InferenceEngine::ResponseDesc *resp = nullptr) noexcept; virtual InferenceEngine::StatusCode getLayerByName(const char *layerName, InferenceEngine::CNNLayerPtr &out, - InferenceEngine::ResponseDesc *resp) noexcept CV_OVERRIDE; + InferenceEngine::ResponseDesc *resp) noexcept; + + virtual InferenceEngine::StatusCode getLayerByName(const char *layerName, + InferenceEngine::CNNLayerPtr &out, + InferenceEngine::ResponseDesc *resp) const noexcept; virtual void setTargetDevice(InferenceEngine::TargetDevice device) noexcept CV_OVERRIDE; - virtual InferenceEngine::TargetDevice getTargetDevice() noexcept CV_OVERRIDE; + virtual InferenceEngine::TargetDevice getTargetDevice() noexcept; + + virtual InferenceEngine::TargetDevice getTargetDevice() const noexcept; virtual InferenceEngine::StatusCode setBatchSize(const size_t size) noexcept CV_OVERRIDE; @@ -94,6 +110,8 @@ private: InferenceEngine::ExecutableNetwork netExec; InferenceEngine::InferRequest infRequest; + std::string name; + void initPlugin(InferenceEngine::ICNNNetwork& net); }; From 18abe54497af36e27331bec7a611190b8d9e510e Mon Sep 17 00:00:00 2001 From: Tomoaki Teshima Date: Fri, 20 Jul 2018 18:09:17 +0900 Subject: [PATCH 08/19] fix build error on Visual Studio 2013 * replace binary literal prefix to hexadecimal literal prefix --- .../include/opencv2/core/hal/intrin_avx.hpp | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp index 7e983fd24f..8654f4f022 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp @@ -526,13 +526,13 @@ inline void v256_zip(const _Tpvec& a, const _Tpvec& b, _Tpvec& ab0, _Tpvec& ab1) template inline _Tpvec v256_combine_diagonal(const _Tpvec& a, const _Tpvec& b) -{ return _Tpvec(_mm256_blend_epi32(a.val, b.val, 0b11110000)); } +{ return _Tpvec(_mm256_blend_epi32(a.val, b.val, 0xf0)); } inline v_float32x8 v256_combine_diagonal(const v_float32x8& a, const v_float32x8& b) -{ return v256_blend<0b11110000>(a, b); } +{ return v256_blend<0xf0>(a, b); } inline v_float64x4 v256_combine_diagonal(const v_float64x4& a, const v_float64x4& b) -{ return v256_blend<0b1100>(a, b); } +{ return v256_blend<0xc>(a, b); } template inline _Tpvec v256_alignr_128(const _Tpvec& a, const _Tpvec& b) @@ -1687,7 +1687,7 @@ inline void v256_store_interleave_l4(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, { _Tpvec ab0 = v256_unpacklo(a, b); _Tpvec bc1 = v256_unpackhi(b, c); - _Tpvec ca10 = v256_swap_halves(v256_blend<0b1010>(c, a)); + _Tpvec ca10 = v256_swap_halves(v256_blend<0xa>(c, a)); v_store(ptr, v256_combine_diagonal(ab0, ca10)); v_store(ptr + _Tpvec::nlanes, v256_combine_diagonal(bc1, ab0)); @@ -1765,10 +1765,10 @@ inline void v256_store_interleave_l8(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, v256_zip(a, b, ab0, ab1); v256_zip(b, c, bc0, bc1); - _Tpvec cazg = v256_blend<0b10101010>(c, a); + _Tpvec cazg = v256_blend<0xaa>(c, a); _Tpvec abc0abc1(_mm256_unpacklo_epi64(ab0.val, cazg.val)); _Tpvec abc1abc2(_mm256_unpackhi_epi64(cazg.val, bc1.val)); - _Tpvec abc2abc0 = v256_reverse_64(v256_blend<0b11001100>(ab1, bc0)); + _Tpvec abc2abc0 = v256_reverse_64(v256_blend<0xcc>(ab1, bc0)); _Tpvec abc0 = v256_combine_diagonal(abc0abc1, abc2abc0); _Tpvec abc1 = v256_combine_diagonal(abc1abc2, abc0abc1); @@ -1785,7 +1785,7 @@ inline void v256_store_interleave_l8(float* ptr, const v_float32x8& a, const v_f v256_zip(a, b, ab0, ab1); v256_zip(b, c, bc0, bc1); - v_float32x8 cazg = v256_blend<0b10101010>(c, a); + v_float32x8 cazg = v256_blend<0xaa>(c, a); v_float32x8 abc0abc1(_mm256_shuffle_ps(ab0.val, cazg.val, _MM_SHUFFLE(1, 0, 1, 0))); v_float32x8 abc1abc2(_mm256_shuffle_ps(cazg.val, bc1.val, _MM_SHUFFLE(3, 2, 3, 2))); @@ -1811,14 +1811,14 @@ inline void v256_load_deinterleave_l8(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpv _Tpvec abc2 = v256_alignr_128(abc02, abc20); _Tpvec abc0 = v256_combine_diagonal(abc02, abc20); - a = v256_blend<0b10010010>(abc0, abc1); - a = v256_blend<0b01000100>(a, abc2); + a = v256_blend<0x92>(abc0, abc1); + a = v256_blend<0x44>(a, abc2); - b = v256_blend<0b00100100>(abc0, abc1); - b = v256_blend<0b10011001>(b, abc2); + b = v256_blend<0x24>(abc0, abc1); + b = v256_blend<0x99>(b, abc2); - c = v256_blend<0b01001001>(abc0, abc1); - c = v256_blend<0b00100010>(c, abc2); + c = v256_blend<0x49>(abc0, abc1); + c = v256_blend<0x22>(c, abc2); a = v256_shuffle<_MM_SHUFFLE(1, 2, 3, 0)>(a); b = v256_shuffle<_MM_SHUFFLE(2, 3, 0, 1)>(b); @@ -1887,14 +1887,14 @@ inline void v256_store_interleave_l16(_Tp* ptr, const _Tpvec& a, const _Tpvec& b v_uint32x8 bc0 = v_reinterpret_as_u32(v256_unpacklo(b, c)); v_uint32x8 bc1 = v_reinterpret_as_u32(v256_unpackhi(b, c)); - v_uint32x8 cazg = v_reinterpret_as_u32(v256_blend<0b10101010>(c, a)); + v_uint32x8 cazg = v_reinterpret_as_u32(v256_blend<0xaa>(c, a)); cazg = v256_shuffle<_MM_SHUFFLE(2, 1, 0, 3)>(cazg); - v_uint32x8 ac1ab1 = v256_blend<0b10101010>(ab1, bc1); + v_uint32x8 ac1ab1 = v256_blend<0xaa>(ab1, bc1); ac1ab1 = v256_shuffle<_MM_SHUFFLE(2, 1, 0, 3)>(ac1ab1); - v_uint32x8 abc001 = v256_blend<0b10101010>(ab0, cazg); - v_uint32x8 cabc0 = v256_blend<0b10101010>(cazg, bc0); + v_uint32x8 abc001 = v256_blend<0xaa>(ab0, cazg); + v_uint32x8 cabc0 = v256_blend<0xaa>(cazg, bc0); v_uint32x8 cabc1 = v256_unpacklo(cabc0, ac1ab1); v_uint32x8 bcab0 = v256_unpackhi(cabc1, abc001); From ee743afebe501c73a7fe562855cc4f46469fe389 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Fri, 20 Jul 2018 17:26:35 +0300 Subject: [PATCH 09/19] dnn(ocl): don't use getUMat() for long live objects --- modules/dnn/src/layers/convolution_layer.cpp | 18 ++++++++++-------- .../src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp | 4 ++-- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index 0f3c99c4a7..23f99e7a6b 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -866,6 +866,16 @@ public: for (int i = 0; i < inputs.size(); ++i) CV_Assert(inputs[i].u != outputs[0].u); + if (umat_blobs.empty()) + { + size_t n = blobs.size(); + umat_blobs.resize(n); + for (size_t i = 0; i < n; i++) + { + blobs[i].copyTo(umat_blobs[i]); + } + } + if (convolutionOp.empty()) { OCL4DNNConvConfig config; @@ -1637,14 +1647,6 @@ public: Ptr ConvolutionLayer::create(const LayerParams ¶ms) { Ptr l(new ConvolutionLayerImpl(params)); - -#ifdef HAVE_OPENCL - size_t n = params.blobs.size(); - l->umat_blobs.resize(n); - for (int i = 0; i < n; i++) - l->umat_blobs[i] = params.blobs[i].getUMat(ACCESS_READ); -#endif - return l; } diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp index c889c7d85c..a3a0936bd4 100644 --- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp +++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp @@ -563,10 +563,10 @@ bool OCL4DNNConvSpatial::Forward(const UMat& bottom, } if (use_half_ && bias_half.empty() && !bias.empty()) - convertFp16((UMat&)bias, bias_half); + convertFp16(bias, bias_half); if (use_half_ && weights_half.empty()) - convertFp16((UMat&)weight, weights_half); + convertFp16(weight, weights_half); prepareKernel(bottom, top, weight, (use_half_) ? bias_half : bias, numImages); if (bestKernelConfig.empty()) From dadde75ef02ed82ea4941aca23a7046d9048d45e Mon Sep 17 00:00:00 2001 From: Teng Yiliang Date: Thu, 19 Jul 2018 16:40:27 +0800 Subject: [PATCH 10/19] use struct member width_set/height_set to replace static width/height. the static variables will cause race-condition when operating in multithread scenarios. Signed-off-by: Teng Yiliang Signed-off-by: Teng Yiliang --- modules/videoio/src/cap_v4l.cpp | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/modules/videoio/src/cap_v4l.cpp b/modules/videoio/src/cap_v4l.cpp index adf5524e39..830fefaa37 100644 --- a/modules/videoio/src/cap_v4l.cpp +++ b/modules/videoio/src/cap_v4l.cpp @@ -277,6 +277,7 @@ struct CvCaptureCAM_V4L CV_FINAL : public CvCapture __u32 palette; int width, height; + int width_set, height_set; int bufferSize; __u32 fps; bool convert_rgb; @@ -795,6 +796,7 @@ bool CvCaptureCAM_V4L::open(const char* _deviceName) FirstCapture = 1; width = DEFAULT_V4L_WIDTH; height = DEFAULT_V4L_HEIGHT; + width_set = height_set = 0; bufferSize = DEFAULT_V4L_BUFFERS; fps = DEFAULT_V4L_FPS; convert_rgb = true; @@ -1748,7 +1750,6 @@ static bool icvSetControl (CvCaptureCAM_V4L* capture, static int icvSetPropertyCAM_V4L( CvCaptureCAM_V4L* capture, int property_id, double value ){ - static int width = 0, height = 0; bool retval = false; bool possible; @@ -1757,6 +1758,9 @@ static int icvSetPropertyCAM_V4L( CvCaptureCAM_V4L* capture, switch (property_id) { case CV_CAP_PROP_FRAME_WIDTH: + { + int& width = capture->width_set; + int& height = capture->height_set; width = cvRound(value); retval = width != 0; if(width !=0 && height != 0) { @@ -1765,8 +1769,12 @@ static int icvSetPropertyCAM_V4L( CvCaptureCAM_V4L* capture, retval = v4l2_reset(capture); width = height = 0; } - break; + } + break; case CV_CAP_PROP_FRAME_HEIGHT: + { + int& width = capture->width_set; + int& height = capture->height_set; height = cvRound(value); retval = height != 0; if(width !=0 && height != 0) { @@ -1775,7 +1783,8 @@ static int icvSetPropertyCAM_V4L( CvCaptureCAM_V4L* capture, retval = v4l2_reset(capture); width = height = 0; } - break; + } + break; case CV_CAP_PROP_FPS: capture->fps = value; retval = v4l2_reset(capture); From 8c7555523b56bf503bf67b139cc036e0b62435ee Mon Sep 17 00:00:00 2001 From: Paul92 Date: Sun, 22 Jul 2018 15:08:29 +0100 Subject: [PATCH 11/19] Merge pull request #12032 from Paul92:mser-sample-improvments Mser sample improvments (#12032) * Fixed bug in detect_mser sample Wrong number of colors used to generate the synthetic images * Formatting improvements * Using safer casts * Improved readability of legend generation * Various readability fixes in detect_mser sample --- samples/cpp/detect_mser.cpp | 293 +++++++++++++++++------------------- 1 file changed, 142 insertions(+), 151 deletions(-) diff --git a/samples/cpp/detect_mser.cpp b/samples/cpp/detect_mser.cpp index a3c3856d2a..8d62b2b7e4 100644 --- a/samples/cpp/detect_mser.cpp +++ b/samples/cpp/detect_mser.cpp @@ -7,6 +7,9 @@ #include #include #include +#include +#include +#include #ifdef HAVE_OPENGL #ifdef _WIN32 #define WIN32_LEAN_AND_MEAN 1 @@ -36,17 +39,17 @@ static void help() cout << "\n This program demonstrates how to use MSER to detect extremal regions \n" "Usage: \n" " ./detect_mser \n" - "Press esc key when image window is active to change descriptor parameter\n" + "Press esc key when image window is active to change descriptor parameter\n" "Press 2, 8, 4, 6, +,- or 5 keys in openGL windows to change view or use mouse\n"; } struct MSERParams { MSERParams(int _delta = 5, int _min_area = 60, int _max_area = 14400, - double _max_variation = 0.25, double _min_diversity = .2, - int _max_evolution = 200, double _area_threshold = 1.01, - double _min_margin = 0.003, int _edge_blur_size = 5) - { + double _max_variation = 0.25, double _min_diversity = .2, + int _max_evolution = 200, double _area_threshold = 1.01, + double _min_margin = 0.003, int _edge_blur_size = 5) + { delta = _delta; minArea = _min_area; maxArea = _max_area; @@ -57,7 +60,7 @@ struct MSERParams minMargin = _min_margin; edgeBlurSize = _edge_blur_size; pass2Only = false; - } + } int delta; int minArea; @@ -72,30 +75,20 @@ struct MSERParams int edgeBlurSize; }; -static String Legende(MSERParams &pAct) +static String Legende(const MSERParams &pAct) { - String s=""; - String inf = static_cast(ostringstream() << pAct.minArea).str(); - String sup = static_cast(ostringstream() << pAct.maxArea).str(); - s = " Area[" + inf + "," + sup + "]"; - - inf = static_cast(ostringstream() << pAct.delta).str(); - s += " del. [" + inf + "]"; - inf = static_cast(ostringstream() << pAct.maxVariation).str(); - s += " var. [" + inf + "]"; - inf = static_cast(ostringstream() << (int)pAct.minDiversity).str(); - s += " div. [" + inf + "]"; - inf = static_cast(ostringstream() << (int)pAct.pass2Only).str(); - s += " pas. [" + inf + "]"; - inf = static_cast(ostringstream() << (int)pAct.maxEvolution).str(); - s += "RGb-> evo. [" + inf + "]"; - inf = static_cast(ostringstream() << (int)pAct.areaThreshold).str(); - s += " are. [" + inf + "]"; - inf = static_cast(ostringstream() << (int)pAct.minMargin).str(); - s += " mar. [" + inf + "]"; - inf = static_cast(ostringstream() << (int)pAct.edgeBlurSize).str(); - s += " siz. [" + inf + "]"; - return s; + ostringstream ss; + ss << "Area[" << pAct.minArea << "," << pAct.maxArea << "] "; + ss << "del. [" << pAct.delta << "] "; + ss << "var. [" << pAct.maxVariation << "] "; + ss << "div. [" << (int)pAct.minDiversity << "] "; + ss << "pas. [" << (int)pAct.pass2Only << "] "; + ss << "RGb->evo. [" << pAct.maxEvolution << "] "; + ss << "are. [" << (int)pAct.areaThreshold << "] "; + ss << "mar. [" << (int)pAct.minMargin << "] "; + ss << "siz. [" << pAct.edgeBlurSize << "]"; + + return ss.str(); } @@ -109,18 +102,28 @@ bool keyPressed=false; Vec4f rotAxis(1,0,1,0); Vec3f zoom(1,0,0); -float obsX = (float)0, obsY = (float)0, obsZ = (float)-10, tx = (float)0, ty = (float)0; -float thetaObs = (float)-1.570, phiObs = (float)1.570, rObs = (float)10; -int prevX=-1,prevY=-1,prevTheta=-1000,prevPhi=-1000; +float obsX = 0.f; +float obsY = 0.f; +float obsZ = -10.f; +float tx = 0.f; +float ty = 0.f; + +float thetaObs = -1.570f; +float phiObs = 1.570f; +float rObs = 10.f; + +int prevX = -1; +int prevY = -1; +int prevTheta = -1000; +int prevPhi = -1000; #ifdef HAVE_OPENGL struct DrawData - - { +{ ogl::Arrays arr; ogl::Texture2D tex; ogl::Buffer indices; - }; +}; static void draw(void* userdata) @@ -167,19 +170,19 @@ static void onMouse(int event, int x, int y, int flags, void*) { if (x - prevTheta<0) { - thetaObs +=(float)0.02; + thetaObs += 0.02f; } else if (x - prevTheta>0) { - thetaObs -= (float)0.02; + thetaObs -= 0.02f; } if (y - prevPhi<0) { - phiObs -= (float)0.02; + phiObs -= 0.02f; } else if (y - prevPhi>0) { - phiObs += (float)0.02; + phiObs += 0.02f; } prevTheta = x; prevPhi = y; @@ -187,9 +190,9 @@ static void onMouse(int event, int x, int y, int flags, void*) if (event==EVENT_MOUSEWHEEL) { if (getMouseWheelDelta(flags)>0) - rObs += (float)0.1; + rObs += 0.1f; else - rObs -= (float)0.1; + rObs -= 0.1f; } float pi = static_cast(CV_PI); if (thetaObs>pi) @@ -202,11 +205,11 @@ static void onMouse(int event, int x, int y, int flags, void*) } if (phiObs>pi / 2) { - phiObs = pi / 2 - (float)0.0001; + phiObs = pi / 2 - 0.0001f; } if (phiObs<-pi / 2) { - phiObs = -pi / 2 + (float)0.00001; + phiObs = -pi / 2 + 0.00001f; } if (rObs<0) { @@ -224,36 +227,37 @@ static void DrawOpenGLMSER(Mat img, Mat result) cvtColor(img, imgGray, COLOR_BGR2GRAY); else imgGray = img; + namedWindow("OpenGL", WINDOW_OPENGL); setMouseCallback("OpenGL", onMouse, NULL); Mat_ vertex(1, img.cols*img.rows); Mat_ texCoords(1, img.cols*img.rows); for (int i = 0, nbPix = 0; i(0, nbPix) = Vec3f(float(2 * (x - 0.5)), float(2 * (0.5 - y)), float(imgGray.at(i, j) / 512.0)); texCoords.at< Vec2f>(0, nbPix) = Vec2f(x, y); - } } + } Mat_ indices(1, (img.rows - 1)*(6 * img.cols)); for (int i = 1, nbPix = 0; i(0, nbPix++) = c ; + indices.at(0, nbPix++) = c; indices.at(0, nbPix++) = c - 1; - indices.at(0, nbPix++) = c- img.cols - 1; - indices.at(0, nbPix++) = c- img.cols - 1; + indices.at(0, nbPix++) = c - img.cols - 1; + indices.at(0, nbPix++) = c - img.cols - 1; indices.at(0, nbPix++) = c - img.cols; - indices.at(0, nbPix++) = c ; - } + indices.at(0, nbPix++) = c; } + } DrawData *data = new DrawData; @@ -279,7 +283,7 @@ static void DrawOpenGLMSER(Mat img, Mat result) setOpenGlDrawCallback("OpenGL", draw, data); for (;;) - { + { updateWindow("OpenGL"); char key = (char)waitKey(40); if (key == 27) @@ -292,27 +296,28 @@ static void DrawOpenGLMSER(Mat img, Mat result) case '5': obsX = 0, obsY = 0, obsZ = -10; thetaObs = -pi/2, phiObs = pi/2, rObs = 10; - tx=0;ty=0; + tx=0; ty=0; break; case '4': - thetaObs += (float)0.1; + thetaObs += 0.1f; break; case '6': - thetaObs -= (float)0.1; + thetaObs -= 0.1f; break; case '2': - phiObs -= (float).1; + phiObs -= 0.1f; break; case '8': - phiObs += (float).1; + phiObs += 0.1f; break; case '+': - rObs -= (float).1; + rObs -= 0.1f; break; case '-': - rObs += (float).1; + rObs += 0.1f; break; } + if (thetaObs>pi) { thetaObs = -2 * pi + thetaObs; @@ -320,9 +325,9 @@ static void DrawOpenGLMSER(Mat img, Mat result) if (thetaObs<-pi) thetaObs = 2 * pi + thetaObs; if (phiObs>pi / 2) - phiObs = pi / 2 - (float)0.0001; + phiObs = pi / 2 - 0.0001f; if (phiObs<-pi / 2) - phiObs = -pi / 2 + (float)0.00001; + phiObs = -pi / 2 + 0.00001f; if (rObs<0) rObs = 0; obsX = rObs*cos(thetaObs)*cos(phiObs); @@ -334,67 +339,59 @@ static void DrawOpenGLMSER(Mat img, Mat result) } #endif +// Add nested rectangles of different widths and colors to an image +static void addNestedRectangles(Mat &img, Point p0, int* width, int *color, int n) { + for (int i = 0; i val; - int fond = 0; img = Scalar(fond); - val[fond] = 1; - int width1[] = { 390, 380, 300, 290, 280, 270, 260, 250, 210, 190, 150, 100, 80, 70 }; - int color1[] = { 80, 180, 160, 140, 120, 100, 90, 110, 170, 150, 140, 100, 220 }; - Point p0(10, 10); - int *width, *color; - width = width1; - color = color1; - for (int i = 0; i<13; i++) - { - rectangle(img, Rect(p0, Size(width[i], width[i])), Scalar(color[i]), 1); - p0 += Point((width[i] - width[i + 1]) / 2, (width[i] - width[i + 1]) / 2); - floodFill(img, p0, Scalar(color[i])); + int width[] = { 390, 380, 300, 290, 280, 270, 260, 250, 210, 190, 150, 100, 80, 70 }; - } + int color1[] = { 80, 180, 160, 140, 120, 100, 90, 110, 170, 150, 140, 100, 220 }; int color2[] = { 81, 181, 161, 141, 121, 101, 91, 111, 171, 151, 141, 101, 221 }; - color = color2; - p0 = Point(200, 600); - for (int i = 0; i<13; i++) - { - circle(img, p0, width[i] / 2, Scalar(color[i]), 1); - floodFill(img, p0, Scalar(color[i])); + int color3[] = { 175, 75, 95, 115, 135, 155, 165, 145, 85, 105, 115, 155, 35 }; + int color4[] = { 173, 73, 93, 113, 133, 153, 163, 143, 83, 103, 113, 153, 33 }; - } - int color3[] = { 175,75,95,115,135,155,165,145,85,105,115,156 }; - color = color3; - p0 = Point(410, 10); - for (int i = 0; i<13; i++) - { - rectangle(img, Rect(p0, Size(width[i], width[i])), Scalar(color[i]), 1); - p0 += Point((width[i] - width[i + 1]) / 2, (width[i] - width[i + 1]) / 2); - floodFill(img, p0, Scalar(color[i])); + addNestedRectangles(img, Point(10, 10), width, color1, 13); + addNestedCircles(img, Point(200, 600), width, color2, 13); - } - int color4[] = { 173,73,93,113,133,153,163,143,83,103,114,154 }; - color = color4; + addNestedRectangles(img, Point(410, 10), width, color3, 13); + addNestedCircles(img, Point(600, 600), width, color4, 13); - p0 = Point(600, 600); - for (int i = 0; i<13; i++) - { - circle(img, p0, width[i] / 2, Scalar(color[i]), 1); - floodFill(img, p0, Scalar(color[i])); - } int histSize = 256; float range[] = { 0, 256 }; const float* histRange[] = { range }; Mat hist; + // we compute the histogram calcHist(&img, 1, 0, Mat(), hist, 1, &histSize, histRange, true, false); + cout << "****************Maximal region************************\n"; - for (int i = 0; i < hist.rows ; i++) + for (int i = 0; i < hist.rows; i++) { if (hist.at(i, 0)!=0) { - cout << "h" << i << "=\t" << hist.at(i, 0) << "\n"; + cout << "h" << setw(3) << left << i << "\t=\t" << hist.at(i, 0) << "\n"; } } @@ -403,68 +400,60 @@ static Mat MakeSyntheticImage() int main(int argc, char *argv[]) { - vector fileName; - Mat imgOrig,img; - Size blurSize(5,5); + Mat imgOrig, img; + Size blurSize(5, 5); cv::CommandLineParser parser(argc, argv, "{ help h | | }{ @input | | }"); if (parser.has("help")) { help(); return 0; } + string input = parser.get("@input"); if (!input.empty()) { - fileName.push_back(input); - imgOrig = imread(fileName[0], IMREAD_GRAYSCALE); + imgOrig = imread(input, IMREAD_GRAYSCALE); blur(imgOrig, img, blurSize); } else { - fileName.push_back("SyntheticImage.bmp"); imgOrig = MakeSyntheticImage(); - img=imgOrig; + img = imgOrig; } - MSERParams pDefaultMSER; // Descriptor array MSER vector typeDesc; // Param array for MSER vector pMSER; - vector::iterator itMSER; // Color palette - vector palette; - for (int i = 0; i<65536; i++) + vector palette; + for (int i = 0; i<=numeric_limits::max(); i++) palette.push_back(Vec3b((uchar)rand(), (uchar)rand(), (uchar)rand())); + help(); + MSERParams params; + + params.delta = 10; + params.minArea = 100; + params.maxArea = 5000; + params.maxVariation = 2; + params.minDiversity = 0; + params.pass2Only = true; + typeDesc.push_back("MSER"); - pMSER.push_back(pDefaultMSER); - pMSER.back().delta = 10; - pMSER.back().minArea = 100; - pMSER.back().maxArea = 5000; - pMSER.back().maxVariation = 2; - pMSER.back().minDiversity = 0; - pMSER.back().pass2Only = true; + pMSER.push_back(params); + + params.pass2Only = false; typeDesc.push_back("MSER"); - pMSER.push_back(pDefaultMSER); - pMSER.back().delta = 10; - pMSER.back().minArea = 100; - pMSER.back().maxArea = 5000; - pMSER.back().maxVariation = 2; - pMSER.back().minDiversity = 0; - pMSER.back().pass2Only = false; + pMSER.push_back(params); + + params.delta = 100; typeDesc.push_back("MSER"); - pMSER.push_back(pDefaultMSER); - pMSER.back().delta = 100; - pMSER.back().minArea = 100; - pMSER.back().maxArea = 5000; - pMSER.back().maxVariation = 2; - pMSER.back().minDiversity = 0; - pMSER.back().pass2Only = false; - itMSER = pMSER.begin(); - vector desMethCmp; + pMSER.push_back(params); + + vector::iterator itMSER = pMSER.begin(); Ptr b; String label; // Descriptor loop @@ -473,14 +462,14 @@ int main(int argc, char *argv[]) for (itDesc = typeDesc.begin(); itDesc != typeDesc.end(); ++itDesc) { vector keyImg1; - if (*itDesc == "MSER"){ + if (*itDesc == "MSER") + { if (img.type() == CV_8UC3) { b = MSER::create(itMSER->delta, itMSER->minArea, itMSER->maxArea, itMSER->maxVariation, itMSER->minDiversity, itMSER->maxEvolution, itMSER->areaThreshold, itMSER->minMargin, itMSER->edgeBlurSize); label = Legende(*itMSER); ++itMSER; - } else { @@ -490,6 +479,7 @@ int main(int argc, char *argv[]) ++itMSER; } } + if (img.type()==CV_8UC3) { img.copyTo(result); @@ -505,36 +495,37 @@ int main(int argc, char *argv[]) try { // We can detect regions using detectRegions method - vector keyImg; - vector zone; - vector > region; - Mat desc; + vector keyImg; + vector zone; + vector > region; + Mat desc; if (b.dynamicCast() != NULL) { Ptr sbd = b.dynamicCast(); sbd->detectRegions(img, region, zone); - int i = 0; //result = Scalar(0, 0, 0); int nbPixelInMSER=0; - for (vector >::iterator itr = region.begin(); itr != region.end(); ++itr, ++i) + for (vector >::iterator itr = region.begin(); itr != region.end(); ++itr) { - for (vector ::iterator itp = region[i].begin(); itp != region[i].end(); ++itp) + for (vector ::iterator itp = itr->begin(); itp != itr->end(); ++itp) { // all pixels belonging to region become blue result.at(itp->y, itp->x) = Vec3b(128, 0, 0); nbPixelInMSER++; } } - cout << "Number of MSER region " << region.size()<<" Number of pixels in all MSER region : "< Date: Mon, 23 Jul 2018 17:22:47 +0300 Subject: [PATCH 12/19] Fixed several issues found by static analysis tools --- modules/core/include/opencv2/core/types.hpp | 7 ++++++ modules/core/include/opencv2/core/types_c.h | 1 - modules/core/src/array.cpp | 4 +-- modules/core/src/persistence_types.cpp | 4 +-- modules/dnn/src/layers/resize_layer.cpp | 2 +- modules/imgproc/src/filter.cpp | 2 +- modules/imgproc/src/undistort.cpp | 1 + modules/videoio/src/cap_mjpeg_encoder.cpp | 28 ++++++++++----------- 8 files changed, 28 insertions(+), 21 deletions(-) diff --git a/modules/core/include/opencv2/core/types.hpp b/modules/core/include/opencv2/core/types.hpp index 503743470c..63232e324c 100644 --- a/modules/core/include/opencv2/core/types.hpp +++ b/modules/core/include/opencv2/core/types.hpp @@ -859,6 +859,13 @@ public: */ TermCriteria(int type, int maxCount, double epsilon); + inline bool isValid() const + { + const bool isCount = (type & COUNT) && maxCount > 0; + const bool isEps = (type & EPS) && !cvIsNaN(epsilon); + return isCount || isEps; + } + int type; //!< the type of termination criteria: COUNT, EPS or COUNT + EPS int maxCount; //!< the maximum number of iterations/elements double epsilon; //!< the desired accuracy diff --git a/modules/core/include/opencv2/core/types_c.h b/modules/core/include/opencv2/core/types_c.h index 81b24f0cc7..7e384a5c6f 100644 --- a/modules/core/include/opencv2/core/types_c.h +++ b/modules/core/include/opencv2/core/types_c.h @@ -629,7 +629,6 @@ CV_INLINE int cvIplDepth( int type ) #define CV_TYPE_NAME_MATND "opencv-nd-matrix" #define CV_MAX_DIM 32 -#define CV_MAX_DIM_HEAP 1024 /** @deprecated consider using cv::Mat instead diff --git a/modules/core/src/array.cpp b/modules/core/src/array.cpp index 11e9868617..45e6ee81d6 100644 --- a/modules/core/src/array.cpp +++ b/modules/core/src/array.cpp @@ -1725,8 +1725,8 @@ cvPtr1D( const CvArr* arr, int idx, int* _type ) else { int i, n = m->dims; - CV_DbgAssert( n <= CV_MAX_DIM_HEAP ); - int _idx[CV_MAX_DIM_HEAP]; + CV_DbgAssert( n <= CV_MAX_DIM ); + int _idx[CV_MAX_DIM]; for( i = n - 1; i >= 0; i-- ) { diff --git a/modules/core/src/persistence_types.cpp b/modules/core/src/persistence_types.cpp index d5732a7793..7ef115b5e3 100644 --- a/modules/core/src/persistence_types.cpp +++ b/modules/core/src/persistence_types.cpp @@ -302,7 +302,7 @@ static void* icvReadSparseMat( CvFileStorage* fs, CvFileNode* node ) CvFileNode* sizes_node; CvSeqReader reader; CvSeq* elements; - int sizes[CV_MAX_DIM_HEAP], dims, elem_type, cn; + int sizes[CV_MAX_DIM], dims, elem_type, cn; int i; sizes_node = cvGetFileNodeByName( fs, node, "sizes" ); @@ -327,7 +327,7 @@ static void* icvReadSparseMat( CvFileStorage* fs, CvFileNode* node ) mat = cvCreateSparseMat( dims, sizes, elem_type ); cn = CV_MAT_CN(elem_type); - int idx[CV_MAX_DIM_HEAP]; + int idx[CV_MAX_DIM]; elements = data->data.seq; cvStartReadRawData( fs, data, &reader ); diff --git a/modules/dnn/src/layers/resize_layer.cpp b/modules/dnn/src/layers/resize_layer.cpp index b26206694d..78362da778 100644 --- a/modules/dnn/src/layers/resize_layer.cpp +++ b/modules/dnn/src/layers/resize_layer.cpp @@ -14,7 +14,7 @@ namespace cv { namespace dnn { class ResizeLayerImpl : public ResizeLayer { public: - ResizeLayerImpl(const LayerParams& params) : scaleWidth(0), scaleHeight(0) + ResizeLayerImpl(const LayerParams& params) : zoomFactorWidth(0), zoomFactorHeight(0), scaleWidth(0), scaleHeight(0) { setParamsFrom(params); outWidth = params.get("width", 0); diff --git a/modules/imgproc/src/filter.cpp b/modules/imgproc/src/filter.cpp index a0866156a1..79c752bdd3 100644 --- a/modules/imgproc/src/filter.cpp +++ b/modules/imgproc/src/filter.cpp @@ -4284,7 +4284,7 @@ static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst, size_t src_step = _src.step(), src_offset = _src.offset(); bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; - if (esz == 0 + if (esz == 0 || src_step == 0 || (src_offset % src_step) % esz != 0 || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) || !(borderType == BORDER_CONSTANT diff --git a/modules/imgproc/src/undistort.cpp b/modules/imgproc/src/undistort.cpp index d083ceb77b..dc71bc42eb 100644 --- a/modules/imgproc/src/undistort.cpp +++ b/modules/imgproc/src/undistort.cpp @@ -370,6 +370,7 @@ static void cvUndistortPointsInternal( const CvMat* _src, CvMat* _dst, const CvM const CvMat* _distCoeffs, const CvMat* matR, const CvMat* matP, cv::TermCriteria criteria) { + CV_Assert(criteria.isValid()); double A[3][3], RR[3][3], k[14]={0,0,0,0,0,0,0,0,0,0,0,0,0,0}; CvMat matA=cvMat(3, 3, CV_64F, A), _Dk; CvMat _RR=cvMat(3, 3, CV_64F, RR); diff --git a/modules/videoio/src/cap_mjpeg_encoder.cpp b/modules/videoio/src/cap_mjpeg_encoder.cpp index fb1ded4997..b3d9b8f3bb 100644 --- a/modules/videoio/src/cap_mjpeg_encoder.cpp +++ b/modules/videoio/src/cap_mjpeg_encoder.cpp @@ -158,8 +158,9 @@ public: data.resize(size); } - void put(unsigned bits, int len) + inline void put_bits(unsigned bits, int len) { + CV_Assert(len >=0 && len < 32); if((m_pos == (data.size() - 1) && len > bits_free) || m_pos == data.size()) { resize(int(2*data.size())); @@ -182,6 +183,12 @@ public: } } + inline void put_val(int val, const unsigned * table) + { + unsigned code = table[(val) + 2]; + put_bits(code >> 8, (int)(code & 255)); + } + void finish() { if(bits_free == 32) @@ -1188,13 +1195,6 @@ public: void operator()( const cv::Range& range ) const CV_OVERRIDE { const int CAT_TAB_SIZE = 4096; - unsigned code = 0; - -#define JPUT_BITS(val, bits) output_buffer.put(val, bits) - -#define JPUT_HUFF(val, table) \ - code = table[(val) + 2]; \ - JPUT_BITS(code >> 8, (int)(code & 255)) int x, y; int i, j; @@ -1300,8 +1300,8 @@ public: int cat = cat_table[val + CAT_TAB_SIZE]; //CV_Assert( cat <= 11 ); - JPUT_HUFF( cat, huff_dc_tab[is_chroma] ); - JPUT_BITS( val - (val < 0 ? 1 : 0), cat ); + output_buffer.put_val(cat, huff_dc_tab[is_chroma] ); + output_buffer.put_bits( val - (val < 0 ? 1 : 0), cat ); } for( j = 1; j < 64; j++ ) @@ -1316,15 +1316,15 @@ public: { while( run >= 16 ) { - JPUT_HUFF( 0xF0, htable ); // encode 16 zeros + output_buffer.put_val( 0xF0, htable ); // encode 16 zeros run -= 16; } { int cat = cat_table[val + CAT_TAB_SIZE]; //CV_Assert( cat <= 10 ); - JPUT_HUFF( cat + run*16, htable ); - JPUT_BITS( val - (val < 0 ? 1 : 0), cat ); + output_buffer.put_val( cat + run*16, htable ); + output_buffer.put_bits( val - (val < 0 ? 1 : 0), cat ); } run = 0; @@ -1333,7 +1333,7 @@ public: if( run ) { - JPUT_HUFF( 0x00, htable ); // encode EOB + output_buffer.put_val( 0x00, htable ); // encode EOB } } } From 28e08ae0bd5ae3106700229c9cf94513833730c2 Mon Sep 17 00:00:00 2001 From: Dmitry Kurtaev Date: Fri, 20 Jul 2018 18:58:37 +0300 Subject: [PATCH 13/19] Add a sample which tests OpenVINO models --- modules/dnn/CMakeLists.txt | 6 + modules/dnn/src/op_inf_engine.cpp | 7 +- modules/dnn/test/test_ie_models.cpp | 220 ++++++++++++++++++++++++++++ 3 files changed, 229 insertions(+), 4 deletions(-) create mode 100644 modules/dnn/test/test_ie_models.cpp diff --git a/modules/dnn/CMakeLists.txt b/modules/dnn/CMakeLists.txt index a2f741cda1..a4cdc18cf7 100644 --- a/modules/dnn/CMakeLists.txt +++ b/modules/dnn/CMakeLists.txt @@ -120,3 +120,9 @@ if(BUILD_PERF_TESTS) endif() endif() endif() + +# Test Intel's Inference Engine models +if(HAVE_INF_ENGINE AND TARGET opencv_test_dnn) + ocv_target_include_directories(opencv_test_dnn PRIVATE ${INF_ENGINE_INCLUDE_DIRS}) + ocv_target_link_libraries(opencv_test_dnn LINK_PRIVATE ${INF_ENGINE_LIBRARIES}) +endif() diff --git a/modules/dnn/src/op_inf_engine.cpp b/modules/dnn/src/op_inf_engine.cpp index bcf2c2a3d9..a7c13f3a13 100644 --- a/modules/dnn/src/op_inf_engine.cpp +++ b/modules/dnn/src/op_inf_engine.cpp @@ -428,9 +428,8 @@ void InfEngineBackendNet::initPlugin(InferenceEngine::ICNNNetwork& net) try { - static std::map sharedPlugins; - std::string deviceName = InferenceEngine::getDeviceName(targetDevice); - auto pluginIt = sharedPlugins.find(deviceName); + static std::map sharedPlugins; + auto pluginIt = sharedPlugins.find(targetDevice); if (pluginIt != sharedPlugins.end()) { enginePtr = pluginIt->second; @@ -438,7 +437,7 @@ void InfEngineBackendNet::initPlugin(InferenceEngine::ICNNNetwork& net) else { enginePtr = InferenceEngine::PluginDispatcher({""}).getSuitablePlugin(targetDevice); - sharedPlugins[deviceName] = enginePtr; + sharedPlugins[targetDevice] = enginePtr; if (targetDevice == InferenceEngine::TargetDevice::eCPU) { diff --git a/modules/dnn/test/test_ie_models.cpp b/modules/dnn/test/test_ie_models.cpp new file mode 100644 index 0000000000..80c8ef3bc1 --- /dev/null +++ b/modules/dnn/test/test_ie_models.cpp @@ -0,0 +1,220 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// +// Copyright (C) 2018, Intel Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +#include "test_precomp.hpp" + +#ifdef HAVE_INF_ENGINE +#include + +#include +#include +#include + +static std::string extraTestDataPath = +#ifdef WINRT + NULL; +#else + getenv("INTEL_CVSDK_DIR"); +#endif + +namespace opencv_test { namespace { + +using namespace cv; +using namespace cv::dnn; +using namespace InferenceEngine; + +static inline void genData(const std::vector& dims, Mat& m, Blob::Ptr& dataPtr) +{ + std::vector reversedDims(dims.begin(), dims.end()); + std::reverse(reversedDims.begin(), reversedDims.end()); + + m.create(reversedDims, CV_32F); + randu(m, -1, 1); + + dataPtr = make_shared_blob(Precision::FP32, dims, (float*)m.data); +} + +void runIE(Target target, const std::string& xmlPath, const std::string& binPath, + std::map& inputsMap, std::map& outputsMap) +{ + CNNNetReader reader; + reader.ReadNetwork(xmlPath); + reader.ReadWeights(binPath); + + CNNNetwork net = reader.getNetwork(); + + InferenceEnginePluginPtr enginePtr; + InferencePlugin plugin; + ExecutableNetwork netExec; + InferRequest infRequest; + TargetDevice targetDevice; + switch (target) + { + case DNN_TARGET_CPU: + targetDevice = TargetDevice::eCPU; + break; + case DNN_TARGET_OPENCL: + case DNN_TARGET_OPENCL_FP16: + targetDevice = TargetDevice::eGPU; + break; + case DNN_TARGET_MYRIAD: + targetDevice = TargetDevice::eMYRIAD; + break; + default: + CV_Error(Error::StsNotImplemented, "Unknown target"); + }; + + try + { + enginePtr = PluginDispatcher({""}).getSuitablePlugin(targetDevice); + + if (targetDevice == TargetDevice::eCPU) + { + std::string suffixes[] = {"_avx2", "_sse4", ""}; + bool haveFeature[] = { + checkHardwareSupport(CPU_AVX2), + checkHardwareSupport(CPU_SSE4_2), + true + }; + for (int i = 0; i < 3; ++i) + { + if (!haveFeature[i]) + continue; +#ifdef _WIN32 + std::string libName = "cpu_extension" + suffixes[i] + ".dll"; +#else + std::string libName = "libcpu_extension" + suffixes[i] + ".so"; +#endif // _WIN32 + try + { + IExtensionPtr extension = make_so_pointer(libName); + enginePtr->AddExtension(extension, 0); + break; + } + catch(...) {} + } + // Some of networks can work without a library of extra layers. + } + plugin = InferencePlugin(enginePtr); + + netExec = plugin.LoadNetwork(net, {}); + infRequest = netExec.CreateInferRequest(); + } + catch (const std::exception& ex) + { + CV_Error(Error::StsAssert, format("Failed to initialize Inference Engine backend: %s", ex.what())); + } + + // Fill input blobs. + inputsMap.clear(); + BlobMap inputBlobs; + for (auto& it : net.getInputsInfo()) + { + genData(it.second->getDims(), inputsMap[it.first], inputBlobs[it.first]); + } + infRequest.SetInput(inputBlobs); + + // Fill output blobs. + outputsMap.clear(); + BlobMap outputBlobs; + for (auto& it : net.getOutputsInfo()) + { + genData(it.second->dims, outputsMap[it.first], outputBlobs[it.first]); + } + infRequest.SetOutput(outputBlobs); + + infRequest.Infer(); +} + +std::vector getOutputsNames(const Net& net) +{ + std::vector names; + if (names.empty()) + { + std::vector outLayers = net.getUnconnectedOutLayers(); + std::vector layersNames = net.getLayerNames(); + names.resize(outLayers.size()); + for (size_t i = 0; i < outLayers.size(); ++i) + names[i] = layersNames[outLayers[i] - 1]; + } + return names; +} + +void runCV(Target target, const std::string& xmlPath, const std::string& binPath, + const std::map& inputsMap, + std::map& outputsMap) +{ + Net net = readNet(xmlPath, binPath); + for (auto& it : inputsMap) + net.setInput(it.second, it.first); + net.setPreferableTarget(target); + + std::vector outNames = getOutputsNames(net); + std::vector outs; + net.forward(outs, outNames); + + outputsMap.clear(); + EXPECT_EQ(outs.size(), outNames.size()); + for (int i = 0; i < outs.size(); ++i) + { + EXPECT_TRUE(outputsMap.insert({outNames[i], outs[i]}).second); + } +} + +typedef TestWithParam > DNNTestOpenVINO; +TEST_P(DNNTestOpenVINO, models) +{ + Target target = (dnn::Target)(int)get<0>(GetParam()); + std::string modelName = get<1>(GetParam()); + + if (modelName == "semantic-segmentation-adas-0001" && target == DNN_TARGET_OPENCL_FP16) + throw SkipTestException(""); + + std::string precision = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? "FP16" : "FP32"; + std::string prefix = utils::fs::join(extraTestDataPath, + utils::fs::join("deployment_tools", + utils::fs::join("intel_models", + utils::fs::join(modelName, + utils::fs::join(precision, modelName))))); + std::string xmlPath = prefix + ".xml"; + std::string binPath = prefix + ".bin"; + + std::map inputsMap; + std::map ieOutputsMap, cvOutputsMap; + runIE(target, xmlPath, binPath, inputsMap, ieOutputsMap); + runCV(target, xmlPath, binPath, inputsMap, cvOutputsMap); + + EXPECT_EQ(ieOutputsMap.size(), cvOutputsMap.size()); + for (auto& srcIt : ieOutputsMap) + { + auto dstIt = cvOutputsMap.find(srcIt.first); + CV_Assert(dstIt != cvOutputsMap.end()); + double normInf = cvtest::norm(srcIt.second, dstIt->second, cv::NORM_INF); + EXPECT_EQ(normInf, 0); + } +} + +static testing::internal::ParamGenerator intelModels() +{ + String path = utils::fs::join(utils::fs::join(extraTestDataPath, "deployment_tools"), "intel_models"); + + std::vector modelsNames; + cv::utils::fs::glob_relative(path, "", modelsNames, false, true); + + std::vector::iterator end = + std::remove_if(modelsNames.begin(), modelsNames.end(), + [&](const String& dir){ return !utils::fs::isDirectory(utils::fs::join(path, dir)); }); + modelsNames = std::vector(modelsNames.begin(), end); + + return testing::ValuesIn(modelsNames); +} + +INSTANTIATE_TEST_CASE_P(/**/, DNNTestOpenVINO, Combine( + Values(DNN_TARGET_CPU, DNN_TARGET_OPENCL, DNN_TARGET_OPENCL_FP16), intelModels() +)); + +}} +#endif // HAVE_INF_ENGINE From 0c4d5ffecd1585b0aa57cedfdfba0dd46d8f6041 Mon Sep 17 00:00:00 2001 From: Dmitry Kurtaev Date: Mon, 23 Jul 2018 23:34:50 +0300 Subject: [PATCH 14/19] Do not copy cv_cpu_helper.h to parent if OpenCV is a submodule --- cmake/OpenCVCompilerOptimizations.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/OpenCVCompilerOptimizations.cmake b/cmake/OpenCVCompilerOptimizations.cmake index def96723af..76f56ba422 100644 --- a/cmake/OpenCVCompilerOptimizations.cmake +++ b/cmake/OpenCVCompilerOptimizations.cmake @@ -740,7 +740,7 @@ macro(ocv_compiler_optimization_fill_cpu_config) ") - set(__file "${CMAKE_SOURCE_DIR}/modules/core/include/opencv2/core/cv_cpu_helper.h") + set(__file "${OpenCV_SOURCE_DIR}/modules/core/include/opencv2/core/cv_cpu_helper.h") if(EXISTS "${__file}") file(READ "${__file}" __content) endif() From 6e767e2376dd5d06f0c4e0da9cb54471ea284fda Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Mon, 23 Jul 2018 17:58:10 +0300 Subject: [PATCH 15/19] ts: add findDataDirectory() function --- modules/ts/include/opencv2/ts.hpp | 5 +++++ modules/ts/src/ts.cpp | 32 ++++++++++++++++++++++++------- 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/modules/ts/include/opencv2/ts.hpp b/modules/ts/include/opencv2/ts.hpp index 7b3f732ce0..5d88396630 100644 --- a/modules/ts/include/opencv2/ts.hpp +++ b/modules/ts/include/opencv2/ts.hpp @@ -654,6 +654,11 @@ void addDataSearchSubDirectory(const std::string& subdir); */ std::string findDataFile(const std::string& relative_path, bool required = true); +/*! @brief Try to find requested data directory +@sa findDataFile + */ +std::string findDataDirectory(const std::string& relative_path, bool required = true); + #ifndef __CV_TEST_EXEC_ARGS #if defined(_MSC_VER) && (_MSC_VER <= 1400) diff --git a/modules/ts/src/ts.cpp b/modules/ts/src/ts.cpp index 06f9118a28..b1ea96bb15 100644 --- a/modules/ts/src/ts.cpp +++ b/modules/ts/src/ts.cpp @@ -772,16 +772,24 @@ void addDataSearchSubDirectory(const std::string& subdir) TS::ptr()->data_search_subdir.push_back(subdir); } -std::string findDataFile(const std::string& relative_path, bool required) +static std::string findData(const std::string& relative_path, bool required, bool findDirectory) { #define TEST_TRY_FILE_WITH_PREFIX(prefix) \ { \ std::string path = path_join(prefix, relative_path); \ /*printf("Trying %s\n", path.c_str());*/ \ - FILE* f = fopen(path.c_str(), "rb"); \ - if(f) { \ - fclose(f); \ - return path; \ + if (findDirectory) \ + { \ + if (isDirectory(path)) \ + return path; \ + } \ + else \ + { \ + FILE* f = fopen(path.c_str(), "rb"); \ + if(f) { \ + fclose(f); \ + return path; \ + } \ } \ } @@ -842,11 +850,21 @@ std::string findDataFile(const std::string& relative_path, bool required) } #endif #endif + const char* type = findDirectory ? "directory" : "data file"; if (required) - CV_Error(cv::Error::StsError, cv::format("OpenCV tests: Can't find required data file: %s", relative_path.c_str())); - throw SkipTestException(cv::format("OpenCV tests: Can't find data file: %s", relative_path.c_str())); + CV_Error(cv::Error::StsError, cv::format("OpenCV tests: Can't find required %s: %s", type, relative_path.c_str())); + throw SkipTestException(cv::format("OpenCV tests: Can't find %s: %s", type, relative_path.c_str())); +} + +std::string findDataFile(const std::string& relative_path, bool required) +{ + return findData(relative_path, required, false); } +std::string findDataDirectory(const std::string& relative_path, bool required) +{ + return findData(relative_path, required, true); +} } //namespace cvtest From 4283309daaa8dfca1701a4b6d3ff884f3eae1bca Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Mon, 23 Jul 2018 18:17:51 +0300 Subject: [PATCH 16/19] dnn: update tests for OpenVINO models --- modules/dnn/test/test_ie_models.cpp | 56 +++++++++++++++++++---------- modules/ts/include/opencv2/ts.hpp | 1 + 2 files changed, 38 insertions(+), 19 deletions(-) diff --git a/modules/dnn/test/test_ie_models.cpp b/modules/dnn/test/test_ie_models.cpp index 80c8ef3bc1..9013ce9774 100644 --- a/modules/dnn/test/test_ie_models.cpp +++ b/modules/dnn/test/test_ie_models.cpp @@ -13,15 +13,22 @@ #include #include -static std::string extraTestDataPath = -#ifdef WINRT - NULL; -#else - getenv("INTEL_CVSDK_DIR"); -#endif - namespace opencv_test { namespace { +static void initDLDTDataPath() +{ +#ifndef WINRT + static bool initialized = false; + if (!initialized) + { + const char* dldtTestDataPath = getenv("INTEL_CVSDK_DIR"); + if (dldtTestDataPath) + cvtest::addDataSearchPath(cv::utils::fs::join(dldtTestDataPath, "deployment_tools")); + initialized = true; + } +#endif +} + using namespace cv; using namespace cv::dnn; using namespace InferenceEngine; @@ -174,13 +181,11 @@ TEST_P(DNNTestOpenVINO, models) throw SkipTestException(""); std::string precision = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? "FP16" : "FP32"; - std::string prefix = utils::fs::join(extraTestDataPath, - utils::fs::join("deployment_tools", - utils::fs::join("intel_models", + std::string prefix = utils::fs::join("intel_models", utils::fs::join(modelName, - utils::fs::join(precision, modelName))))); - std::string xmlPath = prefix + ".xml"; - std::string binPath = prefix + ".bin"; + utils::fs::join(precision, modelName))); + std::string xmlPath = findDataFile(prefix + ".xml"); + std::string binPath = findDataFile(prefix + ".bin"); std::map inputsMap; std::map ieOutputsMap, cvOutputsMap; @@ -199,17 +204,30 @@ TEST_P(DNNTestOpenVINO, models) static testing::internal::ParamGenerator intelModels() { - String path = utils::fs::join(utils::fs::join(extraTestDataPath, "deployment_tools"), "intel_models"); - + initDLDTDataPath(); std::vector modelsNames; + + std::string path; + try + { + path = findDataDirectory("intel_models", false); + } + catch (...) + { + std::cerr << "ERROR: Can't find OpenVINO models. Check INTEL_CVSDK_DIR environment variable (run setup.sh)" << std::endl; + return ValuesIn(modelsNames); // empty list + } + cv::utils::fs::glob_relative(path, "", modelsNames, false, true); - std::vector::iterator end = + modelsNames.erase( std::remove_if(modelsNames.begin(), modelsNames.end(), - [&](const String& dir){ return !utils::fs::isDirectory(utils::fs::join(path, dir)); }); - modelsNames = std::vector(modelsNames.begin(), end); + [&](const String& dir){ return !utils::fs::isDirectory(utils::fs::join(path, dir)); }), + modelsNames.end() + ); + CV_Assert(!modelsNames.empty()); - return testing::ValuesIn(modelsNames); + return ValuesIn(modelsNames); } INSTANTIATE_TEST_CASE_P(/**/, DNNTestOpenVINO, Combine( diff --git a/modules/ts/include/opencv2/ts.hpp b/modules/ts/include/opencv2/ts.hpp index 5d88396630..3fbea894e8 100644 --- a/modules/ts/include/opencv2/ts.hpp +++ b/modules/ts/include/opencv2/ts.hpp @@ -103,6 +103,7 @@ using std::pair; using std::make_pair; using testing::TestWithParam; using testing::Values; +using testing::ValuesIn; using testing::Combine; using cv::Mat; From 8f80565d9cc7b0ec991ca68428baca95c6274dc3 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin Date: Tue, 24 Jul 2018 13:08:58 +0300 Subject: [PATCH 17/19] objdetect(qr): update test code improve error checks --- modules/objdetect/test/test_qrcode.cpp | 84 +++++++++++++------------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/modules/objdetect/test/test_qrcode.cpp b/modules/objdetect/test/test_qrcode.cpp index 82e9990530..65d6afc7f0 100644 --- a/modules/objdetect/test/test_qrcode.cpp +++ b/modules/objdetect/test/test_qrcode.cpp @@ -5,10 +5,9 @@ #include "test_precomp.hpp" -namespace opencv_test -{ +namespace opencv_test { namespace { -String qrcode_images_name[] = { +std::string qrcode_images_name[] = { "20110817_030.jpg", "20110817_048.jpg", "img_20120226_161648.jpg", @@ -25,24 +24,25 @@ String qrcode_images_name[] = { TEST(Objdetect_QRCode, generate_test_data) { - String root = cvtest::TS::ptr()->get_data_path() + "qrcode/"; - String dataset_config = cvtest::TS::ptr()->get_data_path() + "qrcode/dataset_config.json"; + const std::string root = "qrcode/"; + const std::string dataset_config = findDataFile(root + "dataset_config.json"); FileStorage file_config(dataset_config, FileStorage::WRITE); file_config << "test_images" << "["; - size_t images_count = sizeof(qrcode_images_name) / sizeof(String); + size_t images_count = sizeof(qrcode_images_name) / sizeof(qrcode_images_name[0]); for (size_t i = 0; i < images_count; i++) { file_config << "{:" << "image_name" << qrcode_images_name[i]; - String image_path = root + qrcode_images_name[i]; - std::vector transform; + std::string image_path = findDataFile(root + qrcode_images_name[i]); + std::vector corners; Mat src = imread(image_path, IMREAD_GRAYSCALE); - EXPECT_TRUE(detectQRCode(src, transform)); + ASSERT_FALSE(src.empty()) << "Can't read image: " << image_path; + EXPECT_TRUE(detectQRCode(src, corners)); file_config << "x" << "[:"; - for (size_t j = 0; j < transform.size(); j++) { file_config << transform[j].x; } + for (size_t j = 0; j < corners.size(); j++) { file_config << corners[j].x; } file_config << "]"; file_config << "y" << "[:"; - for (size_t j = 0; j < transform.size(); j++) { file_config << transform[j].y; } + for (size_t j = 0; j < corners.size(); j++) { file_config << corners[j].y; } file_config << "]" << "}"; } file_config << "]"; @@ -51,65 +51,65 @@ TEST(Objdetect_QRCode, generate_test_data) #else -typedef testing::TestWithParam< String > Objdetect_QRCode; +typedef testing::TestWithParam< std::string > Objdetect_QRCode; TEST_P(Objdetect_QRCode, regression) { - String root = cvtest::TS::ptr()->get_data_path() + "qrcode/"; - String dataset_config = cvtest::TS::ptr()->get_data_path() + "qrcode/dataset_config.json"; - FileStorage file_config(dataset_config, FileStorage::READ); + const std::string name_current_image = GetParam(); + const std::string root = "qrcode/"; const int pixels_error = 3; - std::vector corners; - String image_path = root + String(GetParam()); + std::string image_path = findDataFile(root + name_current_image); Mat src = imread(image_path, IMREAD_GRAYSCALE); + ASSERT_FALSE(src.empty()) << "Can't read image: " << image_path; + + std::vector corners; EXPECT_TRUE(detectQRCode(src, corners)); - if (file_config.isOpened()) + const std::string dataset_config = findDataFile(root + "dataset_config.json", false); + FileStorage file_config(dataset_config, FileStorage::READ); + ASSERT_TRUE(file_config.isOpened()) << "Can't read validation data: " << dataset_config; { FileNode images_list = file_config["test_images"]; - int index = 0, images_count = static_cast(images_list.size()); - ASSERT_GT(images_count, 0); + size_t images_count = static_cast(images_list.size()); + ASSERT_GT(images_count, 0u) << "Can't find validation data entries in 'test_images': " << dataset_config; - bool runTestsFlag = false; - String name_current_image = String(GetParam()); - for (; index < images_count; index++) + for (size_t index = 0; index < images_count; index++) { - String name_test_image = images_list[index]["image_name"]; + FileNode config = images_list[(int)index]; + std::string name_test_image = config["image_name"]; if (name_test_image == name_current_image) { for (int i = 0; i < 4; i++) { - int x = images_list[index]["x"][i]; - int y = images_list[index]["y"][i]; + int x = config["x"][i]; + int y = config["y"][i]; EXPECT_NEAR(x, corners[i].x, pixels_error); EXPECT_NEAR(y, corners[i].y, pixels_error); } - runTestsFlag = true; + return; // done } } - if (!runTestsFlag) - { - std::cout << "Not found results for " << name_current_image; - std::cout << " image in dataset_config.json file." << std::endl; - } - - file_config.release(); - } - else - { - std::cout << " Not found dataset_config.json file." << std::endl; + std::cerr + << "Not found results for '" << name_current_image + << "' image in config file:" << dataset_config << std::endl + << "Re-run tests with enabled UPDATE_QRCODE_TEST_DATA macro to update test data." + << std::endl; } } -INSTANTIATE_TEST_CASE_P(objdetect, Objdetect_QRCode, testing::ValuesIn(qrcode_images_name)); +INSTANTIATE_TEST_CASE_P(/**/, Objdetect_QRCode, testing::ValuesIn(qrcode_images_name)); + -TEST(Objdetect_QRCode, not_found_qrcode) + +TEST(Objdetect_QRCode_basic, not_found_qrcode) { std::vector corners; Mat zero_image = Mat::zeros(256, 256, CV_8UC1); EXPECT_FALSE(detectQRCode(zero_image, corners)); } -#endif -} // namespace + +#endif // UPDATE_QRCODE_TEST_DATA + +}} // namespace From cbb1e867e5141412c62ff534def7f117e28e04e8 Mon Sep 17 00:00:00 2001 From: Maksim Shabunin Date: Tue, 24 Jul 2018 14:14:13 +0300 Subject: [PATCH 18/19] More issues found by static analysis --- apps/createsamples/utility.cpp | 4 +--- modules/calib3d/src/circlesgrid.cpp | 22 +++++++++---------- modules/calib3d/src/dls.cpp | 2 +- modules/calib3d/test/test_chesscorners.cpp | 8 +++---- modules/core/src/persistence_json.cpp | 9 ++++---- modules/core/test/test_mat.cpp | 4 ++-- modules/core/test/test_rand.cpp | 8 ++++--- modules/dnn/src/layers/convolution_layer.cpp | 3 +-- modules/dnn/src/layers/eltwise_layer.cpp | 2 +- .../dnn/src/layers/normalize_bbox_layer.cpp | 1 + modules/dnn/src/layers/recurrent_layers.cpp | 6 ++--- modules/features2d/src/brisk.cpp | 4 +--- modules/features2d/src/kaze/KAZEFeatures.cpp | 2 -- modules/imgcodecs/src/exif.cpp | 1 - modules/imgcodecs/src/grfmt_bmp.cpp | 7 ++---- modules/imgcodecs/src/grfmt_pam.cpp | 2 +- modules/imgcodecs/src/grfmt_tiff.cpp | 9 ++++---- modules/imgproc/src/contours.cpp | 6 ++--- modules/imgproc/src/drawing.cpp | 4 ---- modules/imgproc/src/filter.avx2.cpp | 7 ++---- modules/imgproc/src/floodfill.cpp | 7 +++--- modules/imgproc/src/histogram.cpp | 4 ++-- modules/imgproc/src/hough.cpp | 1 - .../imgproc/src/min_enclosing_triangle.cpp | 1 - modules/ml/src/ann_mlp.cpp | 2 +- modules/videoio/src/cap_ffmpeg_impl.hpp | 3 --- modules/videoio/src/container_avi.cpp | 3 --- 27 files changed, 51 insertions(+), 81 deletions(-) diff --git a/apps/createsamples/utility.cpp b/apps/createsamples/utility.cpp index 0ec7e8cb6e..ab1ca1c789 100644 --- a/apps/createsamples/utility.cpp +++ b/apps/createsamples/utility.cpp @@ -1044,12 +1044,10 @@ void cvCreateTrainingSamples( const char* filename, output = fopen( filename, "wb" ); if( output != NULL ) { - int hasbg; int i; int inverse; - hasbg = 0; - hasbg = (bgfilename != NULL && icvInitBackgroundReaders( bgfilename, + const int hasbg = (bgfilename != NULL && icvInitBackgroundReaders( bgfilename, Size( winwidth,winheight ) ) ); Mat sample( winheight, winwidth, CV_8UC1 ); diff --git a/modules/calib3d/src/circlesgrid.cpp b/modules/calib3d/src/circlesgrid.cpp index 25da6be03f..2d1a817629 100644 --- a/modules/calib3d/src/circlesgrid.cpp +++ b/modules/calib3d/src/circlesgrid.cpp @@ -224,7 +224,7 @@ void CirclesGridClusterFinder::findOutsideCorners(const std::vector CV_Assert(!corners.empty()); outsideCorners.clear(); //find two pairs of the most nearest corners - int i, j, n = (int)corners.size(); + const size_t n = corners.size(); #ifdef DEBUG_CIRCLES Mat cornersImage(1024, 1248, CV_8UC1, Scalar(0)); @@ -232,22 +232,22 @@ void CirclesGridClusterFinder::findOutsideCorners(const std::vector imshow("corners", cornersImage); #endif - std::vector tangentVectors(corners.size()); - for(size_t k=0; k tangentVectors(n); + for(size_t k=0; k < n; k++) { - Point2f diff = corners[(k + 1) % corners.size()] - corners[k]; + Point2f diff = corners[(k + 1) % n] - corners[k]; tangentVectors[k] = diff * (1.0f / norm(diff)); } //compute angles between all sides - Mat cosAngles(n, n, CV_32FC1, 0.0f); - for(i = 0; i < n; i++) + Mat cosAngles((int)n, (int)n, CV_32FC1, 0.0f); + for(size_t i = 0; i < n; i++) { - for(j = i + 1; j < n; j++) + for(size_t j = i + 1; j < n; j++) { float val = fabs(tangentVectors[i].dot(tangentVectors[j])); - cosAngles.at(i, j) = val; - cosAngles.at(j, i) = val; + cosAngles.at((int)i, (int)j) = val; + cosAngles.at((int)j, (int)i) = val; } } @@ -276,10 +276,10 @@ void CirclesGridClusterFinder::findOutsideCorners(const std::vector const int bigDiff = 4; if(maxIdx - minIdx == bigDiff) { - minIdx += n; + minIdx += (int)n; std::swap(maxIdx, minIdx); } - if(maxIdx - minIdx != n - bigDiff) + if(maxIdx - minIdx != (int)n - bigDiff) { return; } diff --git a/modules/calib3d/src/dls.cpp b/modules/calib3d/src/dls.cpp index b0334c4268..8f814f0d37 100644 --- a/modules/calib3d/src/dls.cpp +++ b/modules/calib3d/src/dls.cpp @@ -206,7 +206,7 @@ void dls::run_kernel(const cv::Mat& pp) void dls::build_coeff_matrix(const cv::Mat& pp, cv::Mat& Mtilde, cv::Mat& D) { - CV_Assert(!pp.empty()); + CV_Assert(!pp.empty() && N > 0); cv::Mat eye = cv::Mat::eye(3, 3, CV_64F); // build coeff matrix diff --git a/modules/calib3d/test/test_chesscorners.cpp b/modules/calib3d/test/test_chesscorners.cpp index 2da1b12e25..8303a8dcd4 100644 --- a/modules/calib3d/test/test_chesscorners.cpp +++ b/modules/calib3d/test/test_chesscorners.cpp @@ -334,19 +334,19 @@ bool validateData(const ChessBoardGenerator& cbg, const Size& imgSz, tmp = cv::norm(cur - mat(i + 1, j + 1)); // TODO cvtest if (tmp < minNeibDist) - tmp = minNeibDist; + minNeibDist = tmp; tmp = cv::norm(cur - mat(i - 1, j + 1)); // TODO cvtest if (tmp < minNeibDist) - tmp = minNeibDist; + minNeibDist = tmp; tmp = cv::norm(cur - mat(i + 1, j - 1)); // TODO cvtest if (tmp < minNeibDist) - tmp = minNeibDist; + minNeibDist = tmp; tmp = cv::norm(cur - mat(i - 1, j - 1)); // TODO cvtest if (tmp < minNeibDist) - tmp = minNeibDist; + minNeibDist = tmp; } const double threshold = 0.25; diff --git a/modules/core/src/persistence_json.cpp b/modules/core/src/persistence_json.cpp index ef0c3a49f5..fe87647337 100644 --- a/modules/core/src/persistence_json.cpp +++ b/modules/core/src/persistence_json.cpp @@ -123,7 +123,6 @@ static char* icvJSONParseKey( CvFileStorage* fs, char* ptr, CvFileNode* map, CvF CV_PARSE_ERROR( "Key must start with \'\"\'" ); char * beg = ptr + 1; - char * end = beg; do { ++ptr; @@ -133,7 +132,7 @@ static char* icvJSONParseKey( CvFileStorage* fs, char* ptr, CvFileNode* map, CvF if( *ptr != '"' ) CV_PARSE_ERROR( "Key must end with \'\"\'" ); - end = ptr; + const char * end = ptr; ptr++; ptr = icvJSONSkipSpaces( fs, ptr ); if ( ptr == 0 || fs->dummy_eof ) @@ -576,12 +575,12 @@ void icvJSONParse( CvFileStorage* fs ) if ( *ptr == '{' ) { CvFileNode* root_node = (CvFileNode*)cvSeqPush( fs->roots, 0 ); - ptr = icvJSONParseMap( fs, ptr, root_node ); + icvJSONParseMap( fs, ptr, root_node ); } else if ( *ptr == '[' ) { CvFileNode* root_node = (CvFileNode*)cvSeqPush( fs->roots, 0 ); - ptr = icvJSONParseSeq( fs, ptr, root_node ); + icvJSONParseSeq( fs, ptr, root_node ); } else { @@ -668,7 +667,7 @@ void icvJSONWrite( CvFileStorage* fs, const char* key, const char* data ) *ptr++ = '\n'; *ptr++ = '\0'; ::icvPuts( fs, fs->buffer_start ); - ptr = fs->buffer = fs->buffer_start; + fs->buffer = fs->buffer_start; } ptr = icvFSFlush(fs); } diff --git a/modules/core/test/test_mat.cpp b/modules/core/test/test_mat.cpp index ad480eb8d3..7e20f153c8 100644 --- a/modules/core/test/test_mat.cpp +++ b/modules/core/test/test_mat.cpp @@ -1014,8 +1014,8 @@ protected: Size mSize(rng.uniform(minMSize, maxMSize), rng.uniform(minMSize, maxMSize)); size_t mvSize = rng.uniform(1, maxMvSize); - int res = cvtest::TS::OK, curRes = res; - curRes = run_case(CV_8U, mvSize, mSize, rng); + int res = cvtest::TS::OK; + int curRes = run_case(CV_8U, mvSize, mSize, rng); res = curRes != cvtest::TS::OK ? curRes : res; curRes = run_case(CV_8S, mvSize, mSize, rng); diff --git a/modules/core/test/test_rand.cpp b/modules/core/test/test_rand.cpp index 34b32a7e53..6504649842 100644 --- a/modules/core/test/test_rand.cpp +++ b/modules/core/test/test_rand.cpp @@ -375,9 +375,11 @@ TEST(Core_Rand, Regression_Stack_Corruption) int bufsz = 128; //enough for 14 doubles AutoBuffer buffer(bufsz); size_t offset = 0; - cv::Mat_ x(2, 3, (cv::Point2d*)(buffer.data()+offset)); offset += x.total()*x.elemSize(); - double& param1 = *(double*)(buffer.data()+offset); offset += sizeof(double); - double& param2 = *(double*)(buffer.data()+offset); offset += sizeof(double); + cv::Mat_ x(2, 3, (cv::Point2d*)(buffer.data()+offset)); + offset += x.total()*x.elemSize(); + double& param1 = *(double*)(buffer.data()+offset); + offset += sizeof(double); + double& param2 = *(double*)(buffer.data()+offset); param1 = -9; param2 = 2; cv::theRNG().fill(x, cv::RNG::NORMAL, param1, param2); diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index 23f99e7a6b..d08dec548b 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -560,7 +560,7 @@ public: int ngroups = ngroups_, batchSize = input_->size[0]*ngroups; int outW = output_->size[3], outH = output_->size[2], outCn = output_->size[1]/ngroups; int width = input_->size[3], height = input_->size[2], inpCn = input_->size[1]/ngroups; - int nstripes = nstripes_; + const int nstripes = nstripes_; int kernel_w = kernel_.width, kernel_h = kernel_.height; int pad_w = pad_.width, pad_h = pad_.height; int stride_w = stride_.width, stride_h = stride_.height; @@ -587,7 +587,6 @@ public: int samplesPerStripe = std::max((batchSize + nstripes - 1)/nstripes, 1); r.start *= samplesPerStripe; r.end *= samplesPerStripe; - nstripes *= samplesPerStripe; stripeSize = outPlaneSize; } diff --git a/modules/dnn/src/layers/eltwise_layer.cpp b/modules/dnn/src/layers/eltwise_layer.cpp index 8eb3ff4a68..442bfa7aff 100644 --- a/modules/dnn/src/layers/eltwise_layer.cpp +++ b/modules/dnn/src/layers/eltwise_layer.cpp @@ -187,7 +187,7 @@ public: int c, j, k, n = nsrcs; const float* coeffsptr = coeffs && !coeffs->empty() ? &coeffs->at(0) : 0; float* dstptr0 = dst->ptr(); - int blockSize0 = 1 << 12, blockSize = blockSize0; + int blockSize0 = 1 << 12, blockSize; for( size_t ofs = stripeStart; ofs < stripeEnd; ofs += blockSize ) { diff --git a/modules/dnn/src/layers/normalize_bbox_layer.cpp b/modules/dnn/src/layers/normalize_bbox_layer.cpp index a846dabfb0..86a56915a2 100644 --- a/modules/dnn/src/layers/normalize_bbox_layer.cpp +++ b/modules/dnn/src/layers/normalize_bbox_layer.cpp @@ -190,6 +190,7 @@ public: size_t num = total(shape(inp0.size), 0, startAxis); size_t numPlanes = total(shape(inp0.size), startAxis, endAxis + 1); + CV_Assert(num * numPlanes != 0); size_t planeSize = inp0.total() / (num * numPlanes); for (size_t n = 0; n < num; ++n) { diff --git a/modules/dnn/src/layers/recurrent_layers.cpp b/modules/dnn/src/layers/recurrent_layers.cpp index 46796778a9..b356b7627c 100644 --- a/modules/dnn/src/layers/recurrent_layers.cpp +++ b/modules/dnn/src/layers/recurrent_layers.cpp @@ -189,18 +189,16 @@ public: else outTailShape_.assign(1, _numOut); - int _numTimeStamps, _numSamples; + int _numSamples; if (useTimestampDim) { CV_Assert(inp0.size() >= 2 && total(inp0, 2) == _numInp); - _numTimeStamps = inp0[0]; _numSamples = inp0[1]; - outResShape.push_back(_numTimeStamps); + outResShape.push_back(inp0[0]); } else { CV_Assert(inp0.size() >= 2 && total(inp0, 1) == _numInp); - _numTimeStamps = 1; _numSamples = inp0[0]; } diff --git a/modules/features2d/src/brisk.cpp b/modules/features2d/src/brisk.cpp index 5e233d0d8f..4038279d75 100644 --- a/modules/features2d/src/brisk.cpp +++ b/modules/features2d/src/brisk.cpp @@ -1236,7 +1236,6 @@ BriskScaleSpace::isMax2D(const int layer, const int x_layer, const int y_layer) { // in this case, we have to analyze the situation more carefully: // the values are gaussian blurred and then we really decide - data = scores.ptr() + y_layer * scorescols + x_layer; int smoothedcenter = 4 * center + 2 * (s_10 + s10 + s0_1 + s01) + s_1_1 + s1_1 + s_11 + s11; for (unsigned int i = 0; i < deltasize; i += 2) { @@ -1312,8 +1311,7 @@ BriskScaleSpace::refine3D(const int layer, const int x_layer, const int y_layer, int s_2_2 = l.getAgastScore_5_8(x_layer + 1, y_layer + 1, 1); max_below = std::max(s_2_2, max_below); - max_below_float = subpixel2D(s_0_0, s_0_1, s_0_2, s_1_0, s_1_1, s_1_2, s_2_0, s_2_1, s_2_2, delta_x_below, - delta_y_below); + subpixel2D(s_0_0, s_0_1, s_0_2, s_1_0, s_1_1, s_1_2, s_2_0, s_2_1, s_2_2, delta_x_below, delta_y_below); max_below_float = (float)max_below; } else diff --git a/modules/features2d/src/kaze/KAZEFeatures.cpp b/modules/features2d/src/kaze/KAZEFeatures.cpp index 0cc52d9117..58f0937f00 100644 --- a/modules/features2d/src/kaze/KAZEFeatures.cpp +++ b/modules/features2d/src/kaze/KAZEFeatures.cpp @@ -373,8 +373,6 @@ void KAZEFeatures::Determinant_Hessian(std::vector& kpts) is_out = true; } - is_out = false; - if (is_out == false) { if (is_repeated == false) { kpts.push_back(kpts_par_[i][j]); diff --git a/modules/imgcodecs/src/exif.cpp b/modules/imgcodecs/src/exif.cpp index 350123431c..22b4f224ce 100644 --- a/modules/imgcodecs/src/exif.cpp +++ b/modules/imgcodecs/src/exif.cpp @@ -175,7 +175,6 @@ std::map ExifReader::getExif() CV_THROW (ExifParsingError()); } m_stream.read( reinterpret_cast(&m_data[0]), exifSize - offsetToTiffHeader ); - count = m_stream.gcount(); exifFound = true; break; diff --git a/modules/imgcodecs/src/grfmt_bmp.cpp b/modules/imgcodecs/src/grfmt_bmp.cpp index fcfd834e4a..798b2d9446 100644 --- a/modules/imgcodecs/src/grfmt_bmp.cpp +++ b/modules/imgcodecs/src/grfmt_bmp.cpp @@ -265,7 +265,7 @@ bool BmpDecoder::readData( Mat& img ) for(;;) { int code = m_strm.getWord(); - int len = code & 255; + const int len = code & 255; code >>= 8; if( len != 0 ) // encoded mode { @@ -304,16 +304,13 @@ bool BmpDecoder::readData( Mat& img ) else { int x_shift3 = (int)(line_end - data); - int y_shift = m_height - y; if( code == 2 ) { x_shift3 = m_strm.getByte()*nch; - y_shift = m_strm.getByte(); + m_strm.getByte(); } - len = x_shift3 + ((y_shift * width3) & ((code == 0) - 1)); - if( color ) data = FillUniColor( data, line_end, step, width3, y, m_height, x_shift3, diff --git a/modules/imgcodecs/src/grfmt_pam.cpp b/modules/imgcodecs/src/grfmt_pam.cpp index 9c9aa9e79e..d1a2161733 100644 --- a/modules/imgcodecs/src/grfmt_pam.cpp +++ b/modules/imgcodecs/src/grfmt_pam.cpp @@ -689,7 +689,7 @@ bool PAMEncoder::write( const Mat& img, const std::vector& params ) tmp += sprintf( buffer + tmp, "MAXVAL %d\n", (1 << img.elemSize1()*8) - 1); if (fmt) tmp += sprintf( buffer + tmp, "TUPLTYPE %s\n", fmt->name ); - tmp += sprintf( buffer + tmp, "ENDHDR\n" ); + sprintf( buffer + tmp, "ENDHDR\n" ); strm.putBytes( buffer, (int)strlen(buffer) ); /* write data */ diff --git a/modules/imgcodecs/src/grfmt_tiff.cpp b/modules/imgcodecs/src/grfmt_tiff.cpp index 8dcbe5b672..69dc4261ac 100644 --- a/modules/imgcodecs/src/grfmt_tiff.cpp +++ b/modules/imgcodecs/src/grfmt_tiff.cpp @@ -255,22 +255,21 @@ bool TiffDecoder::readHeader() { case 8: m_type = CV_MAKETYPE(CV_8U, photometric > 1 ? wanted_channels : 1); + result = true; break; case 16: m_type = CV_MAKETYPE(CV_16U, photometric > 1 ? wanted_channels : 1); + result = true; break; - case 32: m_type = CV_MAKETYPE(CV_32F, photometric > 1 ? 3 : 1); + result = true; break; case 64: m_type = CV_MAKETYPE(CV_64F, photometric > 1 ? 3 : 1); + result = true; break; - - default: - result = false; } - result = true; } } diff --git a/modules/imgproc/src/contours.cpp b/modules/imgproc/src/contours.cpp index 10f7744c1f..eb47816c86 100644 --- a/modules/imgproc/src/contours.cpp +++ b/modules/imgproc/src/contours.cpp @@ -855,7 +855,6 @@ icvTraceContour_32s( int *ptr, int step, int *stop_ptr, int is_hole ) for( ;; ) { CV_Assert(i3 != NULL); - s_end = s; s = std::min(s, MAX_SIZE - 1); while( s < MAX_SIZE - 1 ) @@ -1479,7 +1478,7 @@ icvFindContoursInInterval( const CvArr* src, cv::Ptr storage01; CvSeq* first = 0; - int i, j, k, n; + int j, k, n; uchar* src_data = 0; int img_step = 0; @@ -1547,7 +1546,6 @@ icvFindContoursInInterval( const CvArr* src, // First line. None of runs is binded tmp.pt.y = 0; - i = 0; CV_WRITE_SEQ_ELEM( tmp, writer ); upper_line = (CvLinkedRunPoint*)CV_GET_WRITTEN_ELEM( writer ); @@ -1580,7 +1578,7 @@ icvFindContoursInInterval( const CvArr* src, last_elem = tmp_prev; tmp_prev->next = 0; - for( i = 1; i < img_size.height; i++ ) + for( int i = 1; i < img_size.height; i++ ) { //------// Find runs in next line src_data += img_step; diff --git a/modules/imgproc/src/drawing.cpp b/modules/imgproc/src/drawing.cpp index d58b438a89..0d44e44424 100644 --- a/modules/imgproc/src/drawing.cpp +++ b/modules/imgproc/src/drawing.cpp @@ -338,7 +338,6 @@ LineAA( Mat& img, Point2l pt1, Point2l pt2, const void* color ) if( ax > ay ) { - dx = ax; dy = (dy ^ j) - j; pt1.x ^= pt2.x & j; pt2.x ^= pt1.x & j; @@ -362,7 +361,6 @@ LineAA( Mat& img, Point2l pt1, Point2l pt2, const void* color ) } else { - dy = ay; dx = (dx ^ i) - i; pt1.x ^= pt2.x & i; pt2.x ^= pt1.x & i; @@ -677,7 +675,6 @@ Line2( Mat& img, Point2l pt1, Point2l pt2, const void* color) if( ax > ay ) { - dx = ax; dy = (dy ^ j) - j; pt1.x ^= pt2.x & j; pt2.x ^= pt1.x & j; @@ -692,7 +689,6 @@ Line2( Mat& img, Point2l pt1, Point2l pt2, const void* color) } else { - dy = ay; dx = (dx ^ i) - i; pt1.x ^= pt2.x & i; pt2.x ^= pt1.x & i; diff --git a/modules/imgproc/src/filter.avx2.cpp b/modules/imgproc/src/filter.avx2.cpp index b469329598..e9ced20e36 100644 --- a/modules/imgproc/src/filter.avx2.cpp +++ b/modules/imgproc/src/filter.avx2.cpp @@ -128,8 +128,6 @@ int SymmColumnVec_32f_Symm_AVX(const float** src, const float* ky, float* dst, f for( k = 1; k <= ksize2; k++ ) { f = _mm_set1_ps(ky[k]); - S = src[k] + i; - S2 = src[-k] + i; x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i)); s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); } @@ -144,7 +142,7 @@ int SymmColumnVec_32f_Symm_AVX(const float** src, const float* ky, float* dst, f int SymmColumnVec_32f_Unsymm_AVX(const float** src, const float* ky, float* dst, float delta, int width, int ksize2) { int i = 0, k; - const float *S, *S2; + const float *S2; const __m128 d4 = _mm_set1_ps(delta); const __m256 d8 = _mm256_set1_ps(delta); @@ -152,11 +150,10 @@ int SymmColumnVec_32f_Unsymm_AVX(const float** src, const float* ky, float* dst, { __m256 f, s0 = d8, s1 = d8; __m256 x0; - S = src[0] + i; for (k = 1; k <= ksize2; k++) { - S = src[k] + i; + const float *S = src[k] + i; S2 = src[-k] + i; f = _mm256_set1_ps(ky[k]); x0 = _mm256_sub_ps(_mm256_loadu_ps(S), _mm256_loadu_ps(S2)); diff --git a/modules/imgproc/src/floodfill.cpp b/modules/imgproc/src/floodfill.cpp index 953b4bba9e..0509c61267 100644 --- a/modules/imgproc/src/floodfill.cpp +++ b/modules/imgproc/src/floodfill.cpp @@ -467,7 +467,7 @@ int cv::floodFill( InputOutputArray _image, InputOutputArray _mask, if( rect ) *rect = Rect(); - int i, connectivity = flags & 255; + int i; union { uchar b[4]; int i[4]; @@ -491,9 +491,8 @@ int cv::floodFill( InputOutputArray _image, InputOutputArray _mask, CV_Error( CV_StsBadArg, "Number of channels in input image must be 1 or 3" ); } - if( connectivity == 0 ) - connectivity = 4; - else if( connectivity != 4 && connectivity != 8 ) + const int connectivity = flags & 255; + if( connectivity != 0 && connectivity != 4 && connectivity != 8 ) CV_Error( CV_StsBadFlag, "Connectivity must be 4, 0(=4) or 8" ); bool is_simple = mask.empty() && (flags & FLOODFILL_MASK_ONLY) == 0; diff --git a/modules/imgproc/src/histogram.cpp b/modules/imgproc/src/histogram.cpp index f3ddeaf78c..6eb848068b 100644 --- a/modules/imgproc/src/histogram.cpp +++ b/modules/imgproc/src/histogram.cpp @@ -1930,7 +1930,7 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method ) Mat planes[2]; NAryMatIterator it(arrays, planes); double result = 0; - int j, len = (int)it.size; + int j; CV_Assert( H1.type() == H2.type() && H1.depth() == CV_32F ); @@ -1946,7 +1946,7 @@ double cv::compareHist( InputArray _H1, InputArray _H2, int method ) { const float* h1 = it.planes[0].ptr(); const float* h2 = it.planes[1].ptr(); - len = it.planes[0].rows*it.planes[0].cols*H1.channels(); + const int len = it.planes[0].rows*it.planes[0].cols*H1.channels(); j = 0; if( (method == CV_COMP_CHISQR) || (method == CV_COMP_CHISQR_ALT)) diff --git a/modules/imgproc/src/hough.cpp b/modules/imgproc/src/hough.cpp index 18c532253f..36f2f6739f 100644 --- a/modules/imgproc/src/hough.cpp +++ b/modules/imgproc/src/hough.cpp @@ -413,7 +413,6 @@ HoughLinesSDiv( InputArray image, OutputArray lines, int type, // Find peaks in maccum... for( index = 0; index < sfn; index++ ) { - i = 0; int pos = (int)(lst.size() - 1); if( pos < 0 || lst[pos].value < mcaccum[index] ) { diff --git a/modules/imgproc/src/min_enclosing_triangle.cpp b/modules/imgproc/src/min_enclosing_triangle.cpp index bb372fe338..4853a755d9 100644 --- a/modules/imgproc/src/min_enclosing_triangle.cpp +++ b/modules/imgproc/src/min_enclosing_triangle.cpp @@ -401,7 +401,6 @@ static void findMinimumAreaEnclosingTriangle(const std::vector &pol a = 1; b = 2; - c = 0; // Main algorithm steps diff --git a/modules/ml/src/ann_mlp.cpp b/modules/ml/src/ann_mlp.cpp index a83c615578..1e4691e69d 100644 --- a/modules/ml/src/ann_mlp.cpp +++ b/modules/ml/src/ann_mlp.cpp @@ -1259,7 +1259,7 @@ public: prev_dEdw_sign[i] = Mat::zeros(weights[i].size(), CV_8S); dEdw[i] = Mat::zeros(weights[i].size(), CV_64F); } - + CV_Assert(total > 0); int dcount0 = max_buf_size/(2*total); dcount0 = std::max( dcount0, 1 ); dcount0 = std::min( dcount0, count ); diff --git a/modules/videoio/src/cap_ffmpeg_impl.hpp b/modules/videoio/src/cap_ffmpeg_impl.hpp index 9e858add0e..de852abd2e 100644 --- a/modules/videoio/src/cap_ffmpeg_impl.hpp +++ b/modules/videoio/src/cap_ffmpeg_impl.hpp @@ -2351,9 +2351,6 @@ AVStream* OutputMediaStream_FFMPEG::addVideoStream(AVFormatContext *oc, CV_CODEC c->codec_type = AVMEDIA_TYPE_VIDEO; // put sample parameters - unsigned long long lbit_rate = static_cast(bitrate); - lbit_rate += (bitrate / 4); - lbit_rate = std::min(lbit_rate, static_cast(std::numeric_limits::max())); c->bit_rate = bitrate; // took advice from diff --git a/modules/videoio/src/container_avi.cpp b/modules/videoio/src/container_avi.cpp index c2a93698bd..d6a7a059e4 100644 --- a/modules/videoio/src/container_avi.cpp +++ b/modules/videoio/src/container_avi.cpp @@ -325,9 +325,6 @@ bool AVIReadContainer::parseStrl(char stream_id, Codecs codec_) if(m_file_stream && strh.m_four_cc == STRH_CC) { - uint64_t next_strl_list = m_file_stream->tellg(); - next_strl_list += strh.m_size; - AviStreamHeader strm_hdr; *m_file_stream >> strm_hdr; From 9c7040802cf3001ffee551d563640ee6ab2af1dd Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Tue, 24 Jul 2018 17:27:56 +0300 Subject: [PATCH 19/19] converted split() & merge() to wide univ intrinsics (#12044) * fixed/updated v_load_deinterleave and v_store_interleave intrinsics; modified split() and merge() functions to use those intrinsics * fixed a few compile errors and bug in v_load_deinterleave(ptr, v_uint32x4& a, v_uint32x4& b) * fixed few more compile errors --- .../include/opencv2/core/hal/intrin_avx.hpp | 822 +++++++++++------- .../include/opencv2/core/hal/intrin_neon.hpp | 77 ++ .../include/opencv2/core/hal/intrin_sse.hpp | 327 ++++--- .../include/opencv2/core/hal/intrin_vsx.hpp | 2 + modules/core/src/merge.cpp | 347 ++------ modules/core/src/split.cpp | 361 ++------ 6 files changed, 941 insertions(+), 995 deletions(-) diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp index 8654f4f022..4ea66f5c0b 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp @@ -1609,392 +1609,592 @@ OPENCV_HAL_IMPL_AVX_EXTRACT(v_float32x8) OPENCV_HAL_IMPL_AVX_EXTRACT(v_float64x4) -/** Reinterpret **/ -// its up there with load and store operations - -/* de&interleave */ -#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_2CH(_Tpvec, _Tp, suffix) \ - inline void v_load_deinterleave(const _Tp* ptr, _Tpvec& a, _Tpvec& b) \ - { return v256_load_deinterleave_##suffix(ptr, a, b); } \ - inline void v_store_interleave(_Tp* ptr, const _Tpvec& a, const _Tpvec& b) \ - { return v256_store_interleave_2ch(ptr, a, b); } +///////////////////// load deinterleave ///////////////////////////// -#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_3CH(_Tpvec, _Tp, suffix) \ - inline void v_load_deinterleave \ - (const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c) \ - { return v256_load_deinterleave_##suffix(ptr, a, b, c); } \ - inline void v_store_interleave \ - (_Tp* ptr, const _Tpvec& a,const _Tpvec& b, const _Tpvec& c) \ - { return v256_store_interleave_##suffix(ptr, a, b, c); } +inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b ) +{ + __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr); + __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 32)); -#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_4CH(_Tpvec, _Tp, suffix) \ - inline void v_load_deinterleave \ - (const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d) \ - { return v256_load_deinterleave_##suffix(ptr, a, b, c, d); } \ - inline void v_store_interleave \ - (_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d) \ - { return v256_store_interleave_##suffix(ptr, a, b, c, d); } + static const __m256i sh = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); + __m256i p0 = _mm256_shuffle_epi8(ab0, sh); + __m256i p1 = _mm256_shuffle_epi8(ab1, sh); + __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16); + __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16); + __m256i a0 = _mm256_unpacklo_epi64(pl, ph); + __m256i b0 = _mm256_unpackhi_epi64(pl, ph); + a = v_uint8x32(a0); + b = v_uint8x32(b0); +} + +inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b ) +{ + __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr); + __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 16)); + + static const __m256i sh = _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); + __m256i p0 = _mm256_shuffle_epi8(ab0, sh); + __m256i p1 = _mm256_shuffle_epi8(ab1, sh); + __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16); + __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16); + __m256i a0 = _mm256_unpacklo_epi64(pl, ph); + __m256i b0 = _mm256_unpackhi_epi64(pl, ph); + a = v_uint16x16(a0); + b = v_uint16x16(b0); +} + +inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b ) +{ + __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr); + __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 8)); + + const int sh = 0+2*4+1*16+3*64; + __m256i p0 = _mm256_shuffle_epi32(ab0, sh); + __m256i p1 = _mm256_shuffle_epi32(ab1, sh); + __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16); + __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16); + __m256i a0 = _mm256_unpacklo_epi64(pl, ph); + __m256i b0 = _mm256_unpackhi_epi64(pl, ph); + a = v_uint32x8(a0); + b = v_uint32x8(b0); +} -#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_3n4CH(_Tpvec, _Tp, suffix) \ - OPENCV_HAL_IMPL_AVX_INTERLEAVE_3CH(_Tpvec, _Tp, suffix) \ - OPENCV_HAL_IMPL_AVX_INTERLEAVE_4CH(_Tpvec, _Tp, suffix) +inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b ) +{ + __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr); + __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 4)); -#define OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(_Tpvec, _Tp, suffix) \ - OPENCV_HAL_IMPL_AVX_INTERLEAVE_2CH(_Tpvec, _Tp, suffix) \ - OPENCV_HAL_IMPL_AVX_INTERLEAVE_3n4CH(_Tpvec, _Tp, suffix) + __m256i pl = _mm256_permute2x128_si256(ab0, ab1, 0 + 2*16); + __m256i ph = _mm256_permute2x128_si256(ab0, ab1, 1 + 3*16); + __m256i a0 = _mm256_unpacklo_epi64(pl, ph); + __m256i b0 = _mm256_unpackhi_epi64(pl, ph); + a = v_uint64x4(a0); + b = v_uint64x4(b0); +} -/* **** */ -// -template -inline void v256_store_interleave_2ch(_Tp* ptr, const _Tpvec& a, const _Tpvec& b) +inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& b, v_uint8x32& g, v_uint8x32& r ) { - _Tpvec ab0, ab1; - v_zip(a, b, ab0, ab1); - v_store(ptr, ab0); - v_store(ptr + _Tpvec::nlanes, ab1); -} + __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr); + __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 32)); + __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 64)); -template -inline void v256_load_deinterleave_l4(const _Tp* ptr, _Tpvec& a, _Tpvec& b) -{ - _Tpvec ab0 = v256_load(ptr); - _Tpvec ab1 = v256_load(ptr + _Tpvec::nlanes); - _Tpvec ab00, ab11; - v_recombine(ab0, ab1, ab00, ab11); - v256_zip(ab00, ab11, a, b); -} + __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16); + __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16); -/// -template -inline void v256_load_deinterleave_l4(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c) -{ - _Tpvec abc0 = v256_load(ptr); - _Tpvec abc1 = v256_load(ptr + _Tpvec::nlanes); - _Tpvec abc2 = v256_load(ptr + _Tpvec::nlanes * 2); + static const __m256i m0 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, + 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0); + static const __m256i m1 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, + -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1); - _Tpvec ab0 = v256_combine_diagonal(abc0, abc1); - _Tpvec bc1 = v256_combine_diagonal(abc1, abc2); - _Tpvec ac1 = v256_reverse_64(v256_combine_diagonal(abc2, abc0)); + __m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1); + __m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0); + __m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1); - a = v256_unpacklo(ab0, ac1); - c = v256_unpackhi(ac1, bc1); - b = v256_alignr_64(bc1, ab0); -} + static const __m256i + sh_b = _mm256_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, + 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13), + sh_g = _mm256_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, + 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14), + sh_r = _mm256_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, + 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15); + b0 = _mm256_shuffle_epi8(b0, sh_b); + g0 = _mm256_shuffle_epi8(g0, sh_g); + r0 = _mm256_shuffle_epi8(r0, sh_r); + b = v_uint8x32(b0); + g = v_uint8x32(g0); + r = v_uint8x32(r0); +} -template -inline void v256_store_interleave_l4(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) -{ - _Tpvec ab0 = v256_unpacklo(a, b); - _Tpvec bc1 = v256_unpackhi(b, c); - _Tpvec ca10 = v256_swap_halves(v256_blend<0xa>(c, a)); +inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& b, v_uint16x16& g, v_uint16x16& r ) +{ + __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr); + __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 16)); + __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 32)); + + __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16); + __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16); + + static const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, + 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0); + static const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, + -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0); + __m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1); + __m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1); + __m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0); + static const __m256i sh_b = _mm256_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, + 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11); + static const __m256i sh_g = _mm256_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, + 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13); + static const __m256i sh_r = _mm256_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, + 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15); + b0 = _mm256_shuffle_epi8(b0, sh_b); + g0 = _mm256_shuffle_epi8(g0, sh_g); + r0 = _mm256_shuffle_epi8(r0, sh_r); + + b = v_uint16x16(b0); + g = v_uint16x16(g0); + r = v_uint16x16(r0); +} + +inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& b, v_uint32x8& g, v_uint32x8& r ) +{ + __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr); + __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 8)); + __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 16)); + + __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16); + __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16); + + __m256i b0 = _mm256_blend_epi32(_mm256_blend_epi32(s02_low, s02_high, 0x24), bgr1, 0x92); + __m256i g0 = _mm256_blend_epi32(_mm256_blend_epi32(s02_high, s02_low, 0x92), bgr1, 0x24); + __m256i r0 = _mm256_blend_epi32(_mm256_blend_epi32(bgr1, s02_low, 0x24), s02_high, 0x92); + + b0 = _mm256_shuffle_epi32(b0, 0x6c); + g0 = _mm256_shuffle_epi32(g0, 0xb1); + r0 = _mm256_shuffle_epi32(r0, 0xc6); + + b = v_uint32x8(b0); + g = v_uint32x8(g0); + r = v_uint32x8(r0); +} + +inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& b, v_uint64x4& g, v_uint64x4& r ) +{ + __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr); + __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 4)); + __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 8)); + + __m256i s01 = _mm256_blend_epi32(bgr0, bgr1, 0xf0); + __m256i s12 = _mm256_blend_epi32(bgr1, bgr2, 0xf0); + __m256i s20r = _mm256_permute4x64_epi64(_mm256_blend_epi32(bgr2, bgr0, 0xf0), 0x1b); + __m256i b0 = _mm256_unpacklo_epi64(s01, s20r); + __m256i g0 = _mm256_alignr_epi8(s12, s01, 8); + __m256i r0 = _mm256_unpackhi_epi64(s20r, s12); + + b = v_uint64x4(b0); + g = v_uint64x4(g0); + r = v_uint64x4(r0); +} + +inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& b, v_uint8x32& g, v_uint8x32& r, v_uint8x32& a ) +{ + __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr); + __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 32)); + __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 64)); + __m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 96)); + static const __m256i sh = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, + 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); + + __m256i p0 = _mm256_shuffle_epi8(bgr0, sh); + __m256i p1 = _mm256_shuffle_epi8(bgr1, sh); + __m256i p2 = _mm256_shuffle_epi8(bgr2, sh); + __m256i p3 = _mm256_shuffle_epi8(bgr3, sh); + + __m256i p01l = _mm256_unpacklo_epi32(p0, p1); + __m256i p01h = _mm256_unpackhi_epi32(p0, p1); + __m256i p23l = _mm256_unpacklo_epi32(p2, p3); + __m256i p23h = _mm256_unpackhi_epi32(p2, p3); + + __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16); + __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16); + __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16); + __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16); + + __m256i b0 = _mm256_unpacklo_epi32(pll, plh); + __m256i g0 = _mm256_unpackhi_epi32(pll, plh); + __m256i r0 = _mm256_unpacklo_epi32(phl, phh); + __m256i a0 = _mm256_unpackhi_epi32(phl, phh); - v_store(ptr, v256_combine_diagonal(ab0, ca10)); - v_store(ptr + _Tpvec::nlanes, v256_combine_diagonal(bc1, ab0)); - v_store(ptr + _Tpvec::nlanes * 2, v256_combine_diagonal(ca10, bc1)); + b = v_uint8x32(b0); + g = v_uint8x32(g0); + r = v_uint8x32(r0); + a = v_uint8x32(a0); } -//// -template -inline void v256_load_deinterleave_l4(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d) +inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& b, v_uint16x16& g, v_uint16x16& r, v_uint16x16& a ) { - _Tpvec abcd0 = v256_load(ptr); - _Tpvec abcd1 = v256_load(ptr + _Tpvec::nlanes); - _Tpvec abcd2 = v256_load(ptr + _Tpvec::nlanes * 2); - _Tpvec abcd3 = v256_load(ptr + _Tpvec::nlanes * 3); + __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr); + __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 16)); + __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 32)); + __m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 48)); + static const __m256i sh = _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15); + __m256i p0 = _mm256_shuffle_epi8(bgr0, sh); + __m256i p1 = _mm256_shuffle_epi8(bgr1, sh); + __m256i p2 = _mm256_shuffle_epi8(bgr2, sh); + __m256i p3 = _mm256_shuffle_epi8(bgr3, sh); + + __m256i p01l = _mm256_unpacklo_epi32(p0, p1); + __m256i p01h = _mm256_unpackhi_epi32(p0, p1); + __m256i p23l = _mm256_unpacklo_epi32(p2, p3); + __m256i p23h = _mm256_unpackhi_epi32(p2, p3); - _Tpvec cd0ab0 = v256_alignr_128(abcd0, abcd2); - _Tpvec cd1ab1 = v256_alignr_128(abcd1, abcd3); + __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16); + __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16); + __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16); + __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16); - _Tpvec ab0 = v256_combine_diagonal(abcd0, cd0ab0); - _Tpvec ab1 = v256_combine_diagonal(abcd1, cd1ab1); - _Tpvec cd0 = v256_combine_diagonal(cd0ab0, abcd2); - _Tpvec cd1 = v256_combine_diagonal(cd1ab1, abcd3); + __m256i b0 = _mm256_unpacklo_epi32(pll, plh); + __m256i g0 = _mm256_unpackhi_epi32(pll, plh); + __m256i r0 = _mm256_unpacklo_epi32(phl, phh); + __m256i a0 = _mm256_unpackhi_epi32(phl, phh); - v256_zip(ab0, ab1, a, b); - v256_zip(cd0, cd1, c, d); + b = v_uint16x16(b0); + g = v_uint16x16(g0); + r = v_uint16x16(r0); + a = v_uint16x16(a0); } -template -inline void v256_store_interleave_l4(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d) +inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& b, v_uint32x8& g, v_uint32x8& r, v_uint32x8& a ) { - _Tpvec ab0, ab1, cd0, cd1; - v256_zip(a, b, ab0, ab1); - v256_zip(c, d, cd0, cd1); - - _Tpvec ab0cd0 = v256_alignr_128(ab0, cd0); - _Tpvec ab1cd1 = v256_alignr_128(ab1, cd1); - - v_store(ptr, v256_combine_diagonal(ab0, ab0cd0)); - v_store(ptr + _Tpvec::nlanes, v256_combine_diagonal(ab1, ab1cd1)); - v_store(ptr + _Tpvec::nlanes * 2, v256_combine_diagonal(ab0cd0, cd0)); - v_store(ptr + _Tpvec::nlanes * 3, v256_combine_diagonal(ab1cd1, cd1)); -} + __m256i p0 = _mm256_loadu_si256((const __m256i*)ptr); + __m256i p1 = _mm256_loadu_si256((const __m256i*)(ptr + 8)); + __m256i p2 = _mm256_loadu_si256((const __m256i*)(ptr + 16)); + __m256i p3 = _mm256_loadu_si256((const __m256i*)(ptr + 24)); -OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint64x4, uint64, l4) -OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int64x4, int64, l4) -OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_float64x4, double, l4) + __m256i p01l = _mm256_unpacklo_epi32(p0, p1); + __m256i p01h = _mm256_unpackhi_epi32(p0, p1); + __m256i p23l = _mm256_unpacklo_epi32(p2, p3); + __m256i p23h = _mm256_unpackhi_epi32(p2, p3); -/* **** **** */ -// -inline void v256_load_deinterleave_l8(const float* ptr, v_float32x8& a, v_float32x8& b) -{ - v_float32x8 ab0 = v256_load(ptr); - v_float32x8 ab1 = v256_load(ptr + 8); + __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16); + __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16); + __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16); + __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16); - v_float32x8 ab0ab2, ab1ab3; - v_recombine(ab0, ab1, ab0ab2, ab1ab3); + __m256i b0 = _mm256_unpacklo_epi32(pll, plh); + __m256i g0 = _mm256_unpackhi_epi32(pll, plh); + __m256i r0 = _mm256_unpacklo_epi32(phl, phh); + __m256i a0 = _mm256_unpackhi_epi32(phl, phh); - a.val = _mm256_shuffle_ps(ab0ab2.val, ab1ab3.val, _MM_SHUFFLE(2, 0, 2, 0)); - b.val = _mm256_shuffle_ps(ab0ab2.val, ab1ab3.val, _MM_SHUFFLE(3, 1, 3, 1)); + b = v_uint32x8(b0); + g = v_uint32x8(g0); + r = v_uint32x8(r0); + a = v_uint32x8(a0); } -template -inline void v256_load_deinterleave_l8(const _Tp* ptr, _Tpvec& a, _Tpvec& b) +inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& b, v_uint64x4& g, v_uint64x4& r, v_uint64x4& a ) { - v_float32x8 fa, fb; - v256_load_deinterleave_l8((float*)ptr, fa, fb); - a.val = v_reinterpret_as_u32(fa).val; - b.val = v_reinterpret_as_u32(fb).val; -} -/// -template -inline void v256_store_interleave_l8(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) -{ - _Tpvec ab0, ab1, bc0, bc1; - v256_zip(a, b, ab0, ab1); - v256_zip(b, c, bc0, bc1); + __m256i bgra0 = _mm256_loadu_si256((const __m256i*)ptr); + __m256i bgra1 = _mm256_loadu_si256((const __m256i*)(ptr + 4)); + __m256i bgra2 = _mm256_loadu_si256((const __m256i*)(ptr + 8)); + __m256i bgra3 = _mm256_loadu_si256((const __m256i*)(ptr + 12)); - _Tpvec cazg = v256_blend<0xaa>(c, a); - _Tpvec abc0abc1(_mm256_unpacklo_epi64(ab0.val, cazg.val)); - _Tpvec abc1abc2(_mm256_unpackhi_epi64(cazg.val, bc1.val)); - _Tpvec abc2abc0 = v256_reverse_64(v256_blend<0xcc>(ab1, bc0)); + __m256i l02 = _mm256_permute2x128_si256(bgra0, bgra2, 0 + 2*16); + __m256i h02 = _mm256_permute2x128_si256(bgra0, bgra2, 1 + 3*16); + __m256i l13 = _mm256_permute2x128_si256(bgra1, bgra3, 0 + 2*16); + __m256i h13 = _mm256_permute2x128_si256(bgra1, bgra3, 1 + 3*16); - _Tpvec abc0 = v256_combine_diagonal(abc0abc1, abc2abc0); - _Tpvec abc1 = v256_combine_diagonal(abc1abc2, abc0abc1); - _Tpvec abc2 = v256_combine_diagonal(abc2abc0, abc1abc2); + __m256i b0 = _mm256_unpacklo_epi64(l02, l13); + __m256i g0 = _mm256_unpackhi_epi64(l02, l13); + __m256i r0 = _mm256_unpacklo_epi64(h02, h13); + __m256i a0 = _mm256_unpackhi_epi64(h02, h13); - v_store(ptr, abc0); - v_store(ptr + _Tpvec::nlanes, abc1); - v_store(ptr + _Tpvec::nlanes * 2, abc2); + b = v_uint64x4(b0); + g = v_uint64x4(g0); + r = v_uint64x4(r0); + a = v_uint64x4(a0); } -inline void v256_store_interleave_l8(float* ptr, const v_float32x8& a, const v_float32x8& b, const v_float32x8& c) -{ - v_float32x8 ab0, ab1, bc0, bc1; - v256_zip(a, b, ab0, ab1); - v256_zip(b, c, bc0, bc1); +///////////////////////////// store interleave ///////////////////////////////////// - v_float32x8 cazg = v256_blend<0xaa>(c, a); - v_float32x8 abc0abc1(_mm256_shuffle_ps(ab0.val, cazg.val, _MM_SHUFFLE(1, 0, 1, 0))); - v_float32x8 abc1abc2(_mm256_shuffle_ps(cazg.val, bc1.val, _MM_SHUFFLE(3, 2, 3, 2))); - - v_float32x8 abc0abc2(_mm256_shuffle_ps(bc0.val, ab1.val, _MM_SHUFFLE(1, 0, 3, 2))); - v_float32x8 abc2abc0 = v256_swap_halves(abc0abc2); +inline void v_store_interleave( uchar* ptr, const v_uint8x32& x, const v_uint8x32& y ) +{ + __m256i xy_l = _mm256_unpacklo_epi8(x.val, y.val); + __m256i xy_h = _mm256_unpackhi_epi8(x.val, y.val); - v_float32x8 abc0 = v256_combine_diagonal(abc0abc1, abc2abc0); - v_float32x8 abc1 = v256_combine_diagonal(abc1abc2, abc0abc1); - v_float32x8 abc2 = v256_combine_diagonal(abc2abc0, abc1abc2); + __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16); + __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16); - v_store(ptr, abc0); - v_store(ptr + 8, abc1); - v_store(ptr + 16, abc2); + _mm256_storeu_si256((__m256i*)ptr, xy0); + _mm256_storeu_si256((__m256i*)(ptr + 32), xy1); } -template -inline void v256_load_deinterleave_l8(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c) +inline void v_store_interleave( ushort* ptr, const v_uint16x16& x, const v_uint16x16& y ) { - _Tpvec abc02 = v256_load(ptr); - _Tpvec abc1 = v256_load(ptr + _Tpvec::nlanes); - _Tpvec abc20 = v256_load(ptr + _Tpvec::nlanes * 2); + __m256i xy_l = _mm256_unpacklo_epi16(x.val, y.val); + __m256i xy_h = _mm256_unpackhi_epi16(x.val, y.val); - _Tpvec abc2 = v256_alignr_128(abc02, abc20); - _Tpvec abc0 = v256_combine_diagonal(abc02, abc20); + __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16); + __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16); - a = v256_blend<0x92>(abc0, abc1); - a = v256_blend<0x44>(a, abc2); + _mm256_storeu_si256((__m256i*)ptr, xy0); + _mm256_storeu_si256((__m256i*)(ptr + 16), xy1); +} - b = v256_blend<0x24>(abc0, abc1); - b = v256_blend<0x99>(b, abc2); +inline void v_store_interleave( unsigned* ptr, const v_uint32x8& x, const v_uint32x8& y ) +{ + __m256i xy_l = _mm256_unpacklo_epi32(x.val, y.val); + __m256i xy_h = _mm256_unpackhi_epi32(x.val, y.val); - c = v256_blend<0x49>(abc0, abc1); - c = v256_blend<0x22>(c, abc2); + __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16); + __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16); - a = v256_shuffle<_MM_SHUFFLE(1, 2, 3, 0)>(a); - b = v256_shuffle<_MM_SHUFFLE(2, 3, 0, 1)>(b); - c = v256_shuffle<_MM_SHUFFLE(3, 0, 1, 2)>(c); -} -///// -template -inline void v256_load_deinterleave_l8(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d) -{ - _Tpvec ab0, ab1, cd0, cd1; - v256_load_deinterleave_l4(ptr, ab0, cd0, ab1, cd1); - v256_zip(ab0, ab1, a, b); - v256_zip(cd0, cd1, c, d); + _mm256_storeu_si256((__m256i*)ptr, xy0); + _mm256_storeu_si256((__m256i*)(ptr + 8), xy1); } -template -inline void v256_store_interleave_l8(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d) +inline void v_store_interleave( uint64* ptr, const v_uint64x4& x, const v_uint64x4& y ) { - _Tpvec ac0, ac1, bd0, bd1; - v256_zip(a, c, ac0, ac1); - v256_zip(b, d, bd0, bd1); - - _Tpvec abcd0, abcd1, abcd2, abcd3; - v256_zip(ac0, bd0, abcd0, abcd1); - v256_zip(ac1, bd1, abcd2, abcd3); + __m256i xy_l = _mm256_unpacklo_epi64(x.val, y.val); + __m256i xy_h = _mm256_unpackhi_epi64(x.val, y.val); - _Tpvec abcd01, abcd23, abcd45, abcd67; - v_recombine(abcd0, abcd1, abcd01, abcd45); - v_recombine(abcd2, abcd3, abcd23, abcd67); + __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16); + __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16); - v_store(ptr, abcd01); - v_store(ptr + _Tpvec::nlanes, abcd23); - v_store(ptr + _Tpvec::nlanes * 2, abcd45); - v_store(ptr + _Tpvec::nlanes * 3, abcd67); + _mm256_storeu_si256((__m256i*)ptr, xy0); + _mm256_storeu_si256((__m256i*)(ptr + 4), xy1); } -OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint32x8, unsigned, l8) -OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int32x8, int, l8) -OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_float32x8, float, l8) - -/* ******** ******** */ -// -template -inline void v256_load_deinterleave_l16(const _Tp* ptr, _Tpvec& a, _Tpvec& b) +inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g, const v_uint8x32& r ) { - const __m256i sep = _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 - ); + static const __m256i sh_b = _mm256_setr_epi8( + 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, + 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5); + static const __m256i sh_g = _mm256_setr_epi8( + 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, + 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10); + static const __m256i sh_r = _mm256_setr_epi8( + 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, + 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15); - _Tpvec ab0, ab1; - v_recombine(v256_load(ptr), v256_load(ptr + _Tpvec::nlanes), ab0, ab1); + __m256i b0 = _mm256_shuffle_epi8(b.val, sh_b); + __m256i g0 = _mm256_shuffle_epi8(g.val, sh_g); + __m256i r0 = _mm256_shuffle_epi8(r.val, sh_r); - __m256i a0b0 = _mm256_shuffle_epi8(ab0.val, sep); - __m256i a1b1 = _mm256_shuffle_epi8(ab1.val, sep); - - a.val = _mm256_unpacklo_epi64(a0b0, a1b1); - b.val = _mm256_unpackhi_epi64(a0b0, a1b1); -} -/// -template -inline void v256_store_interleave_l16(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) -{ - v_uint32x8 ab0 = v_reinterpret_as_u32(v256_unpacklo(a, b)); - v_uint32x8 ab1 = v_reinterpret_as_u32(v256_unpackhi(a, b)); - v_uint32x8 bc0 = v_reinterpret_as_u32(v256_unpacklo(b, c)); - v_uint32x8 bc1 = v_reinterpret_as_u32(v256_unpackhi(b, c)); + static const __m256i m0 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, + 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0); + static const __m256i m1 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, + 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0); - v_uint32x8 cazg = v_reinterpret_as_u32(v256_blend<0xaa>(c, a)); - cazg = v256_shuffle<_MM_SHUFFLE(2, 1, 0, 3)>(cazg); + __m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1); + __m256i p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, r0, m0), b0, m1); + __m256i p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, b0, m0), g0, m1); - v_uint32x8 ac1ab1 = v256_blend<0xaa>(ab1, bc1); - ac1ab1 = v256_shuffle<_MM_SHUFFLE(2, 1, 0, 3)>(ac1ab1); + __m256i bgr0 = _mm256_permute2x128_si256(p0, p1, 0 + 2*16); + __m256i bgr1 = _mm256_permute2x128_si256(p2, p0, 0 + 3*16); + __m256i bgr2 = _mm256_permute2x128_si256(p1, p2, 1 + 3*16); - v_uint32x8 abc001 = v256_blend<0xaa>(ab0, cazg); - v_uint32x8 cabc0 = v256_blend<0xaa>(cazg, bc0); + _mm256_storeu_si256((__m256i*)ptr, bgr0); + _mm256_storeu_si256((__m256i*)(ptr + 32), bgr1); + _mm256_storeu_si256((__m256i*)(ptr + 64), bgr2); +} - v_uint32x8 cabc1 = v256_unpacklo(cabc0, ac1ab1); - v_uint32x8 bcab0 = v256_unpackhi(cabc1, abc001); +inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint16x16& g, const v_uint16x16& r ) +{ + static const __m256i sh_b = _mm256_setr_epi8( + 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, + 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11); + static const __m256i sh_g = _mm256_setr_epi8( + 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, + 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5); + static const __m256i sh_r = _mm256_setr_epi8( + 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, + 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15); - v_uint64x4 abc01 = v256_unpacklo(v_reinterpret_as_u64(abc001), v_reinterpret_as_u64(bcab0)); - v_uint64x4 abc21 = v256_unpackhi(v_reinterpret_as_u64(cabc0), v_reinterpret_as_u64(bcab0)); - abc21 = v256_swap_halves(abc21); - v_uint64x4 abc12 = v_reinterpret_as_u64(v256_alignr_64(cabc1, ac1ab1)); + __m256i b0 = _mm256_shuffle_epi8(b.val, sh_b); + __m256i g0 = _mm256_shuffle_epi8(g.val, sh_g); + __m256i r0 = _mm256_shuffle_epi8(r.val, sh_r); - v_uint64x4 abc0 = v256_combine_diagonal(abc01, abc21); - v_uint64x4 abc1 = v256_combine_diagonal(abc12, abc01); - v_uint64x4 abc2 = v256_combine_diagonal(abc21, abc12); + static const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, + 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0); + static const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, + -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0); - v_store(ptr, _Tpvec(abc0.val)); - v_store(ptr + _Tpvec::nlanes, _Tpvec(abc1.val)); - v_store(ptr + _Tpvec::nlanes * 2, _Tpvec(abc2.val)); -} -// todo: -template -inline void v256_load_deinterleave_l16(const _Tp*, _Tpvec&, _Tpvec&, _Tpvec&) -{} -//// -template -inline void v256_load_deinterleave_l16(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d) -{ - _Tpvec ab0, ab1, cd0, cd1; - v256_load_deinterleave_l8(ptr, ab0, cd0, ab1, cd1); - v256_zip(ab0, ab1, a, b); - v256_zip(cd0, cd1, c, d); -} + __m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1); + __m256i p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, r0, m0), b0, m1); + __m256i p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, b0, m0), g0, m1); -template -inline void v256_store_interleave_l16(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d) -{ v256_store_interleave_l8(ptr, a, b, c, d); } + __m256i bgr0 = _mm256_permute2x128_si256(p0, p2, 0 + 2*16); + //__m256i bgr1 = p1; + __m256i bgr2 = _mm256_permute2x128_si256(p0, p2, 1 + 3*16); -OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint16x16, ushort, l16) -OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int16x16, short, l16) + _mm256_storeu_si256((__m256i*)ptr, bgr0); + _mm256_storeu_si256((__m256i*)(ptr + 16), p1); + _mm256_storeu_si256((__m256i*)(ptr + 32), bgr2); +} -/* **************** **************** */ -// -template -inline void v256_load_deinterleave_l32(const _Tp* ptr, _Tpvec& a, _Tpvec& b) +inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint32x8& g, const v_uint32x8& r ) { - const __m256i sep = _mm256_setr_epi8( - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, - 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 - ); + __m256i b0 = _mm256_shuffle_epi32(b.val, 0x6c); + __m256i g0 = _mm256_shuffle_epi32(g.val, 0xb1); + __m256i r0 = _mm256_shuffle_epi32(r.val, 0xc6); - _Tpvec ab0, ab1; - v_recombine(v256_load(ptr), v256_load(ptr + _Tpvec::nlanes), ab0, ab1); + __m256i p0 = _mm256_blend_epi32(_mm256_blend_epi32(b0, g0, 0x92), r0, 0x24); + __m256i p1 = _mm256_blend_epi32(_mm256_blend_epi32(g0, r0, 0x92), b0, 0x24); + __m256i p2 = _mm256_blend_epi32(_mm256_blend_epi32(r0, b0, 0x92), g0, 0x24); - __m256i a0b0 = _mm256_shuffle_epi8(ab0.val, sep); - __m256i a1b1 = _mm256_shuffle_epi8(ab1.val, sep); + __m256i bgr0 = _mm256_permute2x128_si256(p0, p1, 0 + 2*16); + //__m256i bgr1 = p2; + __m256i bgr2 = _mm256_permute2x128_si256(p0, p1, 1 + 3*16); - a.val = _mm256_unpacklo_epi64(a0b0, a1b1); - b.val = _mm256_unpackhi_epi64(a0b0, a1b1); + _mm256_storeu_si256((__m256i*)ptr, bgr0); + _mm256_storeu_si256((__m256i*)(ptr + 8), p2); + _mm256_storeu_si256((__m256i*)(ptr + 16), bgr2); } -/// todo -template -inline void v256_store_interleave_l32(_Tp*, const _Tpvec&, const _Tpvec&, const _Tpvec&) -{} -template -inline void v256_load_deinterleave_l32(const _Tp*, _Tpvec&, _Tpvec&, _Tpvec&) -{} -//// -template -inline void v256_load_deinterleave_l32(const _Tp* ptr, _Tpvec& a, _Tpvec& b, _Tpvec& c, _Tpvec& d) +inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64x4& g, const v_uint64x4& r ) { - const __m256i sep = _mm256_setr_epi8( - 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, - 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 - ); - - _Tpvec abcd0, abcd1, abcd2, abcd3; - v_recombine(v256_load(ptr), v256_load(ptr + _Tpvec::nlanes * 2), abcd0, abcd1); - v_recombine(v256_load(ptr + _Tpvec::nlanes), v256_load(ptr + _Tpvec::nlanes * 3), abcd2, abcd3); + __m256i s01 = _mm256_unpacklo_epi64(b.val, g.val); + __m256i s12 = _mm256_unpackhi_epi64(g.val, r.val); + __m256i s20 = _mm256_blend_epi32(r.val, b.val, 0xcc); - __m256i ab0cd0 = _mm256_shuffle_epi8(abcd0.val, sep); - __m256i ab1cd1 = _mm256_shuffle_epi8(abcd1.val, sep); - __m256i ab2cd2 = _mm256_shuffle_epi8(abcd2.val, sep); - __m256i ab3cd3 = _mm256_shuffle_epi8(abcd3.val, sep); + __m256i bgr0 = _mm256_permute2x128_si256(s01, s20, 0 + 2*16); + __m256i bgr1 = _mm256_blend_epi32(s01, s12, 0x0f); + __m256i bgr2 = _mm256_permute2x128_si256(s20, s12, 1 + 3*16); - __m256i ab0 = _mm256_unpacklo_epi32(ab0cd0, ab1cd1); - __m256i ab1 = _mm256_unpacklo_epi32(ab2cd2, ab3cd3); - __m256i cd0 = _mm256_unpackhi_epi32(ab0cd0, ab1cd1); - __m256i cd1 = _mm256_unpackhi_epi32(ab2cd2, ab3cd3); - - a.val = _mm256_unpacklo_epi64(ab0, ab1); - b.val = _mm256_unpackhi_epi64(ab0, ab1); - c.val = _mm256_unpacklo_epi64(cd0, cd1); - d.val = _mm256_unpackhi_epi64(cd0, cd1); -} - -template -inline void v256_store_interleave_l32(_Tp* ptr, const _Tpvec& a, const _Tpvec& b, const _Tpvec& c, const _Tpvec& d) -{ v256_store_interleave_l8(ptr, a, b, c, d); } + _mm256_storeu_si256((__m256i*)ptr, bgr0); + _mm256_storeu_si256((__m256i*)(ptr + 4), bgr1); + _mm256_storeu_si256((__m256i*)(ptr + 8), bgr2); +} -OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_uint8x32, uchar, l32) -OPENCV_HAL_IMPL_AVX_INTERLEAVE_ACH(v_int8x32, schar, l32) +inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g, const v_uint8x32& r, const v_uint8x32& a ) +{ + __m256i bg0 = _mm256_unpacklo_epi8(b.val, g.val); + __m256i bg1 = _mm256_unpackhi_epi8(b.val, g.val); + __m256i ra0 = _mm256_unpacklo_epi8(r.val, a.val); + __m256i ra1 = _mm256_unpackhi_epi8(r.val, a.val); + + __m256i bgra0_ = _mm256_unpacklo_epi16(bg0, ra0); + __m256i bgra1_ = _mm256_unpackhi_epi16(bg0, ra0); + __m256i bgra2_ = _mm256_unpacklo_epi16(bg1, ra1); + __m256i bgra3_ = _mm256_unpackhi_epi16(bg1, ra1); + + __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16); + __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16); + __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16); + __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16); + + _mm256_storeu_si256((__m256i*)ptr, bgra0); + _mm256_storeu_si256((__m256i*)(ptr + 32), bgra1); + _mm256_storeu_si256((__m256i*)(ptr + 64), bgra2); + _mm256_storeu_si256((__m256i*)(ptr + 96), bgra3); +} + +inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint16x16& g, + const v_uint16x16& r, const v_uint16x16& a ) +{ + __m256i bg0 = _mm256_unpacklo_epi16(b.val, g.val); + __m256i bg1 = _mm256_unpackhi_epi16(b.val, g.val); + __m256i ra0 = _mm256_unpacklo_epi16(r.val, a.val); + __m256i ra1 = _mm256_unpackhi_epi16(r.val, a.val); + + __m256i bgra0_ = _mm256_unpacklo_epi32(bg0, ra0); + __m256i bgra1_ = _mm256_unpackhi_epi32(bg0, ra0); + __m256i bgra2_ = _mm256_unpacklo_epi32(bg1, ra1); + __m256i bgra3_ = _mm256_unpackhi_epi32(bg1, ra1); + + __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16); + __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16); + __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16); + __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16); + + _mm256_storeu_si256((__m256i*)ptr, bgra0); + _mm256_storeu_si256((__m256i*)(ptr + 16), bgra1); + _mm256_storeu_si256((__m256i*)(ptr + 32), bgra2); + _mm256_storeu_si256((__m256i*)(ptr + 48), bgra3); +} + +inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint32x8& g, + const v_uint32x8& r, const v_uint32x8& a ) +{ + __m256i bg0 = _mm256_unpacklo_epi32(b.val, g.val); + __m256i bg1 = _mm256_unpackhi_epi32(b.val, g.val); + __m256i ra0 = _mm256_unpacklo_epi32(r.val, a.val); + __m256i ra1 = _mm256_unpackhi_epi32(r.val, a.val); + + __m256i bgra0_ = _mm256_unpacklo_epi64(bg0, ra0); + __m256i bgra1_ = _mm256_unpackhi_epi64(bg0, ra0); + __m256i bgra2_ = _mm256_unpacklo_epi64(bg1, ra1); + __m256i bgra3_ = _mm256_unpackhi_epi64(bg1, ra1); + + __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16); + __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16); + __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16); + __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16); + + _mm256_storeu_si256((__m256i*)ptr, bgra0); + _mm256_storeu_si256((__m256i*)(ptr + 8), bgra1); + _mm256_storeu_si256((__m256i*)(ptr + 16), bgra2); + _mm256_storeu_si256((__m256i*)(ptr + 24), bgra3); +} + +inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64x4& g, + const v_uint64x4& r, const v_uint64x4& a ) +{ + __m256i bg0 = _mm256_unpacklo_epi64(b.val, g.val); + __m256i bg1 = _mm256_unpackhi_epi64(b.val, g.val); + __m256i ra0 = _mm256_unpacklo_epi64(r.val, a.val); + __m256i ra1 = _mm256_unpackhi_epi64(r.val, a.val); + + __m256i bgra0 = _mm256_permute2x128_si256(bg0, ra0, 0 + 2*16); + __m256i bgra1 = _mm256_permute2x128_si256(bg1, ra1, 0 + 2*16); + __m256i bgra2 = _mm256_permute2x128_si256(bg0, ra0, 1 + 3*16); + __m256i bgra3 = _mm256_permute2x128_si256(bg1, ra1, 1 + 3*16); + + _mm256_storeu_si256((__m256i*)ptr, bgra0); + _mm256_storeu_si256((__m256i*)(ptr + 4), bgra1); + _mm256_storeu_si256((__m256i*)(ptr + 8), bgra2); + _mm256_storeu_si256((__m256i*)(ptr + 12), bgra3); +} + +#define OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \ +inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \ +{ \ + _Tpvec1 a1, b1; \ + v_load_deinterleave((const _Tp1*)ptr, a1, b1); \ + a0 = v_reinterpret_as_##suffix0(a1); \ + b0 = v_reinterpret_as_##suffix0(b1); \ +} \ +inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \ +{ \ + _Tpvec1 a1, b1, c1; \ + v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \ + a0 = v_reinterpret_as_##suffix0(a1); \ + b0 = v_reinterpret_as_##suffix0(b1); \ + c0 = v_reinterpret_as_##suffix0(c1); \ +} \ +inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \ +{ \ + _Tpvec1 a1, b1, c1, d1; \ + v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \ + a0 = v_reinterpret_as_##suffix0(a1); \ + b0 = v_reinterpret_as_##suffix0(b1); \ + c0 = v_reinterpret_as_##suffix0(c1); \ + d0 = v_reinterpret_as_##suffix0(d1); \ +} \ +inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0 ) \ +{ \ + _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \ + _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \ + v_store_interleave((_Tp1*)ptr, a1, b1); \ +} \ +inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0 ) \ +{ \ + _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \ + _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \ + _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \ + v_store_interleave((_Tp1*)ptr, a1, b1, c1); \ +} \ +inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \ + const _Tpvec0& c0, const _Tpvec0& d0 ) \ +{ \ + _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \ + _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \ + _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \ + _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \ + v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1); \ +} + +OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int8x32, schar, s8, v_uint8x32, uchar, u8) +OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int16x16, short, s16, v_uint16x16, ushort, u16) +OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int32x8, int, s32, v_uint32x8, unsigned, u32) +OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float32x8, float, f32, v_uint32x8, unsigned, u32) +OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int64x4, int64, s64, v_uint64x4, uint64, u64) +OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float64x4, double, f64, v_uint64x4, uint64, u64) inline void v256_cleanup() { _mm256_zeroupper(); } diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index fdb3ec09cb..d8067306a5 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -1318,6 +1318,80 @@ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& vst4q_##suffix(ptr, v); \ } +#define OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(tp, suffix) \ +inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b ) \ +{ \ + tp##x1_t a0 = vld1_##suffix(ptr); \ + tp##x1_t b0 = vld1_##suffix(ptr + 1); \ + tp##x1_t a1 = vld1_##suffix(ptr + 2); \ + tp##x1_t b1 = vld1_##suffix(ptr + 3); \ + a = v_##tp##x2(vcombine_##suffix(a0, a1)); \ + b = v_##tp##x2(vcombine_##suffix(b0, b1)); \ +} \ + \ +inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, \ + v_##tp##x2& b, v_##tp##x2& c ) \ +{ \ + tp##x1_t a0 = vld1_##suffix(ptr); \ + tp##x1_t b0 = vld1_##suffix(ptr + 1); \ + tp##x1_t c0 = vld1_##suffix(ptr + 2); \ + tp##x1_t a1 = vld1_##suffix(ptr + 3); \ + tp##x1_t b1 = vld1_##suffix(ptr + 4); \ + tp##x1_t c1 = vld1_##suffix(ptr + 5); \ + a = v_##tp##x2(vcombine_##suffix(a0, a1)); \ + b = v_##tp##x2(vcombine_##suffix(b0, b1)); \ + c = v_##tp##x2(vcombine_##suffix(c0, c1)); \ +} \ + \ +inline void v_load_deinterleave( const tp* ptr, v_##tp##x2& a, v_##tp##x2& b, \ + v_##tp##x2& c, v_##tp##x2& d ) \ +{ \ + tp##x1_t a0 = vld1_##suffix(ptr); \ + tp##x1_t b0 = vld1_##suffix(ptr + 1); \ + tp##x1_t c0 = vld1_##suffix(ptr + 2); \ + tp##x1_t d0 = vld1_##suffix(ptr + 3); \ + tp##x1_t a1 = vld1_##suffix(ptr + 4); \ + tp##x1_t b1 = vld1_##suffix(ptr + 5); \ + tp##x1_t c1 = vld1_##suffix(ptr + 6); \ + tp##x1_t d1 = vld1_##suffix(ptr + 7); \ + a = v_##tp##x2(vcombine_##suffix(a0, a1)); \ + b = v_##tp##x2(vcombine_##suffix(b0, b1)); \ + c = v_##tp##x2(vcombine_##suffix(c0, c1)); \ + d = v_##tp##x2(vcombine_##suffix(d0, d1)); \ +} \ + \ +inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b ) \ +{ \ + vst1_##suffix(ptr, vget_low_##suffix(a.val)); \ + vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \ + vst1_##suffix(ptr + 2, vget_high_##suffix(a.val)); \ + vst1_##suffix(ptr + 3, vget_high_##suffix(b.val)); \ +} \ + \ +inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, \ + const v_##tp##x2& b, const v_##tp##x2& c ) \ +{ \ + vst1_##suffix(ptr, vget_low_##suffix(a.val)); \ + vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \ + vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \ + vst1_##suffix(ptr + 3, vget_high_##suffix(a.val)); \ + vst1_##suffix(ptr + 4, vget_high_##suffix(b.val)); \ + vst1_##suffix(ptr + 5, vget_high_##suffix(c.val)); \ +} \ + \ +inline void v_store_interleave( tp* ptr, const v_##tp##x2& a, const v_##tp##x2& b, \ + const v_##tp##x2& c, const v_##tp##x2& d ) \ +{ \ + vst1_##suffix(ptr, vget_low_##suffix(a.val)); \ + vst1_##suffix(ptr + 1, vget_low_##suffix(b.val)); \ + vst1_##suffix(ptr + 2, vget_low_##suffix(c.val)); \ + vst1_##suffix(ptr + 3, vget_low_##suffix(d.val)); \ + vst1_##suffix(ptr + 4, vget_high_##suffix(a.val)); \ + vst1_##suffix(ptr + 5, vget_high_##suffix(b.val)); \ + vst1_##suffix(ptr + 6, vget_high_##suffix(c.val)); \ + vst1_##suffix(ptr + 7, vget_high_##suffix(d.val)); \ +} + OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint8x16, uchar, u8) OPENCV_HAL_IMPL_NEON_INTERLEAVED(int8x16, schar, s8) OPENCV_HAL_IMPL_NEON_INTERLEAVED(uint16x8, ushort, u16) @@ -1329,6 +1403,9 @@ OPENCV_HAL_IMPL_NEON_INTERLEAVED(float32x4, float, f32) OPENCV_HAL_IMPL_NEON_INTERLEAVED(float64x2, double, f64) #endif +OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(int64, s64) +OPENCV_HAL_IMPL_NEON_INTERLEAVED_INT64(uint64, u64) + inline v_float32x4 v_cvt_f32(const v_int32x4& a) { return v_float32x4(vcvtq_f32_s32(a.val)); diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index b79ea16a4d..4971c777e4 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -58,17 +58,6 @@ namespace cv CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN -struct v_uint8x16; -struct v_int8x16; -struct v_uint16x8; -struct v_int16x8; -struct v_uint32x4; -struct v_int32x4; -struct v_float32x4; -struct v_uint64x2; -struct v_int64x2; -struct v_float64x2; - struct v_uint8x16 { typedef uchar lane_type; @@ -1660,7 +1649,7 @@ OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_N OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps) -// adopted from sse_utils.hpp +// load deinterleave inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b) { __m128i t00 = _mm_loadu_si128((const __m128i*)ptr); @@ -1681,7 +1670,25 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b) inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c) { -#if CV_SSSE3 +#if CV_SSE4_1 + static const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0); + static const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0); + __m128i s0 = _mm_loadu_si128((const __m128i*)ptr); + __m128i s1 = _mm_loadu_si128((const __m128i*)(ptr + 16)); + __m128i s2 = _mm_loadu_si128((const __m128i*)(ptr + 32)); + __m128i a0 = _mm_blendv_epi8(_mm_blendv_epi8(s0, s1, m0), s2, m1); + __m128i b0 = _mm_blendv_epi8(_mm_blendv_epi8(s1, s2, m0), s0, m1); + __m128i c0 = _mm_blendv_epi8(_mm_blendv_epi8(s2, s0, m0), s1, m1); + static const __m128i sh_b = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13); + static const __m128i sh_g = _mm_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14); + static const __m128i sh_r = _mm_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15); + a0 = _mm_shuffle_epi8(a0, sh_b); + b0 = _mm_shuffle_epi8(b0, sh_g); + c0 = _mm_shuffle_epi8(c0, sh_r); + a.val = a0; + b.val = b0; + c.val = c0; +#elif CV_SSSE3 static const __m128i m0 = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14); static const __m128i m1 = _mm_alignr_epi8(m0, m0, 11); static const __m128i m2 = _mm_alignr_epi8(m0, m0, 6); @@ -1753,8 +1760,41 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, d.val = _mm_unpackhi_epi8(v2, v3); } +inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b) +{ + __m128i v0 = _mm_loadu_si128((__m128i*)(ptr)); // a0 b0 a1 b1 a2 b2 a3 b3 + __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8)); // a4 b4 a5 b5 a6 b6 a7 b7 + + __m128i v2 = _mm_unpacklo_epi16(v0, v1); // a0 a4 b0 b4 a1 a5 b1 b5 + __m128i v3 = _mm_unpackhi_epi16(v0, v1); // a2 a6 b2 b6 a3 a7 b3 b7 + __m128i v4 = _mm_unpacklo_epi16(v2, v3); // a0 a2 a4 a6 b0 b2 b4 b6 + __m128i v5 = _mm_unpackhi_epi16(v2, v3); // a1 a3 a5 a7 b1 b3 b5 b7 + + a.val = _mm_unpacklo_epi16(v4, v5); // a0 a1 a2 a3 a4 a5 a6 a7 + b.val = _mm_unpackhi_epi16(v4, v5); // b0 b1 ab b3 b4 b5 b6 b7 +} + inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c) { +#if CV_SSE4_1 + __m128i v0 = _mm_loadu_si128((__m128i*)(ptr)); + __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8)); + __m128i v2 = _mm_loadu_si128((__m128i*)(ptr + 16)); + __m128i a0 = _mm_blend_epi16(_mm_blend_epi16(v0, v1, 0x92), v2, 0x24); + __m128i b0 = _mm_blend_epi16(_mm_blend_epi16(v2, v0, 0x92), v1, 0x24); + __m128i c0 = _mm_blend_epi16(_mm_blend_epi16(v1, v2, 0x92), v0, 0x24); + + static const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11); + static const __m128i sh_b = _mm_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13); + static const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15); + a0 = _mm_shuffle_epi8(a0, sh_a); + b0 = _mm_shuffle_epi8(b0, sh_b); + c0 = _mm_shuffle_epi8(c0, sh_c); + + a.val = a0; + b.val = b0; + c.val = c0; +#else __m128i t00 = _mm_loadu_si128((const __m128i*)ptr); __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 8)); __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 16)); @@ -1770,6 +1810,7 @@ inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, a.val = _mm_unpacklo_epi16(t20, _mm_unpackhi_epi64(t21, t21)); b.val = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t20, t20), t22); c.val = _mm_unpacklo_epi16(t21, _mm_unpackhi_epi64(t22, t22)); +#endif } inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d) @@ -1795,6 +1836,18 @@ inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, d.val = _mm_unpackhi_epi16(u2, u3); } +inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b) +{ + __m128i v0 = _mm_loadu_si128((__m128i*)(ptr)); // a0 b0 a1 b1 + __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 4)); // a2 b2 a3 b3 + + __m128i v2 = _mm_unpacklo_epi32(v0, v1); // a0 a2 b0 b2 + __m128i v3 = _mm_unpackhi_epi32(v0, v1); // a1 a3 b1 b3 + + a.val = _mm_unpacklo_epi32(v2, v3); // a0 a1 a2 a3 + b.val = _mm_unpackhi_epi32(v2, v3); // b0 b1 ab b3 +} + inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c) { __m128i t00 = _mm_loadu_si128((const __m128i*)ptr); @@ -1812,12 +1865,23 @@ inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d) { - v_uint32x4 u0(_mm_loadu_si128((const __m128i*)ptr)); // a0 b0 c0 d0 - v_uint32x4 u1(_mm_loadu_si128((const __m128i*)(ptr + 4))); // a1 b1 c1 d1 - v_uint32x4 u2(_mm_loadu_si128((const __m128i*)(ptr + 8))); // a2 b2 c2 d2 - v_uint32x4 u3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3 + v_uint32x4 s0(_mm_loadu_si128((const __m128i*)ptr)); // a0 b0 c0 d0 + v_uint32x4 s1(_mm_loadu_si128((const __m128i*)(ptr + 4))); // a1 b1 c1 d1 + v_uint32x4 s2(_mm_loadu_si128((const __m128i*)(ptr + 8))); // a2 b2 c2 d2 + v_uint32x4 s3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3 - v_transpose4x4(u0, u1, u2, u3, a, b, c, d); + v_transpose4x4(s0, s1, s2, s3, a, b, c, d); +} + +inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b) +{ + const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); + + __m128 u0 = _mm_loadu_ps(ptr); // a0 b0 a1 b1 + __m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3 + + a.val = _mm_shuffle_ps(u0, u1, mask_lo); // a0 a1 a2 a3 + b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3 } inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c) @@ -1853,77 +1917,43 @@ inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b d.val = _mm_unpackhi_ps(t02hi, t13hi); } -inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c) +inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b) { __m128i t0 = _mm_loadu_si128((const __m128i*)ptr); __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2)); - __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4)); - a = v_uint64x2(_mm_unpacklo_epi64(t0, _mm_unpackhi_epi64(t1, t1))); - b = v_uint64x2(_mm_unpacklo_epi64(_mm_unpackhi_epi64(t0, t0), t2)); - c = v_uint64x2(_mm_unpacklo_epi64(t1, _mm_unpackhi_epi64(t2, t2))); -} - -inline void v_load_deinterleave(const int64 *ptr, v_int64x2& a, v_int64x2& b, v_int64x2& c) -{ - v_uint64x2 t0, t1, t2; - v_load_deinterleave((const uint64*)ptr, t0, t1, t2); - a = v_reinterpret_as_s64(t0); - b = v_reinterpret_as_s64(t1); - c = v_reinterpret_as_s64(t2); -} - -inline void v_load_deinterleave(const double *ptr, v_float64x2& a, v_float64x2& b, v_float64x2& c) -{ - v_uint64x2 t0, t1, t2; - v_load_deinterleave((const uint64*)ptr, t0, t1, t2); - a = v_reinterpret_as_f64(t0); - b = v_reinterpret_as_f64(t1); - c = v_reinterpret_as_f64(t2); + a = v_uint64x2(_mm_unpacklo_epi64(t0, t1)); + b = v_uint64x2(_mm_unpackhi_epi64(t0, t1)); } -// 2-channel -inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b) +inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c) { - const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); + __m128i t0 = _mm_loadu_si128((const __m128i*)ptr); // a0, b0 + __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2)); // c0, a1 + __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4)); // b1, c1 - __m128 u0 = _mm_loadu_ps(ptr); // a0 b0 a1 b1 - __m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3 + t1 = _mm_shuffle_epi32(t1, 0x4e); // a1, c0 - a.val = _mm_shuffle_ps(u0, u1, mask_lo); // a0 a1 a2 a3 - b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3 + a = v_uint64x2(_mm_unpacklo_epi64(t0, t1)); + b = v_uint64x2(_mm_unpacklo_epi64(_mm_unpackhi_epi64(t0, t0), t2)); + c = v_uint64x2(_mm_unpackhi_epi64(t1, t2)); } -inline void v_load_deinterleave(const short* ptr, v_int16x8& a, v_int16x8& b) +inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, + v_uint64x2& b, v_uint64x2& c, v_uint64x2& d) { - __m128i v0 = _mm_loadu_si128((__m128i*)(ptr)); // a0 b0 a1 b1 a2 b2 a3 b3 - __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8)); // a4 b4 a5 b5 a6 b6 a7 b7 - - __m128i v2 = _mm_unpacklo_epi16(v0, v1); // a0 a4 b0 b4 a1 a5 b1 b5 - __m128i v3 = _mm_unpackhi_epi16(v0, v1); // a2 a6 b2 b6 a3 a7 b3 b7 - __m128i v4 = _mm_unpacklo_epi16(v2, v3); // a0 a2 a4 a6 b0 b2 b4 b6 - __m128i v5 = _mm_unpackhi_epi16(v2, v3); // a1 a3 a5 a7 b1 b3 b5 b7 + __m128i t0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 + __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2)); // c0 d0 + __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4)); // a1 b1 + __m128i t3 = _mm_loadu_si128((const __m128i*)(ptr + 6)); // c1 d1 - a.val = _mm_unpacklo_epi16(v4, v5); // a0 a1 a2 a3 a4 a5 a6 a7 - b.val = _mm_unpackhi_epi16(v4, v5); // b0 b1 ab b3 b4 b5 b6 b7 + a = v_uint64x2(_mm_unpacklo_epi64(t0, t2)); + b = v_uint64x2(_mm_unpackhi_epi64(t0, t2)); + c = v_uint64x2(_mm_unpacklo_epi64(t1, t3)); + d = v_uint64x2(_mm_unpackhi_epi64(t1, t3)); } -inline void v_load_deinterleave(const ushort*ptr, v_uint16x8& a, v_uint16x8& b) -{ - v_int16x8 sa, sb; - v_load_deinterleave((const short*)ptr, sa, sb); - a = v_reinterpret_as_u16(sa); - b = v_reinterpret_as_u16(sb); -} - -inline void v_store_interleave(short* ptr, const v_int16x8& a, const v_int16x8& b) -{ - __m128i t0, t1; - t0 = _mm_unpacklo_epi16(a.val, b.val); - t1 = _mm_unpackhi_epi16(a.val, b.val); - _mm_storeu_si128((__m128i*)(ptr), t0); - _mm_storeu_si128((__m128i*)(ptr + 8), t1); -} +// store interleave inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b) { @@ -1937,7 +1967,24 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x1 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b, const v_uint8x16& c ) { -#if CV_SSSE3 +#if CV_SSE4_1 + static const __m128i sh_a = _mm_setr_epi8(0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5); + static const __m128i sh_b = _mm_setr_epi8(5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10); + static const __m128i sh_c = _mm_setr_epi8(10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15); + __m128i a0 = _mm_shuffle_epi8(a.val, sh_a); + __m128i b0 = _mm_shuffle_epi8(b.val, sh_b); + __m128i c0 = _mm_shuffle_epi8(c.val, sh_c); + + static const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0); + static const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0); + __m128i v0 = _mm_blendv_epi8(_mm_blendv_epi8(a0, b0, m1), c0, m0); + __m128i v1 = _mm_blendv_epi8(_mm_blendv_epi8(b0, c0, m1), a0, m0); + __m128i v2 = _mm_blendv_epi8(_mm_blendv_epi8(c0, a0, m1), b0, m0); + + _mm_storeu_si128((__m128i*)(ptr), v0); + _mm_storeu_si128((__m128i*)(ptr + 16), v1); + _mm_storeu_si128((__m128i*)(ptr + 32), v2); +#elif CV_SSSE3 static const __m128i m0 = _mm_setr_epi8(0, 6, 11, 1, 7, 12, 2, 8, 13, 3, 9, 14, 4, 10, 15, 5); static const __m128i m1 = _mm_setr_epi8(5, 11, 0, 6, 12, 1, 7, 13, 2, 8, 14, 3, 9, 15, 4, 10); static const __m128i m2 = _mm_setr_epi8(10, 0, 5, 11, 1, 6, 12, 2, 7, 13, 3, 8, 14, 4, 9, 15); @@ -2025,10 +2072,35 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x1 _mm_storeu_si128((__m128i*)(ptr + 48), v3); } +inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b ) +{ + __m128i t0, t1; + t0 = _mm_unpacklo_epi16(a.val, b.val); + t1 = _mm_unpackhi_epi16(a.val, b.val); + _mm_storeu_si128((__m128i*)(ptr), t0); + _mm_storeu_si128((__m128i*)(ptr + 8), t1); +} + inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b, const v_uint16x8& c ) { +#if CV_SSE4_1 + static const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11); + static const __m128i sh_b = _mm_setr_epi8(10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5); + static const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15); + __m128i a0 = _mm_shuffle_epi8(a.val, sh_a); + __m128i b0 = _mm_shuffle_epi8(b.val, sh_b); + __m128i c0 = _mm_shuffle_epi8(c.val, sh_c); + + __m128i v0 = _mm_blend_epi16(_mm_blend_epi16(a0, b0, 0x92), c0, 0x24); + __m128i v1 = _mm_blend_epi16(_mm_blend_epi16(c0, a0, 0x92), b0, 0x24); + __m128i v2 = _mm_blend_epi16(_mm_blend_epi16(b0, c0, 0x92), a0, 0x24); + + _mm_storeu_si128((__m128i*)ptr, v0); + _mm_storeu_si128((__m128i*)(ptr + 8), v1); + _mm_storeu_si128((__m128i*)(ptr + 16), v2); +#else __m128i z = _mm_setzero_si128(); __m128i ab0 = _mm_unpacklo_epi16(a.val, b.val); __m128i ab1 = _mm_unpackhi_epi16(a.val, b.val); @@ -2060,6 +2132,7 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, _mm_storeu_si128((__m128i*)(ptr), v0); _mm_storeu_si128((__m128i*)(ptr + 8), v1); _mm_storeu_si128((__m128i*)(ptr + 16), v2); +#endif } inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b, @@ -2085,6 +2158,15 @@ inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16 _mm_storeu_si128((__m128i*)(ptr + 24), v3); } +inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b ) +{ + __m128i t0 = _mm_unpacklo_epi32(a.val, b.val); + __m128i t1 = _mm_unpackhi_epi32(a.val, b.val); + + _mm_storeu_si128((__m128i*)ptr, t0); + _mm_storeu_si128((__m128i*)(ptr + 4), t1); +} + inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b, const v_uint32x4& c ) { @@ -2158,6 +2240,15 @@ inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32 _mm_storeu_ps(ptr + 12, v3); } +inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b) +{ + __m128i t0 = _mm_unpacklo_epi64(a.val, b.val); + __m128i t1 = _mm_unpackhi_epi64(a.val, b.val); + + _mm_storeu_si128((__m128i*)ptr, t0); + _mm_storeu_si128((__m128i*)(ptr + 2), t1); +} + inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c) { __m128i t0 = _mm_unpacklo_epi64(a.val, b.val); @@ -2169,58 +2260,72 @@ inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x _mm_storeu_si128((__m128i*)(ptr + 4), t2); } -inline void v_store_interleave(int64 *ptr, const v_int64x2& a, const v_int64x2& b, const v_int64x2& c) +inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c, const v_uint64x2& d) { - v_store_interleave((uint64*)ptr, v_reinterpret_as_u64(a), v_reinterpret_as_u64(b), v_reinterpret_as_u64(c)); -} + __m128i t0 = _mm_unpacklo_epi64(a.val, b.val); + __m128i t1 = _mm_unpacklo_epi64(c.val, d.val); + __m128i t2 = _mm_unpackhi_epi64(a.val, b.val); + __m128i t3 = _mm_unpackhi_epi64(c.val, d.val); -inline void v_store_interleave(double *ptr, const v_float64x2& a, const v_float64x2& b, const v_float64x2& c) -{ - v_store_interleave((uint64*)ptr, v_reinterpret_as_u64(a), v_reinterpret_as_u64(b), v_reinterpret_as_u64(c)); + _mm_storeu_si128((__m128i*)ptr, t0); + _mm_storeu_si128((__m128i*)(ptr + 2), t1); + _mm_storeu_si128((__m128i*)(ptr + 4), t2); + _mm_storeu_si128((__m128i*)(ptr + 6), t3); } -#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix) \ -inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \ - _Tpvec& b0, _Tpvec& c0 ) \ +#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \ +inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \ +{ \ + _Tpvec1 a1, b1; \ + v_load_deinterleave((const _Tp1*)ptr, a1, b1); \ + a0 = v_reinterpret_as_##suffix0(a1); \ + b0 = v_reinterpret_as_##suffix0(b1); \ +} \ +inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \ +{ \ + _Tpvec1 a1, b1, c1; \ + v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \ + a0 = v_reinterpret_as_##suffix0(a1); \ + b0 = v_reinterpret_as_##suffix0(b1); \ + c0 = v_reinterpret_as_##suffix0(c1); \ +} \ +inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \ { \ - _Tpuvec a1, b1, c1; \ - v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1); \ - a0 = v_reinterpret_as_##suffix(a1); \ - b0 = v_reinterpret_as_##suffix(b1); \ - c0 = v_reinterpret_as_##suffix(c1); \ + _Tpvec1 a1, b1, c1, d1; \ + v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \ + a0 = v_reinterpret_as_##suffix0(a1); \ + b0 = v_reinterpret_as_##suffix0(b1); \ + c0 = v_reinterpret_as_##suffix0(c1); \ + d0 = v_reinterpret_as_##suffix0(d1); \ } \ -inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \ - _Tpvec& b0, _Tpvec& c0, _Tpvec& d0 ) \ +inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0 ) \ { \ - _Tpuvec a1, b1, c1, d1; \ - v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1, d1); \ - a0 = v_reinterpret_as_##suffix(a1); \ - b0 = v_reinterpret_as_##suffix(b1); \ - c0 = v_reinterpret_as_##suffix(c1); \ - d0 = v_reinterpret_as_##suffix(d1); \ + _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \ + _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \ + v_store_interleave((_Tp1*)ptr, a1, b1); \ } \ -inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, \ - const _Tpvec& b0, const _Tpvec& c0 ) \ +inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0 ) \ { \ - _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \ - _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \ - _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \ - v_store_interleave((_Tpu*)ptr, a1, b1, c1); \ + _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \ + _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \ + _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \ + v_store_interleave((_Tp1*)ptr, a1, b1, c1); \ } \ -inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, const _Tpvec& b0, \ - const _Tpvec& c0, const _Tpvec& d0 ) \ +inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \ + const _Tpvec0& c0, const _Tpvec0& d0 ) \ { \ - _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \ - _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \ - _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \ - _Tpuvec d1 = v_reinterpret_as_##usuffix(d0); \ - v_store_interleave((_Tpu*)ptr, a1, b1, c1, d1); \ + _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \ + _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \ + _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \ + _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \ + v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1); \ } OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8) OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16) OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32) -//OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float32x4, float, f32, v_uint32x4, unsigned, u32) +OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int64x2, int64, s64, v_uint64x2, uint64, u64) +OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float64x2, double, f64, v_uint64x2, uint64, u64) inline v_float32x4 v_cvt_f32(const v_int32x4& a) { diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp index 069e9578eb..9ad8234895 100644 --- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp @@ -298,6 +298,8 @@ OPENCV_HAL_IMPL_VSX_INTERLEAVE(uint, v_uint32x4) OPENCV_HAL_IMPL_VSX_INTERLEAVE(int, v_int32x4) OPENCV_HAL_IMPL_VSX_INTERLEAVE(float, v_float32x4) OPENCV_HAL_IMPL_VSX_INTERLEAVE(double, v_float64x2) +OPENCV_HAL_IMPL_VSX_INTERLEAVE(int64, v_int64x2) +OPENCV_HAL_IMPL_VSX_INTERLEAVE(uint64, v_uint64x2) /* Expand */ #define OPENCV_HAL_IMPL_VSX_EXPAND(_Tpvec, _Tpwvec, _Tp, fl, fh) \ diff --git a/modules/core/src/merge.cpp b/modules/core/src/merge.cpp index e1fe6ad1e9..a57d3bbb6e 100644 --- a/modules/core/src/merge.cpp +++ b/modules/core/src/merge.cpp @@ -8,223 +8,49 @@ namespace cv { namespace hal { -#if CV_NEON -template struct VMerge2; -template struct VMerge3; -template struct VMerge4; - -#define MERGE2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ - template<> \ - struct name{ \ - void operator()(const data_type* src0, const data_type* src1, \ - data_type* dst){ \ - reg_type r; \ - r.val[0] = load_func(src0); \ - r.val[1] = load_func(src1); \ - store_func(dst, r); \ - } \ - } +#if CV_SIMD +template static void +vecmerge_( const T** src, T* dst, int len, int cn ) +{ + int i; + const T* src0 = src[0]; + const T* src1 = src[1]; -#define MERGE3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ - template<> \ - struct name{ \ - void operator()(const data_type* src0, const data_type* src1, \ - const data_type* src2, data_type* dst){ \ - reg_type r; \ - r.val[0] = load_func(src0); \ - r.val[1] = load_func(src1); \ - r.val[2] = load_func(src2); \ - store_func(dst, r); \ - } \ + const int VECSZ = VecT::nlanes; + if( cn == 2 ) + { + for( i = 0; i < len; i += VECSZ ) + { + i = std::min( len - VECSZ, i ); + VecT a = vx_load(src0 + i), b = vx_load(src1 + i); + v_store_interleave(dst + i*cn, a, b); + } } - -#define MERGE4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ - template<> \ - struct name{ \ - void operator()(const data_type* src0, const data_type* src1, \ - const data_type* src2, const data_type* src3, \ - data_type* dst){ \ - reg_type r; \ - r.val[0] = load_func(src0); \ - r.val[1] = load_func(src1); \ - r.val[2] = load_func(src2); \ - r.val[3] = load_func(src3); \ - store_func(dst, r); \ - } \ + else if( cn == 3 ) + { + const T* src2 = src[2]; + for( i = 0; i < len; i += VECSZ ) + { + i = std::min( len - VECSZ, i ); + VecT a = vx_load(src0 + i), b = vx_load(src1 + i), c = vx_load(src2 + i); + v_store_interleave(dst + i*cn, a, b, c); + } } - -MERGE2_KERNEL_TEMPLATE(VMerge2, uchar , uint8x16x2_t, vld1q_u8 , vst2q_u8 ); -MERGE2_KERNEL_TEMPLATE(VMerge2, ushort, uint16x8x2_t, vld1q_u16, vst2q_u16); -MERGE2_KERNEL_TEMPLATE(VMerge2, int , int32x4x2_t, vld1q_s32, vst2q_s32); -MERGE2_KERNEL_TEMPLATE(VMerge2, int64 , int64x1x2_t, vld1_s64 , vst2_s64 ); - -MERGE3_KERNEL_TEMPLATE(VMerge3, uchar , uint8x16x3_t, vld1q_u8 , vst3q_u8 ); -MERGE3_KERNEL_TEMPLATE(VMerge3, ushort, uint16x8x3_t, vld1q_u16, vst3q_u16); -MERGE3_KERNEL_TEMPLATE(VMerge3, int , int32x4x3_t, vld1q_s32, vst3q_s32); -MERGE3_KERNEL_TEMPLATE(VMerge3, int64 , int64x1x3_t, vld1_s64 , vst3_s64 ); - -MERGE4_KERNEL_TEMPLATE(VMerge4, uchar , uint8x16x4_t, vld1q_u8 , vst4q_u8 ); -MERGE4_KERNEL_TEMPLATE(VMerge4, ushort, uint16x8x4_t, vld1q_u16, vst4q_u16); -MERGE4_KERNEL_TEMPLATE(VMerge4, int , int32x4x4_t, vld1q_s32, vst4q_s32); -MERGE4_KERNEL_TEMPLATE(VMerge4, int64 , int64x1x4_t, vld1_s64 , vst4_s64 ); - -#elif CV_SSE2 - -template -struct VMerge2 -{ - VMerge2() : support(false) { } - void operator()(const T *, const T *, T *) const { } - - bool support; -}; - -template -struct VMerge3 -{ - VMerge3() : support(false) { } - void operator()(const T *, const T *, const T *, T *) const { } - - bool support; -}; - -template -struct VMerge4 -{ - VMerge4() : support(false) { } - void operator()(const T *, const T *, const T *, const T *, T *) const { } - - bool support; -}; - -#define MERGE2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \ -template <> \ -struct VMerge2 \ -{ \ - enum \ - { \ - ELEMS_IN_VEC = 16 / sizeof(data_type) \ - }; \ - \ - VMerge2() \ - { \ - support = checkHardwareSupport(se); \ - } \ - \ - void operator()(const data_type * src0, const data_type * src1, \ - data_type * dst) const \ - { \ - reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \ - reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \ - reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \ - reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \ - \ - _mm_interleave(v_src0, v_src1, v_src2, v_src3); \ - \ - _mm_storeu_##flavor((cast_type *)(dst), v_src0); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \ - } \ - \ - bool support; \ -} - -#define MERGE3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \ -template <> \ -struct VMerge3 \ -{ \ - enum \ - { \ - ELEMS_IN_VEC = 16 / sizeof(data_type) \ - }; \ - \ - VMerge3() \ - { \ - support = checkHardwareSupport(se); \ - } \ - \ - void operator()(const data_type * src0, const data_type * src1, const data_type * src2,\ - data_type * dst) const \ - { \ - reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \ - reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \ - reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \ - reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \ - reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2)); \ - reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC)); \ - \ - _mm_interleave(v_src0, v_src1, v_src2, \ - v_src3, v_src4, v_src5); \ - \ - _mm_storeu_##flavor((cast_type *)(dst), v_src0); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5); \ - } \ - \ - bool support; \ -} - -#define MERGE4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \ -template <> \ -struct VMerge4 \ -{ \ - enum \ - { \ - ELEMS_IN_VEC = 16 / sizeof(data_type) \ - }; \ - \ - VMerge4() \ - { \ - support = checkHardwareSupport(se); \ - } \ - \ - void operator()(const data_type * src0, const data_type * src1, \ - const data_type * src2, const data_type * src3, \ - data_type * dst) const \ - { \ - reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0)); \ - reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC)); \ - reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1)); \ - reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC)); \ - reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2)); \ - reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC)); \ - reg_type v_src6 = _mm_loadu_##flavor((const cast_type *)(src3)); \ - reg_type v_src7 = _mm_loadu_##flavor((const cast_type *)(src3 + ELEMS_IN_VEC)); \ - \ - _mm_interleave(v_src0, v_src1, v_src2, v_src3, \ - v_src4, v_src5, v_src6, v_src7); \ - \ - _mm_storeu_##flavor((cast_type *)(dst), v_src0); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 6), v_src6); \ - _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 7), v_src7); \ - } \ - \ - bool support; \ + else + { + CV_Assert( cn == 4 ); + const T* src2 = src[2]; + const T* src3 = src[3]; + for( i = 0; i < len; i += VECSZ ) + { + i = std::min( len - VECSZ, i ); + VecT a = vx_load(src0 + i), b = vx_load(src1 + i); + VecT c = vx_load(src2 + i), d = vx_load(src3 + i); + v_store_interleave(dst + i*cn, a, b, c, d); + } + } + vx_cleanup(); } - -MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2); -MERGE3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2); -MERGE4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2); - -#if CV_SSE4_1 -MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1); -MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1); -MERGE4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1); -#endif - -MERGE2_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2); -MERGE3_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2); -MERGE4_KERNEL_TEMPLATE( int, __m128, float, _mm_interleave_ps, ps, CV_CPU_SSE2); - #endif template static void @@ -242,28 +68,6 @@ merge_( const T** src, T* dst, int len, int cn ) { const T *src0 = src[0], *src1 = src[1]; i = j = 0; -#if CV_NEON - if(cn == 2) - { - int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); - int inc_j = 2 * inc_i; - - VMerge2 vmerge; - for( ; i < len - inc_i; i += inc_i, j += inc_j) - vmerge(src0 + i, src1 + i, dst + j); - } -#elif CV_SSE2 - if(cn == 2) - { - int inc_i = 32/sizeof(T); - int inc_j = 2 * inc_i; - - VMerge2 vmerge; - if (vmerge.support) - for( ; i < len - inc_i; i += inc_i, j += inc_j) - vmerge(src0 + i, src1 + i, dst + j); - } -#endif for( ; i < len; i++, j += cn ) { dst[j] = src0[i]; @@ -274,28 +78,6 @@ merge_( const T** src, T* dst, int len, int cn ) { const T *src0 = src[0], *src1 = src[1], *src2 = src[2]; i = j = 0; -#if CV_NEON - if(cn == 3) - { - int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); - int inc_j = 3 * inc_i; - - VMerge3 vmerge; - for( ; i < len - inc_i; i += inc_i, j += inc_j) - vmerge(src0 + i, src1 + i, src2 + i, dst + j); - } -#elif CV_SSE2 - if(cn == 3) - { - int inc_i = 32/sizeof(T); - int inc_j = 3 * inc_i; - - VMerge3 vmerge; - if (vmerge.support) - for( ; i < len - inc_i; i += inc_i, j += inc_j) - vmerge(src0 + i, src1 + i, src2 + i, dst + j); - } -#endif for( ; i < len; i++, j += cn ) { dst[j] = src0[i]; @@ -307,28 +89,6 @@ merge_( const T** src, T* dst, int len, int cn ) { const T *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3]; i = j = 0; -#if CV_NEON - if(cn == 4) - { - int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); - int inc_j = 4 * inc_i; - - VMerge4 vmerge; - for( ; i < len - inc_i; i += inc_i, j += inc_j) - vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j); - } -#elif CV_SSE2 - if(cn == 4) - { - int inc_i = 32/sizeof(T); - int inc_j = 4 * inc_i; - - VMerge4 vmerge; - if (vmerge.support) - for( ; i < len - inc_i; i += inc_i, j += inc_j) - vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j); - } -#endif for( ; i < len; i++, j += cn ) { dst[j] = src0[i]; dst[j+1] = src1[i]; @@ -347,29 +107,48 @@ merge_( const T** src, T* dst, int len, int cn ) } } - void merge8u(const uchar** src, uchar* dst, int len, int cn ) { CALL_HAL(merge8u, cv_hal_merge8u, src, dst, len, cn) - merge_(src, dst, len, cn); +#if CV_SIMD + if( len >= v_uint8::nlanes && 2 <= cn && cn <= 4 ) + vecmerge_(src, dst, len, cn); + else +#endif + merge_(src, dst, len, cn); } void merge16u(const ushort** src, ushort* dst, int len, int cn ) { CALL_HAL(merge16u, cv_hal_merge16u, src, dst, len, cn) - merge_(src, dst, len, cn); +#if CV_SIMD + if( len >= v_uint16::nlanes && 2 <= cn && cn <= 4 ) + vecmerge_(src, dst, len, cn); + else +#endif + merge_(src, dst, len, cn); } void merge32s(const int** src, int* dst, int len, int cn ) { CALL_HAL(merge32s, cv_hal_merge32s, src, dst, len, cn) - merge_(src, dst, len, cn); +#if CV_SIMD + if( len >= v_int32::nlanes && 2 <= cn && cn <= 4 ) + vecmerge_(src, dst, len, cn); + else +#endif + merge_(src, dst, len, cn); } void merge64s(const int64** src, int64* dst, int len, int cn ) { CALL_HAL(merge64s, cv_hal_merge64s, src, dst, len, cn) - merge_(src, dst, len, cn); +#if CV_SIMD + if( len >= v_int64::nlanes && 2 <= cn && cn <= 4 ) + vecmerge_(src, dst, len, cn); + else +#endif + merge_(src, dst, len, cn); } }} // cv::hal:: diff --git a/modules/core/src/split.cpp b/modules/core/src/split.cpp index 43896455e1..6f7b61ac7e 100644 --- a/modules/core/src/split.cpp +++ b/modules/core/src/split.cpp @@ -8,222 +8,57 @@ namespace cv { namespace hal { -#if CV_NEON -template struct VSplit2; -template struct VSplit3; -template struct VSplit4; - -#define SPLIT2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ - template<> \ - struct name \ - { \ - void operator()(const data_type* src, data_type* dst0, \ - data_type* dst1) const \ - { \ - reg_type r = load_func(src); \ - store_func(dst0, r.val[0]); \ - store_func(dst1, r.val[1]); \ - } \ - } +#if CV_SIMD +template static void +vecsplit_( const T* src, T** dst, int len, int cn ) +{ + int i; + T* dst0 = dst[0]; + T* dst1 = dst[1]; -#define SPLIT3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ - template<> \ - struct name \ - { \ - void operator()(const data_type* src, data_type* dst0, data_type* dst1, \ - data_type* dst2) const \ - { \ - reg_type r = load_func(src); \ - store_func(dst0, r.val[0]); \ - store_func(dst1, r.val[1]); \ - store_func(dst2, r.val[2]); \ - } \ + const int VECSZ = VecT::nlanes; + if( cn == 2 ) + { + for( i = 0; i < len; i += VECSZ ) + { + i = std::min( len - VECSZ, i ); + VecT a, b; + v_load_deinterleave(src + i*cn, a, b); + v_store(dst0 + i, a); + v_store(dst1 + i, b); + } } - -#define SPLIT4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func) \ - template<> \ - struct name \ - { \ - void operator()(const data_type* src, data_type* dst0, data_type* dst1, \ - data_type* dst2, data_type* dst3) const \ - { \ - reg_type r = load_func(src); \ - store_func(dst0, r.val[0]); \ - store_func(dst1, r.val[1]); \ - store_func(dst2, r.val[2]); \ - store_func(dst3, r.val[3]); \ - } \ + else if( cn == 3 ) + { + T* dst2 = dst[2]; + for( i = 0; i < len; i += VECSZ ) + { + i = std::min( len - VECSZ, i ); + VecT a, b, c; + v_load_deinterleave(src + i*cn, a, b, c); + v_store(dst0 + i, a); + v_store(dst1 + i, b); + v_store(dst2 + i, c); + } } - -SPLIT2_KERNEL_TEMPLATE(VSplit2, uchar , uint8x16x2_t, vld2q_u8 , vst1q_u8 ); -SPLIT2_KERNEL_TEMPLATE(VSplit2, ushort, uint16x8x2_t, vld2q_u16, vst1q_u16); -SPLIT2_KERNEL_TEMPLATE(VSplit2, int , int32x4x2_t, vld2q_s32, vst1q_s32); -SPLIT2_KERNEL_TEMPLATE(VSplit2, int64 , int64x1x2_t, vld2_s64 , vst1_s64 ); - -SPLIT3_KERNEL_TEMPLATE(VSplit3, uchar , uint8x16x3_t, vld3q_u8 , vst1q_u8 ); -SPLIT3_KERNEL_TEMPLATE(VSplit3, ushort, uint16x8x3_t, vld3q_u16, vst1q_u16); -SPLIT3_KERNEL_TEMPLATE(VSplit3, int , int32x4x3_t, vld3q_s32, vst1q_s32); -SPLIT3_KERNEL_TEMPLATE(VSplit3, int64 , int64x1x3_t, vld3_s64 , vst1_s64 ); - -SPLIT4_KERNEL_TEMPLATE(VSplit4, uchar , uint8x16x4_t, vld4q_u8 , vst1q_u8 ); -SPLIT4_KERNEL_TEMPLATE(VSplit4, ushort, uint16x8x4_t, vld4q_u16, vst1q_u16); -SPLIT4_KERNEL_TEMPLATE(VSplit4, int , int32x4x4_t, vld4q_s32, vst1q_s32); -SPLIT4_KERNEL_TEMPLATE(VSplit4, int64 , int64x1x4_t, vld4_s64 , vst1_s64 ); - -#elif CV_SSE2 - -template -struct VSplit2 -{ - VSplit2() : support(false) { } - void operator()(const T *, T *, T *) const { } - - bool support; -}; - -template -struct VSplit3 -{ - VSplit3() : support(false) { } - void operator()(const T *, T *, T *, T *) const { } - - bool support; -}; - -template -struct VSplit4 -{ - VSplit4() : support(false) { } - void operator()(const T *, T *, T *, T *, T *) const { } - - bool support; -}; - -#define SPLIT2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \ -template <> \ -struct VSplit2 \ -{ \ - enum \ - { \ - ELEMS_IN_VEC = 16 / sizeof(data_type) \ - }; \ - \ - VSplit2() \ - { \ - support = checkHardwareSupport(CV_CPU_SSE2); \ - } \ - \ - void operator()(const data_type * src, \ - data_type * dst0, data_type * dst1) const \ - { \ - reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \ - reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \ - reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \ - reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \ - \ - _mm_deinterleave(v_src0, v_src1, v_src2, v_src3); \ - \ - _mm_storeu_##flavor((cast_type *)(dst0), v_src0); \ - _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \ - _mm_storeu_##flavor((cast_type *)(dst1), v_src2); \ - _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \ - } \ - \ - bool support; \ -} - -#define SPLIT3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \ -template <> \ -struct VSplit3 \ -{ \ - enum \ - { \ - ELEMS_IN_VEC = 16 / sizeof(data_type) \ - }; \ - \ - VSplit3() \ - { \ - support = checkHardwareSupport(CV_CPU_SSE2); \ - } \ - \ - void operator()(const data_type * src, \ - data_type * dst0, data_type * dst1, data_type * dst2) const \ - { \ - reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \ - reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \ - reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \ - reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \ - reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \ - reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \ - \ - _mm_deinterleave(v_src0, v_src1, v_src2, \ - v_src3, v_src4, v_src5); \ - \ - _mm_storeu_##flavor((cast_type *)(dst0), v_src0); \ - _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \ - _mm_storeu_##flavor((cast_type *)(dst1), v_src2); \ - _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \ - _mm_storeu_##flavor((cast_type *)(dst2), v_src4); \ - _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \ - } \ - \ - bool support; \ -} - -#define SPLIT4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor) \ -template <> \ -struct VSplit4 \ -{ \ - enum \ - { \ - ELEMS_IN_VEC = 16 / sizeof(data_type) \ - }; \ - \ - VSplit4() \ - { \ - support = checkHardwareSupport(CV_CPU_SSE2); \ - } \ - \ - void operator()(const data_type * src, data_type * dst0, data_type * dst1, \ - data_type * dst2, data_type * dst3) const \ - { \ - reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src)); \ - reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC)); \ - reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \ - reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \ - reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \ - reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \ - reg_type v_src6 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 6)); \ - reg_type v_src7 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 7)); \ - \ - _mm_deinterleave(v_src0, v_src1, v_src2, v_src3, \ - v_src4, v_src5, v_src6, v_src7); \ - \ - _mm_storeu_##flavor((cast_type *)(dst0), v_src0); \ - _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1); \ - _mm_storeu_##flavor((cast_type *)(dst1), v_src2); \ - _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3); \ - _mm_storeu_##flavor((cast_type *)(dst2), v_src4); \ - _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5); \ - _mm_storeu_##flavor((cast_type *)(dst3), v_src6); \ - _mm_storeu_##flavor((cast_type *)(dst3 + ELEMS_IN_VEC), v_src7); \ - } \ - \ - bool support; \ + else + { + CV_Assert( cn == 4 ); + T* dst2 = dst[2]; + T* dst3 = dst[3]; + for( i = 0; i < len; i += VECSZ ) + { + i = std::min( len - VECSZ, i ); + VecT a, b, c, d; + v_load_deinterleave(src + i*cn, a, b, c, d); + v_store(dst0 + i, a); + v_store(dst1 + i, b); + v_store(dst2 + i, c); + v_store(dst3 + i, d); + } + } + vx_cleanup(); } - -SPLIT2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); -SPLIT2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); -SPLIT2_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); - -SPLIT3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); -SPLIT3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); -SPLIT3_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); - -SPLIT4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128); -SPLIT4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128); -SPLIT4_KERNEL_TEMPLATE( int, __m128, float, _mm_deinterleave_ps, ps); - #endif template static void @@ -250,30 +85,6 @@ split_( const T* src, T** dst, int len, int cn ) T *dst0 = dst[0], *dst1 = dst[1]; i = j = 0; -#if CV_NEON - if(cn == 2) - { - int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); - int inc_j = 2 * inc_i; - - VSplit2 vsplit; - for( ; i < len - inc_i; i += inc_i, j += inc_j) - vsplit(src + j, dst0 + i, dst1 + i); - } -#elif CV_SSE2 - if (cn == 2) - { - int inc_i = 32/sizeof(T); - int inc_j = 2 * inc_i; - - VSplit2 vsplit; - if (vsplit.support) - { - for( ; i <= len - inc_i; i += inc_i, j += inc_j) - vsplit(src + j, dst0 + i, dst1 + i); - } - } -#endif for( ; i < len; i++, j += cn ) { dst0[i] = src[j]; @@ -285,31 +96,6 @@ split_( const T* src, T** dst, int len, int cn ) T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2]; i = j = 0; -#if CV_NEON - if(cn == 3) - { - int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); - int inc_j = 3 * inc_i; - - VSplit3 vsplit; - for( ; i <= len - inc_i; i += inc_i, j += inc_j) - vsplit(src + j, dst0 + i, dst1 + i, dst2 + i); - } -#elif CV_SSE2 - if (cn == 3) - { - int inc_i = 32/sizeof(T); - int inc_j = 3 * inc_i; - - VSplit3 vsplit; - - if (vsplit.support) - { - for( ; i <= len - inc_i; i += inc_i, j += inc_j) - vsplit(src + j, dst0 + i, dst1 + i, dst2 + i); - } - } -#endif for( ; i < len; i++, j += cn ) { dst0[i] = src[j]; @@ -322,30 +108,6 @@ split_( const T* src, T** dst, int len, int cn ) T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2], *dst3 = dst[3]; i = j = 0; -#if CV_NEON - if(cn == 4) - { - int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T); - int inc_j = 4 * inc_i; - - VSplit4 vsplit; - for( ; i <= len - inc_i; i += inc_i, j += inc_j) - vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i); - } -#elif CV_SSE2 - if (cn == 4) - { - int inc_i = 32/sizeof(T); - int inc_j = 4 * inc_i; - - VSplit4 vsplit; - if (vsplit.support) - { - for( ; i <= len - inc_i; i += inc_i, j += inc_j) - vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i); - } - } -#endif for( ; i < len; i++, j += cn ) { dst0[i] = src[j]; dst1[i] = src[j+1]; @@ -367,25 +129,46 @@ split_( const T* src, T** dst, int len, int cn ) void split8u(const uchar* src, uchar** dst, int len, int cn ) { CALL_HAL(split8u, cv_hal_split8u, src,dst, len, cn) - split_(src, dst, len, cn); + +#if CV_SIMD + if( len >= v_uint8::nlanes && 2 <= cn && cn <= 4 ) + vecsplit_(src, dst, len, cn); + else +#endif + split_(src, dst, len, cn); } void split16u(const ushort* src, ushort** dst, int len, int cn ) { CALL_HAL(split16u, cv_hal_split16u, src,dst, len, cn) - split_(src, dst, len, cn); +#if CV_SIMD + if( len >= v_uint16::nlanes && 2 <= cn && cn <= 4 ) + vecsplit_(src, dst, len, cn); + else +#endif + split_(src, dst, len, cn); } void split32s(const int* src, int** dst, int len, int cn ) { CALL_HAL(split32s, cv_hal_split32s, src,dst, len, cn) - split_(src, dst, len, cn); +#if CV_SIMD + if( len >= v_uint32::nlanes && 2 <= cn && cn <= 4 ) + vecsplit_(src, dst, len, cn); + else +#endif + split_(src, dst, len, cn); } void split64s(const int64* src, int64** dst, int len, int cn ) { CALL_HAL(split64s, cv_hal_split64s, src,dst, len, cn) - split_(src, dst, len, cn); +#if CV_SIMD + if( len >= v_int64::nlanes && 2 <= cn && cn <= 4 ) + vecsplit_(src, dst, len, cn); + else +#endif + split_(src, dst, len, cn); } }} // cv::hal::