diff --git a/.github/workflows/PR-5.x.yaml b/.github/workflows/PR-5.x.yaml index 3d4f9f8147..505078a325 100644 --- a/.github/workflows/PR-5.x.yaml +++ b/.github/workflows/PR-5.x.yaml @@ -25,14 +25,16 @@ jobs: Windows10-x64: uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-5.x-W10.yaml@main - Windows10-x64-Vulkan: - uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-5.x-W10-Vulkan.yaml@main +# Vulkan configuration disabled as Vulkan backend for DNN does not support int/int64 for now +# Details: https://github.com/opencv/opencv/issues/25110 +# Windows10-x64-Vulkan: +# uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-5.x-W10-Vulkan.yaml@main macOS-ARM64: uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-5.x-macOS-ARM64.yaml@main - macOS-ARM64-Vulkan: - uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-5.x-macOS-ARM64-Vulkan.yaml@main +# macOS-ARM64-Vulkan: +# uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-5.x-macOS-ARM64-Vulkan.yaml@main macOS-x64: uses: opencv/ci-gha-workflow/.github/workflows/OCV-PR-5.x-macOS-x86_64.yaml@main diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp index 9d65ac162a..4f2ea3b063 100644 --- a/modules/dnn/include/opencv2/dnn/dnn.hpp +++ b/modules/dnn/include/opencv2/dnn/dnn.hpp @@ -62,6 +62,7 @@ CV__DNN_INLINE_NS_BEGIN //! @{ typedef std::vector MatShape; + typedef int MatType; /** * @brief Enum of computation backends supported by layers. @@ -205,8 +206,16 @@ CV__DNN_INLINE_NS_BEGIN */ virtual void setHostDirty() = 0; + int getHostMatDepth() { + CV_Assert(hostMatDepth != -1); + return hostMatDepth; + } + int backendId; //!< Backend identifier. int targetId; //!< Target identifier. + + protected: + int hostMatDepth = -1; }; class CV_EXPORTS ActivationLayer; @@ -397,6 +406,12 @@ CV__DNN_INLINE_NS_BEGIN std::vector &outputs, std::vector &internals) const; + virtual void getTypes(const std::vector& inputs, + const int requiredOutputs, + const int requiredInternals, + std::vector&outputs, + std::vector&internals) const; + virtual int64 getFLOPS(const std::vector &inputs, const std::vector &outputs) const {CV_UNUSED(inputs); CV_UNUSED(outputs); return 0;} @@ -675,6 +690,7 @@ CV__DNN_INLINE_NS_BEGIN /** @brief Returns input and output shapes for all layers in loaded model; * preliminary inferencing isn't necessary. * @param netInputShapes shapes for all input blobs in net input layer. + * @param netInputTypes types for all input blobs in net input layer. * @param layersIds output parameter for layer IDs. * @param inLayersShapes output parameter for input layers shapes; * order is the same as in layersIds @@ -682,12 +698,14 @@ CV__DNN_INLINE_NS_BEGIN * order is the same as in layersIds */ CV_WRAP void getLayersShapes(const std::vector& netInputShapes, + const std::vector& netInputTypes, CV_OUT std::vector& layersIds, CV_OUT std::vector >& inLayersShapes, CV_OUT std::vector >& outLayersShapes) const; /** @overload */ CV_WRAP void getLayersShapes(const MatShape& netInputShape, + const int& netInputType, CV_OUT std::vector& layersIds, CV_OUT std::vector >& inLayersShapes, CV_OUT std::vector >& outLayersShapes) const; @@ -695,6 +713,7 @@ CV__DNN_INLINE_NS_BEGIN /** @brief Returns input and output shapes for layer with specified * id in loaded model; preliminary inferencing isn't necessary. * @param netInputShape shape input blob in net input layer. + * @param netInputType input type in net input layer. * @param layerId id for layer. * @param inLayerShapes output parameter for input layers shapes; * order is the same as in layersIds @@ -702,29 +721,36 @@ CV__DNN_INLINE_NS_BEGIN * order is the same as in layersIds */ void getLayerShapes(const MatShape& netInputShape, + const int& netInputType, const int layerId, CV_OUT std::vector& inLayerShapes, CV_OUT std::vector& outLayerShapes) const; // FIXIT: CV_WRAP /** @overload */ void getLayerShapes(const std::vector& netInputShapes, + const std::vector& netInputTypes, const int layerId, CV_OUT std::vector& inLayerShapes, CV_OUT std::vector& outLayerShapes) const; // FIXIT: CV_WRAP /** @brief Computes FLOP for whole loaded model with specified input shapes. * @param netInputShapes vector of shapes for all net inputs. + * @param netInputTypes vector of types for all net inputs. * @returns computed FLOP. */ - CV_WRAP int64 getFLOPS(const std::vector& netInputShapes) const; + CV_WRAP int64 getFLOPS(const std::vector& netInputShapes, + const std::vector& netInputTypes) const; /** @overload */ - CV_WRAP int64 getFLOPS(const MatShape& netInputShape) const; + CV_WRAP int64 getFLOPS(const MatShape& netInputShape, + const int& netInputType) const; /** @overload */ CV_WRAP int64 getFLOPS(const int layerId, - const std::vector& netInputShapes) const; + const std::vector& netInputShapes, + const std::vector& netInputTypes) const; /** @overload */ CV_WRAP int64 getFLOPS(const int layerId, - const MatShape& netInputShape) const; + const MatShape& netInputShape, + const int& netInputType) const; /** @brief Returns list of types for layer used in model. * @param layersTypes output parameter for returning types. @@ -740,36 +766,44 @@ CV__DNN_INLINE_NS_BEGIN /** @brief Computes bytes number which are required to store * all weights and intermediate blobs for model. * @param netInputShapes vector of shapes for all net inputs. + * @param netInputTypes vector of types for all net inputs. * @param weights output parameter to store resulting bytes for weights. * @param blobs output parameter to store resulting bytes for intermediate blobs. */ void getMemoryConsumption(const std::vector& netInputShapes, + const std::vector& netInputTypes, CV_OUT size_t& weights, CV_OUT size_t& blobs) const; // FIXIT: CV_WRAP /** @overload */ CV_WRAP void getMemoryConsumption(const MatShape& netInputShape, + const int& netInputType, CV_OUT size_t& weights, CV_OUT size_t& blobs) const; /** @overload */ CV_WRAP void getMemoryConsumption(const int layerId, const std::vector& netInputShapes, + const std::vector& netInputTypes, CV_OUT size_t& weights, CV_OUT size_t& blobs) const; /** @overload */ CV_WRAP void getMemoryConsumption(const int layerId, const MatShape& netInputShape, + const int& netInputType, CV_OUT size_t& weights, CV_OUT size_t& blobs) const; /** @brief Computes bytes number which are required to store * all weights and intermediate blobs for each layer. * @param netInputShapes vector of shapes for all net inputs. + * @param netInputTypes vector of types for all net inputs. * @param layerIds output vector to save layer IDs. * @param weights output parameter to store resulting bytes for weights. * @param blobs output parameter to store resulting bytes for intermediate blobs. */ void getMemoryConsumption(const std::vector& netInputShapes, + const std::vector& netInputTypes, CV_OUT std::vector& layerIds, CV_OUT std::vector& weights, CV_OUT std::vector& blobs) const; // FIXIT: CV_WRAP /** @overload */ void getMemoryConsumption(const MatShape& netInputShape, + const int& netInputType, CV_OUT std::vector& layerIds, CV_OUT std::vector& weights, CV_OUT std::vector& blobs) const; // FIXIT: CV_WRAP diff --git a/modules/dnn/misc/java/test/DnnListRegressionTest.java b/modules/dnn/misc/java/test/DnnListRegressionTest.java index 4c357aff86..d30c9fcc86 100644 --- a/modules/dnn/misc/java/test/DnnListRegressionTest.java +++ b/modules/dnn/misc/java/test/DnnListRegressionTest.java @@ -97,10 +97,11 @@ public class DnnListRegressionTest extends OpenCVTestCase { int layerId = 1; List netInputShapes = new ArrayList(); netInputShapes.add(new MatOfInt(1, 3, 224, 224)); + MatOfInt netInputTypes = new MatOfInt(5); long[] weights=null; long[] blobs=null; try { - net.getMemoryConsumption(layerId, netInputShapes, weights, blobs); + net.getMemoryConsumption(layerId, netInputShapes, netInputTypes, weights, blobs); } catch(Exception e) { fail("Net getMemoryConsumption failed: " + e.getMessage()); } @@ -110,8 +111,9 @@ public class DnnListRegressionTest extends OpenCVTestCase { int layerId = 1; List netInputShapes = new ArrayList(); netInputShapes.add(new MatOfInt(1, 3, 224, 224)); + MatOfInt netInputTypes = new MatOfInt(5); try { - net.getFLOPS(layerId, netInputShapes); + net.getFLOPS(layerId, netInputShapes, netInputTypes); } catch(Exception e) { fail("Net getFLOPS failed: " + e.getMessage()); } diff --git a/modules/dnn/perf/perf_convolution.cpp b/modules/dnn/perf/perf_convolution.cpp index 2c33969a76..65318a0437 100644 --- a/modules/dnn/perf/perf_convolution.cpp +++ b/modules/dnn/perf/perf_convolution.cpp @@ -886,9 +886,17 @@ Net build_net( Mat output = net.forward(); MatShape netInputShape = shape(input); + cv::dnn::MatType netInputType = input.depth(); + + bool fp16 = false; +#ifdef HAVE_OPENCL + fp16 = ocl::Device::getDefault().isExtensionSupported("cl_khr_fp16"); +#endif + if (netInputType == CV_32F && fp16 && targetId == DNN_TARGET_OPENCL_FP16) + netInputType = CV_16F; size_t weightsMemory = 0, blobsMemory = 0; - net.getMemoryConsumption(netInputShape, weightsMemory, blobsMemory); - int64 flops = net.getFLOPS(netInputShape); + net.getMemoryConsumption(netInputShape, netInputType, weightsMemory, blobsMemory); + int64 flops = net.getFLOPS(netInputShape, netInputType); CV_Assert(flops > 0); std::cout diff --git a/modules/dnn/perf/perf_convolution1d.cpp b/modules/dnn/perf/perf_convolution1d.cpp index 76e409475c..5ee8a515aa 100644 --- a/modules/dnn/perf/perf_convolution1d.cpp +++ b/modules/dnn/perf/perf_convolution1d.cpp @@ -136,9 +136,17 @@ PERF_TEST_P_(Conv1D, conv1d) Mat output = net.forward(); MatShape netInputShape = shape(input); + cv::dnn::MatType netInputType = input.depth(); + + bool fp16 = false; +#ifdef HAVE_OPENCL + fp16 = ocl::Device::getDefault().isExtensionSupported("cl_khr_fp16"); +#endif + if (netInputType == CV_32F && fp16 && targetId == DNN_TARGET_OPENCL_FP16) + netInputType = CV_16F; size_t weightsMemory = 0, blobsMemory = 0; - net.getMemoryConsumption(netInputShape, weightsMemory, blobsMemory); - int64 flops = net.getFLOPS(netInputShape); + net.getMemoryConsumption(netInputShape, netInputType, weightsMemory, blobsMemory); + int64 flops = net.getFLOPS(netInputShape, netInputType); CV_Assert(flops > 0); std::cout diff --git a/modules/dnn/perf/perf_convolution3d.cpp b/modules/dnn/perf/perf_convolution3d.cpp index 728bf4062a..2d69e72cc5 100644 --- a/modules/dnn/perf/perf_convolution3d.cpp +++ b/modules/dnn/perf/perf_convolution3d.cpp @@ -155,9 +155,17 @@ PERF_TEST_P_(Conv3D, conv3d) Mat output = net.forward(); MatShape netInputShape = shape(input); + cv::dnn::MatType netInputType = input.depth(); + + bool fp16 = false; +#ifdef HAVE_OPENCL + fp16 = ocl::Device::getDefault().isExtensionSupported("cl_khr_fp16"); +#endif + if (netInputType == CV_32F && fp16 && targetId == DNN_TARGET_OPENCL_FP16) + netInputType = CV_16F; size_t weightsMemory = 0, blobsMemory = 0; - net.getMemoryConsumption(netInputShape, weightsMemory, blobsMemory); - int64 flops = net.getFLOPS(netInputShape); + net.getMemoryConsumption(netInputShape, netInputType, weightsMemory, blobsMemory); + int64 flops = net.getFLOPS(netInputShape, netInputType); CV_Assert(flops > 0); std::cout diff --git a/modules/dnn/perf/perf_layer.cpp b/modules/dnn/perf/perf_layer.cpp index acdc778b3c..d436f95cc6 100644 --- a/modules/dnn/perf/perf_layer.cpp +++ b/modules/dnn/perf/perf_layer.cpp @@ -267,15 +267,13 @@ PERF_TEST_P_(Layer_Scatter, scatter) { int target_id = get<1>(get<3>(GetParam())); Mat data(shape, CV_32FC1); - Mat indices(shape, CV_32FC1); + Mat indices(shape, CV_64SC1); Mat updates(shape, CV_32FC1); randn(data, 0.f, 1.f); randu(indices, 0, shape[axis]); randn(updates, 0.f, 1.f); - indices.convertTo(indices, CV_32SC1, 1, -1); - Net net; LayerParams lp; lp.type = "Scatter"; @@ -334,7 +332,7 @@ PERF_TEST_P_(Layer_ScatterND, scatterND) { std::vector indices_shape(shape); indices_shape.push_back(int(shape.size())); Mat data(shape, CV_32FC1); - Mat indices(indices_shape, CV_32FC1); + Mat indices(indices_shape, CV_32SC1); Mat updates(shape, CV_32FC1); randn(data, 0.f, 1.f); @@ -346,11 +344,11 @@ PERF_TEST_P_(Layer_ScatterND, scatterND) { std::vector indices_step; for (int i = 0; i < indices.dims; i++) { - int step = indices.step.p[i] / sizeof(float); + int step = indices.step.p[i] / sizeof(int32_t); indices_step.push_back(step); } int t, j, idx, offset_at_idx, offset; - auto *indices_ptr = indices.ptr(); + auto *indices_ptr = indices.ptr(); for (int i = 0; i < total; i++) { t = i; @@ -629,7 +627,7 @@ struct Layer_GatherElements : public TestBaseWithParam > int targetId = get<1>(GetParam()); Mat data(data_shape, CV_32FC1); - Mat indices(indices_shape, CV_32FC1); + Mat indices(indices_shape, CV_64SC1); randu(data, 0.f, 1.f); randu(indices, 0, data_shape[axis]); diff --git a/modules/dnn/perf/perf_net.cpp b/modules/dnn/perf/perf_net.cpp index 1334327a9f..6280f4f6a9 100644 --- a/modules/dnn/perf/perf_net.cpp +++ b/modules/dnn/perf/perf_net.cpp @@ -47,13 +47,25 @@ public: for(auto &inp: inputs){ netMatShapes.push_back(shape(std::get<0>(inp))); } - size_t weightsMemory = 0, blobsMemory = 0; - net.getMemoryConsumption(netMatShapes, weightsMemory, blobsMemory); - int64 flops = net.getFLOPS(netMatShapes); - CV_Assert(flops > 0); + + bool fp16 = false; +#ifdef HAVE_OPENCL + fp16 = ocl::Device::getDefault().isExtensionSupported("cl_khr_fp16"); +#endif + std::vector netMatTypes; + for (auto& inp : inputs) { + cv::dnn::MatType t = std::get<0>(inp).depth(); + if (t == CV_32F && fp16 && target == DNN_TARGET_OPENCL_FP16) + t = CV_16F; + netMatTypes.push_back(t); + } net.forward(outputLayer); // warmup + size_t weightsMemory = 0, blobsMemory = 0; + net.getMemoryConsumption(netMatShapes, netMatTypes, weightsMemory, blobsMemory); + int64 flops = net.getFLOPS(netMatShapes, netMatTypes); + CV_Assert(flops > 0); std::cout << "Memory consumption:" << std::endl; std::cout << " Weights(parameters): " << divUp(weightsMemory, 1u<<20) << " Mb" << std::endl; std::cout << " Blobs: " << divUp(blobsMemory, 1u<<20) << " Mb" << std::endl; diff --git a/modules/dnn/src/cuda/concat.cu b/modules/dnn/src/cuda/concat.cu index 5250b59518..40491f48a9 100644 --- a/modules/dnn/src/cuda/concat.cu +++ b/modules/dnn/src/cuda/concat.cu @@ -152,6 +152,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { template void concat<__half>(const Stream&, TensorSpan<__half>, std::size_t, TensorView<__half>, std::size_t); #endif template void concat(const Stream&, TensorSpan, std::size_t, TensorView, std::size_t); + template void concat(const Stream&, TensorSpan, std::size_t, TensorView, std::size_t); + template void concat(const Stream&, TensorSpan, std::size_t, TensorView, std::size_t); template static void launch_concat_with_offsets( @@ -271,7 +273,11 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { concat_with_offsets_dispatcher(rank, stream, output, outStride, offsets, input, inStride); } +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) template void concat_with_offsets(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector); +#endif template void concat_with_offsets(const Stream&, TensorSpan, TensorView, std::vector); + template void concat_with_offsets(const Stream&, TensorSpan, TensorView, std::vector); + template void concat_with_offsets(const Stream&, TensorSpan, TensorView, std::vector); }}}} /* namespace cv::dnn::cuda4dnn::kernels */ diff --git a/modules/dnn/src/cuda/eltwise_ops.cu b/modules/dnn/src/cuda/eltwise_ops.cu index e2a7cc9a67..9bd0889371 100644 --- a/modules/dnn/src/cuda/eltwise_ops.cu +++ b/modules/dnn/src/cuda/eltwise_ops.cu @@ -371,4 +371,25 @@ void eltwise_fmod_2(const Stream& stream, TensorSpan output, TensorView x, template void eltwise_max_2(const Stream& stream, TensorSpan output, TensorView x, TensorView y); template void eltwise_min_2(const Stream& stream, TensorSpan output, TensorView x, TensorView y); + template void eltwise_mod_2(const Stream& stream, TensorSpan output, TensorView x, TensorView y); + template void eltwise_fmod_2(const Stream& stream, TensorSpan output, TensorView x, TensorView y); + template void eltwise_sub_2(const Stream& stream, TensorSpan output, TensorView x, TensorView y); + template void eltwise_div_2(const Stream& stream, TensorSpan output, TensorView x, TensorView y); + template void eltwise_prod_2(const Stream& stream, TensorSpan output, TensorView x, TensorView y); + template void eltwise_sum_coeff_2(const Stream&, TensorSpan, int32_t, TensorView, int32_t, TensorView); + template void eltwise_sum_2(const Stream& stream, TensorSpan output, TensorView x, TensorView y); + template void eltwise_max_2(const Stream& stream, TensorSpan output, TensorView x, TensorView y); + template void eltwise_min_2(const Stream& stream, TensorSpan output, TensorView x, TensorView y); + + template void eltwise_mod_2(const Stream& stream, TensorSpan output, TensorView x, TensorView y); + template void eltwise_fmod_2(const Stream& stream, TensorSpan output, TensorView x, TensorView y); + template void eltwise_sub_2(const Stream& stream, TensorSpan output, TensorView x, TensorView y); + template void eltwise_div_2(const Stream& stream, TensorSpan output, TensorView x, TensorView y); + template void eltwise_prod_2(const Stream& stream, TensorSpan output, TensorView x, TensorView y); + template void eltwise_sum_coeff_2(const Stream&, TensorSpan, int64_t, TensorView, int64_t, TensorView); + template void eltwise_sum_2(const Stream& stream, TensorSpan output, TensorView x, TensorView y); + template void eltwise_max_2(const Stream& stream, TensorSpan output, TensorView x, TensorView y); + template void eltwise_min_2(const Stream& stream, TensorSpan output, TensorView x, TensorView y); + + }}}} /* namespace cv::dnn::cuda4dnn::kernels */ diff --git a/modules/dnn/src/cuda/fill_copy.cu b/modules/dnn/src/cuda/fill_copy.cu index 61cc63443a..9f8345e3cf 100644 --- a/modules/dnn/src/cuda/fill_copy.cu +++ b/modules/dnn/src/cuda/fill_copy.cu @@ -68,6 +68,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { #endif template void fill(const Stream&, Span, float); template void fill(const Stream&, Span, int); + template void fill(const Stream&, Span, int64_t); template static void launch_vectorized_copy(const Stream& stream, Span output, View input) { @@ -94,5 +95,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { template void copy(const Stream&, Span<__half>, View<__half>); #endif template void copy(const Stream&, Span, View); + template void copy(const Stream&, Span, View); + template void copy(const Stream&, Span, View); }}}} /* namespace cv::dnn::cuda4dnn::kernels */ diff --git a/modules/dnn/src/cuda/limits.hpp b/modules/dnn/src/cuda/limits.hpp index 7b7656a2c0..66a32c72e2 100644 --- a/modules/dnn/src/cuda/limits.hpp +++ b/modules/dnn/src/cuda/limits.hpp @@ -31,6 +31,20 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace de __device__ static float lowest() { return -FLT_MAX; } }; + template <> + struct numeric_limits { + __device__ static int32_t min() { return 1; } + __device__ static int32_t max() { return INT_MAX; } + __device__ static int32_t lowest() { return INT_MIN; } + }; + + template <> + struct numeric_limits { + __device__ static int64_t min() { return 1; } + __device__ static int64_t max() { return LLONG_MAX; } + __device__ static int64_t lowest() { return LLONG_MIN; } + }; + }}}}} /* namespace cv::dnn::cuda4dnn::csl::device */ #endif /* OPENCV_DNN_SRC_CUDA_LIMITS_HPP */ diff --git a/modules/dnn/src/cuda/max_unpooling.cu b/modules/dnn/src/cuda/max_unpooling.cu index 3bfd75f926..6884cf1005 100644 --- a/modules/dnn/src/cuda/max_unpooling.cu +++ b/modules/dnn/src/cuda/max_unpooling.cu @@ -30,10 +30,10 @@ using namespace cv::dnn::cuda4dnn::csl::device; namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { namespace raw { - template ::type = true> /* Order has been hardcoded; see code */ __global__ void max_pooling_with_indices( - Span output, Span indices, View input, size_type channels, + Span output, Span indices, View input, size_type channels, array out_spatial_dims, array in_spatial_dims, array window_size, array strides, array padding_left) { @@ -130,9 +130,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { } } - template + template __global__ void max_unpooling( - Span output, View input, View indices, size_type channels, + Span output, View input, View indices, size_type channels, array out_spatial_dims, array in_spatial_dims, array window_size, array strides, array padding_left) { @@ -164,15 +164,15 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { out_spatial_size *= out_spatial_dims[i]; index_type outer_offset = (n * channels + c) * out_spatial_size; - output[outer_offset + static_cast(indices[idx])] = input[idx]; + output[outer_offset + indices[idx]] = input[idx]; } } } - template static + template static void launch_max_pooling_kernel( const Stream& stream, - Span output, Span indices, View input, std::size_t channels, + Span output, Span indices, View input, std::size_t channels, const std::vector& out_spatial_dims, const std::vector& in_spatial_dims, const std::vector& window_size, const std::vector& strides, const std::vector& padding_left) @@ -193,16 +193,16 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { strides_k.assign(std::begin(strides), std::end(strides)); padding_left_k.assign(std::begin(padding_left), std::end(padding_left)); - auto kernel = raw::max_pooling_with_indices; + auto kernel = raw::max_pooling_with_indices; auto policy = make_policy(kernel, output.size(), 0, stream); launch_kernel(kernel, policy, output, indices, input, channels, out_spatial_dims_k, in_spatial_dims_k, window_size_k, strides_k, padding_left_k); } - template + template void max_pooling_with_indices( const Stream& stream, - TensorSpan output, TensorSpan indices, TensorView input, + TensorSpan output, TensorSpan indices, TensorView input, const std::vector& window_size, const std::vector& strides, const std::vector& padding_left) { @@ -224,33 +224,63 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { CV_Assert(1 <= order && order <= 3); std::size_t channels = input.get_axis_size(1); if (order == 3) { - launch_max_pooling_kernel(stream, output, indices, input, channels, + launch_max_pooling_kernel(stream, output, indices, input, channels, out_spatial_dims, in_spatial_dims, window_size, strides, padding_left); } else if (order == 2) { - launch_max_pooling_kernel(stream, output, indices, input, channels, + launch_max_pooling_kernel(stream, output, indices, input, channels, out_spatial_dims, in_spatial_dims, window_size, strides, padding_left); } else if (order == 1) { - launch_max_pooling_kernel(stream, output, indices, input, channels, + launch_max_pooling_kernel(stream, output, indices, input, channels, out_spatial_dims, in_spatial_dims, window_size, strides, padding_left); } } #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) template void max_pooling_with_indices(const Stream&, - TensorSpan<__half>, TensorSpan<__half>, TensorView<__half>, + TensorSpan<__half>, TensorSpan, TensorView<__half>, + const std::vector&, const std::vector&, + const std::vector&); + + template void max_pooling_with_indices(const Stream&, + TensorSpan<__half>, TensorSpan, TensorView<__half>, const std::vector&, const std::vector&, const std::vector&); #endif template void max_pooling_with_indices(const Stream&, - TensorSpan, TensorSpan, TensorView, + TensorSpan, TensorSpan, TensorView, + const std::vector&, const std::vector&, + const std::vector&); + + template void max_pooling_with_indices(const Stream&, + TensorSpan, TensorSpan, TensorView, + const std::vector&, const std::vector&, + const std::vector&); + + template void max_pooling_with_indices(const Stream&, + TensorSpan, TensorSpan, TensorView, + const std::vector&, const std::vector&, + const std::vector&); + + template void max_pooling_with_indices(const Stream&, + TensorSpan, TensorSpan, TensorView, + const std::vector&, const std::vector&, + const std::vector&); + + template void max_pooling_with_indices(const Stream&, + TensorSpan, TensorSpan, TensorView, + const std::vector&, const std::vector&, + const std::vector&); + + template void max_pooling_with_indices(const Stream&, + TensorSpan, TensorSpan, TensorView, const std::vector&, const std::vector&, const std::vector&); - template static + template static void launch_max_unpooling_kernel( const Stream& stream, - Span output, View input, View indices, std::size_t channels, + Span output, View input, View indices, std::size_t channels, const std::vector& out_spatial_dims, const std::vector& in_spatial_dims, const std::vector& window_size, const std::vector& strides, const std::vector& padding_left) @@ -271,16 +301,16 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { strides_k.assign(std::begin(strides), std::end(strides)); padding_left_k.assign(std::begin(padding_left), std::end(padding_left)); - auto kernel = raw::max_unpooling; + auto kernel = raw::max_unpooling; auto policy = make_policy(kernel, input.size(), 0, stream); launch_kernel(kernel, policy, output, input, indices, channels, out_spatial_dims_k, in_spatial_dims_k, window_size_k, strides_k, padding_left_k); } - template + template void max_unpooling( const Stream& stream, - TensorSpan output, TensorView input, TensorView indices, + TensorSpan output, TensorView input, TensorView indices, const std::vector& window_size, const std::vector& strides, const std::vector& padding_left) { @@ -305,23 +335,53 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { CV_Assert(2 <= order && order <= 3); std::size_t channels = input.get_axis_size(1); if (order == 3) { - launch_max_unpooling_kernel(stream, output, input, indices, channels, + launch_max_unpooling_kernel(stream, output, input, indices, channels, out_spatial_dims, in_spatial_dims, window_size, strides, padding_left); } else if (order == 2) { - launch_max_unpooling_kernel(stream, output, input, indices, channels, + launch_max_unpooling_kernel(stream, output, input, indices, channels, out_spatial_dims, in_spatial_dims, window_size, strides, padding_left); } } #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) template void max_unpooling(const Stream&, - TensorSpan<__half>, TensorView<__half>, TensorView<__half>, + TensorSpan<__half>, TensorView<__half>, TensorView, + const std::vector&, const std::vector&, + const std::vector&); + + template void max_unpooling(const Stream&, + TensorSpan<__half>, TensorView<__half>, TensorView, const std::vector&, const std::vector&, const std::vector&); #endif template void max_unpooling(const Stream&, - TensorSpan, TensorView, TensorView, + TensorSpan, TensorView, TensorView, + const std::vector&, const std::vector&, + const std::vector&); + + template void max_unpooling(const Stream&, + TensorSpan, TensorView, TensorView, + const std::vector&, const std::vector&, + const std::vector&); + + template void max_unpooling(const Stream&, + TensorSpan, TensorView, TensorView, + const std::vector&, const std::vector&, + const std::vector&); + + template void max_unpooling(const Stream&, + TensorSpan, TensorView, TensorView, + const std::vector&, const std::vector&, + const std::vector&); + + template void max_unpooling(const Stream&, + TensorSpan, TensorView, TensorView, + const std::vector&, const std::vector&, + const std::vector&); + + template void max_unpooling(const Stream&, + TensorSpan, TensorView, TensorView, const std::vector&, const std::vector&, const std::vector&); diff --git a/modules/dnn/src/cuda4dnn/kernels/max_unpooling.hpp b/modules/dnn/src/cuda4dnn/kernels/max_unpooling.hpp index 6fe4d61aaa..2b18edd20a 100644 --- a/modules/dnn/src/cuda4dnn/kernels/max_unpooling.hpp +++ b/modules/dnn/src/cuda4dnn/kernels/max_unpooling.hpp @@ -13,17 +13,17 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { - template + template void max_pooling_with_indices( const csl::Stream& stream, - csl::TensorSpan output, csl::TensorSpan indices, csl::TensorView input, + csl::TensorSpan output, csl::TensorSpan indices, csl::TensorView input, const std::vector& kernel_size, const std::vector& strides, const std::vector& padding_left); - template + template void max_unpooling( const csl::Stream& stream, - csl::TensorSpan output, csl::TensorView input, csl::TensorView indices, + csl::TensorSpan output, csl::TensorView input, csl::TensorView indices, const std::vector& window_size, const std::vector& strides, const std::vector& padding_left); diff --git a/modules/dnn/src/cuda4dnn/primitives/max_unpooling.hpp b/modules/dnn/src/cuda4dnn/primitives/max_unpooling.hpp index fc1002fc4e..759cf65b16 100644 --- a/modules/dnn/src/cuda4dnn/primitives/max_unpooling.hpp +++ b/modules/dnn/src/cuda4dnn/primitives/max_unpooling.hpp @@ -39,7 +39,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { std::vector input_shape; }; - template + template class MaxPoolingOp final : public CUDABackendNode { public: using wrapper_type = GetCUDABackendWrapperType; @@ -103,10 +103,10 @@ namespace cv { namespace dnn { namespace cuda4dnn { auto output_wrapper = outputs[0].dynamicCast(); auto output_data = output_wrapper->getSpan(); - auto indices_wrapper = outputs[1].dynamicCast(); + auto indices_wrapper = outputs[1].dynamicCast>(); auto output_indices = indices_wrapper->getSpan(); - kernels::max_pooling_with_indices( + kernels::max_pooling_with_indices( stream, output_data, output_indices, input_data, window_size, strides, padding_left ); } @@ -124,7 +124,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { std::vector pads_begin; }; - template + template class MaxUnpoolingOp final : public CUDABackendNode { public: using wrapper_type = GetCUDABackendWrapperType; @@ -160,13 +160,13 @@ namespace cv { namespace dnn { namespace cuda4dnn { auto input_wrapper = inputs[0].dynamicCast(); auto input_data = input_wrapper->getView(); - auto indices_wrapper = inputs[1].dynamicCast(); + auto indices_wrapper = inputs[1].dynamicCast>(); auto input_indices = indices_wrapper->getView(); auto output_wrapper = outputs[i].dynamicCast(); auto output_data = output_wrapper->getSpan(); - kernels::max_unpooling(stream, output_data, input_data, input_indices, window_size, strides, padding_left); + kernels::max_unpooling(stream, output_data, input_data, input_indices, window_size, strides, padding_left); } } diff --git a/modules/dnn/src/dnn_common.hpp b/modules/dnn/src/dnn_common.hpp index 82b7e845ac..83709443ad 100644 --- a/modules/dnn/src/dnn_common.hpp +++ b/modules/dnn/src/dnn_common.hpp @@ -46,10 +46,12 @@ bool getParam_DNN_CHECK_NAN_INF_RAISE_ERROR(); inline namespace detail { typedef std::vector ShapesVec; +typedef std::vector TypesVec; struct LayerShapes { ShapesVec in, out, internal; + TypesVec inTypes, outTypes, internalTypes; // No guarantees that layer which support in-place computations // will be computed in-place (input.data_ptr == output.data_ptr). // If layer said that it could work in-place and layers after it diff --git a/modules/dnn/src/int8layers/quantization_utils.cpp b/modules/dnn/src/int8layers/quantization_utils.cpp index 146ad68257..e4b7cbaba9 100644 --- a/modules/dnn/src/int8layers/quantization_utils.cpp +++ b/modules/dnn/src/int8layers/quantization_utils.cpp @@ -113,6 +113,16 @@ public: return false; } + void getTypes(const std::vector& inputs, + const int requiredOutputs, + const int requiredInternals, + std::vector& outputs, + std::vector& internals) const CV_OVERRIDE + { + outputs.assign(requiredOutputs, CV_8S); + } + + virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE { std::vector inputs, outputs; @@ -239,6 +249,19 @@ public: return false; } + void getTypes(const std::vector& inputs, + const int requiredOutputs, + const int requiredInternals, + std::vector& outputs, + std::vector& internals) const CV_OVERRIDE + { + if (preferableTarget == DNN_TARGET_OPENCL_FP16) + outputs.assign(requiredOutputs, CV_16F); + else + outputs.assign(requiredOutputs, CV_32F); + } + + virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE { std::vector inputs, outputs; diff --git a/modules/dnn/src/layer.cpp b/modules/dnn/src/layer.cpp index d00a220ff5..f1f1a42e59 100644 --- a/modules/dnn/src/layer.cpp +++ b/modules/dnn/src/layer.cpp @@ -181,20 +181,32 @@ void Layer::forward_fallback(InputArrayOfArrays inputs_arr, OutputArrayOfArrays inputs.resize(orig_inputs.size()); for (size_t i = 0; i < orig_inputs.size(); i++) - orig_inputs[i].convertTo(inputs[i], CV_32F); + if (orig_inputs[i].depth() == CV_16F) + orig_inputs[i].convertTo(inputs[i], CV_32F); + else + inputs[i] = orig_inputs[i]; outputs.resize(orig_outputs.size()); for (size_t i = 0; i < orig_outputs.size(); i++) - outputs[i].create(shape(orig_outputs[i]), CV_32F); + if (orig_outputs[i].depth() == CV_16F) + outputs[i].create(shape(orig_outputs[i]), CV_32F); + else + outputs[i] = orig_outputs[i]; internals.resize(orig_internals.size()); for (size_t i = 0; i < orig_internals.size(); i++) - internals[i].create(shape(orig_internals[i]), CV_32F); + if (orig_internals[i].depth() == CV_16F) + internals[i].create(shape(orig_internals[i]), CV_32F); + else + internals[i] = orig_internals[i]; forward(inputs, outputs, internals); for (size_t i = 0; i < outputs.size(); i++) - outputs[i].convertTo(orig_outputs[i], CV_16F); + if (orig_outputs[i].depth() == CV_16F) + outputs[i].convertTo(orig_outputs[i], CV_16F); + else + outputs[i] = orig_outputs[i]; // sync results back outputs_arr.assign(orig_outputs); @@ -240,6 +252,25 @@ bool Layer::getMemoryShapes(const std::vector& inputs, return false; } +void Layer::getTypes(const std::vector&inputs, + const int requiredOutputs, + const int requiredInternals, + std::vector&outputs, + std::vector&internals) const +{ + CV_Assert(inputs.size()); + for (auto input : inputs) + if (preferableTarget == DNN_TARGET_CUDA_FP16 || preferableTarget == DNN_TARGET_CUDA) + CV_CheckTypeEQ(input, CV_32F, ""); + else if (preferableTarget == DNN_TARGET_OPENCL_FP16) + CV_CheckType(input, input == CV_16F || input == CV_8S, ""); + else + CV_CheckType(input, input == CV_32F || input == CV_8S, ""); + + outputs.assign(requiredOutputs, inputs[0]); + internals.assign(requiredInternals, inputs[0]); +} + bool Layer::updateMemoryShapes(const std::vector& inputs) { return true; diff --git a/modules/dnn/src/layer_internals.hpp b/modules/dnn/src/layer_internals.hpp index 149fb14866..eefb06a230 100644 --- a/modules/dnn/src/layer_internals.hpp +++ b/modules/dnn/src/layer_internals.hpp @@ -146,14 +146,19 @@ struct DataLayer : public Layer CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget), forward_ocl(inputs_arr, outputs_arr, internals_arr)) - bool isFP16 = outputs_arr.depth() == CV_16F; - std::vector outputs, internals; outputs_arr.getMatVector(outputs); internals_arr.getMatVector(internals); for (int i = 0; i < inputsData.size(); ++i) { + bool isFP16 = outputs[i].depth() == CV_16F; + if (inputsData[i].type() == CV_32S || inputsData[i].type() == CV_64S) { + CV_CheckTypeEQ(outputs[i].type(), inputsData[i].type(), ""); + CV_Assert(means[i] == Scalar() && scaleFactors[i] == 1.0); + inputsData[i].copyTo(outputs[i]); + continue; + } double scale = scaleFactors[i]; Scalar& mean = means[i]; @@ -209,13 +214,18 @@ struct DataLayer : public Layer #ifdef HAVE_OPENCL bool forward_ocl(InputArrayOfArrays, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_) { - bool isFP16 = outputs_.depth() == CV_16F; - std::vector outputs; outputs_.getUMatVector(outputs); for (int i = 0; i < inputsData.size(); ++i) { + bool isFP16 = outputs[i].depth() == CV_16F; + if (inputsData[i].type() == CV_32S || inputsData[i].type() == CV_64S) { + CV_CheckTypeEQ(outputs[i].type(), inputsData[i].type(), ""); + CV_Assert(means[i] == Scalar() && scaleFactors[i] == 1.0); + inputsData[i].copyTo(outputs[i]); + continue; + } Mat inputData = inputsData[i]; double scale = scaleFactors[i]; @@ -228,9 +238,12 @@ struct DataLayer : public Layer CV_CheckTypeEQ(outputs[i].type(), CV_32FC1, ""); bool singleMean = true; - for (int j = 1; j < std::min(4, inputData.size[1]) && singleMean; ++j) + if (mean != Scalar()) { - singleMean = mean[j] == mean[j - 1]; + for (int j = 1; j < std::min(4, inputData.size[1]) && singleMean; ++j) + { + singleMean = mean[j] == mean[j - 1]; + } } if (singleMean) @@ -311,6 +324,16 @@ struct DataLayer : public Layer return false; } + void getTypes(const std::vector& inputs, + const int requiredOutputs, + const int requiredInternals, + std::vector& outputs, + std::vector& internals) const CV_OVERRIDE + { + CV_Assert(inputs.size()); + outputs = inputs; + } + virtual void finalize(InputArrayOfArrays, OutputArrayOfArrays outputs_arr) CV_OVERRIDE { std::vector outputs; diff --git a/modules/dnn/src/layers/arg_layer.cpp b/modules/dnn/src/layers/arg_layer.cpp index 94af45882a..f605012b78 100644 --- a/modules/dnn/src/layers/arg_layer.cpp +++ b/modules/dnn/src/layers/arg_layer.cpp @@ -72,6 +72,15 @@ public: return false; } + virtual void getTypes(const std::vector& inputs, + const int requiredOutputs, + const int requiredInternals, + std::vector& outputs, + std::vector& internals) const CV_OVERRIDE + { + outputs.assign(1, CV_64S); + } + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE { CV_TRACE_FUNCTION(); @@ -98,7 +107,7 @@ public: } output = output.reshape(1, outShape); - output.convertTo(outputs[0], CV_32FC1); + output.convertTo(outputs[0], CV_64SC1); } private: diff --git a/modules/dnn/src/layers/blank_layer.cpp b/modules/dnn/src/layers/blank_layer.cpp index 957130a218..1216ad1012 100644 --- a/modules/dnn/src/layers/blank_layer.cpp +++ b/modules/dnn/src/layers/blank_layer.cpp @@ -82,6 +82,17 @@ public: return true; } + void getTypes(const std::vector& inputs, + const int requiredOutputs, + const int requiredInternals, + std::vector& outputs, + std::vector& internals) const CV_OVERRIDE + { + CV_Assert(inputs.size()); + outputs = inputs; + } + + #ifdef HAVE_OPENCL bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_) { @@ -165,7 +176,7 @@ public: ) override { auto context = reinterpret_cast(context_); - return make_cuda_node(preferableTarget, std::move(context->stream)); + return make_cuda_node_with_type(preferableTarget, inputs[0]->getHostMatDepth(), std::move(context->stream)); } #endif }; diff --git a/modules/dnn/src/layers/concat_layer.cpp b/modules/dnn/src/layers/concat_layer.cpp index d1e2a8cae2..2f4985d446 100644 --- a/modules/dnn/src/layers/concat_layer.cpp +++ b/modules/dnn/src/layers/concat_layer.cpp @@ -115,6 +115,19 @@ public: return false; } + virtual void getTypes(const std::vector& inputs, + const int requiredOutputs, + const int requiredInternals, + std::vector& outputs, + std::vector& internals) const CV_OVERRIDE + { + CV_Assert(inputs.size()); + for (int i = 1; i < inputs.size(); i++) + CV_CheckTypeEQ(inputs[i], inputs[0], "All input types should be equal"); + outputs.assign(1, inputs[0]); + } + + virtual bool supportBackend(int backendId) CV_OVERRIDE { #ifdef HAVE_TIMVX @@ -273,7 +286,7 @@ public: CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) && - inputs_arr.depth() != CV_8S, + (inputs_arr.depth() == CV_32F || inputs_arr.depth() == CV_16F), forward_ocl(inputs_arr, outputs_arr, internals_arr)) std::vector inputs, outputs; @@ -286,7 +299,7 @@ public: if (padding) outMat.setTo(paddingValue); - if( cAxis == 1 && outMat.dims == 4 && !padding) + if(cAxis == 1 && outMat.dims == 4 && !padding && (inputs[0].depth() == CV_32F || inputs[0].depth() == CV_8S)) { int nstripes = getNumThreads(); if (outMat.type() == CV_8S) @@ -325,7 +338,7 @@ public: auto input_wrapper = inputs[0].dynamicCast(); auto concat_axis = normalize_axis(axis, input_wrapper->getRank()); - return make_cuda_node(preferableTarget, std::move(context->stream), concat_axis, padding); + return make_cuda_node_with_type(preferableTarget, inputs[0]->getHostMatDepth(), std::move(context->stream), concat_axis, padding); } #endif diff --git a/modules/dnn/src/layers/const_layer.cpp b/modules/dnn/src/layers/const_layer.cpp index 45d524ae8e..970ca04a89 100644 --- a/modules/dnn/src/layers/const_layer.cpp +++ b/modules/dnn/src/layers/const_layer.cpp @@ -57,6 +57,20 @@ public: return false; } + void getTypes(const std::vector& inputs, + const int requiredOutputs, + const int requiredInternals, + std::vector& outputs, + std::vector& internals) const CV_OVERRIDE + { + if (preferableTarget == DNN_TARGET_OPENCL_FP16 + && blobs[0].type() == CV_32F) + outputs.assign(1, CV_16F); + else + outputs.assign(1, blobs[0].depth()); + } + + #ifdef HAVE_OPENCL bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals) { @@ -171,10 +185,7 @@ public: CV_Assert(blobs.size() == 1); Mat blob = blobs[0]; - if (blob.type() != CV_32F) { - blob.convertTo(blob, CV_32F); - } - return make_cuda_node(preferableTarget, std::move(context->stream), blob); + return make_cuda_node_with_type(preferableTarget, blob.type(), std::move(context->stream), blob); } #endif }; diff --git a/modules/dnn/src/layers/gather_elements_layer.cpp b/modules/dnn/src/layers/gather_elements_layer.cpp index da3ae939df..327c017f64 100644 --- a/modules/dnn/src/layers/gather_elements_layer.cpp +++ b/modules/dnn/src/layers/gather_elements_layer.cpp @@ -57,6 +57,18 @@ public: return false; } + virtual void getTypes(const std::vector& inputs, + const int requiredOutputs, + const int requiredInternals, + std::vector& outputs, + std::vector& internals) const CV_OVERRIDE + { + CV_CheckEQ(inputs.size(), (size_t)2, ""); + CV_CheckType(inputs[0], inputs[0] == CV_32F || inputs[0] == CV_32S || inputs[0] == CV_16F || inputs[0] == CV_8U, ""); + CV_CheckType(inputs[1], inputs[1] == CV_64S || inputs[1] == CV_32S, ""); + outputs.assign(1, inputs[0]); + } + virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE { std::vector inputs; inputs_arr.getMatVector(inputs); @@ -70,12 +82,6 @@ public: CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); - if (inputs_arr.depth() == CV_16F) - { - forward_fallback(inputs_arr, outputs_arr, internals_arr); - return; - } - std::vector inputs, outputs; inputs_arr.getMatVector(inputs); outputs_arr.getMatVector(outputs); @@ -84,14 +90,14 @@ public: const Mat& indices = inputs[1]; Mat& out = outputs[0]; - typeDispatch(outputs[0].type(), data, indices, out); + indexTypeDispatch(out.type(), indices.type(), data, indices, out); } - template + template void forward_impl(const Mat& data_, const Mat& indices_, Mat& out_) { const auto *ptr_data = data_.ptr(); - const auto *ptr_indices = indices_.ptr(); + const auto *ptr_indices = indices_.ptr(); auto *ptr_out = out_.ptr(); const auto shape_data = shape(data_); @@ -112,12 +118,12 @@ public: if (innermost_axis) { for (int j = 0; j < inner_most_dim; j++) { - int index = static_cast((indices[j] + axis_dim)) % axis_dim; // TODO: Check out-of-range index + int index = (indices[j] + axis_dim) % axis_dim; // TODO: Check out-of-range index out[j] = data[index]; } } else { for (int j = 0; j < inner_most_dim; j++) { - int index = static_cast(indices[j] + axis_dim) % axis_dim; // TODO: Check out-of-range index + int index = (indices[j] + axis_dim) % axis_dim; // TODO: Check out-of-range index out[j] = data[index * axis_step + j]; } } @@ -130,18 +136,37 @@ public: } template + inline void indexTypeDispatch(const int type, const int index_type, Args&&... args) + { + switch (index_type) + { + case CV_32S: + typeDispatch(type, std::forward(args)...); + break; + case CV_64S: + typeDispatch(type, std::forward(args)...); + break; + default: + CV_Error(cv::Error::BadDepth, "Unsupported type."); + }; + } + + template inline void typeDispatch(const int type, Args&&... args) { switch (type) { case CV_8U: - forward_impl(std::forward(args)...); + forward_impl(std::forward(args)...); + break; + case CV_16F: + forward_impl(std::forward(args)...); break; case CV_32S: - forward_impl(std::forward(args)...); + forward_impl(std::forward(args)...); break; case CV_32F: - forward_impl(std::forward(args)...); + forward_impl(std::forward(args)...); break; default: CV_Error(cv::Error::BadDepth, "DNN/GatherElements: Unsupported type."); diff --git a/modules/dnn/src/layers/gather_layer.cpp b/modules/dnn/src/layers/gather_layer.cpp index 32b76886a3..4542385deb 100644 --- a/modules/dnn/src/layers/gather_layer.cpp +++ b/modules/dnn/src/layers/gather_layer.cpp @@ -40,6 +40,19 @@ public: return false; } + virtual void getTypes(const std::vector& inputs, + const int requiredOutputs, + const int requiredInternals, + std::vector& outputs, + std::vector& internals) const CV_OVERRIDE + { + CV_CheckEQ(inputs.size(), (size_t)2, ""); + CV_CheckType(inputs[0], inputs[0] == CV_32F || inputs[0] == CV_32S || inputs[0] == CV_16F || inputs[0] == CV_8U, ""); + CV_CheckType(inputs[1], inputs[1] == CV_64S || inputs[1] == CV_32S, ""); + outputs.assign(1, inputs[0]); + } + + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE { CV_TRACE_FUNCTION(); @@ -57,17 +70,15 @@ public: const Mat& inp = inputs[0]; int indicesType = inputs[1].type(); - CV_CheckType(indicesType, indicesType == CV_32FC1 || indicesType == CV_16FC1, ""); + CV_CheckType(indicesType, indicesType == CV_32SC1 || indicesType == CV_64SC1, ""); Mat indices32S; - if (indicesType == CV_16F/*FP16*/) + if (indicesType == CV_64SC1) { - Mat indicesF32; - inputs[1].convertTo(indicesF32, CV_32F); - indicesF32.convertTo(indices32S, CV_32S); + inputs[1].convertTo(indices32S, CV_32S); } else { - inputs[1].convertTo(indices32S, CV_32S); + indices32S = inputs[1]; } const size_t indices_total = indices32S.total(); indices32S = indices32S.reshape(1, indices_total); diff --git a/modules/dnn/src/layers/max_unpooling_layer.cpp b/modules/dnn/src/layers/max_unpooling_layer.cpp index f4c3d8cb8b..fe359ef293 100644 --- a/modules/dnn/src/layers/max_unpooling_layer.cpp +++ b/modules/dnn/src/layers/max_unpooling_layer.cpp @@ -68,17 +68,24 @@ public: return false; } + virtual void getTypes(const std::vector& inputs, + const int requiredOutputs, + const int requiredInternals, + std::vector& outputs, + std::vector& internals) const CV_OVERRIDE + { + CV_CheckGE(inputs.size(), (size_t)2, ""); + CV_CheckType(inputs[0], inputs[0] == CV_32F || inputs[0] == CV_16F, ""); + CV_CheckType(inputs[1], inputs[1] == CV_64S || inputs[1] == CV_32S, ""); + outputs.assign(1, inputs[0]); + } + + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE { CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); - if (inputs_arr.depth() == CV_16F) - { - forward_fallback(inputs_arr, outputs_arr, internals_arr); - return; - } - std::vector inputs, outputs; inputs_arr.getMatVector(inputs); outputs_arr.getMatVector(outputs); @@ -87,6 +94,19 @@ public: Mat& input = inputs[0]; Mat& indices = inputs[1]; + if (input.type() == CV_32F && indices.type() == CV_32S) + run(input, indices, outputs); + else if (input.type() == CV_32F && indices.type() == CV_64S) + run(input, indices, outputs); + else if (input.type() == CV_16F && indices.type() == CV_32S) + run(input, indices, outputs); + else if (input.type() == CV_16F && indices.type() == CV_64S) + run(input, indices, outputs); + } + + template + void run(cv::Mat& input, cv::Mat& indices, std::vector& outputs) + { CV_Assert(input.total() == indices.total()); CV_Assert(input.size[0] == 1); CV_Assert(input.isContinuous()); @@ -102,9 +122,9 @@ public: { Mat outPlane = getPlane(outBlob, 0, i_c); int wh_area = input.size[2]*input.size[3]; - const float* inptr = input.ptr(0, i_c); - const float* idxptr = indices.ptr(0, i_c); - float* outptr = outPlane.ptr(); + const T* inptr = input.ptr(0, i_c); + const INDEX_TYPE* idxptr = indices.ptr(0, i_c); + T* outptr = outPlane.ptr(); for(int i_wh = 0; i_wh < wh_area; i_wh++) { @@ -112,8 +132,8 @@ public: if (!(0 <= index && index < outPlaneTotal)) { CV_LOG_ERROR(NULL, cv::format( - "i_n=%d\ni_c=%d\ni_wh=%d\nindex=%d\nmaxval=%lf\noutPlaneTotal=%d\n", - i_n, i_c, i_wh, index, inptr[i_wh], outPlaneTotal)); + "i_n=%d\ni_c=%d\ni_wh=%d\nindex=%d\noutPlaneTotal=%d\n", + i_n, i_c, i_wh, index, outPlaneTotal)); CV_LOG_ERROR(NULL, "input.size=" << input.size); CV_LOG_ERROR(NULL, "indices.size=" << indices.size); CV_LOG_ERROR(NULL, "outBlob=" << outBlob.size); @@ -125,6 +145,7 @@ public: } } + #ifdef HAVE_CUDA Ptr initCUDA( void *context_, @@ -150,7 +171,16 @@ public: pads_begin[0] = poolPad.height; pads_begin[1] = poolPad.width; - return make_cuda_node(preferableTarget, std::move(context->stream), config); + int indicesType = inputs[1]->getHostMatDepth(); + CV_CheckType(indicesType, indicesType == CV_32S || indicesType == CV_64S, "Unsupported indices type"); + + if (indicesType == CV_32S) + return make_cuda_node_with_indices(preferableTarget, inputs[0]->getHostMatDepth(), std::move(context->stream), config); + else if (indicesType == CV_64S) + return make_cuda_node_with_indices(preferableTarget, inputs[0]->getHostMatDepth(), std::move(context->stream), config); + + CV_Error(Error::BadDepth, "Unsupported indices type"); + return Ptr(); } #endif diff --git a/modules/dnn/src/layers/nary_eltwise_layers.cpp b/modules/dnn/src/layers/nary_eltwise_layers.cpp index 70fb4a35e7..f0cce639e1 100644 --- a/modules/dnn/src/layers/nary_eltwise_layers.cpp +++ b/modules/dnn/src/layers/nary_eltwise_layers.cpp @@ -349,6 +349,28 @@ public: return false; } + void getTypes(const std::vector& inputs, + const int requiredOutputs, + const int requiredInternals, + std::vector& outputs, + std::vector& internals) const CV_OVERRIDE + { + CV_Assert(inputs.size()); + for (auto input : inputs) + { + CV_CheckTypeEQ(inputs[0], input, "All inputs should have equal types"); + if (preferableTarget == DNN_TARGET_CUDA_FP16 || preferableTarget == DNN_TARGET_CUDA) + CV_CheckType(input, input == CV_32F || input == CV_32S || input == CV_64S, "Unsupported type"); + else if (preferableTarget == DNN_TARGET_OPENCL_FP16) + CV_CheckType(input, input == CV_16F || input == CV_8S || input == CV_8U || input == CV_32S || input == CV_64S, ""); + else + CV_CheckType(input, input == CV_32F || input == CV_8S || input == CV_8U || input == CV_32S || input == CV_64S, ""); + } + + outputs.assign(requiredOutputs, inputs[0]); + } + + template void binary_forward_impl( int ndims, const std::vector& shape, @@ -773,11 +795,17 @@ public: helper.reInit(sizeof(uint8_t)); opDispatch(std::forward(args)...); break; + case CV_8S: + opDispatch(std::forward(args)...); + break; case CV_32S: // TODO: integrate with type inference helper.reInit(sizeof(int32_t)); opDispatch(std::forward(args)...); break; + case CV_64S: + opDispatch(std::forward(args)...); + break; case CV_32F: CV_Assert(op != OPERATION::BITSHIFT && op != OPERATION::AND && op != OPERATION::OR && op != OPERATION::XOR); @@ -829,7 +857,7 @@ public: default: return Ptr(); // return empty cuda_node if the EltwiseOpType is unsupported type. }; - return make_cuda_node(preferableTarget, std::move(context->stream), op_, std::vector()); + return make_cuda_node_with_type(preferableTarget, inputs[0]->getHostMatDepth(), std::move(context->stream), op_, std::vector()); } #endif diff --git a/modules/dnn/src/layers/permute_layer.cpp b/modules/dnn/src/layers/permute_layer.cpp index 824e8d56a2..df2706b8f0 100644 --- a/modules/dnn/src/layers/permute_layer.cpp +++ b/modules/dnn/src/layers/permute_layer.cpp @@ -178,6 +178,24 @@ public: return false; } + void getTypes(const std::vector& inputs, + const int requiredOutputs, + const int requiredInternals, + std::vector& outputs, + std::vector& internals) const CV_OVERRIDE + { + CV_Assert(inputs.size()); + for (auto input : inputs) + if (preferableTarget == DNN_TARGET_CUDA_FP16 || preferableTarget == DNN_TARGET_CUDA) + CV_CheckTypeEQ(input, CV_32F, "Unsupported type"); + else if (preferableTarget == DNN_TARGET_OPENCL_FP16) + CV_CheckType(input, input == CV_16F || input == CV_8S || input == CV_32S, ""); + else + CV_CheckType(input, input == CV_32F || input == CV_8S || input == CV_32S, ""); + + outputs.assign(requiredOutputs, inputs[0]); + } + void computeStrides(const MatShape &shapeBefore, const MatShape &shapeAfter) { _oldStride.resize(_numAxes); @@ -347,7 +365,7 @@ public: CV_TRACE_ARG_VALUE(name, "name", name.c_str()); CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) && - inputs_arr.depth() != CV_8S, + inputs_arr.depth() != CV_8S && inputs_arr.depth() != CV_32S, forward_ocl(inputs_arr, outputs_arr, internals_arr)) if (inputs_arr.depth() == CV_16F) diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp index 2a9981b5ae..6984c41bf8 100644 --- a/modules/dnn/src/layers/pooling_layer.cpp +++ b/modules/dnn/src/layers/pooling_layer.cpp @@ -320,11 +320,25 @@ public: CV_Assert_N(inputs.size() == 1, !outputs.empty(), !computeMaxIdx || outputs.size() == 2); UMat& inpMat = inputs[0]; UMat& outMat = outputs[0]; - UMat maskMat = computeMaxIdx ? outputs[1] : UMat(); + UMat maskMat; + if (computeMaxIdx) + maskMat.create(shape(outputs[1]), use_half ? CV_16F : CV_32F); CV_Assert(inpMat.offset == 0 && outMat.offset == 0); - return poolOp->Forward(inpMat, outMat, maskMat); + bool result = poolOp->Forward(inpMat, outMat, maskMat); + + if (computeMaxIdx) { + if (use_half) { + UMat maskMat32F; + maskMat.convertTo(maskMat32F, CV_32F); + maskMat32F.convertTo(outputs[1], CV_64S); + } + else + maskMat.convertTo(outputs[1], CV_64S); + } + + return result; } #endif @@ -353,8 +367,12 @@ public: case MAX: { CV_Assert_N(inputs.size() == 1, !computeMaxIdx || outputs.size() == 2); - Mat mask = computeMaxIdx ? outputs[1] : Mat(); + Mat mask; + if (computeMaxIdx) + mask.create(shape(outputs[1]), CV_32F); maxPooling(inputs[0], outputs[0], mask); + if (computeMaxIdx) + mask.convertTo(outputs[1], CV_64S); break; } case AVE: case SUM: @@ -413,7 +431,16 @@ public: config.input_shape.assign(std::begin(input_shape), std::end(input_shape)); - return make_cuda_node(preferableTarget, std::move(context->stream), config); + int indicesType = outputs[1]->getHostMatDepth(); + CV_CheckType(indicesType, indicesType == CV_32S || indicesType == CV_64S, "Unsupported indices type"); + + if (indicesType == CV_32S) + return make_cuda_node_with_indices(preferableTarget, inputs[0]->getHostMatDepth(), std::move(context->stream), config); + else if (indicesType == CV_64S) + return make_cuda_node_with_indices(preferableTarget, inputs[0]->getHostMatDepth(), std::move(context->stream), config); + + CV_Error(Error::BadDepth, "Unsupported indices type"); + return Ptr(); } if (input_shape.size() == 3) @@ -1251,6 +1278,26 @@ public: return false; } + virtual void getTypes(const std::vector& inputs, + const int requiredOutputs, + const int requiredInternals, + std::vector& outputs, + std::vector& internals) const CV_OVERRIDE + { + CV_Assert(inputs.size()); + if (preferableTarget == DNN_TARGET_CUDA_FP16 || preferableTarget == DNN_TARGET_CUDA) + CV_CheckTypeEQ(inputs[0], CV_32F, "Unsupported type"); + else if (preferableTarget == DNN_TARGET_OPENCL_FP16) + CV_CheckType(inputs[0], inputs[0] == CV_16F || inputs[0] == CV_8S, ""); + else + CV_CheckType(inputs[0], inputs[0] == CV_32F || inputs[0] == CV_8S, ""); + + outputs.push_back(inputs[0]); + if (type == MAX && requiredOutputs == 2) { + outputs.push_back(CV_64S); + } + } + bool updateMemoryShapes(const std::vector &inputs) CV_OVERRIDE { int dims = inputs[0].size(); diff --git a/modules/dnn/src/layers/reorg_layer.cpp b/modules/dnn/src/layers/reorg_layer.cpp index 52c18f0723..489234ae1e 100644 --- a/modules/dnn/src/layers/reorg_layer.cpp +++ b/modules/dnn/src/layers/reorg_layer.cpp @@ -101,6 +101,24 @@ public: return false; } + void getTypes(const std::vector& inputs, + const int requiredOutputs, + const int requiredInternals, + std::vector& outputs, + std::vector& internals) const CV_OVERRIDE + { + CV_Assert(inputs.size()); + for (auto input : inputs) + if (preferableTarget == DNN_TARGET_CUDA_FP16 || preferableTarget == DNN_TARGET_CUDA) + CV_CheckTypeEQ(input, CV_32F, "Unsupported type for CUDA"); + else if (preferableTarget == DNN_TARGET_OPENCL_FP16) + CV_CheckType(input, input == CV_16F || input == CV_8S || input == CV_32S || input == CV_64S, ""); + else + CV_CheckType(input, input == CV_32F || input == CV_8S || input == CV_32S || input == CV_64S, ""); + + outputs.assign(requiredOutputs, inputs[0]); + } + virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE { std::vector inputs, outputs; @@ -181,7 +199,7 @@ public: CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); - CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget), + CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) && inputs_arr.depth() != CV_32S && inputs_arr.depth() != CV_64S, forward_ocl(inputs_arr, outputs_arr, internals_arr)) if (inputs_arr.depth() == CV_16F) diff --git a/modules/dnn/src/layers/reshape_layer.cpp b/modules/dnn/src/layers/reshape_layer.cpp index 9d7255ba4e..2e35f050fc 100644 --- a/modules/dnn/src/layers/reshape_layer.cpp +++ b/modules/dnn/src/layers/reshape_layer.cpp @@ -259,6 +259,25 @@ public: return true; } + void getTypes(const std::vector& inputs, + const int requiredOutputs, + const int requiredInternals, + std::vector& outputs, + std::vector& internals) const CV_OVERRIDE + { + CV_Assert(inputs.size()); + for (auto input : inputs) + if (preferableTarget == DNN_TARGET_CUDA_FP16 || preferableTarget == DNN_TARGET_CUDA) + CV_CheckTypeEQ(input, CV_32F, "Unsupported type"); + else if (preferableTarget == DNN_TARGET_OPENCL_FP16) + CV_CheckType(input, input == CV_16F || input == CV_8S || input == CV_32S || input == CV_64S, ""); + else + CV_CheckType(input, input == CV_32F || input == CV_8S || input == CV_32S || input == CV_64S, ""); + + outputs.assign(requiredOutputs, inputs[0]); + } + + bool updateMemoryShapes(const std::vector &inputs) CV_OVERRIDE { if (hasDynamicShapes) @@ -312,7 +331,7 @@ public: CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); - CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget), + CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) && inputs_arr.depth() != CV_32S && inputs_arr.depth() != CV_64S, forward_ocl(inputs_arr, outputs_arr, internals_arr)) std::vector inputs, outputs; diff --git a/modules/dnn/src/layers/scatterND_layer.cpp b/modules/dnn/src/layers/scatterND_layer.cpp index 64ddcd0c4f..f9dcb41647 100644 --- a/modules/dnn/src/layers/scatterND_layer.cpp +++ b/modules/dnn/src/layers/scatterND_layer.cpp @@ -69,6 +69,19 @@ public: return false; } + virtual void getTypes(const std::vector& inputs, + const int requiredOutputs, + const int requiredInternals, + std::vector& outputs, + std::vector& internals) const CV_OVERRIDE + { + CV_CheckEQ(inputs.size(), (size_t)3, ""); + CV_CheckType(inputs[0], inputs[0] == CV_32F || inputs[0] == CV_32S || inputs[0] == CV_16F || inputs[0] == CV_8U, ""); + CV_CheckType(inputs[1], inputs[1] == CV_64S || inputs[1] == CV_32S, ""); + CV_CheckTypeEQ(inputs[2], inputs[0], ""); + outputs.assign(1, inputs[0]); + } + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE { CV_TRACE_FUNCTION(); @@ -88,12 +101,12 @@ public: const Mat& updates = inputs[2]; Mat& out = outputs[0]; - typeDispatch(outputs[0].type(), data, indices, updates, out); + indexTypeDispatch(outputs[0].type(), indices.type(), data, indices, updates, out); } // NOTE: This impl does not check whether indices have duplicate entries. // The last duplicate entry will overwrite the previous. - template + template void forward_impl(const Functor &reduce_operation, const Mat &input_mat, const Mat &indices_mat, const Mat &updates_mat, Mat& output_mat) { input_mat.copyTo(output_mat); @@ -120,14 +133,14 @@ public: indices_offset = r.start * indices_last_dim, updates_offset = r.start * updates_size; for (int i = r.start; i < r.end; i++) { - const T* indices = indices_mat.ptr(); + const T_INDEX* indices = indices_mat.ptr(); const T* updates = updates_mat.ptr(); T* output = output_mat.ptr(); input_offset = 0; indices += indices_offset; for (int j = 0; j < indices_last_dim; j++) { - int index = static_cast(*(indices + j)); + int index = *(indices + j); index = (index + input_mat_shape[j]) % input_mat_shape[j]; CV_Assert(index < input_mat_shape[j] && index >= 0); input_offset += index * input_mat_step[j]; @@ -150,25 +163,42 @@ public: } template + inline void indexTypeDispatch(const int type, const int index_type, Args&&... args) + { + switch (index_type) + { + case CV_32S: + typeDispatch(type, std::forward(args)...); + break; + case CV_64S: + typeDispatch(type, std::forward(args)...); + break; + default: + CV_Error(cv::Error::BadDepth, "Unsupported type."); + }; + } + + + template inline void typeDispatch(const int type, Args&&... args) { switch (type) { case CV_8U: - reductionDispatch(std::forward(args)...); + reductionDispatch(std::forward(args)...); break; case CV_32S: - reductionDispatch(std::forward(args)...); + reductionDispatch(std::forward(args)...); break; case CV_32F: - reductionDispatch(std::forward(args)...); + reductionDispatch(std::forward(args)...); break; default: CV_Error(cv::Error::BadDepth, "Unsupported type."); }; } - template + template inline void reductionDispatch(Args&&... args) { switch (reduction) @@ -176,31 +206,31 @@ public: case REDUCTION::NONE: { auto rd = [](const T& a, const T& b) { return b; }; // a from input data, b from updates - forward_impl(rd, std::forward(args)...); + forward_impl(rd, std::forward(args)...); break; } case REDUCTION::ADD: { auto rd = [](const T& a, const T& b) { return a + b; }; - forward_impl(rd, std::forward(args)...); + forward_impl(rd, std::forward(args)...); break; } case REDUCTION::MUL: { auto rd = [](const T& a, const T& b) { return a * b; }; - forward_impl(rd, std::forward(args)...); + forward_impl(rd, std::forward(args)...); break; } case REDUCTION::MAX: { auto rd = [](const T& a, const T& b) { return std::max(a, b); }; - forward_impl(rd, std::forward(args)...); + forward_impl(rd, std::forward(args)...); break; } case REDUCTION::MIN: { auto rd = [](const T& a, const T& b) { return std::min(a, b); }; - forward_impl(rd, std::forward(args)...); + forward_impl(rd, std::forward(args)...); break; } default: diff --git a/modules/dnn/src/layers/scatter_layer.cpp b/modules/dnn/src/layers/scatter_layer.cpp index b4bcdee82e..e0e6f630b6 100644 --- a/modules/dnn/src/layers/scatter_layer.cpp +++ b/modules/dnn/src/layers/scatter_layer.cpp @@ -63,6 +63,19 @@ public: return false; } + virtual void getTypes(const std::vector& inputs, + const int requiredOutputs, + const int requiredInternals, + std::vector& outputs, + std::vector& internals) const CV_OVERRIDE + { + CV_CheckEQ(inputs.size(), (size_t)3, ""); + CV_CheckType(inputs[0], inputs[0] == CV_32F || inputs[0] == CV_32S || inputs[0] == CV_16F || inputs[0] == CV_8U, ""); + CV_CheckType(inputs[1], inputs[1] == CV_64S || inputs[1] == CV_32S, ""); + CV_CheckTypeEQ(inputs[2], inputs[0], ""); + outputs.assign(1, inputs[0]); + } + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE { CV_TRACE_FUNCTION(); @@ -82,10 +95,10 @@ public: const Mat& updates = inputs[2]; Mat& out = outputs[0]; - typeDispatch(outputs[0].type(), data, indices, updates, out); + indexTypeDispatch(outputs[0].type(), indices.type(), data, indices, updates, out); } - template + template void forward_impl(const Functor &reduce_operation, const Mat &input_mat, const Mat &indices_mat, const Mat &updates_mat, Mat &output_mat) { input_mat.copyTo(output_mat); @@ -99,7 +112,7 @@ public: for (int i = 0; i < ndims; i++) { input_mat_step[i] = static_cast(input_mat.step.p[i] / sizeof(T)); - indices_mat_step[i] = static_cast(indices_mat.step.p[i] / sizeof(T)); + indices_mat_step[i] = static_cast(indices_mat.step.p[i] / sizeof(T_INDEX)); } auto fn = [&](const Range &r) { @@ -108,7 +121,7 @@ public: int indices_index, index; size_t axis_offset, tmp_index, j_index; for (int i = r.start; i < r.end; i++) { - const T* indices = indices_mat.ptr(); + const T_INDEX* indices = indices_mat.ptr(); const T* updates = updates_mat.ptr(); T* output = output_mat.ptr(); @@ -128,7 +141,7 @@ public: } // get index and overwrite current indices - index = static_cast(*(indices + indices_offset)); + index = *(indices + indices_offset); index = (index + input_mat_shape[axis]) % input_mat_shape[axis]; CV_Assert(index < input_mat_shape[axis] && index >= 0); input_offset = input_offset - axis_offset + index * input_mat_step[axis]; @@ -145,25 +158,42 @@ public: } template + inline void indexTypeDispatch(const int type, const int index_type, Args&&... args) + { + switch (index_type) + { + case CV_32S: + typeDispatch(type, std::forward(args)...); + break; + case CV_64S: + typeDispatch(type, std::forward(args)...); + break; + default: + CV_Error(cv::Error::BadDepth, "Unsupported type."); + }; + } + + + template inline void typeDispatch(const int type, Args&&... args) { switch (type) { case CV_8U: - reductionDispatch(std::forward(args)...); + reductionDispatch(std::forward(args)...); break; case CV_32S: - reductionDispatch(std::forward(args)...); + reductionDispatch(std::forward(args)...); break; case CV_32F: - reductionDispatch(std::forward(args)...); + reductionDispatch(std::forward(args)...); break; default: CV_Error(cv::Error::BadDepth, "Unsupported type."); }; } - template + template inline void reductionDispatch(Args&&... args) { switch (reduction) @@ -171,31 +201,31 @@ public: case REDUCTION::NONE: { auto rd = [](const T& a, const T& b) { return b; }; // a from input data, b from updates - forward_impl(rd, std::forward(args)...); + forward_impl(rd, std::forward(args)...); break; } case REDUCTION::ADD: { auto rd = [](const T& a, const T& b) { return a + b; }; - forward_impl(rd, std::forward(args)...); + forward_impl(rd, std::forward(args)...); break; } case REDUCTION::MUL: { auto rd = [](const T& a, const T& b) { return a * b; }; - forward_impl(rd, std::forward(args)...); + forward_impl(rd, std::forward(args)...); break; } case REDUCTION::MAX: { auto rd = [](const T& a, const T& b) { return std::max(a, b); }; - forward_impl(rd, std::forward(args)...); + forward_impl(rd, std::forward(args)...); break; } case REDUCTION::MIN: { auto rd = [](const T& a, const T& b) { return std::min(a, b); }; - forward_impl(rd, std::forward(args)...); + forward_impl(rd, std::forward(args)...); break; } default: diff --git a/modules/dnn/src/layers/slice_layer.cpp b/modules/dnn/src/layers/slice_layer.cpp index 8b082f09da..d1d77631f4 100644 --- a/modules/dnn/src/layers/slice_layer.cpp +++ b/modules/dnn/src/layers/slice_layer.cpp @@ -278,6 +278,25 @@ public: return false; } + void getTypes(const std::vector& inputs, + const int requiredOutputs, + const int requiredInternals, + std::vector& outputs, + std::vector& internals) const CV_OVERRIDE + { + CV_CheckEQ(inputs.size(), (size_t)1, ""); + for (auto input : inputs) + if (preferableTarget == DNN_TARGET_CUDA_FP16 || preferableTarget == DNN_TARGET_CUDA) + CV_CheckEQ(input, CV_32F, "Unsupported type"); + else if (preferableTarget == DNN_TARGET_OPENCL_FP16) + CV_CheckType(input, input == CV_16F || input == CV_8S || input == CV_32S || input == CV_64S, ""); + else + CV_CheckType(input, input == CV_32F || input == CV_8S || input == CV_32S || input == CV_64S, ""); + + outputs.assign(requiredOutputs, inputs[0]); + } + + bool updateMemoryShapes(const std::vector &inputs) CV_OVERRIDE { shapesInitialized = true; @@ -596,13 +615,14 @@ public: CV_TRACE_FUNCTION(); CV_TRACE_ARG_VALUE(name, "name", name.c_str()); - CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget), - forward_ocl(inputs_arr, outputs_arr, internals_arr)) - std::vector inputs, outputs; inputs_arr.getMatVector(inputs); outputs_arr.getMatVector(outputs); + CV_OCL_RUN((IS_DNN_OPENCL_TARGET(preferableTarget) && + (outputs[0].type() != CV_32S && outputs[0].type() != CV_64S)), + forward_ocl(inputs_arr, outputs_arr, internals_arr)) + const Mat& inpMat = inputs[0]; CV_Assert(outputs.size() == finalSliceRanges.size()); @@ -621,7 +641,11 @@ public: { std::vector inpIdx(dimsNum, 0); std::vector outIdx(dimsNum, 0); - if (inpMat.type() == CV_16F) + if (inpMat.type() == CV_32S) + getSliceRecursive(inpMat, inpIdx, finalSliceRanges[i], sliceSteps[i], 0, dimsNum, outputs[i], outIdx); + else if (inpMat.type() == CV_64S) + getSliceRecursive(inpMat, inpIdx, finalSliceRanges[i], sliceSteps[i], 0, dimsNum, outputs[i], outIdx); + else if (inpMat.type() == CV_16F) getSliceRecursive(inpMat, inpIdx, finalSliceRanges[i], sliceSteps[i], 0, dimsNum, outputs[i], outIdx); else if (inpMat.type() == CV_8S) getSliceRecursive(inpMat, inpIdx, finalSliceRanges[i], sliceSteps[i], 0, dimsNum, outputs[i], outIdx); @@ -876,6 +900,25 @@ public: return false; } + void getTypes(const std::vector& inputs, + const int requiredOutputs, + const int requiredInternals, + std::vector& outputs, + std::vector& internals) const CV_OVERRIDE + { + CV_CheckEQ(inputs.size(), (size_t)2, ""); + for (auto input : inputs) + if (preferableTarget == DNN_TARGET_CUDA_FP16 || preferableTarget == DNN_TARGET_CUDA) + CV_CheckTypeEQ(input, CV_32F, "Unsupported type"); + else if (preferableTarget == DNN_TARGET_OPENCL_FP16) + CV_CheckType(input, input == CV_16F || input == CV_8S || input == CV_32S || input == CV_64S, ""); + else + CV_CheckType(input, input == CV_32F || input == CV_8S || input == CV_32S || input == CV_64S, ""); + + outputs.assign(requiredOutputs, inputs[0]); + } + + void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE { std::vector inputs; diff --git a/modules/dnn/src/legacy_backend.cpp b/modules/dnn/src/legacy_backend.cpp index 6f216f75fc..f3eb288bc3 100644 --- a/modules/dnn/src/legacy_backend.cpp +++ b/modules/dnn/src/legacy_backend.cpp @@ -90,14 +90,21 @@ Ptr wrapMat(int backendId, int targetId, cv::Mat& m) CV_Assert(haveCUDA()); #ifdef HAVE_CUDA - switch (targetId) + CV_CheckType(m.depth(), m.depth() == CV_32F || m.depth() == CV_32S || m.depth() == CV_64S, "Unsupported type for CUDA"); + CV_Assert(IS_DNN_CUDA_TARGET(targetId)); + switch (m.depth()) { - case DNN_TARGET_CUDA: - return CUDABackendWrapperFP32::create(m); - case DNN_TARGET_CUDA_FP16: - return CUDABackendWrapperFP16::create(m); + case CV_32F: + if (targetId == DNN_TARGET_CUDA_FP16) + return CUDABackendWrapperFP16::create(m); + else + return CUDABackendWrapperFP32::create(m); + case CV_32S: + return CUDABackendWrapperINT32::create(m); + case CV_64S: + return CUDABackendWrapperINT64::create(m); default: - CV_Assert(IS_DNN_CUDA_TARGET(targetId)); + CV_Error(Error::BadDepth, "Unsupported mat type for CUDA"); } #endif } diff --git a/modules/dnn/src/legacy_backend.hpp b/modules/dnn/src/legacy_backend.hpp index e9ca3fecb3..3709c1f7f2 100644 --- a/modules/dnn/src/legacy_backend.hpp +++ b/modules/dnn/src/legacy_backend.hpp @@ -237,6 +237,10 @@ public: const ShapesVec &outShapes = layerShapes.out, internalShapes = layerShapes.internal; + const TypesVec &outTypes = layerShapes.outTypes, + &internalTypes = layerShapes.internalTypes; + CV_CheckEQ(outShapes.size(), outTypes.size(), "Numbers shapes and types shoud be equal"); + CV_CheckEQ(internalShapes.size(), internalTypes.size(), "Numbers shapes and types shoud be equal"); outputBlobs.resize(std::max((size_t)1, outShapes.size())); // layer produce at least one output blob internalBlobs.resize(internalShapes.size()); @@ -257,7 +261,9 @@ public: } ShapesVec shapes(outShapes); + TypesVec types(outTypes); shapes.insert(shapes.end(), internalShapes.begin(), internalShapes.end()); + types.insert(types.end(), internalTypes.begin(), internalTypes.end()); std::vector blobs; for (int i = 0; i < outputBlobs.size(); i++) { @@ -292,12 +298,13 @@ public: LayerPin blobPin(ld.id, index); if (index < outShapes.size() && inPlace) { - CV_Assert(ld.inputBlobs[0]->total() == total(shapes[index])); + CV_CheckEQ((int)ld.inputBlobs[0]->total(), total(shapes[index]), ""); + CV_CheckTypeEQ(ld.inputBlobs[0]->type(), types[index], "blob can't be reused if it has different type"); ld.outputBlobs[index] = ld.inputBlobs[0]->reshape(1, shapes[index]); reuse(ld.inputBlobsId[0], blobPin); } else - reuseOrCreate(shapes[index], blobPin, *blobs[index], ld.dtype); + reuseOrCreate(shapes[index], blobPin, *blobs[index], types[index]); } } } diff --git a/modules/dnn/src/model.cpp b/modules/dnn/src/model.cpp index bc8e2ebe33..5c261f84ed 100644 --- a/modules/dnn/src/model.cpp +++ b/modules/dnn/src/model.cpp @@ -48,7 +48,7 @@ public: outNames = net.getUnconnectedOutLayersNames(); std::vector inLayerShapes; std::vector outLayerShapes; - net.getLayerShapes(MatShape(), 0, inLayerShapes, outLayerShapes); + net.getLayerShapes(MatShape(), CV_32F, 0, inLayerShapes, outLayerShapes); if (!inLayerShapes.empty() && inLayerShapes[0].size() == 4) size = Size(inLayerShapes[0][3], inLayerShapes[0][2]); else diff --git a/modules/dnn/src/net.cpp b/modules/dnn/src/net.cpp index ad1445425f..1e7a25b29c 100644 --- a/modules/dnn/src/net.cpp +++ b/modules/dnn/src/net.cpp @@ -234,68 +234,77 @@ std::vector Net::getUnconnectedOutLayersNames() const } void Net::getLayersShapes(const ShapesVec& netInputShapes, + const TypesVec& netInputTypes, std::vector& layersIds, std::vector& inLayersShapes, std::vector& outLayersShapes) const { CV_Assert(impl); - return impl->getLayersShapes(netInputShapes, layersIds, inLayersShapes, outLayersShapes); + return impl->getLayersShapes(netInputShapes, netInputTypes, layersIds, inLayersShapes, outLayersShapes); } void Net::getLayersShapes(const MatShape& netInputShape, + const MatType& netInputType, std::vector& layerIds, std::vector& inLayersShapes, std::vector& outLayersShapes) const { getLayersShapes(ShapesVec(1, netInputShape), + TypesVec(1, netInputType), layerIds, inLayersShapes, outLayersShapes); } void Net::getLayerShapes(const MatShape& netInputShape, + const MatType& netInputType, const int layerId, ShapesVec& inLayerShapes, ShapesVec& outLayerShapes) const { - getLayerShapes(ShapesVec(1, netInputShape), + getLayerShapes(ShapesVec(1, netInputShape), TypesVec(1, netInputType), layerId, inLayerShapes, outLayerShapes); } void Net::getLayerShapes(const ShapesVec& netInputShapes, + const TypesVec& netInputTypes, const int layerId, ShapesVec& inLayerShapes, ShapesVec& outLayerShapes) const { CV_Assert(impl); LayerShapes shapes; - impl->getLayerShapes(netInputShapes, layerId, shapes); + impl->getLayerShapes(netInputShapes, netInputTypes, layerId, shapes); inLayerShapes = shapes.in; outLayerShapes = shapes.out; } -int64 Net::getFLOPS(const std::vector& netInputShapes) const +int64 Net::getFLOPS(const std::vector& netInputShapes, const std::vector& netInputTypes) const { CV_TRACE_FUNCTION(); CV_Assert(impl); - return impl->getFLOPS(netInputShapes); + return impl->getFLOPS(netInputShapes, netInputTypes); } -int64 Net::getFLOPS(const MatShape& netInputShape) const +int64 Net::getFLOPS(const MatShape& netInputShape, const MatType& netInputType) const { - return getFLOPS(std::vector(1, netInputShape)); + return getFLOPS(std::vector(1, netInputShape), + std::vector(1, netInputType)); } int64 Net::getFLOPS(const int layerId, - const std::vector& netInputShapes) const + const std::vector& netInputShapes, + const std::vector& netInputTypes) const { CV_TRACE_FUNCTION(); CV_Assert(impl); - return impl->getFLOPS(layerId, netInputShapes); + return impl->getFLOPS(layerId, netInputShapes, netInputTypes); } int64 Net::getFLOPS(const int layerId, - const MatShape& netInputShape) const + const MatShape& netInputShape, + const MatType& netInputType) const { - return getFLOPS(layerId, std::vector(1, netInputShape)); + return getFLOPS(layerId, std::vector(1, netInputShape), + std::vector(1, netInputType)); } void Net::getLayerTypes(std::vector& layersTypes) const @@ -314,50 +323,59 @@ int Net::getLayersCount(const String& layerType) const void Net::getMemoryConsumption(const int layerId, const std::vector& netInputShapes, + const std::vector& netInputTypes, size_t& weights, size_t& blobs) const { CV_TRACE_FUNCTION(); CV_Assert(impl); - return impl->getMemoryConsumption(layerId, netInputShapes, weights, blobs); + return impl->getMemoryConsumption(layerId, netInputShapes, netInputTypes, weights, blobs); } void Net::getMemoryConsumption(const std::vector& netInputShapes, + const std::vector& netInputTypes, size_t& weights, size_t& blobs) const { CV_TRACE_FUNCTION(); CV_Assert(impl); - return impl->getMemoryConsumption(netInputShapes, weights, blobs); + return impl->getMemoryConsumption(netInputShapes, netInputTypes, weights, blobs); } void Net::getMemoryConsumption(const int layerId, const MatShape& netInputShape, + const MatType& netInputType, size_t& weights, size_t& blobs) const { getMemoryConsumption(layerId, std::vector(1, netInputShape), - weights, blobs); + std::vector(1, netInputType), + weights, blobs); } void Net::getMemoryConsumption(const MatShape& netInputShape, + const MatType& netInputType, size_t& weights, size_t& blobs) const { getMemoryConsumption(std::vector(1, netInputShape), + std::vector(1, netInputType), weights, blobs); } void Net::getMemoryConsumption(const std::vector& netInputShapes, + const std::vector& netInputTypes, std::vector& layerIds, std::vector& weights, std::vector& blobs) const { CV_TRACE_FUNCTION(); CV_Assert(impl); - return impl->getMemoryConsumption(netInputShapes, layerIds, weights, blobs); + return impl->getMemoryConsumption(netInputShapes, netInputTypes, layerIds, weights, blobs); } -void Net::getMemoryConsumption(const MatShape& netInputShape, std::vector& layerIds, +void Net::getMemoryConsumption(const MatShape& netInputShape, const MatType& netInputType, + std::vector& layerIds, std::vector& weights, std::vector& blobs) const { - getMemoryConsumption(std::vector(1, netInputShape), layerIds, - weights, blobs); + getMemoryConsumption(std::vector(1, netInputShape), + std::vector(1, netInputType), + layerIds, weights, blobs); } // FIXIT return old value or add get method diff --git a/modules/dnn/src/net_impl.cpp b/modules/dnn/src/net_impl.cpp index 2f644ad420..472a73d787 100644 --- a/modules/dnn/src/net_impl.cpp +++ b/modules/dnn/src/net_impl.cpp @@ -186,11 +186,6 @@ void Net::Impl::setUpNet(const std::vector& blobsToKeep_) clear(); - if (hasDynamicShapes) - { - updateLayersShapes(); - } - this->blobsToKeep = blobsToKeep_; allocateLayers(blobsToKeep_); @@ -475,7 +470,7 @@ void Net::Impl::allocateLayer(int lid, const LayersShapesMap& layersShapes) allocateLayer(*i, layersShapes); // bind inputs - if (ld.id == 0) // DataLayer + if (ld.id == 0 && netInputLayer->supportBackend(preferableBackend)) // DataLayer { ninputs = netInputLayer->inputsData.size(); ld.inputBlobsWrappers.resize(ninputs); @@ -500,7 +495,8 @@ void Net::Impl::allocateLayer(int lid, const LayersShapesMap& layersShapes) CV_Assert(layerShapesIt != layersShapes.end()); - if (preferableBackend == DNN_BACKEND_OPENCV && preferableTarget == DNN_TARGET_OPENCL_FP16 && ld.dtype == CV_32F) + if (preferableBackend == DNN_BACKEND_OPENCV && ld.dtype == CV_32F + && preferableTarget == DNN_TARGET_OPENCL_FP16) ld.dtype = CV_16F; std::vector pinsForInternalBlobs; @@ -522,7 +518,6 @@ void Net::Impl::allocateLayer(int lid, const LayersShapesMap& layersShapes) inps[i] = *ld.inputBlobs[i]; } layerPtr->finalize(inps, ld.outputBlobs); - layerPtr->preferableTarget = preferableTarget; #if 0 std::cout << "\toutputs:"; size_t noutputs = ld.outputBlobs.size(); @@ -551,20 +546,39 @@ void Net::Impl::allocateLayers(const std::vector& blobsToKeep_) CV_Assert(!layers[0].outputBlobs.empty()); ShapesVec inputShapes; + TypesVec inputTypes; for (int i = 0; i < layers[0].outputBlobs.size(); i++) { Mat& inp = layers[0].outputBlobs[i]; CV_Assert(inp.total()); - if (preferableBackend == DNN_BACKEND_OPENCV && - preferableTarget == DNN_TARGET_OPENCL_FP16 && - layers[0].dtype == CV_32F) + int type = inp.type(); + if (type != CV_32S && type != CV_64S) { - layers[0].outputBlobs[i].create(inp.dims, inp.size, CV_16F); + type = CV_32F; + if (preferableBackend == DNN_BACKEND_OPENCV && + preferableTarget == DNN_TARGET_OPENCL_FP16) + { + type = CV_16F; + if (layers[0].dtype == CV_32F) + layers[0].outputBlobs[i].create(inp.dims, inp.size, CV_16F); + } + if (netWasQuantized && inp.type() == CV_8S) { + type = CV_8S; + } } inputShapes.push_back(shape(inp)); + inputTypes.push_back(type); } + + for (auto& layer : layers) + { + auto& ld = layer.second; + Ptr layerPtr = getLayerInstance(ld); + layerPtr->preferableTarget = preferableTarget; + } + LayersShapesMap layersShapes; - getLayersShapes(inputShapes, layersShapes); + getLayersShapes(inputShapes, inputTypes, layersShapes); blobManager.reset(); backendWrappers.clear(); @@ -969,7 +983,12 @@ void Net::Impl::forward(OutputArrayOfArrays outputBlobs, const String& outputNam std::vector& outputvec = *(std::vector*)outputBlobs.getObj(); outputvec.resize(ld.outputBlobs.size()); for (int i = 0; i < outputvec.size(); i++) - ld.outputBlobs[i].convertTo(outputvec[i], CV_32F); + { + if (ld.outputBlobs[i].depth() == CV_32S || ld.outputBlobs[i].depth() == CV_64S) + outputvec[i] = ld.outputBlobs[i]; + else + ld.outputBlobs[i].convertTo(outputvec[i], CV_32F); + } } else { @@ -1079,13 +1098,16 @@ void Net::Impl::getLayerShapesRecursively(int id, LayersShapesMap& inOutShapes) if (!layerData.outputBlobs.empty()) { ShapesVec shapes; + TypesVec types; for (int i = 0; i < layerData.outputBlobs.size(); i++) { Mat& inp = layerData.outputBlobs[i]; CV_Assert(!inp.empty()); shapes.push_back(shape(inp)); + types.push_back(inp.type()); } layerShapes.in = shapes; + layerShapes.inTypes = types; } else { @@ -1102,11 +1124,13 @@ void Net::Impl::getLayerShapesRecursively(int id, LayersShapesMap& inOutShapes) if (none) { layerShapes.out.clear(); + layerShapes.outTypes.clear(); return; } else { layerShapes.in = inputShapes; + layerShapes.inTypes.assign(inputShapes.size(), layerData.dtype); } } } @@ -1126,7 +1150,9 @@ void Net::Impl::getLayerShapesRecursively(int id, LayersShapesMap& inOutShapes) const int out_port = inputLayerIds[i].oid; CV_CheckLT(out_port, (int)it->second.out.size(), ""); const MatShape& shape = it->second.out[out_port]; + const MatType& type = it->second.outTypes[out_port]; layerShapes.in.push_back(shape); + layerShapes.inTypes.push_back(type); } } const ShapesVec& is = layerShapes.in; @@ -1138,7 +1164,11 @@ void Net::Impl::getLayerShapesRecursively(int id, LayersShapesMap& inOutShapes) bool layerSupportInPlace = false; try { + l->updateMemoryShapes(layerShapes.in); layerSupportInPlace = l->getMemoryShapes(is, requiredOutputs, os, ints); + l->getTypes(layerShapes.inTypes, os.size(), ints.size(), layerShapes.outTypes, layerShapes.internalTypes); + CV_CheckEQ(layerShapes.out.size(), layerShapes.outTypes.size(), "Number of shapes and types should be equal"); + CV_CheckEQ(layerShapes.internal.size(), layerShapes.internalTypes.size(), "Number of shapes and types should be equal"); } catch (const cv::Exception& e) { @@ -1197,6 +1227,7 @@ void Net::Impl::getLayerShapesRecursively(int id, LayersShapesMap& inOutShapes) void Net::Impl::getLayersShapes( const ShapesVec& netInputShapes, + const TypesVec& netInputTypes, std::vector& layersIds, std::vector& inLayersShapes, std::vector& outLayersShapes) /*const*/ @@ -1206,7 +1237,7 @@ void Net::Impl::getLayersShapes( outLayersShapes.clear(); Impl::LayersShapesMap inOutShapes; - getLayersShapes(netInputShapes, inOutShapes); + getLayersShapes(netInputShapes, netInputTypes, inOutShapes); for (Impl::LayersShapesMap::const_iterator it = inOutShapes.begin(); it != inOutShapes.end(); it++) @@ -1219,11 +1250,13 @@ void Net::Impl::getLayersShapes( void Net::Impl::getLayersShapes(const ShapesVec& netInputShapes, + const TypesVec& netInputTypes, LayersShapesMap& inOutShapes) { inOutShapes.clear(); inOutShapes[0].in = netInputShapes; // insert shape for first input layer + inOutShapes[0].inTypes = netInputTypes; for (MapIdToLayerData::const_iterator it = layers.begin(); it != layers.end(); it++) { @@ -1232,11 +1265,13 @@ void Net::Impl::getLayersShapes(const ShapesVec& netInputShapes, } void Net::Impl::getLayerShapes(const ShapesVec& netInputShapes, + const TypesVec& netInputTypes, const int layerId, LayerShapes& shapes) { LayersShapesMap inOutShapes; inOutShapes[0].in = netInputShapes; // insert shape for first input layer + inOutShapes[0].inTypes = netInputTypes; getLayerShapesRecursively(layerId, inOutShapes); shapes = inOutShapes[layerId]; } @@ -1250,6 +1285,7 @@ void Net::Impl::updateLayersShapes() CV_Assert(inputLayerData.layerInstance.get() == &inputLayer); CV_Assert(!inputLayerData.outputBlobs.empty()); ShapesVec inputShapes; + TypesVec inputTypes; for (int i = 0; i < inputLayerData.outputBlobs.size(); i++) { Mat& inp = inputLayerData.outputBlobs[i]; @@ -1261,10 +1297,12 @@ void Net::Impl::updateLayersShapes() inp.create(inp.dims, inp.size, CV_16F); } inputShapes.push_back(shape(inp)); + inputTypes.push_back(inp.type()); } CV_LOG_DEBUG(NULL, toString(inputShapes, "Network input shapes")); LayersShapesMap layersShapes; layersShapes[0].in = inputShapes; + layersShapes[0].inTypes = inputTypes; for (MapIdToLayerData::iterator it = layers.begin(); it != layers.end(); it++) { int layerId = it->first; @@ -1285,7 +1323,9 @@ void Net::Impl::updateLayersShapes() getLayerShapesRecursively(inputLayerId, layersShapes); } const MatShape& shape = layersShapes[inputLayerId].out[inputPin.oid]; + const MatType& type = layersShapes[inputLayerId].outTypes[inputPin.oid]; layerShapes.in.push_back(shape); + layerShapes.inTypes.push_back(type); } getLayerInstance(layerData)->updateMemoryShapes(layerShapes.in); } @@ -1910,12 +1950,13 @@ std::vector Net::Impl::getUnconnectedOutLayersNames() /*const*/ } -int64 Net::Impl::getFLOPS(const std::vector& netInputShapes) /*const*/ +int64 Net::Impl::getFLOPS(const std::vector& netInputShapes, + const std::vector& netInputTypes) /*const*/ { int64 flops = 0; std::vector ids; std::vector> inShapes, outShapes; - getLayersShapes(netInputShapes, ids, inShapes, outShapes); + getLayersShapes(netInputShapes, netInputTypes, ids, inShapes, outShapes); CV_Assert(inShapes.size() == outShapes.size()); CV_Assert(inShapes.size() == ids.size()); @@ -1930,13 +1971,14 @@ int64 Net::Impl::getFLOPS(const std::vector& netInputShapes) /*const*/ int64 Net::Impl::getFLOPS( const int layerId, - const std::vector& netInputShapes) /*const*/ + const std::vector& netInputShapes, + const std::vector& netInputTypes) /*const*/ { Impl::MapIdToLayerData::const_iterator layer = layers.find(layerId); CV_Assert(layer != layers.end()); LayerShapes shapes; - getLayerShapes(netInputShapes, layerId, shapes); + getLayerShapes(netInputShapes, netInputTypes, layerId, shapes); return getLayerInstance(const_cast(layer->second))->getFLOPS(shapes.in, shapes.out); } @@ -1945,6 +1987,7 @@ int64 Net::Impl::getFLOPS( void Net::Impl::getMemoryConsumption( const int layerId, const std::vector& netInputShapes, + const std::vector& netInputTypes, size_t& weights, size_t& blobs) /*const*/ { Impl::MapIdToLayerData::const_iterator layer = layers.find(layerId); @@ -1959,25 +2002,22 @@ void Net::Impl::getMemoryConsumption( } LayerShapes shapes; - getLayerShapes(netInputShapes, layerId, shapes); + getLayerShapes(netInputShapes, netInputTypes, layerId, shapes); const ShapesVec& outLayerShapes = shapes.out; - // FIXIT netWasQuantized check is not enough - per layer check should be done - size_t elemSize = netWasQuantized ? sizeof(char) : sizeof(float); for (int i = 0; i < outLayerShapes.size(); i++) - { - blobs += total(outLayerShapes[i]) * elemSize; - } + blobs += total(outLayerShapes[i]) * CV_ELEM_SIZE(shapes.outTypes[i]); } void Net::Impl::getMemoryConsumption( const std::vector& netInputShapes, + const std::vector& netInputTypes, size_t& weights, size_t& blobs) /*const*/ { std::vector layerIds; std::vector w, b; - getMemoryConsumption(netInputShapes, layerIds, w, b); + getMemoryConsumption(netInputShapes, netInputTypes, layerIds, w, b); weights = blobs = 0; for (int i = 0; i < layerIds.size(); i++) @@ -1997,6 +2037,7 @@ int64 Net::Impl::getPerfProfile(std::vector& timings) const void Net::Impl::getMemoryConsumption( const std::vector& netInputShapes, + const std::vector& netInputTypes, std::vector& layerIds, std::vector& weights, std::vector& blobs) /*const*/ { @@ -2006,7 +2047,7 @@ void Net::Impl::getMemoryConsumption( std::vector> inLayerShapes, outLayerShapes; - getLayersShapes(netInputShapes, layerIds, inLayerShapes, outLayerShapes); + getLayersShapes(netInputShapes, netInputTypes, layerIds, inLayerShapes, outLayerShapes); // FIXIT netWasQuantized check is not enough - per layer check should be done size_t elemSize = netWasQuantized ? sizeof(char) : sizeof(float); for (int i = 0; i < layerIds.size(); i++) diff --git a/modules/dnn/src/net_impl.hpp b/modules/dnn/src/net_impl.hpp index facee0da55..972cc0f9b4 100644 --- a/modules/dnn/src/net_impl.hpp +++ b/modules/dnn/src/net_impl.hpp @@ -227,33 +227,41 @@ struct Net::Impl : public detail::NetImplBase void getLayersShapes( const ShapesVec& netInputShapes, + const TypesVec& netInputTypes, std::vector& layersIds, std::vector& inLayersShapes, std::vector& outLayersShapes) /*const*/; void getLayersShapes(const ShapesVec& netInputShapes, + const TypesVec& netInputTypes, LayersShapesMap& inOutShapes); void getLayerShapes(const ShapesVec& netInputShapes, + const TypesVec& netInputTypes, const int layerId, LayerShapes& shapes); void updateLayersShapes(); - int64 getFLOPS(const std::vector& netInputShapes) /*const*/; + int64 getFLOPS(const std::vector& netInputShapes, + const std::vector& netInputTypes) /*const*/; int64 getFLOPS( const int layerId, - const std::vector& netInputShapes) /*const*/; + const std::vector& netInputShapes, + const std::vector& netInputTypes) /*const*/; void getMemoryConsumption( const int layerId, const std::vector& netInputShapes, + const std::vector& netInputTypes, size_t& weights, size_t& blobs) /*const*/; void getMemoryConsumption( const std::vector& netInputShapes, + const std::vector& netInputTypes, size_t& weights, size_t& blobs) /*const*/; void getMemoryConsumption( const std::vector& netInputShapes, + const std::vector& netInputTypes, std::vector& layerIds, std::vector& weights, std::vector& blobs) /*const*/; int64 getPerfProfile(std::vector& timings) const; diff --git a/modules/dnn/src/net_impl_backend.cpp b/modules/dnn/src/net_impl_backend.cpp index 2751a41782..f2bd9138b5 100644 --- a/modules/dnn/src/net_impl_backend.cpp +++ b/modules/dnn/src/net_impl_backend.cpp @@ -62,14 +62,21 @@ Ptr Net::Impl::wrap(Mat& host) { CV_Assert(haveCUDA()); #ifdef HAVE_CUDA - switch (preferableTarget) + CV_CheckType(host.depth(), host.depth() == CV_32F || host.depth() == CV_32S || host.depth() == CV_64S, "Unsupported type for CUDA"); + CV_Assert(IS_DNN_CUDA_TARGET(preferableTarget)); + switch (host.depth()) { - case DNN_TARGET_CUDA: - return CUDABackendWrapperFP32::create(baseBuffer, shape); - case DNN_TARGET_CUDA_FP16: - return CUDABackendWrapperFP16::create(baseBuffer, shape); + case CV_32F: + if (preferableTarget == DNN_TARGET_CUDA_FP16) + return CUDABackendWrapperFP16::create(baseBuffer, shape); + else + return CUDABackendWrapperFP32::create(baseBuffer, shape); + case CV_32S: + return CUDABackendWrapperINT32::create(baseBuffer, shape); + case CV_64S: + return CUDABackendWrapperINT64::create(baseBuffer, shape); default: - CV_Assert(IS_DNN_CUDA_TARGET(preferableTarget)); + CV_Error(Error::BadDepth, "Unsupported mat type for CUDA"); } #endif } diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp index 8776bee38b..31de39c30a 100644 --- a/modules/dnn/src/onnx/onnx_importer.cpp +++ b/modules/dnn/src/onnx/onnx_importer.cpp @@ -381,27 +381,24 @@ void runLayer(LayerParams& params, const std::vector& inputs, CV_Assert((bool)layer); std::vector inpShapes(inputs.size()); - int ddepth = params.get("depth", CV_32F); + std::vector inpTypes(inputs.size()); for (size_t i = 0; i < inputs.size(); ++i) { inpShapes[i] = shape(inputs[i]); - if (i > 0 && ddepth != inputs[i].depth()) - CV_Error(Error::StsNotImplemented, cv::format("Mixed input data types. Required type: %d, actual type: %d", ddepth, inputs[i].depth())); - - // Quantize and Dequantize layer have different output type than input. - if (params.type != "Quantize" && params.type != "Dequantize") - ddepth = inputs[i].depth(); + inpTypes[i] = inputs[i].type(); } std::vector outShapes, internalShapes; + std::vector outTypes, internalTypes; layer->getMemoryShapes(inpShapes, 0, outShapes, internalShapes); + layer->getTypes(inpTypes, outShapes.size(), internalShapes.size(), outTypes, internalTypes); std::vector internals(internalShapes.size()); outputs.resize(outShapes.size()); for (size_t i = 0; i < outShapes.size(); ++i) - outputs[i].create(outShapes[i], ddepth); + outputs[i].create(outShapes[i], outTypes[i]); for (size_t i = 0; i < internalShapes.size(); ++i) - internals[i].create(internalShapes[i], ddepth); + internals[i].create(internalShapes[i], internalTypes[i]); layer->finalize(inputs, outputs); layer->forward(inputs, outputs, internals); @@ -2506,7 +2503,6 @@ void ONNXImporter::parseGather(LayerParams& layerParams, const opencv_onnx::Node inputs.push_back(input); Mat indices = getBlob(node_proto, 1); - indices.convertTo(indices, CV_32FC1); inputs.push_back(indices); runLayer(layerParams, inputs, output); @@ -2525,10 +2521,6 @@ void ONNXImporter::parseGather(LayerParams& layerParams, const opencv_onnx::Node constParams.name = node_proto.input(i); constParams.type = "Const"; Mat blob = getBlob(node_proto, i); - if (i == 1) - { - blob.convertTo(blob, CV_32FC1); - } constParams.blobs.push_back(blob); opencv_onnx::NodeProto proto; @@ -3037,8 +3029,6 @@ void ONNXImporter::parseScatter(LayerParams& layerParams, const opencv_onnx::Nod for (size_t i = 0; i < node_proto.input_size(); i++) { Mat blob = getBlob(node_proto, i); - if (i == 1) // indices - blob.convertTo(blob, CV_32F); inputs.push_back(blob); } runLayer(layerParams, inputs, output); diff --git a/modules/dnn/src/op_cuda.cpp b/modules/dnn/src/op_cuda.cpp index 46e68f7689..954864936d 100644 --- a/modules/dnn/src/op_cuda.cpp +++ b/modules/dnn/src/op_cuda.cpp @@ -51,7 +51,8 @@ void Net::Impl::initCUDABackend(const std::vector& blobsToKeep_) for (auto& layer : layers) { auto& ld = layer.second; - if (ld.id == 0) + + if (ld.id == 0 && netInputLayer->supportBackend(preferableBackend)) { for (auto& wrapper : ld.inputBlobsWrappers) { diff --git a/modules/dnn/src/op_cuda.hpp b/modules/dnn/src/op_cuda.hpp index 0ce4d469fc..a0f346c496 100644 --- a/modules/dnn/src/op_cuda.hpp +++ b/modules/dnn/src/op_cuda.hpp @@ -58,6 +58,16 @@ namespace cv { namespace dnn { return Tensor(std::begin(sizes), std::end(sizes)); } + template inline + void copyMatToTensorImpl(const Mat& srcMat, const TensorSpan destTensor, const Stream& stream) { + CV_Assert(srcMat.total() >= destTensor.size()); + + Mat temp = srcMat.isContinuous() ? srcMat : srcMat.clone(); + CV_Assert(temp.isContinuous()); + + memcpy(destTensor.get(), reinterpret_cast(temp.data), destTensor.size(), stream); + } + /** @brief copies data from a cv::Mat to TensorType * * \tparam T the type of the elements contained in TensorType object @@ -81,8 +91,7 @@ namespace cv { namespace dnn { template <> inline void copyMatToTensor(const Mat& srcMat, const TensorSpan destTensor, const Stream& stream) { - /* should perhaps convert cv::Mat of different type to the required type and copy */ - CV_Assert(srcMat.type() == CV_32F); + CV_CheckTypeEQ(srcMat.type(), CV_32F, ""); CV_Assert(srcMat.total() >= destTensor.size()); Mat temp; @@ -94,14 +103,20 @@ namespace cv { namespace dnn { template <> inline void copyMatToTensor(const Mat& srcMat, const TensorSpan destTensor, const Stream& stream) { - /* should perhaps convert cv::Mat of different type to the required type and copy */ - CV_Assert(srcMat.type() == CV_32F); - CV_Assert(srcMat.total() >= destTensor.size()); + CV_CheckTypeEQ(srcMat.type(), CV_32F, ""); + copyMatToTensorImpl(srcMat, destTensor, stream); + } - Mat temp = srcMat.isContinuous() ? srcMat : srcMat.clone(); - CV_Assert(temp.isContinuous()); + template <> inline + void copyMatToTensor(const Mat& srcMat, const TensorSpan destTensor, const Stream& stream) { + CV_CheckTypeEQ(srcMat.type(), CV_32S, ""); + copyMatToTensorImpl(srcMat, destTensor, stream); + } - memcpy(destTensor.get(), reinterpret_cast(temp.data), destTensor.size(), stream); + template <> inline + void copyMatToTensor(const Mat& srcMat, const TensorSpan destTensor, const Stream& stream) { + CV_CheckTypeEQ(srcMat.type(), CV_64S, ""); + copyMatToTensorImpl(srcMat, destTensor, stream); } /** @brief copies data from a TensorType to a cv::Mat @@ -126,7 +141,7 @@ namespace cv { namespace dnn { template <> inline void copyTensorToMat(TensorView srcTensor, Mat& destMat, const Stream& stream) { - CV_Assert(destMat.type() == CV_32F); + CV_CheckTypeEQ(destMat.type(), CV_32F, "Unsupported type"); CV_Assert(destMat.total() >= srcTensor.size()); Mat temp(shape(destMat), CV_16F); @@ -139,7 +154,7 @@ namespace cv { namespace dnn { template <> inline void copyTensorToMat(TensorView srcTensor, Mat& destMat, const Stream& stream) { - CV_Assert(destMat.type() == CV_32F); + CV_CheckTypeEQ(destMat.type(), CV_32F, "Unsupported type"); CV_Assert(destMat.total() >= srcTensor.size()); Mat temp = destMat.isContinuous() ? destMat : destMat.clone(); @@ -200,6 +215,44 @@ namespace cv { namespace dnn { return Ptr(); } + template