diff --git a/modules/dnn/src/cuda/fill.cu b/modules/dnn/src/cuda/fill_copy.cu similarity index 56% rename from modules/dnn/src/cuda/fill.cu rename to modules/dnn/src/cuda/fill_copy.cu index d08884facb..2304e42346 100644 --- a/modules/dnn/src/cuda/fill.cu +++ b/modules/dnn/src/cuda/fill_copy.cu @@ -21,7 +21,6 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { template __global__ void fill_vec(Span output, T value) { using vector_type = get_vector_type_t; - auto output_vPtr = vector_type::get_pointer(output.data()); for (auto i : grid_stride_range(output.size() / vector_type::size())) { vector_type vec; @@ -30,6 +29,18 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { v_store(output_vPtr[i], vec); } } + + template + __global__ void copy_vec(Span output, View input) { + using vector_type = get_vector_type_t; + auto input_vPtr = vector_type::get_pointer(input.data()); + auto output_vPtr = vector_type::get_pointer(output.data()); + for (auto i : grid_stride_range(output.size() / vector_type::size())) { + vector_type vec; + v_load(vec, input_vPtr[i]); + v_store(output_vPtr[i], vec); + } + } } template static @@ -55,4 +66,28 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { template void fill(const Stream&, Span<__half>, __half); template void fill(const Stream&, Span, float); + template static + void launch_vectorized_copy(const Stream& stream, Span output, View input) { + CV_Assert(is_fully_aligned(output, N)); + CV_Assert(is_fully_aligned(input, N)); + + auto kernel = raw::copy_vec; + auto policy = make_policy(kernel, output.size() / N, 0, stream); + launch_kernel(kernel, policy, output, input); + } + + template + void copy(const Stream& stream, Span output, View input) { + if (is_fully_aligned(output, 4) && is_fully_aligned(input, 4)) { + launch_vectorized_copy(stream, output, input); + } else if (is_fully_aligned(output, 2) && is_fully_aligned(input, 2)) { + launch_vectorized_copy(stream, output, input); + } else { + launch_vectorized_copy(stream, output, input); + } + } + + template void copy(const Stream&, Span<__half>, View<__half>); + template void copy(const Stream&, Span, View); + }}}} /* namespace cv::dnn::cuda4dnn::kernels */ diff --git a/modules/dnn/src/cuda/max_unpooling.cu b/modules/dnn/src/cuda/max_unpooling.cu index e388c9528e..ed3aa70dda 100644 --- a/modules/dnn/src/cuda/max_unpooling.cu +++ b/modules/dnn/src/cuda/max_unpooling.cu @@ -16,7 +16,7 @@ #include "../cuda4dnn/csl/tensor.hpp" #include "../cuda4dnn/csl/span.hpp" -#include "../cuda4dnn/kernels/fill.hpp" +#include "../cuda4dnn/kernels/fill_copy.hpp" #include diff --git a/modules/dnn/src/cuda/normalize.cu b/modules/dnn/src/cuda/normalize.cu index 49dff9b559..326a9ae2d9 100644 --- a/modules/dnn/src/cuda/normalize.cu +++ b/modules/dnn/src/cuda/normalize.cu @@ -15,7 +15,7 @@ #include "../cuda4dnn/csl/stream.hpp" #include "../cuda4dnn/csl/span.hpp" -#include "../cuda4dnn/kernels/fill.hpp" +#include "../cuda4dnn/kernels/fill_copy.hpp" #include "../cuda4dnn/kernels/scale_shift.hpp" #include diff --git a/modules/dnn/src/cuda/permute.cu b/modules/dnn/src/cuda/permute.cu index 3643ad5d25..db04e9d2c3 100644 --- a/modules/dnn/src/cuda/permute.cu +++ b/modules/dnn/src/cuda/permute.cu @@ -7,6 +7,7 @@ #include "array.hpp" #include "types.hpp" +#include "vector_traits.hpp" #include "grid_stride_range.hpp" #include "execution.hpp" #include "kernel_dispatcher.hpp" @@ -15,6 +16,8 @@ #include "../cuda4dnn/csl/tensor.hpp" #include "../cuda4dnn/csl/span.hpp" +#include "../cuda4dnn/kernels/fill_copy.hpp" + #include #include @@ -46,8 +49,88 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { output[i] = input[oldPosition]; } } + + template + __global__ void transpose(Span output, View input, size_type in_width, size_type out_width) + { + using vector_type = get_vector_type_t; + + __shared__ T tile[TILE_SIZE][TILE_SIZE + 1]; + + /* blockDim.y = TILE_SIZE, blockDim.x = TILE_SIZE/N */ + const index_type in_x = blockIdx.x * TILE_SIZE + threadIdx.x * N; + const index_type in_y = blockIdx.y * TILE_SIZE + threadIdx.y; + + /* Every valid input location has a corresponding output location and vice versa. + * Hence, if we do not load values into the shared memory for a given location, we + * also won't read them for storing in the output. + */ + if (in_x < in_width && in_y < out_width) + { + vector_type vec; + auto input_vPtr = vector_type::get_pointer(input.data()); + v_load(vec, input_vPtr[(in_y * in_width + in_x) / N]); + + for (int i = 0; i < vector_type::size(); i++) + tile[threadIdx.y][threadIdx.x * N + i] = vec.data[i]; + } + + __syncthreads(); + + /* Note that `blockDim.x * N` is equal to `blockDim.y`. Since there are an equal + * number of them, we can interchange `threadIdx.x` and `threadIdx.y` without changing + * result. The advantage of interchanging is that consecutive output indices map to + * consecutive threads. This would allow writes across threds in a warp to be coalesced. + */ + const index_type out_x = blockIdx.y * TILE_SIZE + threadIdx.x * N; + const index_type out_y = blockIdx.x * TILE_SIZE + threadIdx.y; + + if (out_x < out_width && out_y < in_width) + { + vector_type vec; + for (int i = 0; i < vector_type::size(); i++) + vec.data[i] = tile[threadIdx.x * N + i][threadIdx.y]; + + auto output_vPtr = vector_type::get_pointer(output.data()); + v_store(output_vPtr[(out_y * out_width + out_x) / N], vec); + } + } } + template static + void launch_transpose_kernel(const Stream& stream, Span output, View input, size_type in_width, size_type out_width) + { + CV_Assert(is_fully_aligned(output, N)); + CV_Assert(is_fully_aligned(input, N)); + CV_Assert(in_width % N == 0); + CV_Assert(out_width % N == 0); + + constexpr int TILE_SIZE = 32; + constexpr int TILE_SIZE_X = TILE_SIZE/N, TILE_SIZE_Y = TILE_SIZE; + auto kernel = raw::transpose; + + dim3 grid_size((in_width/N + TILE_SIZE_X - 1)/TILE_SIZE_X, (out_width + TILE_SIZE_Y - 1)/TILE_SIZE_Y); + dim3 block_size(TILE_SIZE_X, TILE_SIZE_Y); + auto policy = execution_policy(grid_size, block_size, stream); + + launch_kernel(kernel, policy, output, input, in_width, out_width); + } + + template + void transpose(const Stream& stream, Span output, View input, std::size_t in_width, std::size_t out_width) + { + if (is_fully_aligned(output, 4) && is_fully_aligned(input, 4) && in_width % 4 == 0 && out_width % 4 == 0) { + launch_transpose_kernel(stream, output, input, in_width, out_width); + } else if (is_fully_aligned(output, 2) && is_fully_aligned(input, 2) && in_width % 2 == 0 && out_width % 2 == 0) { + launch_transpose_kernel(stream, output, input, in_width, out_width); + } else { + launch_transpose_kernel(stream, output, input, in_width, out_width); + } + } + + template void transpose(const Stream&, Span<__half>, View<__half>, std::size_t, std::size_t); + template void transpose(const Stream&, Span, View, std::size_t, std::size_t); + template static void launch_permute_kernel( const Stream& stream, @@ -83,7 +166,11 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { CV_Assert(input.rank() == order.size()); CV_Assert(input.size() == output.size()); - /* squeezable axes at the beginning of both tensors which aren't permuted can be eliminated + auto rank = output.rank(); + auto inShape = input.shape_as_vector(); + auto outShape = output.shape_as_vector(); + + /* singleton axes do not contribute towards address calculation * * Reasoning: * ---------- @@ -92,33 +179,93 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { * tensor indices be [o1, o2, ...]. The permutation operation essentially copies items * from the input tensor to new locations in the output tensor as dictated by the indices. * - * If the size of the first axis of the input and output tensor is one and these axes are - * not involved in any permutation, i.e. order[0] = 0, the input and output indicies for - * all the elements will be of the form be [0, i2, ...] and [0, o2, ...] respectively. - * The first index does not contribute to the element's address calculation and hence does - * nothing apart from eating up few cycles. + * If the size of the nth axis (say i2) of the input is one the input and output indicies for + * all the elements will be of the form be [i1, 0, ...] and [..., 0, ...] respectively. + * The index does not contribute to the element's address calculation and hence would give + * identical result if it weren't there. */ - while (order[0] == 0 && input.get_axis_size(0) == 1 && output.get_axis_size(0) == 1) { - /* remove the axes */ - input.squeeze(0); - output.squeeze(0); + for (int i = 0; i < rank; i++) + { + /* index `i` corresponds to the axis index in the output; order[i] has the corresponding axis index in the input */ + while (i < rank && outShape[i] == 1) + { + int in_i = order[i]; + CV_Assert(inShape[in_i] == 1); - /* when we remove axis zero, the axis index will be one less than the previous index - * for the remaining axes - */ - order.erase(order.begin()); - for (auto& axis : order) - axis--; - - /* optimizations should not break the invariants */ - CV_Assert(output.rank() == input.rank()); - CV_Assert(input.rank() == order.size()); - CV_Assert(input.size() == output.size()); + /* delete axis `i` */ + inShape.erase(std::begin(inShape) + in_i); + outShape.erase(std::begin(outShape) + i); + + /* deletion of an axis reduces an axis in the input tensor which would cause the indices + * of the axes that come after the deleted axis to reduce by one + */ + order.erase(order.begin() + i); + for (auto& axis : order) + if (axis > in_i) + axis--; + + rank--; + + /* optimizations should not break the invariants */ + CV_Assert(rank == order.size()); + CV_Assert(inShape.size() == order.size()); + CV_Assert(outShape.size() == order.size()); + CV_Assert(input.size() == output.size()); + } } - auto rank = output.rank(); - auto inShape = input.shape_as_vector(); - auto outShape = output.shape_as_vector(); + /* contiguous axes whose relative ordering stays same before and after permutation can be merged into one axis + * example: in permute order 0 2 3 1, axes 2 and 3 can be grouped into a single axis + * + * Reasoning: + * ---------- + * Suppose an item's indices in the input tensor is [i0, i1, i2, i3, ...]. Let the permutation order be [0, 3, 1, 2, ...]. + * Note that i1 and i2 are adjacent axes in the same order in input as well as output. The indices in the output tensor + * will be [i0, i3, i1, i2, ...]. + * + * Each axis in the contiguous axes sequence will add an offset of iN * strideN. In the above example, + * the two axes add a total offset of `i1 * (size2 * stride2) + i2 * stride2` which is `(i1 * size2 + i2) * stride2`, + * in both input and output. Note stride2 can be different in the input and output. We can merge the two axes into one axis + * with a size of `size1 * size2`. The new offset added will be `i12 * stride12` as the kernel iterates through `i12`. Note + * that `i12` is actually `(i1 * size2 + i2)` and `stride12` is `stride2`. + */ + for (int i = 0; i < rank; i++) { + /* the indices used in the loops such as `i` and `j` are axis indices in the output tensor */ + /* the corresponding input axis indices are `order[i]` and `order[j]`*/ + + /* loop invariant: `i` is the first axis in the contiguous unpermuted axis sequence */ + + int j = i + 1; /* `j` is the axis which we will attempt to merge */ + while (j < rank && (order[i] + 1) == order[j]) { + /* axis `i` and axis `j` do not change relative order */ + + auto in_i = order[i], in_j = order[j]; + + auto new_size = inShape[in_i] * inShape[in_j]; + inShape[in_i] = new_size; + outShape[i] = new_size; + + /* delete axis `j` */ + inShape.erase(std::begin(inShape) + in_j); + outShape.erase(std::begin(outShape) + j); + + /* deletion of an axis reduces an axis in the input tensor which would cause the indices + * of the axes that come after the deleted axis to reduce by one + */ + order.erase(order.begin() + j); + for (auto& axis : order) + if (axis > order[i]) + axis--; + + rank--; + + /* optimizations should not break the invariants */ + CV_Assert(rank == order.size()); + CV_Assert(inShape.size() == order.size()); + CV_Assert(outShape.size() == order.size()); + CV_Assert(input.size() == output.size()); + } + } std::vector inStride(rank), outStride(rank); inStride.back() = 1; @@ -133,8 +280,27 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies()); /* stride[0], stride[1], ..., stride[-2], 1 */ - CV_Assert(2 <= rank && rank <= CSL_MAX_TENSOR_RANK); - permute_dispatcher(rank, stream, order, output, outStride, input, inStride); + const bool is_in_order = [&order] { + for (int i = 0; i < order.size(); i++) + if (order[i] != i) + return false; + return true; + }(); + + if (is_in_order) + { + kernels::copy(stream, output, input); + } + else if(rank == 2) + { + /* use the more efficient transpose kernel */ + transpose(stream, output, input, inShape[1], outShape[1]); + } + else + { + CV_Assert(3 <= rank && rank <= CSL_MAX_TENSOR_RANK); + permute_dispatcher(rank, stream, order, output, outStride, input, inStride); + } } template void permute(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector); diff --git a/modules/dnn/src/cuda4dnn/kernels/fill.hpp b/modules/dnn/src/cuda4dnn/kernels/fill_copy.hpp similarity index 63% rename from modules/dnn/src/cuda4dnn/kernels/fill.hpp rename to modules/dnn/src/cuda4dnn/kernels/fill_copy.hpp index 4101481aad..8a6b9e112b 100644 --- a/modules/dnn/src/cuda4dnn/kernels/fill.hpp +++ b/modules/dnn/src/cuda4dnn/kernels/fill_copy.hpp @@ -2,8 +2,8 @@ // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. -#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_FILL_HPP -#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_FILL_HPP +#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_FILL_COPY_HPP +#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_FILL_COPY_HPP #include "../csl/stream.hpp" #include "../csl/span.hpp" @@ -13,6 +13,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { template void fill(const csl::Stream& stream, csl::Span output, T value); + template + void copy(const csl::Stream& stream, csl::Span output, csl::View input); + }}}} /* namespace cv::dnn::cuda4dnn::kernels */ -#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_FILL_HPP */ +#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_FILL_COPY_HPP */ diff --git a/modules/dnn/src/cuda4dnn/kernels/permute.hpp b/modules/dnn/src/cuda4dnn/kernels/permute.hpp index 65fe46ba26..57a654c9f1 100644 --- a/modules/dnn/src/cuda4dnn/kernels/permute.hpp +++ b/modules/dnn/src/cuda4dnn/kernels/permute.hpp @@ -16,6 +16,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { template void permute(const csl::Stream& stream, csl::TensorSpan output, csl::TensorView input, std::vector order); + template + void transpose(const csl::Stream& stream, csl::Span output, csl::View input, std::size_t in_width, std::size_t out_width); + }}}} /* namespace cv::dnn::cuda4dnn::kernels */ #endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PERMUTE_HPP */ diff --git a/modules/dnn/src/cuda4dnn/primitives/concat.hpp b/modules/dnn/src/cuda4dnn/primitives/concat.hpp index 96556275eb..af79530f6f 100644 --- a/modules/dnn/src/cuda4dnn/primitives/concat.hpp +++ b/modules/dnn/src/cuda4dnn/primitives/concat.hpp @@ -10,7 +10,7 @@ #include "../csl/stream.hpp" #include "../csl/pointer.hpp" -#include "../kernels/fill.hpp" +#include "../kernels/fill_copy.hpp" #include "../kernels/concat.hpp" #include diff --git a/modules/dnn/src/cuda4dnn/primitives/padding.hpp b/modules/dnn/src/cuda4dnn/primitives/padding.hpp index 97953786b4..cbf75131a1 100644 --- a/modules/dnn/src/cuda4dnn/primitives/padding.hpp +++ b/modules/dnn/src/cuda4dnn/primitives/padding.hpp @@ -10,7 +10,7 @@ #include "../csl/stream.hpp" #include "../csl/tensor.hpp" -#include "../kernels/fill.hpp" +#include "../kernels/fill_copy.hpp" #include "../kernels/concat.hpp" #include "../kernels/padding.hpp"