Merge pull request #14827 from YashasSamaga:cuda4dnn-csl-low
CUDA backend for the DNN module * stub cuda4dnn design * minor fixes for tests and doxygen * add csl public api directory to module headers * add low-level CSL components * add high-level CSL components * integrate csl::Tensor into backbone code * switch to CPU iff unsupported; otherwise, fail on error * add fully connected layer * add softmax layer * add activation layers * support arbitary rank TensorDescriptor * pass input wrappers to `initCUDA()` * add 1d/2d/3d-convolution * add pooling layer * reorganize and refactor code * fixes for gcc, clang and doxygen; remove cxx14/17 code * add blank_layer * add LRN layer * add rounding modes for pooling layer * split tensor.hpp into tensor.hpp and tensor_ops.hpp * add concat layer * add scale layer * add batch normalization layer * split math.cu into activations.cu and math.hpp * add eltwise layer * add flatten layer * add tensor transform api * add asymmetric padding support for convolution layer * add reshape layer * fix rebase issues * add permute layer * add padding support for concat layer * refactor and reorganize code * add normalize layer * optimize bias addition in scale layer * add prior box layer * fix and optimize normalize layer * add asymmetric padding support for pooling layer * add event API * improve pooling performance for some padding scenarios * avoid over-allocation of compute resources to kernels * improve prior box performance * enable layer fusion * add const layer * add resize layer * add slice layer * add padding layer * add deconvolution layer * fix channelwise ReLU initialization * add vector traits * add vectorized versions of relu, clipped_relu, power * add vectorized concat kernels * improve concat_with_offsets performance * vectorize scale and bias kernels * add support for multi-billion element tensors * vectorize prior box kernels * fix address alignment check * improve bias addition performance of conv/deconv/fc layers * restructure code for supporting multiple targets * add DNN_TARGET_CUDA_FP64 * add DNN_TARGET_FP16 * improve vectorization * add region layer * improve tensor API, add dynamic ranks 1. use ManagedPtr instead of a Tensor in backend wrapper 2. add new methods to tensor classes - size_range: computes the combined size of for a given axis range - tensor span/view can be constructed from a raw pointer and shape 3. the tensor classes can change their rank at runtime (previously rank was fixed at compile-time) 4. remove device code from tensor classes (as they are unused) 5. enforce strict conditions on tensor class APIs to improve debugging ability * fix parametric relu activation * add squeeze/unsqueeze tensor API * add reorg layer * optimize permute and enable 2d permute * enable 1d and 2d slice * add split layer * add shuffle channel layer * allow tensors of different ranks in reshape primitive * patch SliceOp to allow Crop Layer * allow extra shape inputs in reshape layer * use `std::move_backward` instead of `std::move` for insert in resizable_static_array * improve workspace management * add spatial LRN * add nms (cpu) to region layer * add max pooling with argmax ( and a fix to limits.hpp) * add max unpooling layer * rename DNN_TARGET_CUDA_FP32 to DNN_TARGET_CUDA * update supportBackend to be more rigorous * remove stray include from preventing non-cuda build * include op_cuda.hpp outside condition #if * refactoring, fixes and many optimizations * drop DNN_TARGET_CUDA_FP64 * fix gcc errors * increase max. tensor rank limit to six * add Interp layer * drop custom layers; use BackendNode * vectorize activation kernels * fixes for gcc * remove wrong assertion * fix broken assertion in unpooling primitive * fix build errors in non-CUDA build * completely remove workspace from public API * fix permute layer * enable accuracy and perf. tests for DNN_TARGET_CUDA * add asynchronous forward * vectorize eltwise ops * vectorize fill kernel * fixes for gcc * remove CSL headers from public API * remove csl header source group from cmake * update min. cudnn version in cmake * add numerically stable FP32 log1pexp * refactor code * add FP16 specialization to cudnn based tensor addition * vectorize scale1 and bias1 + minor refactoring * fix doxygen build * fix invalid alignment assertion * clear backend wrappers before allocateLayers * ignore memory lock failures * do not allocate internal blobs * integrate NVTX * add numerically stable half precision log1pexp * fix indentation, following coding style, improve docs * remove accidental modification of IE code * Revert "add asynchronous forward" This reverts commit 1154b9da9da07e9b52f8a81bdcea48cf31c56f70. * [cmake] throw error for unsupported CC versions * fix rebase issues * add more docs, refactor code, fix bugs * minor refactoring and fixes * resolve warnings/errors from clang * remove haveCUDA() checks from supportBackend() * remove NVTX integration * changes based on review comments * avoid exception when no CUDA device is present * add color code for CUDA in Net::dumppull/18312/head^2
parent
8ec6544624
commit
613c12e590
122 changed files with 13024 additions and 99 deletions
@ -0,0 +1,432 @@ |
||||
// This file is part of OpenCV project. |
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory |
||||
// of this distribution and at http://opencv.org/license.html. |
||||
|
||||
#include <cuda_runtime.h> |
||||
#include <cuda_fp16.h> |
||||
|
||||
#include "math.hpp" |
||||
#include "types.hpp" |
||||
#include "vector_traits.hpp" |
||||
#include "grid_stride_range.hpp" |
||||
#include "execution.hpp" |
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp" |
||||
#include "../cuda4dnn/csl/span.hpp" |
||||
|
||||
#include "../cuda4dnn/kernels/scale_shift.hpp" |
||||
|
||||
#include <opencv2/core.hpp> |
||||
|
||||
#include <cstddef> |
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl; |
||||
using namespace cv::dnn::cuda4dnn::csl::device; |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { |
||||
|
||||
namespace raw { |
||||
template <class T, std::size_t N> |
||||
__global__ void abs_vec(Span<T> output, View<T> input) { |
||||
using vector_type = get_vector_type_t<T, N>; |
||||
|
||||
auto output_vPtr = vector_type::get_pointer(output.data()); |
||||
auto input_vPtr = vector_type::get_pointer(input.data()); |
||||
|
||||
for (auto i : grid_stride_range(output.size() / vector_type::size())) { |
||||
vector_type vec; |
||||
v_load(vec, input_vPtr[i]); |
||||
for (int j = 0; j < vector_type::size(); j++) { |
||||
using device::abs; |
||||
vec.data[j] = abs(vec.data[j]); |
||||
} |
||||
v_store(output_vPtr[i], vec); |
||||
} |
||||
} |
||||
|
||||
template <class T, std::size_t N> |
||||
__global__ void tanh_vec(Span<T> output, View<T> input) { |
||||
using vector_type = get_vector_type_t<T, N>; |
||||
|
||||
auto output_vPtr = vector_type::get_pointer(output.data()); |
||||
auto input_vPtr = vector_type::get_pointer(input.data()); |
||||
|
||||
for (auto i : grid_stride_range(output.size() / vector_type::size())) { |
||||
vector_type vec; |
||||
v_load(vec, input_vPtr[i]); |
||||
for (int j = 0; j < vector_type::size(); j++) { |
||||
using device::tanh; |
||||
vec.data[j] = tanh(vec.data[j]); |
||||
} |
||||
v_store(output_vPtr[i], vec); |
||||
} |
||||
} |
||||
|
||||
template <class T, std::size_t N> |
||||
__global__ void sigmoid_vec(Span<T> output, View<T> input) { |
||||
using vector_type = get_vector_type_t<T, N>; |
||||
|
||||
auto output_vPtr = vector_type::get_pointer(output.data()); |
||||
auto input_vPtr = vector_type::get_pointer(input.data()); |
||||
|
||||
for (auto i : grid_stride_range(output.size() / vector_type::size())) { |
||||
vector_type vec; |
||||
v_load(vec, input_vPtr[i]); |
||||
for (int j = 0; j < vector_type::size(); j++) { |
||||
using device::sigmoid; |
||||
vec.data[j] = sigmoid(vec.data[j]); |
||||
} |
||||
v_store(output_vPtr[i], vec); |
||||
} |
||||
} |
||||
|
||||
template <class T, std::size_t N> |
||||
__global__ void bnll_vec(Span<T> output, View<T> input) { |
||||
using vector_type = get_vector_type_t<T, N>; |
||||
|
||||
auto output_vPtr = vector_type::get_pointer(output.data()); |
||||
auto input_vPtr = vector_type::get_pointer(input.data()); |
||||
|
||||
for (auto i : grid_stride_range(output.size() / vector_type::size())) { |
||||
vector_type vec; |
||||
v_load(vec, input_vPtr[i]); |
||||
for (int j = 0; j < vector_type::size(); j++) { |
||||
using device::log1pexp; |
||||
vec.data[j] = vec.data[j] > T(0) ? vec.data[j] + log1pexp(-vec.data[j]) : log1pexp(vec.data[j]); |
||||
} |
||||
v_store(output_vPtr[i], vec); |
||||
} |
||||
} |
||||
|
||||
template <class T, std::size_t N> |
||||
__global__ void elu_vec(Span<T> output, View<T> input) { |
||||
using vector_type = get_vector_type_t<T, N>; |
||||
|
||||
auto output_vPtr = vector_type::get_pointer(output.data()); |
||||
auto input_vPtr = vector_type::get_pointer(input.data()); |
||||
|
||||
for (auto i : grid_stride_range(output.size() / vector_type::size())) { |
||||
vector_type vec; |
||||
v_load(vec, input_vPtr[i]); |
||||
for (int j = 0; j < vector_type::size(); j++) { |
||||
using device::expm1; |
||||
vec.data[j] = vec.data[j] >= T(0) ? vec.data[j] : expm1(vec.data[j]); |
||||
} |
||||
v_store(output_vPtr[i], vec); |
||||
} |
||||
} |
||||
|
||||
template <class T, std::size_t N> |
||||
__global__ void relu_vec(Span<T> output, View<T> input, T slope) { |
||||
using vector_type = get_vector_type_t<T, N>; |
||||
|
||||
auto output_vPtr = vector_type::get_pointer(output.data()); |
||||
auto input_vPtr = vector_type::get_pointer(input.data()); |
||||
|
||||
for (auto i : grid_stride_range(output.size() / vector_type::size())) { |
||||
vector_type vec; |
||||
v_load(vec, input_vPtr[i]); |
||||
for(int j = 0; j < vector_type::size(); j++) |
||||
vec.data[j] = vec.data[j] >= T(0) ? vec.data[j] : slope * vec.data[j]; |
||||
v_store(output_vPtr[i], vec); |
||||
} |
||||
} |
||||
|
||||
template <class T, std::size_t N> |
||||
__global__ void clipped_relu_vec(Span<T> output, View<T> input, T floor, T ceiling) { |
||||
using vector_type = get_vector_type_t<T, N>; |
||||
|
||||
auto output_vPtr = vector_type::get_pointer(output.data()); |
||||
auto input_vPtr = vector_type::get_pointer(input.data()); |
||||
|
||||
for (auto i : grid_stride_range(output.size() / vector_type::size())) { |
||||
using device::clamp; |
||||
|
||||
vector_type vec; |
||||
v_load(vec, input_vPtr[i]); |
||||
for (int j = 0; j < vector_type::size(); j++) |
||||
vec.data[j] = clamp(vec.data[j], floor, ceiling); |
||||
v_store(output_vPtr[i], vec); |
||||
} |
||||
} |
||||
|
||||
template <class T, std::size_t N> |
||||
__global__ void axiswise_relu_vec(Span<T> output, View<T> input, size_type inner_size, View<T> slope) { |
||||
using vector_type = get_vector_type_t<T, N>; |
||||
|
||||
auto output_vPtr = vector_type::get_pointer(output.data()); |
||||
auto input_vPtr = vector_type::get_pointer(input.data()); |
||||
|
||||
inner_size /= vector_type::size(); |
||||
for (auto i : grid_stride_range(output.size() / vector_type::size())) { |
||||
const index_type c = (i / inner_size) % static_cast<size_type>(slope.size()); |
||||
|
||||
vector_type vec; |
||||
v_load(vec, input_vPtr[i]); |
||||
for (int j = 0; j < vector_type::size(); j++) |
||||
vec.data[j] = vec.data[j] > T(0) ? vec.data[j] : vec.data[j] * slope[c]; |
||||
v_store(output_vPtr[i], vec); |
||||
} |
||||
} |
||||
|
||||
template <class T, std::size_t N> |
||||
__global__ void power_vec(Span<T> output, View<T> input, T exp, T scale, T shift) { |
||||
using vector_type = get_vector_type_t<T, N>; |
||||
|
||||
auto output_vPtr = vector_type::get_pointer(output.data()); |
||||
auto input_vPtr = vector_type::get_pointer(input.data()); |
||||
|
||||
for (auto i : grid_stride_range(output.size() / vector_type::size())) { |
||||
using device::pow; |
||||
|
||||
vector_type vec; |
||||
v_load(vec, input_vPtr[i]); |
||||
for (int j = 0; j < vector_type::size(); j++) |
||||
vec.data[j] = pow(shift + scale * vec.data[j], exp); |
||||
v_store(output_vPtr[i], vec); |
||||
} |
||||
} |
||||
} |
||||
|
||||
template <class T, std::size_t N> |
||||
void launch_vectorized_abs(const Stream& stream, Span<T> output, View<T> input) { |
||||
CV_Assert(is_fully_aligned<T>(output, N)); |
||||
CV_Assert(is_fully_aligned<T>(input, N)); |
||||
|
||||
auto kernel = raw::abs_vec<T, N>; |
||||
auto policy = make_policy(kernel, output.size() / N, 0, stream); |
||||
launch_kernel(kernel, policy, output, input); |
||||
} |
||||
|
||||
template <class T> |
||||
void abs(const Stream& stream, Span<T> output, View<T> input) { |
||||
CV_Assert(input.size() == output.size()); |
||||
|
||||
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) { |
||||
launch_vectorized_abs<T, 4>(stream, output, input); |
||||
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) { |
||||
launch_vectorized_abs<T, 2>(stream, output, input); |
||||
} else { |
||||
launch_vectorized_abs<T, 1>(stream, output, input); |
||||
} |
||||
} |
||||
|
||||
template void abs<__half>(const Stream& stream, Span<__half> output, View<__half> input); |
||||
template void abs<float>(const Stream& stream, Span<float> output, View<float> input); |
||||
|
||||
template <class T, std::size_t N> |
||||
void launch_vectorized_tanh(const Stream& stream, Span<T> output, View<T> input) { |
||||
CV_Assert(is_fully_aligned<T>(output, N)); |
||||
CV_Assert(is_fully_aligned<T>(input, N)); |
||||
|
||||
auto kernel = raw::tanh_vec<T, N>; |
||||
auto policy = make_policy(kernel, output.size() / N, 0, stream); |
||||
launch_kernel(kernel, policy, output, input); |
||||
} |
||||
|
||||
template <class T> |
||||
void tanh(const Stream& stream, Span<T> output, View<T> input) { |
||||
CV_Assert(input.size() == output.size()); |
||||
|
||||
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) { |
||||
launch_vectorized_tanh<T, 4>(stream, output, input); |
||||
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) { |
||||
launch_vectorized_tanh<T, 2>(stream, output, input); |
||||
} else { |
||||
launch_vectorized_tanh<T, 1>(stream, output, input); |
||||
} |
||||
} |
||||
|
||||
template void tanh<__half>(const Stream&, Span<__half>, View<__half>); |
||||
template void tanh<float>(const Stream&, Span<float>, View<float>); |
||||
|
||||
template <class T, std::size_t N> |
||||
void launch_vectorized_sigmoid(const Stream& stream, Span<T> output, View<T> input) { |
||||
CV_Assert(is_fully_aligned<T>(output, N)); |
||||
CV_Assert(is_fully_aligned<T>(input, N)); |
||||
|
||||
auto kernel = raw::sigmoid_vec<T, N>; |
||||
auto policy = make_policy(kernel, output.size() / N, 0, stream); |
||||
launch_kernel(kernel, policy, output, input); |
||||
} |
||||
|
||||
template <class T> |
||||
void sigmoid(const Stream& stream, Span<T> output, View<T> input) { |
||||
CV_Assert(input.size() == output.size()); |
||||
|
||||
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) { |
||||
launch_vectorized_sigmoid<T, 4>(stream, output, input); |
||||
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) { |
||||
launch_vectorized_sigmoid<T, 2>(stream, output, input); |
||||
} else { |
||||
launch_vectorized_sigmoid<T, 1>(stream, output, input); |
||||
} |
||||
} |
||||
|
||||
template void sigmoid<__half>(const Stream&, Span<__half>, View<__half>); |
||||
template void sigmoid<float>(const Stream&, Span<float>, View<float>); |
||||
|
||||
template <class T, std::size_t N> |
||||
void launch_vectorized_bnll(const Stream& stream, Span<T> output, View<T> input) { |
||||
CV_Assert(is_fully_aligned<T>(output, N)); |
||||
CV_Assert(is_fully_aligned<T>(input, N)); |
||||
|
||||
auto kernel = raw::bnll_vec<T, N>; |
||||
auto policy = make_policy(kernel, output.size() / N, 0, stream); |
||||
launch_kernel(kernel, policy, output, input); |
||||
} |
||||
|
||||
template <class T> |
||||
void bnll(const Stream& stream, Span<T> output, View<T> input) { |
||||
CV_Assert(input.size() == output.size()); |
||||
|
||||
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) { |
||||
launch_vectorized_bnll<T, 4>(stream, output, input); |
||||
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) { |
||||
launch_vectorized_bnll<T, 2>(stream, output, input); |
||||
} else { |
||||
launch_vectorized_bnll<T, 1>(stream, output, input); |
||||
} |
||||
} |
||||
|
||||
template void bnll<__half>(const Stream&, Span<__half>, View<__half>); |
||||
template void bnll<float>(const Stream&, Span<float>, View<float>); |
||||
|
||||
template <class T, std::size_t N> |
||||
void launch_vectorized_elu(const Stream& stream, Span<T> output, View<T> input) { |
||||
CV_Assert(is_fully_aligned<T>(output, N)); |
||||
CV_Assert(is_fully_aligned<T>(input, N)); |
||||
|
||||
auto kernel = raw::elu_vec<T, N>; |
||||
auto policy = make_policy(kernel, output.size() / N, 0, stream); |
||||
launch_kernel(kernel, policy, output, input); |
||||
} |
||||
|
||||
template <class T> |
||||
void elu(const Stream& stream, Span<T> output, View<T> input) { |
||||
CV_Assert(input.size() == output.size()); |
||||
|
||||
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) { |
||||
launch_vectorized_elu<T, 4>(stream, output, input); |
||||
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) { |
||||
launch_vectorized_elu<T, 2>(stream, output, input); |
||||
} else { |
||||
launch_vectorized_elu<T, 1>(stream, output, input); |
||||
} |
||||
} |
||||
|
||||
template void elu<__half>(const Stream&, Span<__half>, View<__half>); |
||||
template void elu<float>(const Stream&, Span<float>, View<float>); |
||||
|
||||
template <class T, std::size_t N> |
||||
void launch_vectorized_relu(const Stream& stream, Span<T> output, View<T> input, T slope) { |
||||
CV_Assert(is_fully_aligned<T>(output, N)); |
||||
CV_Assert(is_fully_aligned<T>(input, N)); |
||||
|
||||
auto kernel = raw::relu_vec<T, N>; |
||||
auto policy = make_policy(kernel, output.size() / N, 0, stream); |
||||
launch_kernel(kernel, policy, output, input, slope); |
||||
} |
||||
|
||||
template <class T> |
||||
void relu(const Stream& stream, Span<T> output, View<T> input, T slope) { |
||||
CV_Assert(input.size() == output.size()); |
||||
|
||||
if(is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) { |
||||
launch_vectorized_relu<T, 4>(stream, output, input, slope); |
||||
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) { |
||||
launch_vectorized_relu<T, 2>(stream, output, input, slope); |
||||
} else { |
||||
launch_vectorized_relu<T, 1>(stream, output, input, slope); |
||||
} |
||||
} |
||||
|
||||
template void relu<__half>(const Stream&, Span<__half>, View<__half>, __half); |
||||
template void relu<float>(const Stream&, Span<float>, View<float>, float); |
||||
|
||||
template <class T, std::size_t N> |
||||
void launch_vectorized_clipped_relu(const Stream& stream, Span<T> output, View<T> input, T floor, T ceiling) { |
||||
CV_Assert(is_fully_aligned<T>(output, N)); |
||||
CV_Assert(is_fully_aligned<T>(input, N)); |
||||
|
||||
auto kernel = raw::clipped_relu_vec<T, N>; |
||||
auto policy = make_policy(kernel, output.size() / N, 0, stream); |
||||
launch_kernel(kernel, policy, output, input, floor, ceiling); |
||||
} |
||||
|
||||
template <class T> |
||||
void clipped_relu(const Stream& stream, Span<T> output, View<T> input, T floor, T ceiling) { |
||||
CV_Assert(input.size() == output.size()); |
||||
CV_Assert(static_cast<double>(floor) <= static_cast<double>(ceiling)); |
||||
|
||||
if(is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) { |
||||
launch_vectorized_clipped_relu<T, 4>(stream, output, input, floor, ceiling); |
||||
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) { |
||||
launch_vectorized_clipped_relu<T, 2>(stream, output, input, floor, ceiling); |
||||
} else { |
||||
launch_vectorized_clipped_relu<T, 1>(stream, output, input, floor, ceiling); |
||||
} |
||||
} |
||||
|
||||
template void clipped_relu<__half>(const Stream&, Span<__half>, View<__half>, __half, __half); |
||||
template void clipped_relu<float>(const Stream&, Span<float>, View<float>, float, float); |
||||
|
||||
template <class T, std::size_t N> |
||||
void launch_vectorized_axiswise_relu(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> slope) { |
||||
CV_Assert(is_fully_aligned<T>(output, N)); |
||||
CV_Assert(is_fully_aligned<T>(input, N)); |
||||
CV_Assert(inner_size % N == 0); |
||||
|
||||
auto kernel = raw::axiswise_relu_vec<T, N>; |
||||
auto policy = make_policy(kernel, output.size() / N, 0, stream); |
||||
launch_kernel(kernel, policy, output, input, inner_size, slope); |
||||
} |
||||
|
||||
template <class T> |
||||
void axiswise_relu(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> slope) { |
||||
CV_Assert(input.size() == output.size()); |
||||
|
||||
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && inner_size % 4 == 0) { |
||||
launch_vectorized_axiswise_relu<T, 4>(stream, output, input, inner_size, slope); |
||||
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && inner_size % 2 == 0) { |
||||
launch_vectorized_axiswise_relu<T, 2>(stream, output, input, inner_size, slope); |
||||
} else { |
||||
launch_vectorized_axiswise_relu<T, 1>(stream, output, input, inner_size, slope); |
||||
} |
||||
} |
||||
|
||||
template void axiswise_relu<__half>(const Stream&, Span<__half>, View<__half>, std::size_t, View<__half>); |
||||
template void axiswise_relu<float>(const Stream&, Span<float>, View<float>, std::size_t, View<float>); |
||||
|
||||
template <class T, std::size_t N> |
||||
void launch_vectorized_power(const Stream& stream, Span<T> output, View<T> input, T exp, T scale, T shift) { |
||||
CV_Assert(is_fully_aligned<T>(output, N)); |
||||
CV_Assert(is_fully_aligned<T>(input, N)); |
||||
|
||||
auto kernel = raw::power_vec<T, N>; |
||||
auto policy = make_policy(kernel, output.size() / N, 0, stream); |
||||
launch_kernel(kernel, policy, output, input, exp, scale, shift); |
||||
} |
||||
|
||||
template <class T> |
||||
void power(const Stream& stream, Span<T> output, View<T> input, T exp, T scale, T shift) { |
||||
CV_Assert(input.size() == output.size()); |
||||
|
||||
if (static_cast<float>(exp) == 1.0f) { |
||||
scale1_with_bias1(stream, output, input, scale, shift); |
||||
return; |
||||
} |
||||
|
||||
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && output.size()) { |
||||
launch_vectorized_power<T, 4>(stream, output, input, exp, scale, shift); |
||||
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && output.size()) { |
||||
launch_vectorized_power<T, 2>(stream, output, input, exp, scale, shift); |
||||
} else { |
||||
launch_vectorized_power<T, 1>(stream, output, input, exp, scale, shift); |
||||
} |
||||
} |
||||
|
||||
template void power<__half>(const Stream&, Span<__half>, View<__half>, __half, __half, __half); |
||||
template void power<float>(const Stream&, Span<float>, View<float>, float, float, float); |
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */ |
@ -0,0 +1,73 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA_ARRAY_HPP |
||||
#define OPENCV_DNN_SRC_CUDA_ARRAY_HPP |
||||
|
||||
#include <cuda_runtime.h> |
||||
|
||||
#include "types.hpp" |
||||
|
||||
#include <cstddef> |
||||
#include <type_traits> |
||||
#include <iterator> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device { |
||||
|
||||
template <class T, std::size_t N> |
||||
struct array { |
||||
using value_type = T; |
||||
using size_type = device::size_type; |
||||
using difference_type = std::ptrdiff_t; |
||||
using reference = typename std::add_lvalue_reference<value_type>::type; |
||||
using const_reference = typename std::add_lvalue_reference<typename std::add_const<value_type>::type>::type; |
||||
using pointer = typename std::add_pointer<value_type>::type; |
||||
using const_pointer = typename std::add_pointer<typename std::add_const<value_type>::type>::type; |
||||
using iterator = pointer; |
||||
using const_iterator = const_pointer; |
||||
using reverse_iterator = std::reverse_iterator<iterator>; |
||||
using const_reverse_iterator = std::reverse_iterator<const_iterator>; |
||||
|
||||
__host__ __device__ bool empty() const noexcept { return N == 0; } |
||||
__host__ __device__ size_type size() const noexcept { return N; } |
||||
|
||||
__host__ __device__ iterator begin() noexcept { return ptr; } |
||||
__host__ __device__ iterator end() noexcept { return ptr + N; } |
||||
__host__ __device__ const_iterator begin() const noexcept { return ptr; } |
||||
__host__ __device__ const_iterator end() const noexcept { return ptr + N; } |
||||
|
||||
__host__ __device__ const_iterator cbegin() const noexcept { return ptr; } |
||||
__host__ __device__ const_iterator cend() const noexcept { return ptr + N; } |
||||
|
||||
__host__ __device__ reverse_iterator rbegin() noexcept { return ptr + N; } |
||||
__host__ __device__ reverse_iterator rend() noexcept { return ptr; } |
||||
__host__ __device__ const_reverse_iterator rbegin() const noexcept { return ptr + N; } |
||||
__host__ __device__ const_reverse_iterator rend() const noexcept { return ptr; } |
||||
|
||||
__host__ __device__ const_reverse_iterator crbegin() const noexcept { return ptr + N; } |
||||
__host__ __device__ const_reverse_iterator crend() const noexcept { return ptr; } |
||||
|
||||
template <class InputItr> |
||||
__host__ void assign(InputItr first, InputItr last) { |
||||
std::copy(first, last, std::begin(ptr)); |
||||
} |
||||
|
||||
__host__ __device__ reference operator[](int idx) { return ptr[idx]; } |
||||
__host__ __device__ const_reference operator[](int idx) const { return ptr[idx]; } |
||||
|
||||
__host__ __device__ reference front() { return ptr[0]; } |
||||
__host__ __device__ const_reference front() const { return ptr[0]; } |
||||
|
||||
__host__ __device__ reference back() { return ptr[N - 1]; } |
||||
__host__ __device__ const_reference back() const { return ptr[N - 1]; } |
||||
|
||||
__host__ __device__ pointer data() noexcept { return ptr; } |
||||
__host__ __device__ const_pointer data() const noexcept { return ptr; } |
||||
|
||||
T ptr[N]; |
||||
}; |
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA_ARRAY_HPP */ |
@ -0,0 +1,32 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA_ATOMICS_HPP |
||||
#define OPENCV_DNN_SRC_CUDA_ATOMICS_HPP |
||||
|
||||
#include <cuda_runtime.h> |
||||
#include <cuda_fp16.h> |
||||
|
||||
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 |
||||
#else |
||||
inline __device__ void atomicAdd(__half* address, __half val) { |
||||
unsigned int* address_as_ui = (unsigned int *)((char *)address - ((size_t)address & 2)); |
||||
unsigned int old = *address_as_ui; |
||||
unsigned int assumed; |
||||
|
||||
do { |
||||
assumed = old; |
||||
|
||||
__half_raw hsum; |
||||
hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); |
||||
__half tmpres = hsum + val; |
||||
hsum = __half_raw(tmpres); |
||||
|
||||
old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x; |
||||
old = atomicCAS(address_as_ui, assumed, old); |
||||
} while (assumed != old); |
||||
} |
||||
#endif |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA_ATOMICS_HPP */ |
@ -0,0 +1,259 @@ |
||||
// This file is part of OpenCV project. |
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory |
||||
// of this distribution and at http://opencv.org/license.html. |
||||
|
||||
#include <cuda_runtime.h> |
||||
#include <cuda_fp16.h> |
||||
|
||||
#include "array.hpp" |
||||
#include "types.hpp" |
||||
#include "vector_traits.hpp" |
||||
#include "grid_stride_range.hpp" |
||||
#include "execution.hpp" |
||||
#include "kernel_dispatcher.hpp" |
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp" |
||||
#include "../cuda4dnn/csl/tensor.hpp" |
||||
#include "../cuda4dnn/csl/span.hpp" |
||||
|
||||
#include <cstddef> |
||||
#include <vector> |
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl; |
||||
using namespace cv::dnn::cuda4dnn::csl::device; |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { |
||||
|
||||
namespace raw { |
||||
template <class T, std::size_t N> |
||||
__global__ void concat_vec( |
||||
Span<T> output, size_type output_axis_size, index_type output_axis_offset, |
||||
View<T> input, size_type input_axis_size, size_type concat_size) |
||||
{ |
||||
using vector_type = get_vector_type_t<T, N>; |
||||
|
||||
auto output_vPtr = vector_type::get_pointer(output.data()); |
||||
auto input_vPtr = vector_type::get_pointer(input.data()); |
||||
|
||||
/* we need to copy all the elements of input to some location in the output |
||||
* we copy blocks of size `total_concat_size` to some location in the output |
||||
*/ |
||||
const auto total_concat_size = concat_size * input_axis_size; |
||||
|
||||
for (auto in_idx : grid_stride_range(input.size() / vector_type::size())) { |
||||
const index_type idx = in_idx * vector_type::size(); |
||||
const index_type concat_num = idx / total_concat_size; |
||||
const index_type concat_index = idx % total_concat_size; |
||||
const index_type top_index = concat_index + |
||||
(concat_num * output_axis_size + output_axis_offset) * concat_size; |
||||
|
||||
const auto out_idx = top_index / vector_type::size(); |
||||
|
||||
vector_type vec; |
||||
v_load(vec, input_vPtr[in_idx]); |
||||
v_store(output_vPtr[out_idx], vec); |
||||
} |
||||
} |
||||
|
||||
template <class T, std::size_t Rank> |
||||
__global__ void concat_with_offsets( |
||||
Span<T> output, array<size_type, Rank> out_strides, array<index_type, Rank> out_offset, |
||||
View<T> input, array<size_type, Rank> in_strides) |
||||
{ |
||||
for (auto i : grid_stride_range(input.size())) { |
||||
index_type in_index = i / in_strides[0]; |
||||
index_type out_index = out_offset[0] + in_index; |
||||
index_type oidx = out_index * out_strides[0]; |
||||
for (int j = 1; j < Rank; j++) { |
||||
in_index = (i % in_strides[j - 1]) / in_strides[j]; |
||||
out_index = out_offset[j] + in_index; |
||||
oidx += out_index * out_strides[j]; |
||||
} |
||||
|
||||
output[oidx] = input[i]; |
||||
} |
||||
} |
||||
} |
||||
|
||||
template <class T, std::size_t N> static |
||||
void launch_vectorized_concat(const Stream& stream, |
||||
Span<T> output, size_type output_axis_size, index_type output_axis_offset, |
||||
View<T> input, size_type input_axis_size, size_type concat_size) |
||||
{ |
||||
CV_Assert(is_fully_aligned<T>(output, N)); |
||||
CV_Assert(is_fully_aligned<T>(input, N)); |
||||
/* more assertions are required to fully check for vectorization possiblity; check concat() */ |
||||
|
||||
auto kernel = raw::concat_vec<T, N>; |
||||
auto policy = make_policy(kernel, input.size() / N, 0, stream); |
||||
launch_kernel(kernel, policy, output, output_axis_size, output_axis_offset, input, input_axis_size, concat_size); |
||||
} |
||||
|
||||
template <class T> |
||||
void concat( |
||||
const Stream& stream, |
||||
TensorSpan<T> output, std::size_t output_axis_offset, |
||||
TensorView<T> input, std::size_t axis) |
||||
{ |
||||
/* let's call the axis of interest as the channel axis for the purpose of the following discussion |
||||
* even though it can be any axis |
||||
* |
||||
* for each batch item: |
||||
* we move all the channels from the input (which together, for a single batch item, is contiguous) |
||||
* of a batch item to its corresponding contiguous place in the output |
||||
* |
||||
* for a valid vector operation: |
||||
* - the size of each copy block must be aligned |
||||
* - input must be aligned |
||||
* - all the destination locations in the output must be aligned |
||||
*/ |
||||
std::size_t concat_size = output.size_range(axis + 1, output.rank()); |
||||
|
||||
std::size_t input_axis_size = input.get_axis_size(axis); |
||||
std::size_t output_axis_size = output.get_axis_size(axis); |
||||
|
||||
std::size_t copy_block_size = concat_size * input_axis_size; |
||||
std::size_t copy_block_stride = concat_size * output_axis_size; |
||||
std::size_t starting_offset = output_axis_offset * concat_size; |
||||
|
||||
/* in a nutshell, all this concat operation does is copy several blocks of size `copy_block_size` |
||||
* to the output starting from `starting_offset` with blocks in the output strided by `copy_block_stride` |
||||
*/ |
||||
|
||||
bool is_aligned_4 = copy_block_size % 4 == 0 && copy_block_stride % 4 == 0 && starting_offset % 4 == 0; |
||||
bool is_aligned_2 = copy_block_size % 2 == 0 && copy_block_stride % 2 == 0 && starting_offset % 2 == 0; |
||||
|
||||
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && is_aligned_4) { |
||||
launch_vectorized_concat<T, 4>(stream, output, output_axis_size, output_axis_offset, input, input_axis_size, concat_size); |
||||
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && is_aligned_2) { |
||||
launch_vectorized_concat<T, 2>(stream, output, output_axis_size, output_axis_offset, input, input_axis_size, concat_size); |
||||
} else { |
||||
launch_vectorized_concat<T, 1>(stream, output, output_axis_size, output_axis_offset, input, input_axis_size, concat_size); |
||||
} |
||||
} |
||||
|
||||
template void concat<__half>(const Stream&, TensorSpan<__half>, std::size_t, TensorView<__half>, std::size_t); |
||||
template void concat<float>(const Stream&, TensorSpan<float>, std::size_t, TensorView<float>, std::size_t); |
||||
|
||||
template <class T, std::size_t Rank> static |
||||
void launch_concat_with_offsets( |
||||
const Stream& stream, |
||||
Span<T> output, const std::vector<std::size_t>& outStride, const std::vector<std::size_t>& outOffset, |
||||
View<T> input, const std::vector<std::size_t>& inStride) |
||||
{ |
||||
CV_Assert(outStride.size() == Rank); |
||||
CV_Assert(outOffset.size() == Rank); |
||||
CV_Assert(inStride.size() == Rank); |
||||
|
||||
array<size_type, Rank> outStride_k, inStride_k; |
||||
outStride_k.assign(std::begin(outStride), std::end(outStride)); |
||||
inStride_k.assign(std::begin(inStride), std::end(inStride)); |
||||
|
||||
array<index_type, Rank> outOffset_k; |
||||
outOffset_k.assign(std::begin(outOffset), std::end(outOffset)); |
||||
|
||||
auto kernel = raw::concat_with_offsets<T, Rank>; |
||||
auto policy = make_policy(kernel, input.size(), 0, stream); |
||||
launch_kernel(kernel, policy, output, outStride_k, outOffset_k, input, inStride_k); |
||||
} |
||||
|
||||
GENERATE_KERNEL_DISPATCHER(concat_with_offsets_dispatcher, launch_concat_with_offsets); |
||||
|
||||
template <class T> |
||||
void concat_with_offsets( |
||||
const Stream& stream, |
||||
TensorSpan<T> output, TensorView<T> input, |
||||
std::vector<std::size_t> offsets) |
||||
{ |
||||
CV_Assert(output.rank() == input.rank()); |
||||
CV_Assert(output.rank() == offsets.size()); |
||||
|
||||
/* squeezable axes at the begining of both tensors can be eliminated |
||||
* |
||||
* Reasoning: |
||||
* ---------- |
||||
* Suppose an item's indices in the input tensor is [i1, i2, ...]. The indices in the output |
||||
* tensor will be [i1 + off1, i2 + off2, ...]. The concat operation essentially copies items |
||||
* from the input tensor to new locations in the output tensor. |
||||
* |
||||
* If the size of the first axis of the input and output tensor is unity, the input and output |
||||
* indices for all the elements will be of the form be [0, i2, ...] and [0, i2 + off2, ...] |
||||
* respectively. The first index does not contribute to the element's address calculation and |
||||
* hence does nothing apart from eating up few cycles. |
||||
*/ |
||||
while (input.get_axis_size(0) == 1 && output.get_axis_size(0) == 1) { |
||||
CV_Assert(offsets[0] == 0); |
||||
|
||||
input.squeeze(0); |
||||
output.squeeze(0); |
||||
offsets.erase(std::begin(offsets)); |
||||
|
||||
CV_Assert(output.rank() == input.rank()); |
||||
CV_Assert(output.rank() == offsets.size()); |
||||
} |
||||
|
||||
auto inShape = input.shape_as_vector(); |
||||
auto outShape = output.shape_as_vector(); |
||||
|
||||
/* contiguous axes that undergo full copy can be combined into one axis |
||||
* |
||||
* Reasoning: |
||||
* ---------- |
||||
* Suppose an item's indices in the input tensor is [i1, i2, i3, ...]. Let the first two axes not undergo any |
||||
* concatenation. The indices in the output tensor will be [i1, i2, i3 + off3, ...]. |
||||
* |
||||
* Each axis in the contiguous axes sequence will add an offset of iN * strideN. In the above example, |
||||
* the two axes add a total offset of `i1 * stride1 + i2 * stride2`. We can merge the two axes into one axis with |
||||
* a size of `size1 * size2`. The new offset added will be i12 * stride2` as the kernel iterates through `i12`. |
||||
* Note that `i12` is actually `(i1 * size2 + i2)` in the original tensor. |
||||
*/ |
||||
for (int i = 0; i < inShape.size(); i++) { |
||||
/* check if axis `i` requires any slicing */ |
||||
if (offsets[i] == 0 && inShape[i] == outShape[i]) { |
||||
/* loop invariant: `i` is the first axis in the contiguous unsliced axis sequence */ |
||||
|
||||
int j = i + 1; /* `j` is the axis which we will attempt to merge */ |
||||
while (j < inShape.size() && offsets[j] == 0 && inShape[j] == outShape[j]) { |
||||
/* `j` axis is also copied fully; merge `i` and `j` */ |
||||
auto new_size = inShape[i] * inShape[j]; |
||||
inShape[i] = new_size; |
||||
outShape[i] = new_size; |
||||
offsets[i] = 0; /* redundant */ |
||||
|
||||
/* delete axis `j` */ |
||||
inShape.erase(std::begin(inShape) + j); |
||||
outShape.erase(std::begin(outShape) + j); |
||||
offsets.erase(std::begin(offsets) + j); |
||||
|
||||
/* optimizations should not break the invariants */ |
||||
CV_Assert(inShape.size() == outShape.size()); |
||||
CV_Assert(inShape.size() == offsets.size()); |
||||
CV_Assert(inShape[i] == outShape[i]); |
||||
CV_Assert(offsets[i] == 0); |
||||
} |
||||
} |
||||
} |
||||
|
||||
auto rank = inShape.size(); |
||||
|
||||
std::vector<std::size_t> inStride(rank), outStride(rank); |
||||
inStride.back() = 1; |
||||
outStride.back() = 1; |
||||
/* garbage, ..., garbage, 1 */ |
||||
|
||||
std::copy(std::begin(inShape) + 1, std::end(inShape), std::begin(inStride)); |
||||
std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride)); |
||||
/* dim[0], dim[1], ..., dim[-1], 1 */ |
||||
|
||||
std::partial_sum(inStride.rbegin(), inStride.rend(), inStride.rbegin(), std::multiplies<int>()); |
||||
std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies<int>()); |
||||
/* stride[0], stride[1], ..., stride[-2], 1 */ |
||||
|
||||
CV_Assert(1 <= rank && rank <= CSL_MAX_TENSOR_RANK); |
||||
concat_with_offsets_dispatcher<T, 1, CSL_MAX_TENSOR_RANK>(rank, stream, output, outStride, offsets, input, inStride); |
||||
} |
||||
|
||||
template void concat_with_offsets(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::size_t>); |
||||
template void concat_with_offsets(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::size_t>); |
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */ |
@ -0,0 +1,224 @@ |
||||
// This file is part of OpenCV project. |
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory |
||||
// of this distribution and at http://opencv.org/license.html. |
||||
|
||||
#include <cuda_runtime.h> |
||||
#include <cuda_fp16.h> |
||||
|
||||
#include "math.hpp" |
||||
#include "grid_stride_range.hpp" |
||||
#include "execution.hpp" |
||||
#include "vector_traits.hpp" |
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp" |
||||
#include "../cuda4dnn/csl/span.hpp" |
||||
|
||||
#include <opencv2/core.hpp> |
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl; |
||||
using namespace cv::dnn::cuda4dnn::csl::device; |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { |
||||
|
||||
namespace raw { |
||||
template <class T, std::size_t N> |
||||
__global__ void eltwise_max_2_vec(Span<T> output, View<T> x, View<T> y) { |
||||
using vector_type = get_vector_type_t<T, N>; |
||||
|
||||
auto output_vPtr = vector_type::get_pointer(output.data()); |
||||
auto x_vPtr = vector_type::get_pointer(x.data()); |
||||
auto y_vPtr = vector_type::get_pointer(y.data()); |
||||
|
||||
for (auto i : grid_stride_range(output.size() / vector_type::size())) { |
||||
vector_type vec_x, vec_y; |
||||
v_load(vec_x, x_vPtr[i]); |
||||
v_load(vec_y, y_vPtr[i]); |
||||
|
||||
for (int j = 0; j < vector_type::size(); j++) { |
||||
using device::max; |
||||
vec_x.data[j] = max(vec_x.data[j], vec_y.data[j]); |
||||
} |
||||
|
||||
v_store(output_vPtr[i], vec_x); |
||||
} |
||||
} |
||||
|
||||
template <class T, std::size_t N> |
||||
__global__ void eltwise_sum_2_vec(Span<T> output, View<T> x, View<T> y) { |
||||
using vector_type = get_vector_type_t<T, N>; |
||||
|
||||
auto output_vPtr = vector_type::get_pointer(output.data()); |
||||
auto x_vPtr = vector_type::get_pointer(x.data()); |
||||
auto y_vPtr = vector_type::get_pointer(y.data()); |
||||
|
||||
for (auto i : grid_stride_range(output.size() / vector_type::size())) { |
||||
vector_type vec_x, vec_y; |
||||
v_load(vec_x, x_vPtr[i]); |
||||
v_load(vec_y, y_vPtr[i]); |
||||
|
||||
for (int j = 0; j < vector_type::size(); j++) |
||||
vec_x.data[j] = vec_x.data[j] + vec_y.data[j]; |
||||
|
||||
v_store(output_vPtr[i], vec_x); |
||||
} |
||||
} |
||||
|
||||
template <class T, std::size_t N> |
||||
__global__ void eltwise_sum_coeff_2_vec(Span<T> output, T coeff_x, View<T> x, T coeff_y, View<T> y) { |
||||
using vector_type = get_vector_type_t<T, N>; |
||||
|
||||
auto output_vPtr = vector_type::get_pointer(output.data()); |
||||
auto x_vPtr = vector_type::get_pointer(x.data()); |
||||
auto y_vPtr = vector_type::get_pointer(y.data()); |
||||
|
||||
for (auto i : grid_stride_range(output.size() / vector_type::size())) { |
||||
vector_type vec_x, vec_y; |
||||
v_load(vec_x, x_vPtr[i]); |
||||
v_load(vec_y, y_vPtr[i]); |
||||
|
||||
for (int j = 0; j < vector_type::size(); j++) |
||||
vec_x.data[j] = coeff_x * vec_x.data[j] + coeff_y * vec_y.data[j]; |
||||
|
||||
v_store(output_vPtr[i], vec_x); |
||||
} |
||||
} |
||||
|
||||
template <class T, std::size_t N> |
||||
__global__ void eltwise_prod_2_vec(Span<T> output, View<T> x, View<T> y) { |
||||
using vector_type = get_vector_type_t<T, N>; |
||||
|
||||
auto output_vPtr = vector_type::get_pointer(output.data()); |
||||
auto x_vPtr = vector_type::get_pointer(x.data()); |
||||
auto y_vPtr = vector_type::get_pointer(y.data()); |
||||
|
||||
for (auto i : grid_stride_range(output.size() / vector_type::size())) { |
||||
vector_type vec_x, vec_y; |
||||
v_load(vec_x, x_vPtr[i]); |
||||
v_load(vec_y, y_vPtr[i]); |
||||
|
||||
for (int j = 0; j < vector_type::size(); j++) |
||||
vec_x.data[j] = vec_x.data[j] * vec_y.data[j]; |
||||
|
||||
v_store(output_vPtr[i], vec_x); |
||||
} |
||||
} |
||||
} |
||||
|
||||
template <class T, std::size_t N> |
||||
void launch_vectorized_eltwise_max_2(const Stream& stream, Span<T> output, View<T> x, View<T> y) { |
||||
CV_Assert(is_fully_aligned<T>(output, N)); |
||||
CV_Assert(is_fully_aligned<T>(x, N)); |
||||
CV_Assert(is_fully_aligned<T>(y, N)); |
||||
|
||||
auto kernel = raw::eltwise_max_2_vec<T, N>; |
||||
auto policy = make_policy(kernel, output.size() / N, 0, stream); |
||||
launch_kernel(kernel, policy, output, x, y); |
||||
} |
||||
|
||||
template <class T> |
||||
void eltwise_max_2(const Stream& stream, Span<T> output, View<T> x, View<T> y) { |
||||
CV_Assert(x.size() == y.size()); |
||||
CV_Assert(x.size() == output.size()); |
||||
|
||||
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(x, 4) && is_fully_aligned<T>(y, 4)) { |
||||
launch_vectorized_eltwise_max_2<T, 4>(stream, output, x, y); |
||||
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(x, 2) && is_fully_aligned<T>(y, 2)) { |
||||
launch_vectorized_eltwise_max_2<T, 2>(stream, output, x, y); |
||||
} else { |
||||
launch_vectorized_eltwise_max_2<T, 1>(stream, output, x, y); |
||||
} |
||||
} |
||||
|
||||
template void eltwise_max_2(const Stream& stream, Span<__half> output, View<__half> x, View<__half> y); |
||||
template void eltwise_max_2(const Stream& stream, Span<float> output, View<float> x, View<float> y); |
||||
|
||||
template <class T, std::size_t N> |
||||
void launch_vectorized_eltwise_sum_2(const Stream& stream, Span<T> output, View<T> x, View<T> y) { |
||||
CV_Assert(is_fully_aligned<T>(output, N)); |
||||
CV_Assert(is_fully_aligned<T>(x, N)); |
||||
CV_Assert(is_fully_aligned<T>(y, N)); |
||||
|
||||
auto kernel = raw::eltwise_sum_2_vec<T, N>; |
||||
auto policy = make_policy(kernel, output.size() / N, 0, stream); |
||||
launch_kernel(kernel, policy, output, x, y); |
||||
} |
||||
|
||||
template <class T> |
||||
void eltwise_sum_2(const Stream& stream, Span<T> output, View<T> x, View<T> y) { |
||||
CV_Assert(x.size() == y.size()); |
||||
CV_Assert(x.size() == output.size()); |
||||
|
||||
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(x, 4) && is_fully_aligned<T>(y, 4)) { |
||||
launch_vectorized_eltwise_sum_2<T, 4>(stream, output, x, y); |
||||
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(x, 2) && is_fully_aligned<T>(y, 2)) { |
||||
launch_vectorized_eltwise_sum_2<T, 2>(stream, output, x, y); |
||||
} else { |
||||
launch_vectorized_eltwise_sum_2<T, 1>(stream, output, x, y); |
||||
} |
||||
} |
||||
|
||||
template void eltwise_sum_2(const Stream& stream, Span<__half> output, View<__half> x, View<__half> y); |
||||
template void eltwise_sum_2(const Stream& stream, Span<float> output, View<float> x, View<float> y); |
||||
|
||||
template <class T, std::size_t N> |
||||
void launch_vectorized_eltwise_sum_coeff_2(const Stream& stream, Span<T> output, T coeff_x, View<T> x, T coeff_y, View<T> y) { |
||||
CV_Assert(is_fully_aligned<T>(output, N)); |
||||
CV_Assert(is_fully_aligned<T>(x, N)); |
||||
CV_Assert(is_fully_aligned<T>(y, N)); |
||||
|
||||
auto kernel = raw::eltwise_sum_coeff_2_vec<T, N>; |
||||
auto policy = make_policy(kernel, output.size() / N, 0, stream); |
||||
launch_kernel(kernel, policy, output, coeff_x, x, coeff_y, y); |
||||
} |
||||
|
||||
template <class T> |
||||
void eltwise_sum_coeff_2(const Stream& stream, Span<T> output, T coeff_x, View<T> x, T coeff_y, View<T> y) { |
||||
CV_Assert(x.size() == y.size()); |
||||
CV_Assert(x.size() == output.size()); |
||||
|
||||
if (static_cast<float>(coeff_x) == 1.0f && static_cast<float>(coeff_y) == 1.0f) { |
||||
eltwise_sum_2(stream, output, x, y); |
||||
return; |
||||
} |
||||
|
||||
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(x, 4) && is_fully_aligned<T>(y, 4)) { |
||||
launch_vectorized_eltwise_sum_coeff_2<T, 4>(stream, output, coeff_x, x, coeff_y, y); |
||||
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(x, 2) && is_fully_aligned<T>(y, 2)) { |
||||
launch_vectorized_eltwise_sum_coeff_2<T, 2>(stream, output, coeff_x, x, coeff_y, y); |
||||
} else { |
||||
launch_vectorized_eltwise_sum_coeff_2<T, 1>(stream, output, coeff_x, x, coeff_y, y); |
||||
} |
||||
} |
||||
|
||||
template void eltwise_sum_coeff_2(const Stream&, Span<__half>, __half, View<__half>, __half, View<__half>); |
||||
template void eltwise_sum_coeff_2(const Stream&, Span<float>, float, View<float>, float, View<float>); |
||||
|
||||
template <class T, std::size_t N> |
||||
void launch_vectorized_eltwise_prod_2(const Stream& stream, Span<T> output, View<T> x, View<T> y) { |
||||
CV_Assert(is_fully_aligned<T>(output, N)); |
||||
CV_Assert(is_fully_aligned<T>(x, N)); |
||||
CV_Assert(is_fully_aligned<T>(y, N)); |
||||
|
||||
auto kernel = raw::eltwise_prod_2_vec<T, N>; |
||||
auto policy = make_policy(kernel, output.size() / N, 0, stream); |
||||
launch_kernel(kernel, policy, output, x, y); |
||||
} |
||||
|
||||
template <class T> |
||||
void eltwise_prod_2(const Stream& stream, Span<T> output, View<T> x, View<T> y) { |
||||
CV_Assert(x.size() == y.size()); |
||||
CV_Assert(x.size() == output.size()); |
||||
|
||||
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(x, 4) && is_fully_aligned<T>(y, 4)) { |
||||
launch_vectorized_eltwise_prod_2<T, 4>(stream, output, x, y); |
||||
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(x, 2) && is_fully_aligned<T>(y, 2)) { |
||||
launch_vectorized_eltwise_prod_2<T, 2>(stream, output, x, y); |
||||
} else { |
||||
launch_vectorized_eltwise_prod_2<T, 1>(stream, output, x, y); |
||||
} |
||||
} |
||||
|
||||
template void eltwise_prod_2(const Stream& stream, Span<__half> output, View<__half> x, View<__half> y); |
||||
template void eltwise_prod_2(const Stream& stream, Span<float> output, View<float> x, View<float> y); |
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */ |
@ -0,0 +1,81 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA_EXECUTION_HPP |
||||
#define OPENCV_DNN_SRC_CUDA_EXECUTION_HPP |
||||
|
||||
#include "../cuda4dnn/csl/error.hpp" |
||||
#include "../cuda4dnn/csl/stream.hpp" |
||||
|
||||
#include <opencv2/core.hpp> |
||||
|
||||
#include <cuda_runtime_api.h> |
||||
|
||||
#include <cstddef> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { |
||||
|
||||
struct execution_policy { |
||||
execution_policy(dim3 grid_size, dim3 block_size) |
||||
: grid{ grid_size }, block{ block_size }, sharedMem{ 0 }, stream{ 0 } { } |
||||
|
||||
execution_policy(dim3 grid_size, dim3 block_size, std::size_t shared_mem) |
||||
: grid{ grid_size }, block{ block_size }, sharedMem{ shared_mem }, stream{ nullptr } { } |
||||
|
||||
execution_policy(dim3 grid_size, dim3 block_size, const Stream& strm) |
||||
: grid{ grid_size }, block{ block_size }, sharedMem{ 0 }, stream{ strm.get() } { } |
||||
|
||||
execution_policy(dim3 grid_size, dim3 block_size, std::size_t shared_mem, const Stream& strm) |
||||
: grid{ grid_size }, block{ block_size }, sharedMem{ shared_mem }, stream{ strm.get() } { } |
||||
|
||||
dim3 grid; |
||||
dim3 block; |
||||
std::size_t sharedMem; |
||||
cudaStream_t stream; |
||||
}; |
||||
|
||||
/* this overload shouldn't be necessary; we should always provide a bound on the number of threads */ |
||||
/*
|
||||
template <class Kernel> inline |
||||
execution_policy make_policy(Kernel kernel, std::size_t sharedMem = 0, const Stream& stream = 0) { |
||||
int grid_size, block_size; |
||||
CUDA4DNN_CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, sharedMem)); |
||||
return execution_policy(grid_size, block_size, sharedMem, stream); |
||||
}*/ |
||||
|
||||
template <class Kernel> inline |
||||
execution_policy make_policy(Kernel kernel, std::size_t max_threads, std::size_t sharedMem = 0, const Stream& stream = 0) { |
||||
CV_Assert(max_threads > 0); |
||||
|
||||
int grid_size = 0, block_size = 0; |
||||
CUDA4DNN_CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, sharedMem)); |
||||
if (grid_size * block_size > max_threads) { |
||||
grid_size = (max_threads + block_size - 1) / block_size; |
||||
if (block_size > max_threads) |
||||
block_size = max_threads; |
||||
} |
||||
|
||||
CV_Assert(grid_size >= 1 && block_size >= 1); |
||||
return execution_policy(grid_size, block_size, sharedMem, stream); |
||||
} |
||||
|
||||
template <class Kernel, typename ...Args> inline |
||||
void launch_kernel(Kernel kernel, Args ...args) { |
||||
auto policy = make_policy(kernel); |
||||
kernel <<<policy.grid, policy.block>>> (std::forward<Args>(args)...); |
||||
} |
||||
|
||||
template <class Kernel, typename ...Args> inline |
||||
void launch_kernel(Kernel kernel, dim3 grid, dim3 block, Args ...args) { |
||||
kernel <<<grid, block>>> (std::forward<Args>(args)...); |
||||
} |
||||
|
||||
template <class Kernel, typename ...Args> inline |
||||
void launch_kernel(Kernel kernel, execution_policy policy, Args ...args) { |
||||
kernel <<<policy.grid, policy.block, policy.sharedMem, policy.stream>>> (std::forward<Args>(args)...); |
||||
} |
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::csl */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA_EXECUTION_HPP */ |
@ -0,0 +1,58 @@ |
||||
// This file is part of OpenCV project. |
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory |
||||
// of this distribution and at http://opencv.org/license.html. |
||||
|
||||
#include <cuda_runtime.h> |
||||
#include <cuda_fp16.h> |
||||
|
||||
#include "grid_stride_range.hpp" |
||||
#include "execution.hpp" |
||||
#include "vector_traits.hpp" |
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp" |
||||
#include "../cuda4dnn/csl/span.hpp" |
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl; |
||||
using namespace cv::dnn::cuda4dnn::csl::device; |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { |
||||
|
||||
namespace raw { |
||||
template <class T, std::size_t N> |
||||
__global__ void fill_vec(Span<T> output, T value) { |
||||
using vector_type = get_vector_type_t<T, N>; |
||||
|
||||
auto output_vPtr = vector_type::get_pointer(output.data()); |
||||
for (auto i : grid_stride_range(output.size() / vector_type::size())) { |
||||
vector_type vec; |
||||
for (int j = 0; j < vector_type::size(); j++) |
||||
vec.data[j] = value; |
||||
v_store(output_vPtr[i], vec); |
||||
} |
||||
} |
||||
} |
||||
|
||||
template <class T, std::size_t N> |
||||
void launch_vectorized_fill(const Stream& stream, Span<T> output, T value) { |
||||
CV_Assert(is_fully_aligned<T>(output, N)); |
||||
|
||||
auto kernel = raw::fill_vec<T, N>; |
||||
auto policy = make_policy(kernel, output.size() / N, 0, stream); |
||||
launch_kernel(kernel, policy, output, value); |
||||
} |
||||
|
||||
template <class T> |
||||
void fill(const Stream& stream, Span<T> output, T value) { |
||||
if (is_fully_aligned<T>(output, 4)) { |
||||
launch_vectorized_fill<T, 4>(stream, output, value); |
||||
} else if (is_fully_aligned<T>(output, 2)) { |
||||
launch_vectorized_fill<T, 2>(stream, output, value); |
||||
} else { |
||||
launch_vectorized_fill<T, 1>(stream, output, value); |
||||
} |
||||
} |
||||
|
||||
template void fill(const Stream&, Span<__half>, __half); |
||||
template void fill(const Stream&, Span<float>, float); |
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */ |
@ -0,0 +1,92 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA_GRID_STRIDE_RANGE_HPP |
||||
#define OPENCV_DNN_SRC_CUDA_GRID_STRIDE_RANGE_HPP |
||||
|
||||
#include "types.hpp" |
||||
|
||||
#include <cuda_runtime.h> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device { |
||||
|
||||
namespace detail { |
||||
template <int> __device__ auto getGridDim()->decltype(dim3::x); |
||||
template <> inline __device__ auto getGridDim<0>()->decltype(dim3::x) { return gridDim.x; } |
||||
template <> inline __device__ auto getGridDim<1>()->decltype(dim3::x) { return gridDim.y; } |
||||
template <> inline __device__ auto getGridDim<2>()->decltype(dim3::x) { return gridDim.z; } |
||||
|
||||
template <int> __device__ auto getBlockDim()->decltype(dim3::x); |
||||
template <> inline __device__ auto getBlockDim<0>()->decltype(dim3::x) { return blockDim.x; } |
||||
template <> inline __device__ auto getBlockDim<1>()->decltype(dim3::x) { return blockDim.y; } |
||||
template <> inline __device__ auto getBlockDim<2>()->decltype(dim3::x) { return blockDim.z; } |
||||
|
||||
template <int> __device__ auto getBlockIdx()->decltype(uint3::x); |
||||
template <> inline __device__ auto getBlockIdx<0>()->decltype(uint3::x) { return blockIdx.x; } |
||||
template <> inline __device__ auto getBlockIdx<1>()->decltype(uint3::x) { return blockIdx.y; } |
||||
template <> inline __device__ auto getBlockIdx<2>()->decltype(uint3::x) { return blockIdx.z; } |
||||
|
||||
template <int> __device__ auto getThreadIdx()->decltype(uint3::x); |
||||
template <> inline __device__ auto getThreadIdx<0>()->decltype(uint3::x) { return threadIdx.x; } |
||||
template <> inline __device__ auto getThreadIdx<1>()->decltype(uint3::x) { return threadIdx.y; } |
||||
template <> inline __device__ auto getThreadIdx<2>()->decltype(uint3::x) { return threadIdx.z; } |
||||
} |
||||
|
||||
template <int dim, class index_type = device::index_type, class size_type = device::size_type> |
||||
class grid_stride_range_generic { |
||||
public: |
||||
__device__ grid_stride_range_generic(index_type to_) : from(0), to(to_) { } |
||||
__device__ grid_stride_range_generic(index_type from_, index_type to_) : from(from_), to(to_) { } |
||||
|
||||
class iterator |
||||
{ |
||||
public: |
||||
__device__ iterator(index_type pos_) : pos(pos_) {} |
||||
|
||||
/* these iterators return the index when dereferenced; this allows us to loop
|
||||
* through the indices using a range based for loop |
||||
*/ |
||||
__device__ index_type operator*() const { return pos; } |
||||
|
||||
__device__ iterator& operator++() { |
||||
pos += detail::getGridDim<dim>() * static_cast<index_type>(detail::getBlockDim<dim>()); |
||||
return *this; |
||||
} |
||||
|
||||
__device__ bool operator!=(const iterator& other) const { |
||||
/* NOTE HACK
|
||||
** 'pos' can move in large steps (see operator++) |
||||
** expansion of range for loop uses != as the loop conditioion |
||||
** => operator!= must return false if 'pos' crosses the end |
||||
*/ |
||||
return pos < other.pos; |
||||
} |
||||
|
||||
private: |
||||
index_type pos; |
||||
}; |
||||
|
||||
__device__ iterator begin() const { |
||||
using detail::getBlockDim; |
||||
using detail::getBlockIdx; |
||||
using detail::getThreadIdx; |
||||
return iterator(from + getBlockDim<dim>() * getBlockIdx<dim>() + getThreadIdx<dim>()); |
||||
} |
||||
|
||||
__device__ iterator end() const { |
||||
return iterator(to); |
||||
} |
||||
|
||||
private: |
||||
index_type from, to; |
||||
}; |
||||
|
||||
using grid_stride_range_x = grid_stride_range_generic<0>; |
||||
using grid_stride_range_y = grid_stride_range_generic<1>; |
||||
using grid_stride_range_z = grid_stride_range_generic<2>; |
||||
using grid_stride_range = grid_stride_range_x; |
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA_GRID_STRIDE_RANGE_HPP */ |
@ -0,0 +1,76 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA_KERNEL_DISPATCHER_HPP |
||||
#define OPENCV_DNN_SRC_CUDA_KERNEL_DISPATCHER_HPP |
||||
|
||||
#include <cstddef> |
||||
#include <type_traits> |
||||
|
||||
/* The performance of many kernels are highly dependent on the tensor rank. Instead of having
|
||||
* one kernel which can work with the maximally ranked tensors, we make one kernel for each supported |
||||
* tensor rank. This is to ensure that the requirements of the maximally ranked tensors do not take a |
||||
* toll on the performance of the operation for low ranked tensors. Hence, many kernels take the tensor |
||||
* rank as a template parameter. |
||||
* |
||||
* The kernel is a template and we have different instantiations for each rank. This causes the following pattern |
||||
* to arise frequently: |
||||
* |
||||
* if(rank == 3) |
||||
* kernel<T, 3>(); |
||||
* else if(rank == 2) |
||||
* kernel<T, 2>(); |
||||
* else |
||||
* kernel<T, 1>(); |
||||
* |
||||
* The rank is a runtime variable. To facilitate creation of such structures, we use GENERATE_KERNEL_DISPATCHER. |
||||
* This macro creates a function which selects the correct kernel instantiation at runtime. |
||||
* |
||||
* Example: |
||||
* |
||||
* // function which setups the kernel and launches it
|
||||
* template <class T, std::size_t Rank> |
||||
* void launch_some_kernel(...); |
||||
* |
||||
* // creates the dispatcher named "some_dispatcher" which invokves the correct instantiation of "launch_some_kernel"
|
||||
* GENERATE_KERNEL_DISPATCHER(some_dispatcher, launch_some_kernel); |
||||
* |
||||
* // internal API function
|
||||
* template <class T> |
||||
* void some(...) { |
||||
* // ...
|
||||
* auto rank = input.rank(); |
||||
* some_dispatcher<T, MIN_RANK, MAX_RANK>(rank, ...); |
||||
* } |
||||
*/ |
||||
|
||||
/*
|
||||
* name name of the dispatcher function that is generated |
||||
* func template function that requires runtime selection |
||||
* |
||||
* T first template parameter to `func` |
||||
* start starting rank |
||||
* end ending rank (inclusive) |
||||
* |
||||
* Executes func<T, selector> based on runtime `selector` argument given `selector` lies |
||||
* within the range [start, end]. If outside the range, no instantiation of `func` is executed. |
||||
*/ |
||||
#define GENERATE_KERNEL_DISPATCHER(name,func); \ |
||||
template <class T, std::size_t start, std::size_t end, class... Args> static \
|
||||
typename std::enable_if<start == end, void> \
|
||||
::type name(int selector, Args&& ...args) { \
|
||||
if(selector == start) \
|
||||
func<T, start>(std::forward<Args>(args)...); \
|
||||
} \
|
||||
\
|
||||
template <class T, std::size_t start, std::size_t end, class... Args> static \
|
||||
typename std::enable_if<start != end, void> \
|
||||
::type name(int selector, Args&& ...args) { \
|
||||
if(selector == start) \
|
||||
func<T, start>(std::forward<Args>(args)...); \
|
||||
else \
|
||||
name<T, start + 1, end, Args...>(selector, std::forward<Args>(args)...); \
|
||||
} |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA_KERNEL_DISPATCHER_HPP */ |
@ -0,0 +1,34 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA_LIMITS_HPP |
||||
#define OPENCV_DNN_SRC_CUDA_LIMITS_HPP |
||||
|
||||
#include <cuda_runtime.h> |
||||
#include <cuda_fp16.h> |
||||
|
||||
#include <cfloat> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device { |
||||
|
||||
template <class T> |
||||
struct numeric_limits; |
||||
|
||||
template <> |
||||
struct numeric_limits<__half> { |
||||
__device__ static __half min() { return 0.0000610; } |
||||
__device__ static __half max() { return 65504.0; } |
||||
__device__ static __half lowest() { return -65504.0; } |
||||
}; |
||||
|
||||
template <> |
||||
struct numeric_limits<float> { |
||||
__device__ static float min() { return FLT_MIN; } |
||||
__device__ static float max() { return FLT_MAX; } |
||||
__device__ static float lowest() { return -FLT_MAX; } |
||||
}; |
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA_LIMITS_HPP */ |
@ -0,0 +1,125 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA_MATH_HPP |
||||
#define OPENCV_DNN_SRC_CUDA_MATH_HPP |
||||
|
||||
#include <cuda_runtime.h> |
||||
#include <cuda_fp16.h> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device { |
||||
|
||||
template <class T> __device__ T abs(T val) { return (val < T(0) ? -val : val); } |
||||
template <> inline __device__ __half2 abs(__half2 val) { |
||||
val.x = abs(val.x); |
||||
val.y = abs(val.y); |
||||
return val; |
||||
} |
||||
template <> inline __device__ float abs(float val) { return fabsf(val); } |
||||
template <> inline __device__ double abs(double val) { return fabs(val); } |
||||
|
||||
template <class T> __device__ T exp(T val); |
||||
template <> inline __device__ __half exp(__half val) { return hexp(val); } |
||||
template <> inline __device__ __half2 exp(__half2 val) { return h2exp(val); } |
||||
template <> inline __device__ float exp(float val) { return expf(val); } |
||||
template <> inline __device__ double exp(double val) { return ::exp(val); } |
||||
|
||||
template <class T> __device__ T expm1(T val); |
||||
template <> inline __device__ __half expm1(__half val) { return hexp(val) + __half(1); } |
||||
template <> inline __device__ __half2 expm1(__half2 val) { return h2exp(val) + __half2(1, 1); } |
||||
template <> inline __device__ float expm1(float val) { return expm1f(val); } |
||||
template <> inline __device__ double expm1(double val) { return ::expm1(val); } |
||||
|
||||
template <class T> __device__ T max(T x, T y) { return (x > y ? x : y); } |
||||
template <> inline __device__ __half2 max(__half2 a, __half2 b) { |
||||
a.x = max(a.x, a.x); |
||||
a.y = max(a.y, b.y); |
||||
return a; |
||||
} |
||||
template <> inline __device__ float max(float x, float y) { return fmaxf(x, y); } |
||||
template <> inline __device__ double max(double x, double y) { return fmax(x, y); } |
||||
|
||||
template <class T> __device__ T min(T x, T y) { return (x > y ? y : x); } |
||||
template <> inline __device__ __half2 min(__half2 a, __half2 b) { |
||||
a.x = min(a.x, a.x); |
||||
a.y = min(a.y, b.y); |
||||
return a; |
||||
} |
||||
template <> inline __device__ float min(float x, float y) { return fminf(x, y); } |
||||
template <> inline __device__ double min(double x, double y) { return fmin(x, y); } |
||||
|
||||
template <class T> __device__ T log1p(T val); |
||||
template <> inline __device__ __half log1p(__half val) { return hlog(val) + __half(1); } |
||||
template <> inline __device__ __half2 log1p(__half2 val) { return h2log(val) + __half2(1, 1); } |
||||
template <> inline __device__ float log1p(float val) { return log1pf(val); } |
||||
|
||||
template <class T> __device__ T log1pexp(T val); |
||||
template <> inline __device__ __half log1pexp(__half val) { |
||||
if (val <= __half(-4.0)) |
||||
return exp(val); |
||||
else if (val <= __half(8.0)) |
||||
return log1p(exp(val)); |
||||
else if (val <= __half(8.7)) |
||||
return val + exp(-val); |
||||
else |
||||
return val; |
||||
} |
||||
template <> inline __device__ __half2 log1pexp(__half2 val) { |
||||
val.x = log1pexp(val.x); |
||||
val.y = log1pexp(val.y); |
||||
return val; |
||||
} |
||||
template <> inline __device__ float log1pexp(float val) { |
||||
if (val <= -20) |
||||
return expf(val); |
||||
else if (val <= 9.0) |
||||
return log1pf(expf(val)); |
||||
else if (val <= 14.6) |
||||
return val + exp(-val); |
||||
else |
||||
return val; |
||||
} |
||||
template <> inline __device__ double log1pexp(double val) { |
||||
if (val <= -37) |
||||
return exp(val); |
||||
else if (val <= 18) |
||||
return log1p(exp(val)); |
||||
else if (val <= 33.3) |
||||
return val + exp(-val); |
||||
else |
||||
return val; |
||||
} |
||||
|
||||
template <class T> __device__ T tanh(T val); |
||||
template <> inline __device__ __half tanh(__half val) { return tanhf(val); } |
||||
template <> inline __device__ __half2 tanh(__half2 val) { return __half2(tanh(val.x), tanh(val.y)); } |
||||
template <> inline __device__ float tanh(float val) { return tanhf(val); } |
||||
template <> inline __device__ double tanh(double val) { return ::tanh(val); } |
||||
|
||||
template <class T> __device__ T pow(T val, T exp); |
||||
template <> inline __device__ __half pow(__half val, __half exp) { return powf(val, exp); } |
||||
template <> inline __device__ __half2 pow(__half2 val, __half2 exp) { return __half2(pow(val.x, exp.x), pow(val.y, exp.y)); } |
||||
template <> inline __device__ float pow(float val, float exp) { return powf(val, exp); } |
||||
template <> inline __device__ double pow(double val, double exp) { return ::pow(val, exp); } |
||||
|
||||
template <class T> __device__ T sqrt(T val); |
||||
template <> inline __device__ __half sqrt(__half val) { return hsqrt(val); } |
||||
template <> inline __device__ __half2 sqrt(__half2 val) { return h2sqrt(val); } |
||||
template <> inline __device__ float sqrt(float val) { return sqrtf(val); } |
||||
template <> inline __device__ double sqrt(double val) { return ::sqrt(val); } |
||||
|
||||
template <class T> __device__ T rsqrt(T val); |
||||
template <> inline __device__ __half rsqrt(__half val) { return hrsqrt(val); } |
||||
template <> inline __device__ __half2 rsqrt(__half2 val) { return h2rsqrt(val); } |
||||
template <> inline __device__ float rsqrt(float val) { return rsqrtf(val); } |
||||
template <> inline __device__ double rsqrt(double val) { return ::rsqrt(val); } |
||||
|
||||
template <class T> __device__ T sigmoid(T val) { return T(1) / (T(1) + exp(-val)); } |
||||
template <> inline __device__ __half2 sigmoid(__half2 val) { return __half2(1, 1) / (__half2(1, 1) + exp(__hneg2(val))); } |
||||
|
||||
template <class T> __device__ T clamp(T value, T lower, T upper) { return min(max(value, lower), upper); } |
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA_MATH_HPP */ |
@ -0,0 +1,307 @@ |
||||
// This file is part of OpenCV project. |
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory |
||||
// of this distribution and at http://opencv.org/license.html. |
||||
|
||||
#include <cuda_runtime.h> |
||||
#include <cuda_fp16.h> |
||||
|
||||
#include "math.hpp" |
||||
#include "array.hpp" |
||||
#include "limits.hpp" |
||||
#include "types.hpp" |
||||
#include "grid_stride_range.hpp" |
||||
#include "execution.hpp" |
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp" |
||||
#include "../cuda4dnn/csl/tensor.hpp" |
||||
#include "../cuda4dnn/csl/span.hpp" |
||||
|
||||
#include "../cuda4dnn/kernels/fill.hpp" |
||||
|
||||
#include <opencv2/core.hpp> |
||||
|
||||
#include <cstddef> |
||||
#include <vector> |
||||
#include <type_traits> |
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl; |
||||
using namespace cv::dnn::cuda4dnn::csl::device; |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { |
||||
|
||||
namespace raw { |
||||
template <class T, std::size_t Order, |
||||
typename std::enable_if<Order == 2 || Order == 3, bool>::type = true> /* Order has been hardcoded; see code */ |
||||
__global__ void max_pooling_with_indices( |
||||
Span<T> output, Span<T> indices, View<T> input, size_type channels, |
||||
array<size_type, Order> out_spatial_dims, array<size_type, Order> in_spatial_dims, |
||||
array<size_type, Order> window_size, array<size_type, Order> strides, array<size_type, Order> padding_left) |
||||
{ |
||||
/* every element in the output is mapped to a window in the input and each thread processes several windows */ |
||||
for (auto idx : grid_stride_range(output.size())) { |
||||
size_type out_spatial_size = 1; |
||||
array<index_type, Order> window_idx; |
||||
for (int i = Order - 1; i >= 0; i--) { |
||||
window_idx[i] = (idx / out_spatial_size) % out_spatial_dims[i]; |
||||
out_spatial_size *= out_spatial_dims[i]; |
||||
} |
||||
|
||||
const index_type n = idx / (out_spatial_size * channels); |
||||
const index_type c = (idx / out_spatial_size) % channels; |
||||
|
||||
array<index_type, Order> start; |
||||
for(int i = 0; i < Order; i++) |
||||
start[i] = window_idx[i] * strides[i] - padding_left[i]; |
||||
|
||||
array<index_type, Order> end; |
||||
for (int i = 0; i < Order; i++) { |
||||
using device::min; |
||||
end[i] = min<index_type>(start[i] + window_size[i], in_spatial_dims[i]); |
||||
} |
||||
|
||||
for (int i = 0; i < Order; i++) { |
||||
using device::max; |
||||
start[i] = max(start[i], 0); |
||||
} |
||||
|
||||
T max_value = numeric_limits<T>::lowest(); |
||||
index_type max_idx = -1; |
||||
|
||||
size_type in_spatial_size = 1; |
||||
for (int i = 0; i < Order; i++) |
||||
in_spatial_size *= in_spatial_dims[i]; |
||||
|
||||
const auto outer_offset = (n * channels + c) * in_spatial_size; |
||||
if (Order == 2) { |
||||
array<index_type, Order> idx; |
||||
for (idx[0] = start[0]; idx[0] != end[0]; idx[0]++) { |
||||
for (idx[1] = start[1]; idx[1] != end[1]; idx[1]++) { |
||||
index_type offset = 0; |
||||
index_type stride = 1; |
||||
for (int i = Order - 1; i >= 0; i--) { |
||||
offset += stride * idx[i]; |
||||
stride *= in_spatial_dims[i]; |
||||
} |
||||
|
||||
if (input[outer_offset + offset] > max_value) { |
||||
max_idx = offset; |
||||
max_value = input[outer_offset + offset]; |
||||
} |
||||
} |
||||
} |
||||
} else if(Order == 3) { |
||||
array<index_type, Order> idx; |
||||
for (idx[0] = start[0]; idx[0] != end[0]; idx[0]++) { |
||||
for (idx[1] = start[1]; idx[1] != end[1]; idx[1]++) { |
||||
for (idx[2] = start[2]; idx[2] != end[2]; idx[2]++) { |
||||
index_type offset = 0; |
||||
index_type stride = 1; |
||||
for (int i = Order - 1; i >= 0; i--) { |
||||
offset += stride * idx[i]; |
||||
stride *= in_spatial_dims[i]; |
||||
} |
||||
|
||||
if (input[outer_offset + offset] > max_value) { |
||||
max_idx = offset; |
||||
max_value = input[outer_offset + offset]; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
output[idx] = max_value; |
||||
indices[idx] = max_idx; |
||||
} |
||||
} |
||||
|
||||
template <class T, std::size_t Order> |
||||
__global__ void max_unpooling( |
||||
Span<T> output, View<T> input, View<T> indices, size_type channels, |
||||
array<size_type, Order> out_spatial_dims, array<size_type, Order> in_spatial_dims, |
||||
array<size_type, Order> window_size, array<size_type, Order> strides, array<size_type, Order> padding_left) |
||||
{ |
||||
/* the output has already been zero filled */ |
||||
/* Every input value represents a window in the output. The max unpooling operation |
||||
* copies the input value to exactly one location in the output window which is given |
||||
* by the indices tensor. |
||||
*/ |
||||
for (auto idx : grid_stride_range(input.size())) { |
||||
size_type in_spatial_size = 1; |
||||
array<index_type, Order> window_idx; |
||||
for (int i = Order - 1; i >= 0; i--) { |
||||
window_idx[i] = (idx / in_spatial_size) % in_spatial_dims[i]; |
||||
in_spatial_size *= in_spatial_dims[i]; |
||||
} |
||||
|
||||
const index_type n = idx / (in_spatial_size * channels); |
||||
const index_type c = (idx / in_spatial_size) % channels; |
||||
|
||||
array<index_type, Order> start; |
||||
for (int i = 0; i < Order; i++) { |
||||
using device::min; |
||||
using device::max; |
||||
start[i] = max(0, min(window_idx[i] * strides[i] - padding_left[i], out_spatial_dims[i] - 1)); |
||||
} |
||||
|
||||
size_type out_spatial_size = 1; |
||||
for (int i = 0; i < Order; i++) |
||||
out_spatial_size *= out_spatial_dims[i]; |
||||
|
||||
index_type outer_offset = (n * channels + c) * out_spatial_size; |
||||
output[outer_offset + static_cast<index_type>(indices[idx])] = input[idx]; |
||||
} |
||||
} |
||||
} |
||||
|
||||
template <class T, std::size_t Order> static |
||||
void launch_max_pooling_kernel( |
||||
const Stream& stream, |
||||
Span<T> output, Span<T> indices, View<T> input, std::size_t channels, |
||||
const std::vector<std::size_t>& out_spatial_dims, const std::vector<std::size_t>& in_spatial_dims, |
||||
const std::vector<std::size_t>& window_size, |
||||
const std::vector<std::size_t>& strides, const std::vector<std::size_t>& padding_left) |
||||
{ |
||||
CV_Assert(indices.size() == output.size()); |
||||
CV_Assert(out_spatial_dims.size() == Order); |
||||
CV_Assert(in_spatial_dims.size() == Order); |
||||
CV_Assert(window_size.size() == Order); |
||||
CV_Assert(strides.size() == Order); |
||||
CV_Assert(padding_left.size() == Order); |
||||
|
||||
array<size_type, Order> out_spatial_dims_k, in_spatial_dims_k; |
||||
out_spatial_dims_k.assign(std::begin(out_spatial_dims), std::end(out_spatial_dims)); |
||||
in_spatial_dims_k.assign(std::begin(in_spatial_dims), std::end(in_spatial_dims)); |
||||
|
||||
array<size_type, Order> window_size_k, strides_k, padding_left_k; |
||||
window_size_k.assign(std::begin(window_size), std::end(window_size)); |
||||
strides_k.assign(std::begin(strides), std::end(strides)); |
||||
padding_left_k.assign(std::begin(padding_left), std::end(padding_left)); |
||||
|
||||
auto kernel = raw::max_pooling_with_indices<T, Order>; |
||||
auto policy = make_policy(kernel, output.size(), 0, stream); |
||||
launch_kernel(kernel, policy, output, indices, input, channels, |
||||
out_spatial_dims_k, in_spatial_dims_k, window_size_k, strides_k, padding_left_k); |
||||
} |
||||
|
||||
template <class T> |
||||
void max_pooling_with_indices( |
||||
const Stream& stream, |
||||
TensorSpan<T> output, TensorSpan<T> indices, TensorView<T> input, |
||||
const std::vector<std::size_t>& window_size, const std::vector<std::size_t>& strides, |
||||
const std::vector<std::size_t>& padding_left) |
||||
{ |
||||
CV_Assert(is_shape_same(output, indices)); |
||||
CV_Assert(input.get_axis_size(1) == output.get_axis_size(1)); |
||||
|
||||
auto order = window_size.size(); |
||||
CV_Assert(strides.size() == order); |
||||
CV_Assert(padding_left.size() == order); |
||||
CV_Assert(output.rank() == order + 2); |
||||
CV_Assert(input.rank() == order + 2); |
||||
|
||||
std::vector<std::size_t> out_spatial_dims(order), in_spatial_dims(order); |
||||
for (int i = 0; i < order; i++) { |
||||
in_spatial_dims[i] = input.get_axis_size(2 + i); |
||||
out_spatial_dims[i] = output.get_axis_size(2 + i); |
||||
} |
||||
|
||||
/* only max_pooling2d and max_pooling3d are supported */ |
||||
CV_Assert(2 <= order && order <= 3); |
||||
std::size_t channels = input.get_axis_size(1); |
||||
if (order == 3) { |
||||
launch_max_pooling_kernel<T, 3>(stream, output, indices, input, channels, |
||||
out_spatial_dims, in_spatial_dims, window_size, strides, padding_left); |
||||
} else if (order == 2) { |
||||
launch_max_pooling_kernel<T, 2>(stream, output, indices, input, channels, |
||||
out_spatial_dims, in_spatial_dims, window_size, strides, padding_left); |
||||
} |
||||
} |
||||
|
||||
template void max_pooling_with_indices(const Stream&, |
||||
TensorSpan<__half>, TensorSpan<__half>, TensorView<__half>, |
||||
const std::vector<std::size_t>&, const std::vector<std::size_t>&, |
||||
const std::vector<std::size_t>&); |
||||
|
||||
template void max_pooling_with_indices(const Stream&, |
||||
TensorSpan<float>, TensorSpan<float>, TensorView<float>, |
||||
const std::vector<std::size_t>&, const std::vector<std::size_t>&, |
||||
const std::vector<std::size_t>&); |
||||
|
||||
template <class T, std::size_t Order> static |
||||
void launch_max_unpooling_kernel( |
||||
const Stream& stream, |
||||
Span<T> output, View<T> input, View<T> indices, std::size_t channels, |
||||
const std::vector<std::size_t>& out_spatial_dims, const std::vector<std::size_t>& in_spatial_dims, |
||||
const std::vector<std::size_t>& window_size, |
||||
const std::vector<std::size_t>& strides, const std::vector<std::size_t>& padding_left) |
||||
{ |
||||
CV_Assert(out_spatial_dims.size() == Order); |
||||
CV_Assert(in_spatial_dims.size() == Order); |
||||
CV_Assert(window_size.size() == Order); |
||||
CV_Assert(strides.size() == Order); |
||||
CV_Assert(padding_left.size() == Order); |
||||
CV_Assert(indices.size() == input.size()); |
||||
|
||||
array<size_type, Order> out_spatial_dims_k, in_spatial_dims_k; |
||||
out_spatial_dims_k.assign(std::begin(out_spatial_dims), std::end(out_spatial_dims)); |
||||
in_spatial_dims_k.assign(std::begin(in_spatial_dims), std::end(in_spatial_dims)); |
||||
|
||||
array<size_type, Order> window_size_k, strides_k, padding_left_k; |
||||
window_size_k.assign(std::begin(window_size), std::end(window_size)); |
||||
strides_k.assign(std::begin(strides), std::end(strides)); |
||||
padding_left_k.assign(std::begin(padding_left), std::end(padding_left)); |
||||
|
||||
auto kernel = raw::max_unpooling<T, Order>; |
||||
auto policy = make_policy(kernel, input.size(), 0, stream); |
||||
launch_kernel(kernel, policy, output, input, indices, channels, |
||||
out_spatial_dims_k, in_spatial_dims_k, window_size_k, strides_k, padding_left_k); |
||||
} |
||||
|
||||
template <class T> |
||||
void max_unpooling( |
||||
const Stream& stream, |
||||
TensorSpan<T> output, TensorView<T> input, TensorView<T> indices, |
||||
const std::vector<std::size_t>& window_size, const std::vector<std::size_t>& strides, |
||||
const std::vector<std::size_t>& padding_left) |
||||
{ |
||||
CV_Assert(is_shape_same(input, indices)); |
||||
CV_Assert(input.get_axis_size(1) == output.get_axis_size(1)); |
||||
|
||||
auto order = window_size.size(); |
||||
CV_Assert(strides.size() == order); |
||||
CV_Assert(padding_left.size() == order); |
||||
CV_Assert(output.rank() == order + 2); |
||||
CV_Assert(input.rank() == order + 2); |
||||
|
||||
std::vector<std::size_t> out_spatial_dims(order), in_spatial_dims(order); |
||||
for (int i = 0; i < order; i++) { |
||||
in_spatial_dims[i] = input.get_axis_size(2 + i); |
||||
out_spatial_dims[i] = output.get_axis_size(2 + i); |
||||
} |
||||
|
||||
kernels::fill<T>(stream, output, 0.0); |
||||
|
||||
/* only max_unpooling2d and max_unpooling3d are supported */ |
||||
CV_Assert(2 <= order && order <= 3); |
||||
std::size_t channels = input.get_axis_size(1); |
||||
if (order == 3) { |
||||
launch_max_unpooling_kernel<T, 3>(stream, output, input, indices, channels, |
||||
out_spatial_dims, in_spatial_dims, window_size, strides, padding_left); |
||||
} else if (order == 2) { |
||||
launch_max_unpooling_kernel<T, 2>(stream, output, input, indices, channels, |
||||
out_spatial_dims, in_spatial_dims, window_size, strides, padding_left); |
||||
} |
||||
} |
||||
|
||||
template void max_unpooling(const Stream&, |
||||
TensorSpan<__half>, TensorView<__half>, TensorView<__half>, |
||||
const std::vector<std::size_t>&, const std::vector<std::size_t>&, |
||||
const std::vector<std::size_t>&); |
||||
|
||||
template void max_unpooling(const Stream&, |
||||
TensorSpan<float>, TensorView<float>, TensorView<float>, |
||||
const std::vector<std::size_t>&, const std::vector<std::size_t>&, |
||||
const std::vector<std::size_t>&); |
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */ |
@ -0,0 +1,121 @@ |
||||
// This file is part of OpenCV project. |
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory |
||||
// of this distribution and at http://opencv.org/license.html. |
||||
|
||||
#include <cuda_runtime.h> |
||||
#include <cuda_fp16.h> |
||||
|
||||
#include "array.hpp" |
||||
#include "math.hpp" |
||||
#include "types.hpp" |
||||
#include "atomics.hpp" |
||||
#include "grid_stride_range.hpp" |
||||
#include "execution.hpp" |
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp" |
||||
#include "../cuda4dnn/csl/span.hpp" |
||||
|
||||
#include "../cuda4dnn/kernels/fill.hpp" |
||||
#include "../cuda4dnn/kernels/scale_shift.hpp" |
||||
|
||||
#include <opencv2/core.hpp> |
||||
|
||||
#include <cstddef> |
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl; |
||||
using namespace cv::dnn::cuda4dnn::csl::device; |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { |
||||
|
||||
namespace raw { |
||||
template <class T> |
||||
__global__ void reduce_sum_abs(Span<T> output, View<T> input, size_type outer_stride, size_type mid_stride) { |
||||
for (auto idx : grid_stride_range(input.size())) { |
||||
const index_type outer_idx = idx / outer_stride; |
||||
const index_type inner_idx = idx % mid_stride; |
||||
|
||||
const index_type sum_idx = outer_idx * mid_stride + inner_idx; |
||||
atomicAdd(&output[sum_idx], device::abs(input[idx])); |
||||
} |
||||
} |
||||
|
||||
template <class T> |
||||
__global__ void reciprocal(Span<T> output, T epsilon) { |
||||
for (auto idx : grid_stride_range(output.size())) |
||||
output[idx] = T(1) / (output[idx] + epsilon); |
||||
} |
||||
|
||||
template <class T> |
||||
__global__ void reduce_sum_squared(Span<T> output, View<T> input, size_type outer_stride, size_type mid_stride) { |
||||
for (auto idx : grid_stride_range(input.size())) { |
||||
const index_type outer_idx = idx / outer_stride; |
||||
const index_type inner_idx = idx % mid_stride; |
||||
|
||||
const index_type sum_idx = outer_idx * mid_stride + inner_idx; |
||||
atomicAdd(&output[sum_idx], input[idx] * input[idx]); |
||||
} |
||||
} |
||||
|
||||
template <class T> |
||||
__global__ void rsqrt(Span<T> output, T epsilon) { |
||||
for (auto idx : grid_stride_range(output.size())) { |
||||
using device::sqrt; |
||||
output[idx] = T(1) / sqrt(output[idx] + epsilon); |
||||
} |
||||
} |
||||
|
||||
template <class T> |
||||
__global__ void apply_norm(Span<T> output, View<T> input, size_type outer_stride, size_type mid_stride, View<T> sums) { |
||||
for (auto idx : grid_stride_range(output.size())) { |
||||
const index_type outer_idx = idx / outer_stride; |
||||
const index_type inner_idx = idx % mid_stride; |
||||
|
||||
const index_type sum_idx = outer_idx * mid_stride + inner_idx; |
||||
output[idx] = input[idx] * sums[sum_idx]; |
||||
} |
||||
} |
||||
} |
||||
|
||||
template <class T> |
||||
void normalize( |
||||
const Stream& stream, |
||||
Span<T> output, |
||||
View<T> input, std::size_t outer_size, std::size_t mid_size, std::size_t inner_size, std::size_t norm, T epsilon, |
||||
Span<T> workspace) |
||||
{ |
||||
CV_Assert(output.size() == input.size()); |
||||
CV_Assert(output.size() == outer_size * mid_size * inner_size); |
||||
CV_Assert(norm == 1 || norm == 2); |
||||
CV_Assert(workspace.size() >= outer_size * inner_size); |
||||
|
||||
auto sums = Span<T>(workspace.data(), outer_size * inner_size); |
||||
|
||||
fill<T>(stream, sums, 0.0); |
||||
|
||||
if (norm == 1) { |
||||
auto reduce_kernel = raw::reduce_sum_abs<T>; |
||||
auto policy = make_policy(reduce_kernel, input.size(), 0, stream); |
||||
launch_kernel(reduce_kernel, policy, sums, input, mid_size * inner_size, inner_size); |
||||
|
||||
auto reciprocal_kernel = raw::reciprocal<T>; |
||||
policy = make_policy(reciprocal_kernel, sums.size(), 0, stream); |
||||
launch_kernel(reciprocal_kernel, policy, sums, epsilon); |
||||
} else { |
||||
auto reduce_kernel = raw::reduce_sum_squared<T>; |
||||
auto policy = make_policy(reduce_kernel, input.size(), 0, stream); |
||||
launch_kernel(reduce_kernel, policy, sums, input, mid_size * inner_size, inner_size); |
||||
|
||||
auto rsqrt_kernel = raw::rsqrt<T>; |
||||
policy = make_policy(rsqrt_kernel, sums.size(), 0, stream); |
||||
launch_kernel(rsqrt_kernel, policy, sums, epsilon); |
||||
} |
||||
|
||||
auto scale_kernel = raw::apply_norm<T>; |
||||
auto policy = make_policy(scale_kernel, output.size(), 0, stream); |
||||
launch_kernel(scale_kernel, policy, output, input, mid_size * inner_size, inner_size, sums); |
||||
} |
||||
|
||||
template void normalize(const Stream&, Span<__half>, View<__half>, std::size_t, std::size_t, std::size_t, std::size_t, __half, Span<__half>); |
||||
template void normalize(const Stream&, Span<float>, View<float>, std::size_t, std::size_t, std::size_t, std::size_t, float, Span<float>); |
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */ |
@ -0,0 +1,199 @@ |
||||
// This file is part of OpenCV project. |
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory |
||||
// of this distribution and at http://opencv.org/license.html. |
||||
|
||||
#include <cuda_runtime.h> |
||||
#include <cuda_fp16.h> |
||||
|
||||
#include "array.hpp" |
||||
#include "math.hpp" |
||||
#include "types.hpp" |
||||
#include "grid_stride_range.hpp" |
||||
#include "execution.hpp" |
||||
#include "kernel_dispatcher.hpp" |
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp" |
||||
#include "../cuda4dnn/csl/tensor.hpp" |
||||
#include "../cuda4dnn/csl/span.hpp" |
||||
|
||||
#include <opencv2/core.hpp> |
||||
|
||||
#include <cstddef> |
||||
#include <vector> |
||||
#include <utility> |
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl; |
||||
using namespace cv::dnn::cuda4dnn::csl::device; |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { |
||||
|
||||
namespace raw { |
||||
template <class T, std::size_t Rank> |
||||
__global__ void copy_with_reflection101( |
||||
Span<T> output, array<size_type, Rank> out_strides, array<index_type, Rank> start, array<index_type, Rank> end, |
||||
View<T> input, array<size_type, Rank> in_strides) |
||||
{ |
||||
for (auto i : grid_stride_range(output.size())) { |
||||
/* compute output axis indices corresponding to element 'i' */ |
||||
array<index_type, Rank> out_index; |
||||
out_index[0] = i / out_strides[0]; |
||||
for (int j = 1; j < Rank; j++) |
||||
out_index[j] = (i % out_strides[j - 1]) / out_strides[j]; |
||||
|
||||
/* compute input axis indices corresponding to output axis indices */ |
||||
array<index_type, Rank> in_index; |
||||
for (int j = 0; j < Rank; j++) { |
||||
/* if out_index < start, the point is in the left reflection region |
||||
* the reflected value's index is the absolute value of the difference |
||||
* |
||||
* otherwise, if the value is in the copy region, out_index - start gives the input index |
||||
*/ |
||||
using device::abs; |
||||
in_index[j] = abs(out_index[j] - start[j]); |
||||
|
||||
/* if out_index >= end, it's in the right reflection region */ |
||||
if (out_index[j] >= end[j]) |
||||
in_index[j] = (end[j] - start[j]) - (out_index[j] - end[j]) - 2; |
||||
} |
||||
|
||||
/* compute input element number from input axis indices */ |
||||
index_type iidx = 0; |
||||
for (int j = 0; j < Rank; j++) |
||||
iidx += in_index[j] * in_strides[j]; |
||||
|
||||
output[i] = input[iidx]; |
||||
} |
||||
} |
||||
} |
||||
|
||||
template <class T, std::size_t Rank> static |
||||
void launch_copy_with_reflection101( |
||||
const Stream& stream, |
||||
Span<T> output, const std::vector<std::size_t>& outStride, |
||||
View<T> input, const std::vector<std::size_t>& inStride, |
||||
const std::vector<std::pair<std::size_t, std::size_t>>& ranges) |
||||
{ |
||||
CV_Assert(outStride.size() == Rank); |
||||
CV_Assert(inStride.size() == Rank); |
||||
CV_Assert(ranges.size() == Rank); |
||||
|
||||
array<size_type, Rank> outStride_k, inStride_k; |
||||
outStride_k.assign(std::begin(outStride), std::end(outStride)); |
||||
inStride_k.assign(std::begin(inStride), std::end(inStride)); |
||||
|
||||
array<index_type, Rank> start_k, end_k; |
||||
for (int i = 0; i < Rank; i++) { |
||||
start_k[i] = ranges[i].first; |
||||
end_k[i] = ranges[i].second; |
||||
} |
||||
|
||||
auto kernel = raw::copy_with_reflection101<T, Rank>; |
||||
auto policy = make_policy(kernel, output.size(), 0, stream); |
||||
launch_kernel(kernel, policy, output, outStride_k, start_k, end_k, input, inStride_k); |
||||
} |
||||
|
||||
GENERATE_KERNEL_DISPATCHER(copy_with_reflection101_dispatcher, launch_copy_with_reflection101); |
||||
|
||||
template <class T> |
||||
void copy_with_reflection101( |
||||
const Stream& stream, |
||||
TensorSpan<T> output, TensorView<T> input, |
||||
std::vector<std::pair<std::size_t, std::size_t>> ranges) |
||||
{ |
||||
CV_Assert(output.rank() == input.rank()); |
||||
CV_Assert(output.rank() == ranges.size()); |
||||
|
||||
/* squeezable axes at the begining of both tensors can be eliminated |
||||
* |
||||
* Reasoning: |
||||
* ---------- |
||||
* Suppose an item's indices in the input tensor is [i1, i2, ...]. The indices in the |
||||
* output tensor will be [i1 + off1, i2 + off2, ...]. The rest of the elements in the output are padding. |
||||
* The padding operation essentially copies items from the input tensor to new locations in the output tensor |
||||
* and pads the remaining. |
||||
* |
||||
* If the size of the first axis of the input and output tensor is unity, the input and output indices |
||||
* for all the elements will be of the form be [0, i2, ...] and [0, i2 + off2, ...] respectively. Note that |
||||
* there cannot be extra padding since the axes have unit size. The first index does not contribute to the |
||||
* element's address calculation and hence does nothing apart from eating up few cycles. |
||||
*/ |
||||
while (input.get_axis_size(0) == 1 && output.get_axis_size(0) == 1) { |
||||
CV_Assert(ranges[0].first == 0 && ranges[0].second == 1); |
||||
|
||||
input.squeeze(0); |
||||
output.squeeze(0); |
||||
ranges.erase(std::begin(ranges)); |
||||
|
||||
CV_Assert(output.rank() == input.rank()); |
||||
CV_Assert(output.rank() == ranges.size()); |
||||
} |
||||
|
||||
auto inShape = input.shape_as_vector(); |
||||
auto outShape = output.shape_as_vector(); |
||||
|
||||
/* contiguous axes which do not have any padding can be combined into one axis |
||||
* |
||||
* Reasoning: |
||||
* ---------- |
||||
* Suppose an item's indices in the input tensor is [i1, i2, i3, ...]. Let the first two axes not have any |
||||
* padding. The indices in the output tensor will be [i1, i2, i3 + off3, ...]. |
||||
* |
||||
* Each axis in the contiguous unpadded axes sequence will add an offset of iN * strideN. In the above example, |
||||
* the two axes add a total offset of `i1 * stride1 + i2 * stride2`. We can merge the two axes into one axis with |
||||
* a size of `size1 * size2`. The new offset added will be `i12 * stride2` as the kernel iterates through `i12`. |
||||
* Note that `i12` is actually `(i1 * size2 + i2)` in the original tensor. |
||||
*/ |
||||
for (int i = 0; i < inShape.size(); i++) { |
||||
/* check if axis `i` requires any padding */ |
||||
if (ranges[i].first == 0 && ranges[i].second == inShape[i]) { |
||||
/* loop invariant: `i` is the first axis in the contiguous unpadded axis sequence */ |
||||
CV_Assert(inShape[i] == outShape[i]); |
||||
|
||||
/* we now iterate through the axes which follow and try to merge */ |
||||
int j = i + 1; /* `j` is the axis which we will attempt to merge */ |
||||
while (j < inShape.size() && ranges[j].first == 0 && ranges[j].second == inShape[j]) { |
||||
CV_Assert(inShape[j] == outShape[j]); |
||||
|
||||
/* `j` is also unpadded; merge `i` and `j` */ |
||||
auto new_size = inShape[i] * inShape[j]; |
||||
inShape[i] = new_size; |
||||
outShape[i] = new_size; |
||||
ranges[i].second = new_size; |
||||
|
||||
/* delete axis `j` */ |
||||
inShape.erase(std::begin(inShape) + j); |
||||
outShape.erase(std::begin(outShape) + j); |
||||
ranges.erase(std::begin(ranges) + j); |
||||
|
||||
/* optimizations should not break the invariants */ |
||||
CV_Assert(inShape.size() == outShape.size()); |
||||
CV_Assert(inShape.size() == ranges.size()); |
||||
CV_Assert(inShape[i] == outShape[i]); |
||||
CV_Assert(ranges[i].first == 0 && ranges[i].second == inShape[i]); |
||||
} |
||||
} |
||||
} |
||||
|
||||
auto rank = inShape.size(); |
||||
|
||||
std::vector<std::size_t> inStride(rank), outStride(rank); |
||||
inStride.back() = 1; |
||||
outStride.back() = 1; |
||||
/* garbage, ..., garbage, 1 */ |
||||
|
||||
std::copy(std::begin(inShape) + 1, std::end(inShape), std::begin(inStride)); |
||||
std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride)); |
||||
/* dim[0], dim[1], ..., dim[-1], 1 */ |
||||
|
||||
std::partial_sum(inStride.rbegin(), inStride.rend(), inStride.rbegin(), std::multiplies<int>()); |
||||
std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies<int>()); |
||||
/* stride[0], stride[1], ..., stride[-2], 1 */ |
||||
|
||||
CV_Assert(1 <= rank && rank <= CSL_MAX_TENSOR_RANK); |
||||
copy_with_reflection101_dispatcher<T, 1, CSL_MAX_TENSOR_RANK>(rank, stream, output, outStride, input, inStride, ranges); |
||||
} |
||||
|
||||
template void copy_with_reflection101(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::pair<std::size_t, std::size_t>> ranges); |
||||
template void copy_with_reflection101(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::pair<std::size_t, std::size_t>> ranges); |
||||
|
||||
}}}} /* namespace namespace cv::dnn::cuda4dnn::kernels */ |
@ -0,0 +1,143 @@ |
||||
// This file is part of OpenCV project. |
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory |
||||
// of this distribution and at http://opencv.org/license.html. |
||||
|
||||
#include <cuda_runtime.h> |
||||
#include <cuda_fp16.h> |
||||
|
||||
#include "array.hpp" |
||||
#include "types.hpp" |
||||
#include "grid_stride_range.hpp" |
||||
#include "execution.hpp" |
||||
#include "kernel_dispatcher.hpp" |
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp" |
||||
#include "../cuda4dnn/csl/tensor.hpp" |
||||
#include "../cuda4dnn/csl/span.hpp" |
||||
|
||||
#include <opencv2/core.hpp> |
||||
|
||||
#include <cstddef> |
||||
#include <vector> |
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl; |
||||
using namespace cv::dnn::cuda4dnn::csl::device; |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { |
||||
|
||||
namespace raw { |
||||
template <class T, std::size_t Rank> |
||||
__global__ void permute( |
||||
array<index_type, Rank> axis_order, |
||||
Span<T> output, array<size_type, Rank> outStrides, |
||||
View<T> input, array<size_type, Rank> inStrides) |
||||
{ |
||||
for (auto i : grid_stride_range(input.size())) { |
||||
index_type oldPosition = 0; |
||||
index_type newPosition = i; |
||||
|
||||
for (int j = 0; j < Rank; j++) |
||||
{ |
||||
auto order = axis_order[j]; |
||||
oldPosition += (newPosition / outStrides[j]) * inStrides[order]; |
||||
newPosition %= outStrides[j]; |
||||
} |
||||
|
||||
output[i] = input[oldPosition]; |
||||
} |
||||
} |
||||
} |
||||
|
||||
template <class T, std::size_t Rank> static |
||||
void launch_permute_kernel( |
||||
const Stream& stream, |
||||
const std::vector<std::size_t>& order, |
||||
Span<T> output, const std::vector<std::size_t>& outStride, |
||||
View<T> input, const std::vector<std::size_t>& inStride) |
||||
{ |
||||
CV_Assert(order.size() == Rank); |
||||
CV_Assert(outStride.size() == Rank); |
||||
CV_Assert(inStride.size() == Rank); |
||||
|
||||
array<index_type, Rank> order_k; |
||||
order_k.assign(std::begin(order), std::end(order)); |
||||
|
||||
array<size_type, Rank> outStride_k, inStride_k; |
||||
outStride_k.assign(std::begin(outStride), std::end(outStride)); |
||||
inStride_k.assign(std::begin(inStride), std::end(inStride)); |
||||
|
||||
auto kernel = raw::permute<T, Rank>; |
||||
auto policy = make_policy(kernel, input.size(), 0, stream); |
||||
launch_kernel(kernel, policy, order_k, output, outStride_k, input, inStride_k); |
||||
} |
||||
|
||||
GENERATE_KERNEL_DISPATCHER(permute_dispatcher, launch_permute_kernel); |
||||
|
||||
template <class T> |
||||
void permute( |
||||
const Stream& stream, |
||||
TensorSpan<T> output, TensorView<T> input, |
||||
std::vector<std::size_t> order) |
||||
{ |
||||
CV_Assert(output.rank() == input.rank()); |
||||
CV_Assert(input.rank() == order.size()); |
||||
CV_Assert(input.size() == output.size()); |
||||
|
||||
/* squeezable axes at the begining of both tensors which aren't permuted can be eliminated |
||||
* |
||||
* Reasoning: |
||||
* ---------- |
||||
* Suppose an item's indices in the input tensor is [i1, i2, ...]. The indices in the |
||||
* output tensor will be some permutation of the input tensor indices. Let the output |
||||
* tensor indices be [o1, o2, ...]. The permutation operation essentially copies items |
||||
* from the input tensor to new locations in the output tensor as dictated by the indices. |
||||
* |
||||
* If the size of the first axis of the input and output tensor is one and these axes are |
||||
* not involved in any permutation, i.e. order[0] = 0, the input and output indicies for |
||||
* all the elements will be of the form be [0, i2, ...] and [0, o2, ...] respectively. |
||||
* The first index does not contribute to the element's address calculation and hence does |
||||
* nothing apart from eating up few cycles. |
||||
*/ |
||||
while (order[0] == 0 && input.get_axis_size(0) == 1 && output.get_axis_size(0) == 1) { |
||||
/* remove the axes */ |
||||
input.squeeze(0); |
||||
output.squeeze(0); |
||||
|
||||
/* when we remove axis zero, the axis index will be one less than the previous index |
||||
* for the remaining axes |
||||
*/ |
||||
order.erase(order.begin()); |
||||
for (auto& axis : order) |
||||
axis--; |
||||
|
||||
/* optimizations should not break the invariants */ |
||||
CV_Assert(output.rank() == input.rank()); |
||||
CV_Assert(input.rank() == order.size()); |
||||
CV_Assert(input.size() == output.size()); |
||||
} |
||||
|
||||
auto rank = output.rank(); |
||||
auto inShape = input.shape_as_vector(); |
||||
auto outShape = output.shape_as_vector(); |
||||
|
||||
std::vector<std::size_t> inStride(rank), outStride(rank); |
||||
inStride.back() = 1; |
||||
outStride.back() = 1; |
||||
/* garbage, ..., garbage, 1 */ |
||||
|
||||
std::copy(std::begin(inShape) + 1, std::end(inShape), std::begin(inStride)); |
||||
std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride)); |
||||
/* dim[0], dim[1], ..., dim[-1], 1 */ |
||||
|
||||
std::partial_sum(inStride.rbegin(), inStride.rend(), inStride.rbegin(), std::multiplies<std::size_t>()); |
||||
std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies<std::size_t>()); |
||||
/* stride[0], stride[1], ..., stride[-2], 1 */ |
||||
|
||||
CV_Assert(2 <= rank && rank <= CSL_MAX_TENSOR_RANK); |
||||
permute_dispatcher<T, 2, CSL_MAX_TENSOR_RANK>(rank, stream, order, output, outStride, input, inStride); |
||||
} |
||||
|
||||
template void permute(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::size_t>); |
||||
template void permute(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::size_t>); |
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */ |
@ -0,0 +1,174 @@ |
||||
// This file is part of OpenCV project. |
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory |
||||
// of this distribution and at http://opencv.org/license.html. |
||||
|
||||
#include <cuda_runtime.h> |
||||
#include <cuda_fp16.h> |
||||
|
||||
#include "array.hpp" |
||||
#include "math.hpp" |
||||
#include "types.hpp" |
||||
#include "vector_traits.hpp" |
||||
#include "grid_stride_range.hpp" |
||||
#include "execution.hpp" |
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp" |
||||
#include "../cuda4dnn/csl/span.hpp" |
||||
|
||||
#include <cstddef> |
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl; |
||||
using namespace cv::dnn::cuda4dnn::csl::device; |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { |
||||
|
||||
namespace raw { |
||||
template <class T, bool Normalize> |
||||
__global__ void prior_box( |
||||
Span<T> output, |
||||
View<float> boxWidth, View<float> boxHeight, View<float> offsetX, View<float> offsetY, float stepX, float stepY, |
||||
size_type layerWidth, size_type layerHeight, |
||||
size_type imageWidth, size_type imageHeight) |
||||
{ |
||||
/* each box consists of two pair of coordinates and hence 4 values in total */ |
||||
/* since the entire output consists (first channel at least) of these boxes, |
||||
* we are garunteeed that the output is aligned to a boundary of 4 values |
||||
*/ |
||||
using vector_type = get_vector_type_t<T, 4>; |
||||
auto output_vPtr = vector_type::get_pointer(output.data()); |
||||
|
||||
/* num_points contains the number of points in the feature map of interest |
||||
* each iteration of the stride loop selects a point and generates prior boxes for it |
||||
*/ |
||||
size_type num_points = layerWidth * layerHeight; |
||||
for (auto idx : grid_stride_range(num_points)) { |
||||
const index_type x = idx % layerWidth, |
||||
y = idx / layerWidth; |
||||
|
||||
index_type output_offset_v4 = idx * offsetX.size() * boxWidth.size(); |
||||
for (int i = 0; i < boxWidth.size(); i++) { |
||||
for (int j = 0; j < offsetX.size(); j++) { |
||||
float center_x = (x + offsetX[j]) * stepX; |
||||
float center_y = (y + offsetY[j]) * stepY; |
||||
|
||||
vector_type vec; |
||||
if(Normalize) { |
||||
vec.data[0] = (center_x - boxWidth[i] * 0.5f) / imageWidth; |
||||
vec.data[1] = (center_y - boxHeight[i] * 0.5f) / imageHeight; |
||||
vec.data[2] = (center_x + boxWidth[i] * 0.5f) / imageWidth; |
||||
vec.data[3] = (center_y + boxHeight[i] * 0.5f) / imageHeight; |
||||
} else { |
||||
vec.data[0] = center_x - boxWidth[i] * 0.5f; |
||||
vec.data[1] = center_y - boxHeight[i] * 0.5f; |
||||
vec.data[2] = center_x + boxWidth[i] * 0.5f - 1.0f; |
||||
vec.data[3] = center_y + boxHeight[i] * 0.5f - 1.0f; |
||||
} |
||||
|
||||
v_store(output_vPtr[output_offset_v4], vec); |
||||
output_offset_v4++; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
template <class T> |
||||
__global__ void prior_box_clip(Span<T> output) { |
||||
for (auto i : grid_stride_range(output.size())) { |
||||
using device::clamp; |
||||
output[i] = clamp<T>(output[i], 0.0, 1.0); |
||||
} |
||||
} |
||||
|
||||
template <class T> |
||||
__global__ void prior_box_set_variance1(Span<T> output, float variance) { |
||||
using vector_type = get_vector_type_t<T, 4>; |
||||
auto output_vPtr = vector_type::get_pointer(output.data()); |
||||
for (auto i : grid_stride_range(output.size() / 4)) { |
||||
vector_type vec; |
||||
for (int j = 0; j < 4; j++) |
||||
vec.data[j] = variance; |
||||
v_store(output_vPtr[i], vec); |
||||
} |
||||
} |
||||
|
||||
template <class T> |
||||
__global__ void prior_box_set_variance4(Span<T> output, array<float, 4> variance) { |
||||
using vector_type = get_vector_type_t<T, 4>; |
||||
auto output_vPtr = vector_type::get_pointer(output.data()); |
||||
for (auto i : grid_stride_range(output.size() / 4)) { |
||||
vector_type vec; |
||||
for(int j = 0; j < 4; j++) |
||||
vec.data[j] = variance[j]; |
||||
v_store(output_vPtr[i], vec); |
||||
} |
||||
} |
||||
} |
||||
|
||||
template <class T, bool Normalize> static |
||||
void launch_prior_box_kernel( |
||||
const Stream& stream, |
||||
Span<T> output, View<float> boxWidth, View<float> boxHeight, View<float> offsetX, View<float> offsetY, float stepX, float stepY, |
||||
std::size_t layerWidth, std::size_t layerHeight, std::size_t imageWidth, std::size_t imageHeight) |
||||
{ |
||||
auto num_points = layerWidth * layerHeight; |
||||
auto kernel = raw::prior_box<T, Normalize>; |
||||
auto policy = make_policy(kernel, num_points, 0, stream); |
||||
launch_kernel(kernel, policy, |
||||
output, boxWidth, boxHeight, offsetX, offsetY, stepX, stepY, |
||||
layerWidth, layerHeight, imageWidth, imageHeight); |
||||
} |
||||
|
||||
template <class T> |
||||
void generate_prior_boxes( |
||||
const Stream& stream, |
||||
Span<T> output, |
||||
View<float> boxWidth, View<float> boxHeight, View<float> offsetX, View<float> offsetY, float stepX, float stepY, |
||||
std::vector<float> variance, |
||||
std::size_t numPriors, |
||||
std::size_t layerWidth, std::size_t layerHeight, |
||||
std::size_t imageWidth, std::size_t imageHeight, |
||||
bool normalize, bool clip) |
||||
{ |
||||
if (normalize) { |
||||
launch_prior_box_kernel<T, true>( |
||||
stream, output, boxWidth, boxHeight, offsetX, offsetY, stepX, stepY, |
||||
layerWidth, layerHeight, imageWidth, imageHeight |
||||
); |
||||
} else { |
||||
launch_prior_box_kernel<T, false>( |
||||
stream, output, boxWidth, boxHeight, offsetX, offsetY, stepX, stepY, |
||||
layerWidth, layerHeight, imageWidth, imageHeight |
||||
); |
||||
} |
||||
|
||||
std::size_t channel_size = layerHeight * layerWidth * numPriors * 4; |
||||
CV_Assert(channel_size * 2 == output.size()); |
||||
|
||||
if (clip) { |
||||
auto output_span_c1 = Span<T>(output.data(), channel_size); |
||||
auto kernel = raw::prior_box_clip<T>; |
||||
auto policy = make_policy(kernel, output_span_c1.size(), 0, stream); |
||||
launch_kernel(kernel, policy, output_span_c1); |
||||
} |
||||
|
||||
auto output_span_c2 = Span<T>(output.data() + channel_size, channel_size); |
||||
if (variance.size() == 1) { |
||||
auto kernel = raw::prior_box_set_variance1<T>; |
||||
auto policy = make_policy(kernel, output_span_c2.size() / 4, 0, stream); |
||||
launch_kernel(kernel, policy, output_span_c2, variance[0]); |
||||
} else { |
||||
array<float, 4> variance_k; |
||||
variance_k.assign(std::begin(variance), std::end(variance)); |
||||
auto kernel = raw::prior_box_set_variance4<T>; |
||||
auto policy = make_policy(kernel, output_span_c2.size() / 4, 0, stream); |
||||
launch_kernel(kernel, policy, output_span_c2, variance_k); |
||||
} |
||||
} |
||||
|
||||
template void generate_prior_boxes(const Stream&, Span<__half>, View<float>, View<float>, View<float>, View<float>, float, float, |
||||
std::vector<float>, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, bool, bool); |
||||
|
||||
template void generate_prior_boxes(const Stream&, Span<float>, View<float>, View<float>, View<float>, View<float>, float, float, |
||||
std::vector<float>, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, bool, bool); |
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */ |
@ -0,0 +1,199 @@ |
||||
// This file is part of OpenCV project. |
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory |
||||
// of this distribution and at http://opencv.org/license.html. |
||||
|
||||
#include <cuda_runtime.h> |
||||
#include <cuda_fp16.h> |
||||
|
||||
#include "math.hpp" |
||||
#include "grid_stride_range.hpp" |
||||
#include "execution.hpp" |
||||
#include "limits.hpp" |
||||
#include "vector_traits.hpp" |
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp" |
||||
#include "../cuda4dnn/csl/span.hpp" |
||||
|
||||
#include <opencv2/core.hpp> |
||||
|
||||
#include <cstddef> |
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl; |
||||
using namespace cv::dnn::cuda4dnn::csl::device; |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { |
||||
|
||||
namespace raw { |
||||
template <class T> |
||||
__global__ void sigmoid_strided(Span<T> output, View<T> input, size_type n, size_type stride, size_type offset) { |
||||
/* - the input is divided into equal blocks strided by `stride` |
||||
* - we must apply sigmoid to a continuous range of `n` values starting from `offset` in every block |
||||
*/ |
||||
for (auto i : grid_stride_range(n * output.size() / stride)) { |
||||
auto block_idx = i / n; |
||||
auto index = block_idx * stride + offset + (i % n); |
||||
|
||||
using device::sigmoid; |
||||
output[index] = sigmoid(input[index]); |
||||
} |
||||
} |
||||
|
||||
template <class T> |
||||
__global__ void softmax_strided(Span<T> output, View<T> input, size_type n, size_type stride, size_type offset_) { |
||||
for (auto idx : grid_stride_range(output.size() / stride)) { |
||||
index_type offset = idx * stride + offset_; |
||||
|
||||
auto largest = numeric_limits<T>::lowest(); |
||||
for (int i = 0; i < n; i++) { |
||||
using device::max; |
||||
largest = max(largest, output[offset + i]); |
||||
} |
||||
|
||||
auto sum = T(0); |
||||
for (int i = 0; i < n; i++) { |
||||
using device::exp; |
||||
auto temp = exp(output[offset + i] - largest); |
||||
sum += temp; |
||||
output[offset + i] = temp; |
||||
} |
||||
|
||||
for (int i = 0; i < n; i++) { |
||||
output[offset + i] /= sum; |
||||
} |
||||
} |
||||
} |
||||
|
||||
template <class T> |
||||
__global__ void region_finalize(Span<T> output, View<T> input, View<T> bias, |
||||
T object_prob_cutoff, T class_prob_cutoff, |
||||
size_type height_norm, size_type width_norm, |
||||
size_type rows, size_type cols, |
||||
size_type boxes_per_cell, |
||||
size_type box_size, |
||||
size_type classes) |
||||
{ |
||||
for (auto box_index : grid_stride_range(output.size() / box_size)) { |
||||
auto box_of_the_cell = box_index % boxes_per_cell; /* box number within a cell */ |
||||
auto box_offset = box_index * box_size; |
||||
|
||||
auto batch_inner_size = rows * cols * boxes_per_cell; |
||||
auto row_inner_size = cols * boxes_per_cell; |
||||
auto col_inner_size = boxes_per_cell; |
||||
|
||||
auto y = (box_index % batch_inner_size) / row_inner_size; |
||||
auto x = (box_index % row_inner_size) / col_inner_size; |
||||
|
||||
using device::sigmoid; |
||||
using device::exp; |
||||
output[box_offset + 0] = (T(x) + sigmoid(input[box_offset + 0])) / T(cols); |
||||
output[box_offset + 1] = (T(y) + sigmoid(input[box_offset + 1])) / T(rows); |
||||
output[box_offset + 2] = exp(input[box_offset + 2]) * bias[2 * box_of_the_cell + 0] / T(width_norm); |
||||
output[box_offset + 3] = exp(input[box_offset + 3]) * bias[2 * box_of_the_cell + 1] / T(height_norm); |
||||
|
||||
/* squash objectness score into a probability */ |
||||
using device::sigmoid; |
||||
T objectness_prob = sigmoid(output[box_offset + 4]); |
||||
output[box_offset + 4] = objectness_prob; |
||||
|
||||
/* ignore prediction if the objectness probability is less than the cutoff */ |
||||
if (objectness_prob < object_prob_cutoff) |
||||
objectness_prob = 0; |
||||
|
||||
/* the class probabilities we have currently are conditional class probabilities |
||||
* given the object |
||||
* |
||||
* to obtain the actual class probability, we multiply the conditional probability |
||||
* with the object probability |
||||
*/ |
||||
const index_type class_begin = box_offset + 5; /* 4 box coordinates, 1 obj prob, class probs... */ |
||||
const index_type class_end = class_begin + classes; |
||||
index_type offset = class_begin; |
||||
|
||||
using vector_type = get_vector_type_t<T, 4>; |
||||
|
||||
/* process each class independently until the offset is aligned to an n-element boundary */ |
||||
while (offset % vector_type::size() != 0 && offset < class_end) { |
||||
T actual_class_prob = objectness_prob * output[offset]; |
||||
if (actual_class_prob <= class_prob_cutoff) |
||||
actual_class_prob = T(0); |
||||
output[offset] = actual_class_prob; |
||||
offset++; |
||||
} |
||||
|
||||
auto output_vPtr = vector_type::get_pointer(output.data() + offset); |
||||
auto input_vPtr = vector_type::get_pointer(input.data() + offset); |
||||
for (int i = 0; (offset + vector_type::size()) < class_end; i++) { |
||||
vector_type vec; |
||||
v_load(vec, output_vPtr[i]); |
||||
for (int j = 0; j < vector_type::size(); j++) { |
||||
T actual_class_prob = objectness_prob * vec.data[j]; |
||||
if (actual_class_prob <= class_prob_cutoff) |
||||
actual_class_prob = T(0); |
||||
vec.data[j] = actual_class_prob; |
||||
} |
||||
v_store(output_vPtr[i], vec); |
||||
offset += vector_type::size(); |
||||
} |
||||
|
||||
/* process the remaining classes */ |
||||
while (offset < class_end) { |
||||
T actual_class_prob = objectness_prob * output[offset]; |
||||
if (actual_class_prob <= class_prob_cutoff) |
||||
actual_class_prob = T(0); |
||||
output[offset] = actual_class_prob; |
||||
offset++; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
template <class T> |
||||
void sigmoid_strided(const Stream& stream, Span<T> output, View<T> input, std::size_t n, std::size_t stride, std::size_t offset) { |
||||
CV_Assert(output.size() % stride == 0); |
||||
|
||||
auto kernel = raw::sigmoid_strided<T>; |
||||
auto policy = make_policy(kernel, n * output.size() / stride, 0, stream); |
||||
launch_kernel(kernel, policy, output, input, n, stride, offset); |
||||
} |
||||
|
||||
template void sigmoid_strided(const Stream&, Span<__half>, View<__half>, std::size_t, std::size_t, std::size_t); |
||||
template void sigmoid_strided(const Stream&, Span<float>, View<float>, std::size_t, std::size_t, std::size_t); |
||||
|
||||
template <class T> |
||||
void softmax_strided(const Stream& stream, Span<T> output, View<T> input, std::size_t n, std::size_t stride, std::size_t offset) { |
||||
CV_Assert(output.size() % stride == 0); |
||||
|
||||
auto kernel = raw::softmax_strided<T>; |
||||
auto policy = make_policy(kernel, output.size() / stride, 0, stream); |
||||
launch_kernel(kernel, policy, output, input, n, stride, offset); |
||||
} |
||||
|
||||
template void softmax_strided(const Stream&, Span<__half>, View<__half>, std::size_t, std::size_t, std::size_t); |
||||
template void softmax_strided(const Stream&, Span<float>, View<float>, std::size_t, std::size_t, std::size_t); |
||||
|
||||
template <class T> |
||||
void region_finalize(const Stream& stream, Span<T> output, View<T> input, View<T> bias, |
||||
T object_prob_cutoff, T class_prob_cutoff, |
||||
std::size_t height_norm, std::size_t width_norm, |
||||
std::size_t rows, std::size_t cols, |
||||
std::size_t boxes_per_cell, |
||||
std::size_t box_size, |
||||
std::size_t classes) |
||||
{ |
||||
CV_Assert(output.size() % box_size == 0); |
||||
|
||||
auto kernel = raw::region_finalize<T>; |
||||
auto policy = make_policy(kernel, output.size() / box_size, 0, stream); |
||||
launch_kernel(kernel, policy, output, input, bias, |
||||
object_prob_cutoff, class_prob_cutoff, |
||||
height_norm, width_norm, |
||||
rows, cols, boxes_per_cell, box_size, classes); |
||||
} |
||||
|
||||
template void region_finalize(const Stream&, Span<__half>, View<__half>, View<__half>, |
||||
__half, __half, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t); |
||||
|
||||
template void region_finalize(const Stream&, Span<float>, View<float>, View<float>, |
||||
float, float, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t); |
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */ |
@ -0,0 +1,133 @@ |
||||
// This file is part of OpenCV project. |
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory |
||||
// of this distribution and at http://opencv.org/license.html. |
||||
|
||||
#include <cuda_runtime.h> |
||||
#include <cuda_fp16.h> |
||||
|
||||
#include "math.hpp" |
||||
#include "types.hpp" |
||||
#include "grid_stride_range.hpp" |
||||
#include "execution.hpp" |
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp" |
||||
#include "../cuda4dnn/csl/tensor.hpp" |
||||
#include "../cuda4dnn/csl/span.hpp" |
||||
|
||||
#include <cuda_runtime.h> |
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl; |
||||
using namespace cv::dnn::cuda4dnn::csl::device; |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { |
||||
|
||||
namespace raw { |
||||
template <class T> |
||||
__global__ void resize_nn( |
||||
Span<T> output, size_type out_height, size_type out_width, |
||||
View<T> input, size_type in_height, size_type in_width) |
||||
{ |
||||
auto in_image_size = in_height * in_width; |
||||
auto out_image_size = out_height * out_width; |
||||
|
||||
/* o2i = output to input */ |
||||
auto o2i_fx = static_cast<float>(in_width) / out_width; |
||||
auto o2i_fy = static_cast<float>(in_height) / out_height; |
||||
|
||||
/* think of the output and input as a collection of 2d images with the last axis |
||||
* representing the width and the last but one axis representing the height |
||||
* |
||||
* the remaining axis together form a collection of these images |
||||
*/ |
||||
for (auto idx : grid_stride_range(output.size())) { |
||||
const index_type n = idx / out_image_size; |
||||
const index_type x = (idx % out_image_size) % out_width; |
||||
const index_type y = (idx % out_image_size) / out_width; |
||||
|
||||
auto in_x = static_cast<index_type>(x * o2i_fx); |
||||
auto in_y = static_cast<index_type>(y * o2i_fy); |
||||
|
||||
index_type in_idx = n * in_image_size + in_y * in_width + in_x; |
||||
output[idx] = input[in_idx]; |
||||
} |
||||
} |
||||
|
||||
template <class T> |
||||
__global__ void resize_bilinear( |
||||
Span<T> output, size_type out_height, size_type out_width, |
||||
View<T> input, size_type in_height, size_type in_width, |
||||
float o2i_fy, float o2i_fx) |
||||
{ |
||||
auto in_image_size = in_height * in_width; |
||||
auto out_image_size = out_height * out_width; |
||||
|
||||
/* think of the output and input as a collection of 2d images with the last axis |
||||
* representing the width and the last but one axis representing the height |
||||
* |
||||
* the remaining axis together form a collection of these images |
||||
*/ |
||||
for (auto idx : grid_stride_range(output.size())) { |
||||
const index_type n = idx / out_image_size; |
||||
const index_type x = (idx % out_image_size) % out_width; |
||||
const index_type y = (idx % out_image_size) / out_width; |
||||
|
||||
auto in_x = x * o2i_fx; |
||||
auto in_y = y * o2i_fy; |
||||
|
||||
auto in_x0 = static_cast<index_type>(in_x); |
||||
auto in_y0 = static_cast<index_type>(in_y); |
||||
|
||||
using device::min; |
||||
auto in_x1 = min<index_type>(in_x0 + 1, in_width - 1); |
||||
auto in_y1 = min<index_type>(in_y0 + 1, in_height - 1); |
||||
|
||||
const index_type in_offset_r0 = n * in_image_size + in_y0 * in_width; |
||||
const index_type in_offset_r1 = n * in_image_size + in_y1 * in_width; |
||||
|
||||
auto v_00 = input[in_offset_r0 + in_x0], |
||||
v_01 = input[in_offset_r0 + in_x1], |
||||
v_10 = input[in_offset_r1 + in_x0], |
||||
v_11 = input[in_offset_r1 + in_x1]; |
||||
|
||||
output[idx] = |
||||
v_00 + |
||||
T(in_y - in_y0) * T(v_10 - v_00) + |
||||
T(in_x - in_x0) * T(v_01 - v_00) + |
||||
T(in_y - in_y0) * T(in_x - in_x0) * T(v_11 - v_01 - v_10 + v_00); |
||||
} |
||||
} |
||||
} |
||||
|
||||
template <class T> |
||||
void resize_nn(const Stream& stream, TensorSpan<T> output, TensorView<T> input) { |
||||
auto in_height = input.get_axis_size(-2); |
||||
auto in_width = input.get_axis_size(-1); |
||||
|
||||
auto out_height = output.get_axis_size(-2); |
||||
auto out_width = output.get_axis_size(-1); |
||||
|
||||
auto kernel = raw::resize_nn<T>; |
||||
auto policy = make_policy(kernel, output.size(), 0, stream); |
||||
launch_kernel(kernel, policy, output, out_height, out_width, input, in_height, in_width); |
||||
} |
||||
|
||||
template void resize_nn<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>); |
||||
template void resize_nn<float>(const Stream&, TensorSpan<float>, TensorView<float>); |
||||
|
||||
template <class T> |
||||
void resize_bilinear(const Stream& stream, TensorSpan<T> output, TensorView<T> input, float scale_y, float scale_x) { |
||||
auto in_height = input.get_axis_size(-2); |
||||
auto in_width = input.get_axis_size(-1); |
||||
|
||||
auto out_height = output.get_axis_size(-2); |
||||
auto out_width = output.get_axis_size(-1); |
||||
|
||||
auto kernel = raw::resize_bilinear<T>; |
||||
auto policy = make_policy(kernel, output.size(), 0, stream); |
||||
launch_kernel(kernel, policy, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x); |
||||
} |
||||
|
||||
template void resize_bilinear<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, float, float); |
||||
template void resize_bilinear<float>(const Stream&, TensorSpan<float>, TensorView<float>, float, float); |
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */ |
@ -0,0 +1,311 @@ |
||||
// This file is part of OpenCV project. |
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory |
||||
// of this distribution and at http://opencv.org/license.html. |
||||
|
||||
#include <cuda_runtime.h> |
||||
#include <cuda_fp16.h> |
||||
|
||||
#include "types.hpp" |
||||
#include "vector_traits.hpp" |
||||
#include "grid_stride_range.hpp" |
||||
#include "execution.hpp" |
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp" |
||||
#include "../cuda4dnn/csl/tensor.hpp" |
||||
#include "../cuda4dnn/csl/span.hpp" |
||||
|
||||
#include <opencv2/core.hpp> |
||||
|
||||
#include <cstddef> |
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl; |
||||
using namespace cv::dnn::cuda4dnn::csl::device; |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { |
||||
|
||||
namespace raw { |
||||
template <class T, std::size_t N> |
||||
__global__ void bias1_vec(Span<T> output, View<T> input, T beta) { |
||||
using vector_type = get_vector_type_t<T, N>; |
||||
|
||||
auto output_vPtr = vector_type::get_pointer(output.data()); |
||||
auto input_vPtr = vector_type::get_pointer(input.data()); |
||||
|
||||
for (auto i : grid_stride_range(output.size() / vector_type::size())) { |
||||
vector_type vec; |
||||
v_load(vec, input_vPtr[i]); |
||||
for (int j = 0; j < vec.size(); j++) |
||||
vec.data[j] = vec.data[j] + beta; |
||||
v_store(output_vPtr[i], vec); |
||||
} |
||||
} |
||||
|
||||
template <class T, std::size_t N> |
||||
__global__ void biasN_vec(Span<T> output, View<T> input, size_type inner_size, View<T> bias) { |
||||
using vector_type = get_vector_type_t<T, N>; |
||||
|
||||
auto output_vPtr = vector_type::get_pointer(output.data()); |
||||
auto input_vPtr = vector_type::get_pointer(input.data()); |
||||
|
||||
inner_size /= vector_type::size(); |
||||
for (auto i : grid_stride_range(output.size() / vector_type::size())) { |
||||
const index_type bias_idx = (i / inner_size) % static_cast<size_type>(bias.size()); |
||||
|
||||
vector_type vec; |
||||
v_load(vec, input_vPtr[i]); |
||||
for(int j = 0; j < vec.size(); j++) |
||||
vec.data[j] = vec.data[j] + bias[bias_idx]; |
||||
v_store(output_vPtr[i], vec); |
||||
} |
||||
} |
||||
|
||||
template <class T, std::size_t N> |
||||
__global__ void scale1_vec(Span<T> output, View<T> input, T alpha) { |
||||
using vector_type = get_vector_type_t<T, N>; |
||||
|
||||
auto output_vPtr = vector_type::get_pointer(output.data()); |
||||
auto input_vPtr = vector_type::get_pointer(input.data()); |
||||
|
||||
for (auto i : grid_stride_range(output.size() / vector_type::size())) { |
||||
vector_type vec; |
||||
v_load(vec, input_vPtr[i]); |
||||
for (int j = 0; j < vec.size(); j++) |
||||
vec.data[j] = vec.data[j] * alpha; |
||||
v_store(output_vPtr[i], vec); |
||||
} |
||||
} |
||||
|
||||
template <class T, std::size_t N> |
||||
__global__ void scaleN_vec(Span<T> output, View<T> input, size_type inner_size, View<T> weights) |
||||
{ |
||||
using vector_type = get_vector_type_t<T, N>; |
||||
|
||||
auto output_vPtr = vector_type::get_pointer(output.data()); |
||||
auto input_vPtr = vector_type::get_pointer(input.data()); |
||||
|
||||
inner_size /= vector_type::size(); |
||||
for (auto i : grid_stride_range(output.size() / vector_type::size())) { |
||||
const index_type scale_idx = (i / inner_size) % static_cast<size_type>(weights.size()); |
||||
|
||||
vector_type vec; |
||||
v_load(vec, input_vPtr[i]); |
||||
for (int j = 0; j < vec.size(); j++) |
||||
vec.data[j] = vec.data[j] * weights[scale_idx]; |
||||
v_store(output_vPtr[i], vec); |
||||
} |
||||
} |
||||
|
||||
template <class T, std::size_t N> |
||||
__global__ void scale1_with_bias1_vec(Span<T> output, View<T> input, T alpha, T beta) |
||||
{ |
||||
using vector_type = get_vector_type_t<T, N>; |
||||
|
||||
auto output_vPtr = vector_type::get_pointer(output.data()); |
||||
auto input_vPtr = vector_type::get_pointer(input.data()); |
||||
|
||||
for (auto i : grid_stride_range(output.size() / vector_type::size())) { |
||||
vector_type vec; |
||||
v_load(vec, input_vPtr[i]); |
||||
for (int j = 0; j < vec.size(); j++) |
||||
vec.data[j] = alpha * vec.data[j] + beta; |
||||
v_store(output_vPtr[i], vec); |
||||
} |
||||
} |
||||
|
||||
template <class T, std::size_t N> |
||||
__global__ void scaleN_with_biasN_vec(Span<T> output, View<T> input, size_type inner_size, View<T> weights, View<T> bias) |
||||
{ |
||||
using vector_type = get_vector_type_t<T, N>; |
||||
|
||||
auto output_vPtr = vector_type::get_pointer(output.data()); |
||||
auto input_vPtr = vector_type::get_pointer(input.data()); |
||||
|
||||
inner_size /= vector_type::size(); |
||||
for (auto i : grid_stride_range(output.size() / vector_type::size())) { |
||||
const index_type scale_idx = (i / inner_size) % static_cast<size_type>(weights.size()); |
||||
|
||||
vector_type vec; |
||||
v_load(vec, input_vPtr[i]); |
||||
for (int j = 0; j < vec.size(); j++) |
||||
vec.data[j] = vec.data[j] * weights[scale_idx] + bias[scale_idx]; |
||||
v_store(output_vPtr[i], vec); |
||||
} |
||||
} |
||||
} |
||||
|
||||
template <class T, std::size_t N> static |
||||
void launch_bias1_vec_kernel(const Stream& stream, Span<T> output, View<T> input, T beta) { |
||||
CV_Assert(is_fully_aligned<T>(output, N)); |
||||
CV_Assert(is_fully_aligned<T>(input, N)); |
||||
|
||||
auto kernel = raw::bias1_vec<T, N>; |
||||
auto policy = make_policy(kernel, output.size() / N, 0, stream); |
||||
launch_kernel(kernel, policy, output, input, beta); |
||||
} |
||||
|
||||
template <class T> |
||||
void bias1(const Stream& stream, TensorSpan<T> output, TensorView<T> input, T beta) { |
||||
CV_Assert(is_shape_same(input, output)); |
||||
|
||||
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) { |
||||
launch_bias1_vec_kernel<T, 4>(stream, output, input, beta); |
||||
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) { |
||||
launch_bias1_vec_kernel<T, 2>(stream, output, input, beta); |
||||
} else { |
||||
launch_bias1_vec_kernel<T, 1>(stream, output, input, beta); |
||||
} |
||||
} |
||||
|
||||
template void bias1<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, __half); |
||||
template void bias1<float>(const Stream&, TensorSpan<float>, TensorView<float>, float); |
||||
|
||||
template <class T, std::size_t N> static |
||||
void launch_biasN_vec_kernel(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> bias){ |
||||
CV_Assert(is_fully_aligned<T>(output, N)); |
||||
CV_Assert(is_fully_aligned<T>(input, N)); |
||||
CV_Assert(inner_size % N == 0); |
||||
|
||||
auto kernel = raw::biasN_vec<T, N>; |
||||
auto policy = make_policy(kernel, output.size() / N, 0, stream); |
||||
launch_kernel(kernel, policy, output, input, inner_size, bias); |
||||
} |
||||
|
||||
template <class T> |
||||
void biasN( |
||||
const Stream& stream, |
||||
TensorSpan<T> output, |
||||
TensorView<T> input, std::size_t inner_size, |
||||
TensorView<T> bias) |
||||
{ |
||||
CV_Assert(is_shape_same(input, output)); |
||||
|
||||
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && inner_size % 4 == 0) { |
||||
launch_biasN_vec_kernel<T, 4>(stream, output, input, inner_size, bias); |
||||
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && inner_size % 2 == 0) { |
||||
launch_biasN_vec_kernel<T, 2>(stream, output, input, inner_size, bias); |
||||
} else { |
||||
launch_biasN_vec_kernel<T, 1>(stream, output, input, inner_size, bias); |
||||
} |
||||
} |
||||
|
||||
template void biasN<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, std::size_t, TensorView<__half>); |
||||
template void biasN<float>(const Stream&, TensorSpan<float>, TensorView<float>, std::size_t, TensorView<float>); |
||||
|
||||
template <class T, std::size_t N> static |
||||
void launch_scale1_vec_kernel(const Stream& stream, Span<T> output, View<T> input, T alpha) { |
||||
CV_Assert(is_fully_aligned<T>(output, N)); |
||||
CV_Assert(is_fully_aligned<T>(input, N)); |
||||
|
||||
auto kernel = raw::scale1_vec<T, N>; |
||||
auto policy = make_policy(kernel, output.size() / N, 0, stream); |
||||
launch_kernel(kernel, policy, output, input, alpha); |
||||
} |
||||
|
||||
template <class T> |
||||
void scale1(const Stream& stream, TensorSpan<T> output, TensorView<T> input, T alpha) { |
||||
CV_Assert(is_shape_same(input, output)); |
||||
|
||||
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) { |
||||
launch_scale1_vec_kernel<T, 4>(stream, output, input, alpha); |
||||
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) { |
||||
launch_scale1_vec_kernel<T, 2>(stream, output, input, alpha); |
||||
} else { |
||||
launch_scale1_vec_kernel<T, 1>(stream, output, input, alpha); |
||||
} |
||||
} |
||||
|
||||
template void scale1<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, __half); |
||||
template void scale1<float>(const Stream&, TensorSpan<float>, TensorView<float>, float); |
||||
|
||||
template <class T, std::size_t N> static |
||||
void launch_scaleN_vec_kernel(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> weights) { |
||||
CV_Assert(is_fully_aligned<T>(output, N)); |
||||
CV_Assert(is_fully_aligned<T>(input, N)); |
||||
CV_Assert(inner_size % N == 0); |
||||
|
||||
auto kernel = raw::scaleN_vec<T, N>; |
||||
auto policy = make_policy(kernel, output.size() / N, 0, stream); |
||||
launch_kernel(kernel, policy, output, input, inner_size, weights); |
||||
} |
||||
|
||||
template <class T> |
||||
void scaleN( |
||||
const Stream& stream, |
||||
TensorSpan<T> output, |
||||
TensorView<T> input, std::size_t inner_size, |
||||
TensorView<T> weights) |
||||
{ |
||||
CV_Assert(is_shape_same(input, output)); |
||||
|
||||
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && inner_size % 4 == 0) { |
||||
launch_scaleN_vec_kernel<T, 4>(stream, output, input, inner_size, weights); |
||||
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && inner_size % 2 == 0) { |
||||
launch_scaleN_vec_kernel<T, 2>(stream, output, input, inner_size, weights); |
||||
} else { |
||||
launch_scaleN_vec_kernel<T, 1>(stream, output, input, inner_size, weights); |
||||
} |
||||
} |
||||
|
||||
template void scaleN<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, std::size_t, TensorView<__half>); |
||||
template void scaleN<float>(const Stream&, TensorSpan<float>, TensorView<float>, std::size_t, TensorView<float>); |
||||
|
||||
template <class T, std::size_t N> static |
||||
void launch_scale1_with_bias1_vec_kernel(const Stream& stream, Span<T> output, View<T> input, T alpha, T beta) { |
||||
CV_Assert(is_fully_aligned<T>(output, N)); |
||||
CV_Assert(is_fully_aligned<T>(input, N)); |
||||
|
||||
auto kernel = raw::scale1_with_bias1_vec<T, N>; |
||||
auto policy = make_policy(kernel, output.size() / N, 0, stream); |
||||
launch_kernel(kernel, policy, output, input, alpha, beta); |
||||
} |
||||
|
||||
template <class T> |
||||
void scale1_with_bias1(const Stream& stream, Span<T> output, View<T> input, T alpha, T beta) { |
||||
CV_Assert(output.size() == input.size()); |
||||
|
||||
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) { |
||||
launch_scale1_with_bias1_vec_kernel<T, 4>(stream, output, input, alpha, beta); |
||||
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) { |
||||
launch_scale1_with_bias1_vec_kernel<T, 2>(stream, output, input, alpha, beta); |
||||
} else { |
||||
launch_scale1_with_bias1_vec_kernel<T, 1>(stream, output, input, alpha, beta); |
||||
} |
||||
} |
||||
|
||||
template void scale1_with_bias1<__half>(const Stream&, Span<__half>, View<__half>, __half, __half); |
||||
template void scale1_with_bias1<float>(const Stream&, Span<float>, View<float>, float, float); |
||||
|
||||
template <class T, std::size_t N> static |
||||
void launch_scaleN_with_biasN_vec_kernel(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> weights, View<T> bias) { |
||||
CV_Assert(is_fully_aligned<T>(output, N)); |
||||
CV_Assert(is_fully_aligned<T>(input, N)); |
||||
CV_Assert(inner_size % N == 0); |
||||
|
||||
auto kernel = raw::scaleN_with_biasN_vec<T, N>; |
||||
auto policy = make_policy(kernel, output.size() / N, 0, stream); |
||||
launch_kernel(kernel, policy, output, input, inner_size, weights, bias); |
||||
} |
||||
|
||||
template <class T> |
||||
void scaleN_with_biasN( |
||||
const Stream& stream, |
||||
TensorSpan<T> output, |
||||
TensorView<T> input, std::size_t inner_size, |
||||
TensorView<T> weights, TensorView<T> bias) |
||||
{ |
||||
CV_Assert(is_shape_same(input, output)); |
||||
CV_Assert(weights.size() == bias.size()); |
||||
|
||||
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && inner_size % 4 == 0) { |
||||
launch_scaleN_with_biasN_vec_kernel<T, 4>(stream, output, input, inner_size, weights, bias); |
||||
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && inner_size % 2 == 0) { |
||||
launch_scaleN_with_biasN_vec_kernel<T, 2>(stream, output, input, inner_size, weights, bias); |
||||
} else { |
||||
launch_scaleN_with_biasN_vec_kernel<T, 1>(stream, output, input, inner_size, weights, bias); |
||||
} |
||||
} |
||||
|
||||
template void scaleN_with_biasN<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, std::size_t, TensorView<__half>, TensorView<__half>); |
||||
template void scaleN_with_biasN<float>(const Stream&, TensorSpan<float>, TensorView<float>, std::size_t, TensorView<float>, TensorView<float>); |
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */ |
@ -0,0 +1,169 @@ |
||||
// This file is part of OpenCV project. |
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory |
||||
// of this distribution and at http://opencv.org/license.html. |
||||
|
||||
#include <cuda_runtime.h> |
||||
#include <cuda_fp16.h> |
||||
|
||||
#include "array.hpp" |
||||
#include "types.hpp" |
||||
#include "grid_stride_range.hpp" |
||||
#include "execution.hpp" |
||||
#include "kernel_dispatcher.hpp" |
||||
|
||||
#include "../cuda4dnn/csl/stream.hpp" |
||||
#include "../cuda4dnn/csl/tensor.hpp" |
||||
#include "../cuda4dnn/csl/span.hpp" |
||||
|
||||
#include <opencv2/core.hpp> |
||||
|
||||
#include <cstddef> |
||||
#include <vector> |
||||
#include <iostream> |
||||
|
||||
using namespace cv::dnn::cuda4dnn::csl; |
||||
using namespace cv::dnn::cuda4dnn::csl::device; |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { |
||||
|
||||
namespace raw { |
||||
template <class T, std::size_t Rank> |
||||
__global__ void slice( |
||||
Span<T> output, array<size_type, Rank> out_strides, |
||||
View<T> input, array<size_type, Rank> in_strides, array<index_type, Rank> in_offset) |
||||
{ |
||||
for (auto i : grid_stride_range(output.size())) { |
||||
index_type out_index = i / out_strides[0]; |
||||
index_type in_index = in_offset[0] + out_index; |
||||
index_type iidx = in_index * in_strides[0]; |
||||
for (int j = 1; j < Rank; j++) { |
||||
out_index = (i % out_strides[j - 1]) / out_strides[j]; |
||||
in_index = in_offset[j] + out_index; |
||||
iidx += in_index * in_strides[j]; |
||||
} |
||||
|
||||
output[i] = input[iidx]; |
||||
} |
||||
} |
||||
} |
||||
|
||||
template <class T, std::size_t Rank> static |
||||
void launch_slice( |
||||
const Stream& stream, |
||||
Span<T> output, const std::vector<std::size_t>& outStride, |
||||
View<T> input, const std::vector<std::size_t>& inStride, const std::vector<std::size_t>& inOffset) |
||||
{ |
||||
CV_Assert(outStride.size() == Rank); |
||||
CV_Assert(inStride.size() == Rank); |
||||
CV_Assert(inOffset.size() == Rank); |
||||
|
||||
array<size_type, Rank> outStride_k, inStride_k; |
||||
outStride_k.assign(std::begin(outStride), std::end(outStride)); |
||||
inStride_k.assign(std::begin(inStride), std::end(inStride)); |
||||
|
||||
array<index_type, Rank> inOffset_k; |
||||
inOffset_k.assign(std::begin(inOffset), std::end(inOffset)); |
||||
|
||||
auto kernel = raw::slice<T, Rank>; |
||||
auto policy = make_policy(kernel, output.size(), 0, stream); |
||||
launch_kernel(kernel, policy, output, outStride_k, input, inStride_k, inOffset_k); |
||||
} |
||||
|
||||
GENERATE_KERNEL_DISPATCHER(slice_dispatcher, launch_slice); |
||||
|
||||
template <class T> |
||||
void slice(const Stream& stream, |
||||
TensorSpan<T> output, TensorView<T> input, |
||||
std::vector<std::size_t> offsets) |
||||
{ |
||||
CV_Assert(output.rank() == input.rank()); |
||||
CV_Assert(output.rank() == offsets.size()); |
||||
|
||||
/* squeezable axes at the begining of both tensors can be eliminated |
||||
* |
||||
* Reasoning: |
||||
* ---------- |
||||
* Suppose an item's indices in the output tensor is [o1, o2, ...]. The indices in the input |
||||
* tensor will be [o1 + off1, o2 + off2, ...]. The rest of the elements in the input are igored. |
||||
* |
||||
* If the size of the first axis of the input and output tensor is unity, the input and output indices |
||||
* for all the elements will be of the form be [0, o2 + off2, ...] and [0, o2, ...] respectively. Note that |
||||
* there cannot be any ignored items since the axes have unit size. The first index does not contribute to the |
||||
* element's address calculation and hence does nothing apart from eating up few cycles. |
||||
*/ |
||||
while (input.get_axis_size(0) == 1 && output.get_axis_size(0) == 1) { |
||||
CV_Assert(offsets[0] == 0); |
||||
|
||||
input.squeeze(0); |
||||
output.squeeze(0); |
||||
offsets.erase(std::begin(offsets)); |
||||
|
||||
CV_Assert(output.rank() == input.rank()); |
||||
CV_Assert(output.rank() == offsets.size()); |
||||
} |
||||
|
||||
auto inShape = input.shape_as_vector(); |
||||
auto outShape = output.shape_as_vector(); |
||||
|
||||
/* contiguous axes which do not undergo slicing can be combined into one axis |
||||
* |
||||
* Reasoning: |
||||
* ---------- |
||||
* Suppose an item's indices in the output tensor is [o1, o2, o3, ...]. Let the first two axes not undergo any |
||||
* slicing. The indices in the input tensor will be [o1, o2, o3 + off3, ...]. |
||||
* |
||||
* Each axis in the contiguous unsliced axes sequence will add an offset of iN * strideN. In the above example, |
||||
* the two axes add a total offset of `o1 * stride1 + o2 * stride2`. We can merge the two axes into one axis with |
||||
* a size of `size1 * size2`. The new offset added will be o12 * stride2` as the kernel iterates through `o12`. |
||||
* Note that `o12` is actually `(o1 * size2 + o2)` in the original tensor. |
||||
*/ |
||||
for (int i = 0; i < inShape.size(); i++) { |
||||
/* check if axis `i` requires any slicing */ |
||||
if (offsets[i] == 0 && inShape[i] == outShape[i]) { |
||||
/* loop invariant: `i` is the first axis in the contiguous unsliced axis sequence */ |
||||
|
||||
int j = i + 1; /* `j` is the axis which we will attempt to merge */ |
||||
while (j < inShape.size() && offsets[j] == 0 && inShape[j] == outShape[j]) { |
||||
/* `j` axis is also unsliced; merge `i` and `j` */ |
||||
auto new_size = inShape[i] * inShape[j]; |
||||
inShape[i] = new_size; |
||||
outShape[i] = new_size; |
||||
offsets[i] = 0; /* redundant */ |
||||
|
||||
/* delete axis `j` */ |
||||
inShape.erase(std::begin(inShape) + j); |
||||
outShape.erase(std::begin(outShape) + j); |
||||
offsets.erase(std::begin(offsets) + j); |
||||
|
||||
/* optimizations should not break the invariants */ |
||||
CV_Assert(inShape.size() == outShape.size()); |
||||
CV_Assert(inShape.size() == offsets.size()); |
||||
CV_Assert(inShape[i] == outShape[i]); |
||||
CV_Assert(offsets[i] == 0); |
||||
} |
||||
} |
||||
} |
||||
|
||||
auto rank = inShape.size(); |
||||
|
||||
std::vector<std::size_t> inStride(rank), outStride(rank); |
||||
inStride.back() = 1; |
||||
outStride.back() = 1; |
||||
/* garbage, ..., garbage, 1 */ |
||||
|
||||
std::copy(std::begin(inShape) + 1, std::end(inShape), std::begin(inStride)); |
||||
std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride)); |
||||
/* dim[0], dim[1], ..., dim[-1], 1 */ |
||||
|
||||
std::partial_sum(inStride.rbegin(), inStride.rend(), inStride.rbegin(), std::multiplies<std::size_t>()); |
||||
std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies<std::size_t>()); |
||||
/* stride[0], stride[1], ..., stride[-2], 1 */ |
||||
|
||||
CV_Assert(1 <= rank && rank <= CSL_MAX_TENSOR_RANK); |
||||
slice_dispatcher<T, 1, CSL_MAX_TENSOR_RANK>(rank, stream, output, outStride, input, inStride, offsets); |
||||
} |
||||
|
||||
template void slice(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::size_t>); |
||||
template void slice(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::size_t>); |
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */ |
@ -1,18 +0,0 @@ |
||||
// This file is part of OpenCV project. |
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory |
||||
// of this distribution and at http://opencv.org/license.html. |
||||
|
||||
// this file is a stub and will be removed once actual code is added |
||||
|
||||
#include "../precomp.hpp" |
||||
|
||||
#include <cuda_runtime.h> |
||||
|
||||
#ifndef HAVE_CUDA |
||||
# error "CUDA files should not be compiled if CUDA was not enabled" |
||||
#endif |
||||
|
||||
__global__ void cuda4dnn_build_test_kernel(float* addr) { |
||||
int idx = threadIdx.x; |
||||
addr[idx] = 0.0; |
||||
} |
@ -0,0 +1,27 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA_TYPES_HPP |
||||
#define OPENCV_DNN_SRC_CUDA_TYPES_HPP |
||||
|
||||
#include <cstdint> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device { |
||||
|
||||
/* For indices, we can use 32bit variables or 64bit variables. The GPU registers are 32 bits in size.
|
||||
* Hence, a 64bit variable requires two registers and is significantly slower than the 32bit versions. |
||||
* |
||||
* If we do not need to handle huge tensors, we can use 32-bit indices and get better performance. |
||||
*/ |
||||
#ifdef __CUDACC__ |
||||
using size_type = int; |
||||
using index_type = int; |
||||
#else |
||||
using size_type = std::int32_t; |
||||
using index_type = std::int32_t; |
||||
#endif |
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA_TYPES_HPP */ |
@ -0,0 +1,109 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA_VECTOR_TRAITS_HPP |
||||
#define OPENCV_DNN_SRC_CUDA_VECTOR_TRAITS_HPP |
||||
|
||||
#include <cuda_runtime.h> |
||||
|
||||
#include "types.hpp" |
||||
|
||||
#include "../cuda4dnn/csl/pointer.hpp" |
||||
|
||||
#include <type_traits> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device { |
||||
|
||||
/** \file vector_traits.hpp
|
||||
* \brief utility classes and functions for vectorized memory loads/stores |
||||
* |
||||
* Example: |
||||
* using vector_type = get_vector_type_t<float, 4>; |
||||
* |
||||
* auto input_vPtr = type::get_pointer(iptr); // iptr is of type DevicePtr<const float>
|
||||
* auto output_vPtr = type::get_pointer(optr); // optr is of type DevicePtr<float>
|
||||
* |
||||
* vector_type vec; |
||||
* v_load(vec, input_vPtr); |
||||
* |
||||
* for(int i = 0; i < vector_type::size(); i++) |
||||
* vec[i] = do_something(vec[i]); |
||||
* |
||||
* v_store(output_vPtr, vec); |
||||
*/ |
||||
|
||||
namespace detail { |
||||
template <size_type N> struct raw_type_ { }; |
||||
template <> struct raw_type_<256> { typedef ulonglong4 type; }; |
||||
template <> struct raw_type_<128> { typedef uint4 type; }; |
||||
template <> struct raw_type_<64> { typedef uint2 type; }; |
||||
template <> struct raw_type_<32> { typedef uint1 type; }; |
||||
template <> struct raw_type_<16> { typedef uchar2 type; }; |
||||
template <> struct raw_type_<8> { typedef uchar1 type; }; |
||||
|
||||
template <size_type N> struct raw_type { |
||||
using type = typename raw_type_<N>::type; |
||||
static_assert(sizeof(type) * 8 == N, ""); |
||||
}; |
||||
} |
||||
|
||||
/* \tparam T type of element in the vector
|
||||
* \tparam N "number of elements" of type T in the vector |
||||
*/ |
||||
template <class T, size_type N> |
||||
union vector_type { |
||||
using value_type = T; |
||||
using raw_type = typename detail::raw_type<N * sizeof(T) * 8>::type; |
||||
|
||||
__device__ vector_type() { } |
||||
|
||||
__device__ static constexpr size_type size() { return N; } |
||||
|
||||
raw_type raw; |
||||
T data[N]; |
||||
|
||||
template <class U> static __device__ |
||||
typename std::enable_if<std::is_const<U>::value, const vector_type*> |
||||
::type get_pointer(csl::DevicePtr<U> ptr) { |
||||
return reinterpret_cast<const vector_type*>(ptr.get()); |
||||
} |
||||
|
||||
template <class U> static __device__ |
||||
typename std::enable_if<!std::is_const<U>::value, vector_type*> |
||||
::type get_pointer(csl::DevicePtr<U> ptr) { |
||||
return reinterpret_cast<vector_type*>(ptr.get()); |
||||
} |
||||
}; |
||||
|
||||
template <class V> |
||||
__device__ void v_load(V& dest, const V& src) { |
||||
dest.raw = src.raw; |
||||
} |
||||
|
||||
template <class V> |
||||
__device__ void v_load(V& dest, const V* src) { |
||||
dest.raw = src->raw; |
||||
} |
||||
|
||||
template <class V> |
||||
__device__ void v_store(V* dest, const V& src) { |
||||
dest->raw = src.raw; |
||||
} |
||||
|
||||
template <class V> |
||||
__device__ void v_store(V& dest, const V& src) { |
||||
dest.raw = src.raw; |
||||
} |
||||
|
||||
template <class T, size_type N> |
||||
struct get_vector_type { |
||||
typedef vector_type<T, N> type; |
||||
}; |
||||
|
||||
template <class T, size_type N> |
||||
using get_vector_type_t = typename get_vector_type<T, N>::type; |
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA_VECTOR_TRAITS_HPP */ |
@ -0,0 +1,230 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_CUBLAS_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_CUBLAS_HPP |
||||
|
||||
#include "error.hpp" |
||||
#include "stream.hpp" |
||||
#include "pointer.hpp" |
||||
#include "fp16.hpp" |
||||
|
||||
#include <opencv2/core.hpp> |
||||
|
||||
#include <cublas_v2.h> |
||||
|
||||
#include <cstddef> |
||||
#include <memory> |
||||
#include <utility> |
||||
|
||||
#define CUDA4DNN_CHECK_CUBLAS(call) \ |
||||
::cv::dnn::cuda4dnn::csl::cublas::detail::check((call), CV_Func, __FILE__, __LINE__) |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cublas { |
||||
|
||||
/** @brief exception class for errors thrown by the cuBLAS API */ |
||||
class cuBLASException : public CUDAException { |
||||
public: |
||||
using CUDAException::CUDAException; |
||||
}; |
||||
|
||||
namespace detail { |
||||
static void check(cublasStatus_t status, const char* func, const char* file, int line) { |
||||
auto cublasGetErrorString = [](cublasStatus_t err) { |
||||
switch (err) { |
||||
case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; |
||||
case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED"; |
||||
case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED"; |
||||
case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE"; |
||||
case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; |
||||
case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR"; |
||||
case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; |
||||
case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; |
||||
case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED"; |
||||
case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR"; |
||||
} |
||||
return "UNKNOWN_CUBLAS_ERROR"; |
||||
}; |
||||
|
||||
if (status != CUBLAS_STATUS_SUCCESS) |
||||
throw cuBLASException(Error::GpuApiCallError, cublasGetErrorString(status), func, file, line); |
||||
} |
||||
} |
||||
|
||||
/** noncopyable cuBLAS smart handle
|
||||
* |
||||
* UniqueHandle is a smart non-sharable wrapper for cuBLAS handle which ensures that the handle |
||||
* is destroyed after use. The handle can be associated with a CUDA stream by specifying the |
||||
* stream during construction. By default, the handle is associated with the default stream. |
||||
*/ |
||||
class UniqueHandle { |
||||
public: |
||||
UniqueHandle() { CUDA4DNN_CHECK_CUBLAS(cublasCreate(&handle)); } |
||||
UniqueHandle(UniqueHandle&) = delete; |
||||
UniqueHandle(UniqueHandle&& other) noexcept |
||||
: stream(std::move(other.stream)), handle{ other.handle } { |
||||
other.handle = nullptr; |
||||
} |
||||
|
||||
UniqueHandle(Stream strm) : stream(std::move(strm)) { |
||||
CUDA4DNN_CHECK_CUBLAS(cublasCreate(&handle)); |
||||
try { |
||||
CUDA4DNN_CHECK_CUBLAS(cublasSetStream(handle, stream.get())); |
||||
} catch (...) { |
||||
/* cublasDestroy won't throw if a valid handle is passed */ |
||||
CUDA4DNN_CHECK_CUBLAS(cublasDestroy(handle)); |
||||
throw; |
||||
} |
||||
} |
||||
|
||||
~UniqueHandle() noexcept { |
||||
if (handle != nullptr) { |
||||
/* cublasDestroy won't throw if a valid handle is passed */ |
||||
CUDA4DNN_CHECK_CUBLAS(cublasDestroy(handle)); |
||||
} |
||||
} |
||||
|
||||
UniqueHandle& operator=(const UniqueHandle&) = delete; |
||||
UniqueHandle& operator=(UniqueHandle&& other) noexcept { |
||||
stream = std::move(other.stream); |
||||
handle = other.handle; |
||||
other.handle = nullptr; |
||||
return *this; |
||||
} |
||||
|
||||
/** @brief returns the raw cuBLAS handle */ |
||||
cublasHandle_t get() const noexcept { return handle; } |
||||
|
||||
private: |
||||
Stream stream; |
||||
cublasHandle_t handle; |
||||
}; |
||||
|
||||
/** @brief sharable cuBLAS smart handle
|
||||
* |
||||
* Handle is a smart sharable wrapper for cuBLAS handle which ensures that the handle |
||||
* is destroyed after all references to the handle are destroyed. The handle can be |
||||
* associated with a CUDA stream by specifying the stream during construction. By default, |
||||
* the handle is associated with the default stream. |
||||
* |
||||
* @note Moving a Handle object to another invalidates the former |
||||
*/ |
||||
class Handle { |
||||
public: |
||||
Handle() : handle(std::make_shared<UniqueHandle>()) { } |
||||
Handle(const Handle&) = default; |
||||
Handle(Handle&&) = default; |
||||
Handle(Stream strm) : handle(std::make_shared<UniqueHandle>(std::move(strm))) { } |
||||
|
||||
Handle& operator=(const Handle&) = default; |
||||
Handle& operator=(Handle&&) = default; |
||||
|
||||
/** returns true if the handle is valid */ |
||||
explicit operator bool() const noexcept { return static_cast<bool>(handle); } |
||||
|
||||
cublasHandle_t get() const noexcept { |
||||
CV_Assert(handle); |
||||
return handle->get(); |
||||
} |
||||
|
||||
private: |
||||
std::shared_ptr<UniqueHandle> handle; |
||||
}; |
||||
|
||||
/** @brief GEMM for colummn-major matrices
|
||||
* |
||||
* \f$ C = \alpha AB + \beta C \f$ |
||||
* |
||||
* @tparam T matrix element type (must be `half` or `float`) |
||||
* |
||||
* @param handle valid cuBLAS Handle |
||||
* @param transa use transposed matrix of A for computation |
||||
* @param transb use transposed matrix of B for computation |
||||
* @param rows_c number of rows in C |
||||
* @param cols_c number of columns in C |
||||
* @param common_dim common dimension of A (or trans A) and B (or trans B) |
||||
* @param alpha scale factor for AB |
||||
* @param[in] A pointer to column-major matrix A in device memory |
||||
* @param lda leading dimension of matrix A |
||||
* @param[in] B pointer to column-major matrix B in device memory |
||||
* @param ldb leading dimension of matrix B |
||||
* @param beta scale factor for C |
||||
* @param[in,out] C pointer to column-major matrix C in device memory |
||||
* @param ldc leading dimension of matrix C |
||||
* |
||||
* Exception Guarantee: Basic |
||||
*/ |
||||
template <class T> |
||||
void gemm(const Handle& handle, |
||||
bool transa, bool transb, |
||||
std::size_t rows_c, std::size_t cols_c, std::size_t common_dim, |
||||
T alpha, const DevicePtr<const T> A, std::size_t lda, |
||||
const DevicePtr<const T> B, std::size_t ldb, |
||||
T beta, const DevicePtr<T> C, std::size_t ldc); |
||||
|
||||
template <> inline |
||||
void gemm<half>(const Handle& handle, |
||||
bool transa, bool transb, |
||||
std::size_t rows_c, std::size_t cols_c, std::size_t common_dim, |
||||
half alpha, const DevicePtr<const half> A, std::size_t lda, |
||||
const DevicePtr<const half> B, std::size_t ldb, |
||||
half beta, const DevicePtr<half> C, std::size_t ldc) |
||||
{ |
||||
CV_Assert(handle); |
||||
|
||||
auto opa = transa ? CUBLAS_OP_T : CUBLAS_OP_N, |
||||
opb = transb ? CUBLAS_OP_T : CUBLAS_OP_N; |
||||
int irows_c = static_cast<int>(rows_c), |
||||
icols_c = static_cast<int>(cols_c), |
||||
icommon_dim = static_cast<int>(common_dim), |
||||
ilda = static_cast<int>(lda), |
||||
ildb = static_cast<int>(ldb), |
||||
ildc = static_cast<int>(ldc); |
||||
|
||||
CUDA4DNN_CHECK_CUBLAS( |
||||
cublasHgemm( |
||||
handle.get(), |
||||
opa, opb, |
||||
irows_c, icols_c, icommon_dim, |
||||
&alpha, A.get(), ilda, |
||||
B.get(), ildb, |
||||
&beta, C.get(), ildc |
||||
) |
||||
); |
||||
} |
||||
|
||||
template <> inline |
||||
void gemm<float>(const Handle& handle, |
||||
bool transa, bool transb, |
||||
std::size_t rows_c, std::size_t cols_c, std::size_t common_dim, |
||||
float alpha, const DevicePtr<const float> A, std::size_t lda, |
||||
const DevicePtr<const float> B, std::size_t ldb, |
||||
float beta, const DevicePtr<float> C, std::size_t ldc) |
||||
{ |
||||
CV_Assert(handle); |
||||
|
||||
auto opa = transa ? CUBLAS_OP_T : CUBLAS_OP_N, |
||||
opb = transb ? CUBLAS_OP_T : CUBLAS_OP_N; |
||||
int irows_c = static_cast<int>(rows_c), |
||||
icols_c = static_cast<int>(cols_c), |
||||
icommon_dim = static_cast<int>(common_dim), |
||||
ilda = static_cast<int>(lda), |
||||
ildb = static_cast<int>(ldb), |
||||
ildc = static_cast<int>(ldc); |
||||
|
||||
CUDA4DNN_CHECK_CUBLAS( |
||||
cublasSgemm( |
||||
handle.get(), |
||||
opa, opb, |
||||
irows_c, icols_c, icommon_dim, |
||||
&alpha, A.get(), ilda, |
||||
B.get(), ildb, |
||||
&beta, C.get(), ildc |
||||
) |
||||
); |
||||
} |
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::cublas */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_CUBLAS_HPP */ |
@ -0,0 +1,10 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_CUDNN_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_CUDNN_HPP |
||||
|
||||
#include "cudnn/cudnn.hpp" |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_CUDNN_HPP */ |
@ -0,0 +1,408 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP |
||||
#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP |
||||
|
||||
#include "cudnn.hpp" |
||||
|
||||
#include "../pointer.hpp" |
||||
#include "../workspace.hpp" |
||||
|
||||
#include <cudnn.h> |
||||
|
||||
#include <cstddef> |
||||
#include <array> |
||||
#include <algorithm> |
||||
#include <vector> |
||||
#include <type_traits> |
||||
#include <iterator> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn { |
||||
|
||||
/** describe convolution filters
|
||||
* |
||||
* @tparam T type of elements in the kernels |
||||
*/ |
||||
template <class T> |
||||
class FilterDescriptor { |
||||
public: |
||||
FilterDescriptor() noexcept : descriptor{ nullptr } { } |
||||
FilterDescriptor(const FilterDescriptor&) = delete; |
||||
FilterDescriptor(FilterDescriptor&& other) noexcept |
||||
: descriptor{ other.descriptor } { |
||||
other.descriptor = nullptr; |
||||
} |
||||
|
||||
/** constructs a filter descriptor from the filter dimensions provided in \p shape
|
||||
* |
||||
* Shape dimensions: |
||||
* 0: number of filters |
||||
* 1: number of input feature maps |
||||
* 2..n: kernel dimensions |
||||
* |
||||
* Exception Guarantee: Strong |
||||
*/ |
||||
template <class SequenceContainer, typename = decltype(std::begin(std::declval<SequenceContainer>()))> |
||||
FilterDescriptor(const SequenceContainer& shape) { |
||||
constructor(shape.begin(), shape.end()); |
||||
} |
||||
|
||||
/** constructs a filter descriptor from the filter dimensions provided in [begin, end)
|
||||
* |
||||
* Shape dimensions: |
||||
* 0: number of filters |
||||
* 1: number of input feature maps |
||||
* 2..n: kernel dimensions |
||||
* |
||||
* Exception Guarantee: Strong |
||||
*/ |
||||
template <class ForwardItr, typename = typename std::enable_if<!std::is_integral<ForwardItr>::value, void>::type> // TODO is_iterator
|
||||
FilterDescriptor(ForwardItr begin, ForwardItr end) { |
||||
constructor(begin, end); |
||||
} |
||||
|
||||
/** constructs a filter descriptor from the filter dimensions provided as arguments
|
||||
* |
||||
* Shape dimensions: |
||||
* 0: number of filters |
||||
* 1: number of input feature maps |
||||
* 2..n: kernel dimensions |
||||
* |
||||
* Exception Guarantee: Strong |
||||
*/ |
||||
template <class ...Sizes> |
||||
FilterDescriptor(Sizes ...sizes) { |
||||
static_assert(sizeof...(Sizes) >= 3, "filter descriptors must have at least three dimensions"); |
||||
static_assert(sizeof...(Sizes) <= CUDNN_DIM_MAX, "required rank exceeds maximum supported rank"); |
||||
std::array<int, sizeof...(Sizes)> dims = { static_cast<int>(sizes)... }; |
||||
constructor(std::begin(dims), std::end(dims)); |
||||
} |
||||
|
||||
~FilterDescriptor() noexcept { |
||||
if (descriptor != nullptr) { |
||||
/* cudnnDestroyFilterDescriptor will not fail for a valid descriptor object */ |
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyFilterDescriptor(descriptor)); |
||||
} |
||||
} |
||||
|
||||
FilterDescriptor& operator=(const FilterDescriptor&) = delete; |
||||
FilterDescriptor& operator=(FilterDescriptor&& other) noexcept { |
||||
descriptor = other.descriptor; |
||||
other.descriptor = nullptr; |
||||
return *this; |
||||
}; |
||||
|
||||
cudnnFilterDescriptor_t get() const noexcept { return descriptor; } |
||||
|
||||
private: |
||||
template <class ForwardItr> |
||||
void constructor(ForwardItr start, ForwardItr end) { |
||||
CV_Assert(start != end); |
||||
CV_Assert(std::distance(start, end) >= 3); |
||||
CV_Assert(std::distance(start, end) <= CUDNN_DIM_MAX); |
||||
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnCreateFilterDescriptor(&descriptor)); |
||||
try { |
||||
const auto rank = std::distance(start, end); |
||||
if (rank == 4) { |
||||
std::array<int, 4> dims; |
||||
std::copy(start, end, std::begin(dims)); |
||||
CUDA4DNN_CHECK_CUDNN( |
||||
cudnnSetFilter4dDescriptor( |
||||
descriptor, |
||||
detail::get_data_type<T>(), CUDNN_TENSOR_NCHW, |
||||
dims[0], dims[1], dims[2], dims[3] |
||||
) |
||||
); |
||||
} else { |
||||
std::vector<int> dims(start, end); |
||||
CUDA4DNN_CHECK_CUDNN( |
||||
cudnnSetFilterNdDescriptor( |
||||
descriptor, |
||||
detail::get_data_type<T>(), CUDNN_TENSOR_NCHW, |
||||
dims.size(), dims.data() |
||||
) |
||||
); |
||||
} |
||||
} catch (...) { |
||||
/* cudnnDestroyFilterDescriptor will not fail for a valid descriptor object */ |
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyFilterDescriptor(descriptor)); |
||||
throw; |
||||
} |
||||
} |
||||
|
||||
cudnnFilterDescriptor_t descriptor; |
||||
}; |
||||
|
||||
/** describes a convolution operation
|
||||
* |
||||
* @tparam T type of element participating in convolution |
||||
*/ |
||||
template <class T> |
||||
class ConvolutionDescriptor { |
||||
public: |
||||
ConvolutionDescriptor() noexcept : descriptor{ nullptr } { } |
||||
ConvolutionDescriptor(const ConvolutionDescriptor&) = delete; |
||||
ConvolutionDescriptor(ConvolutionDescriptor&& other) noexcept |
||||
: descriptor{ other.descriptor } { |
||||
other.descriptor = nullptr; |
||||
} |
||||
|
||||
/** constructs a convolution descriptor
|
||||
* |
||||
* Pre-conditions: |
||||
* - \p zero_padding, \p stride and \p dilation must have the same size |
||||
* |
||||
* The length of the containers is interpreted as the order of the convolution. |
||||
* |
||||
* Exception Guarantee: Strong |
||||
*/ |
||||
template <class SequenceContainer, typename = decltype(std::begin(std::declval<SequenceContainer>()))> |
||||
ConvolutionDescriptor( |
||||
const SequenceContainer& zero_padding, |
||||
const SequenceContainer& stride, |
||||
const SequenceContainer& dilation, |
||||
std::size_t group_count) |
||||
{ |
||||
constructor(zero_padding, stride, dilation, group_count); |
||||
} |
||||
|
||||
~ConvolutionDescriptor() noexcept { |
||||
if (descriptor != nullptr) { |
||||
/* cudnnDestroyConvolutionDescriptor will not fail for a valid descriptor object */ |
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyConvolutionDescriptor(descriptor)); |
||||
} |
||||
} |
||||
|
||||
ConvolutionDescriptor& operator=(const ConvolutionDescriptor&) = delete; |
||||
ConvolutionDescriptor& operator=(ConvolutionDescriptor&& other) noexcept { |
||||
descriptor = other.descriptor; |
||||
other.descriptor = nullptr; |
||||
return *this; |
||||
}; |
||||
|
||||
cudnnConvolutionDescriptor_t get() const noexcept { return descriptor; } |
||||
|
||||
private: |
||||
template <class SequenceContainer> |
||||
void constructor( |
||||
const SequenceContainer& zero_padding, |
||||
const SequenceContainer& stride, |
||||
const SequenceContainer& dilation, |
||||
std::size_t group_count) |
||||
{ |
||||
CV_Assert(zero_padding.size() == stride.size()); |
||||
CV_Assert(zero_padding.size() == dilation.size()); |
||||
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnCreateConvolutionDescriptor(&descriptor)); |
||||
try { |
||||
const auto rank = zero_padding.size(); |
||||
if (rank == 2) { |
||||
CUDA4DNN_CHECK_CUDNN( |
||||
cudnnSetConvolution2dDescriptor( |
||||
descriptor, |
||||
zero_padding[0], zero_padding[1], |
||||
stride[0], stride[1], |
||||
dilation[0], dilation[1], |
||||
CUDNN_CROSS_CORRELATION, |
||||
detail::get_data_type<T>() |
||||
) |
||||
); |
||||
} else { |
||||
std::vector<int> ipadding(std::begin(zero_padding), std::end(zero_padding)); |
||||
std::vector<int> istride(std::begin(stride), std::end(stride)); |
||||
std::vector<int> idilation(std::begin(dilation), std::end(dilation)); |
||||
CUDA4DNN_CHECK_CUDNN( |
||||
cudnnSetConvolutionNdDescriptor( |
||||
descriptor, |
||||
rank, ipadding.data(), istride.data(), idilation.data(), |
||||
CUDNN_CROSS_CORRELATION, |
||||
detail::get_data_type<T>() |
||||
) |
||||
); |
||||
} |
||||
CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionGroupCount(descriptor, group_count)); |
||||
} catch (...) { |
||||
/* cudnnDestroyConvolutionDescriptor will not fail for a valid desriptor object */ |
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyConvolutionDescriptor(descriptor)); |
||||
throw; |
||||
} |
||||
} |
||||
|
||||
cudnnConvolutionDescriptor_t descriptor; |
||||
}; |
||||
|
||||
/** wrapper around a convolution algorithm
|
||||
* |
||||
* @tparam T type of elements being convolved |
||||
*/ |
||||
template <class T> |
||||
class ConvolutionAlgorithm { |
||||
public: |
||||
ConvolutionAlgorithm() noexcept : workspace_size{ 0 } { } |
||||
ConvolutionAlgorithm(ConvolutionAlgorithm&) = default; |
||||
ConvolutionAlgorithm(ConvolutionAlgorithm&&) = default; |
||||
|
||||
/** selects a good algorithm for convolution for given configuration
|
||||
* |
||||
* Exception Guarantee: Strong |
||||
*/ |
||||
ConvolutionAlgorithm( |
||||
const Handle& handle, |
||||
const ConvolutionDescriptor<T>& conv, |
||||
const FilterDescriptor<T>& filter, |
||||
const TensorDescriptor<T>& input, |
||||
const TensorDescriptor<T>& output) |
||||
{ |
||||
CUDA4DNN_CHECK_CUDNN( |
||||
cudnnGetConvolutionForwardAlgorithm( |
||||
handle.get(), |
||||
input.get(), filter.get(), conv.get(), output.get(), |
||||
CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, |
||||
0, /* no memory limit */ |
||||
&algo |
||||
) |
||||
); |
||||
|
||||
CUDA4DNN_CHECK_CUDNN( |
||||
cudnnGetConvolutionForwardWorkspaceSize( |
||||
handle.get(), |
||||
input.get(), filter.get(), conv.get(), output.get(), |
||||
algo, &workspace_size |
||||
) |
||||
); |
||||
} |
||||
|
||||
ConvolutionAlgorithm& operator=(const ConvolutionAlgorithm&) = default; |
||||
ConvolutionAlgorithm& operator=(ConvolutionAlgorithm&& other) = default; |
||||
|
||||
cudnnConvolutionFwdAlgo_t get() const noexcept { return algo; } |
||||
|
||||
/** number of bytes of workspace memory required by the algorithm */ |
||||
std::size_t get_workspace_size() const noexcept { return workspace_size; } |
||||
|
||||
private: |
||||
cudnnConvolutionFwdAlgo_t algo; |
||||
std::size_t workspace_size; |
||||
}; |
||||
|
||||
/** gives the shape of the output tensor of convolution
|
||||
* |
||||
* Exception Guarantee: Basic |
||||
*/ |
||||
template <class T> |
||||
void getConvolutionForwardOutputDim( |
||||
const ConvolutionDescriptor<T>& convDesc, |
||||
const FilterDescriptor<T>& filterDesc, |
||||
const TensorDescriptor<T>& inputDesc, |
||||
std::vector<int>& output) |
||||
{ |
||||
output.clear(); |
||||
output.resize(CUDNN_DIM_MAX); /* we use `output` to hold temporaries */ |
||||
|
||||
std::vector<int> temp(CUDNN_DIM_MAX); |
||||
cudnnDataType_t tempDataType; |
||||
CUDA4DNN_CHECK_CUDNN( |
||||
cudnnGetTensorNdDescriptor( |
||||
inputDesc.get(), |
||||
CUDNN_DIM_MAX + 1, /* according to docs, this is what we do to get the rank */ |
||||
&tempDataType, |
||||
output.data(), |
||||
temp.data(), |
||||
temp.data() |
||||
) |
||||
); |
||||
|
||||
const auto rank = output[0]; |
||||
output.resize(rank); |
||||
CUDA4DNN_CHECK_CUDNN( |
||||
cudnnGetConvolutionNdForwardOutputDim( |
||||
convDesc.get(), inputDesc.get(), filterDesc.get(), rank, output.data() |
||||
) |
||||
); |
||||
} |
||||
|
||||
/** @brief performs convolution
|
||||
* |
||||
* dstValue = alpha * result + beta * priorDstValue |
||||
* |
||||
* @tparam T convolution element type (must be `half` or `float`) |
||||
* |
||||
* @param handle valid cuDNN Handle |
||||
* @param convDesc convolution description |
||||
* @param convAlgo algorithm to use for convolution |
||||
* @param workspace workspace memory which meets the requirements of \p convAlgo |
||||
* @param filterDesc filter descriptor |
||||
* @param[in] filterPtr pointer to device memory containing the filters |
||||
* @param inputDesc tensor descriptor describing the input |
||||
* @param[in] inputPtr pointer to input tensor in device memory |
||||
* @param alpha result scale factor |
||||
* @param beta previous value scale factor |
||||
* @param outputDesc tensor descriptor describing the output |
||||
* @param[out] outputPtr pointer to output tensor in device memory |
||||
* |
||||
* Exception Guarantee: Basic |
||||
*/ |
||||
template <class T> |
||||
void convolve( |
||||
const Handle& handle, |
||||
const ConvolutionDescriptor<T>& convDesc, |
||||
const ConvolutionAlgorithm<T>& convAlgo, |
||||
WorkspaceInstance workspace, |
||||
const FilterDescriptor<T>& filterDesc, |
||||
DevicePtr<const T> filterPtr, |
||||
const TensorDescriptor<T>& inputDesc, |
||||
DevicePtr<const T> inputPtr, |
||||
T alpha, T beta, |
||||
const TensorDescriptor<T>& outputDesc, |
||||
DevicePtr<T> outputPtr) |
||||
{ |
||||
CV_Assert(handle); |
||||
|
||||
CUDA4DNN_CHECK_CUDNN( |
||||
cudnnConvolutionForward( |
||||
handle.get(), |
||||
&alpha, inputDesc.get(), inputPtr.get(), |
||||
filterDesc.get(), filterPtr.get(), |
||||
convDesc.get(), convAlgo.get(), |
||||
static_cast<void*>(workspace.get()), workspace.size_in_bytes(), |
||||
&beta, outputDesc.get(), outputPtr.get() |
||||
) |
||||
); |
||||
} |
||||
|
||||
template <> inline |
||||
void convolve( |
||||
const Handle& handle, |
||||
const ConvolutionDescriptor<half>& convDesc, |
||||
const ConvolutionAlgorithm<half>& convAlgo, |
||||
WorkspaceInstance workspace, |
||||
const FilterDescriptor<half>& filterDesc, |
||||
DevicePtr<const half> filterPtr, |
||||
const TensorDescriptor<half>& inputDesc, |
||||
DevicePtr<const half> inputPtr, |
||||
half alpha, half beta, |
||||
const TensorDescriptor<half>& outputDesc, |
||||
DevicePtr<half> outputPtr) |
||||
{ |
||||
CV_Assert(handle); |
||||
|
||||
/* we specalize for fp16 as the scaling factors must be provided as `float` */ |
||||
float alpha_ = alpha, beta_ = beta; |
||||
CUDA4DNN_CHECK_CUDNN( |
||||
cudnnConvolutionForward( |
||||
handle.get(), |
||||
&alpha_, inputDesc.get(), inputPtr.get(), |
||||
filterDesc.get(), filterPtr.get(), |
||||
convDesc.get(), convAlgo.get(), |
||||
static_cast<void*>(workspace.get()), workspace.size_in_bytes(), |
||||
&beta_, outputDesc.get(), outputPtr.get() |
||||
) |
||||
); |
||||
} |
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */ |
||||
|
||||
#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP */ |
@ -0,0 +1,280 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CUDNN_HPP |
||||
#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CUDNN_HPP |
||||
|
||||
#include "../fp16.hpp" |
||||
#include "../pointer.hpp" |
||||
|
||||
#include <cudnn.h> |
||||
|
||||
#include <cstddef> |
||||
#include <array> |
||||
#include <algorithm> |
||||
#include <functional> |
||||
#include <numeric> |
||||
#include <vector> |
||||
#include <type_traits> |
||||
#include <iterator> |
||||
|
||||
#define CUDA4DNN_CHECK_CUDNN(call) \ |
||||
::cv::dnn::cuda4dnn::csl::cudnn::detail::check((call), CV_Func, __FILE__, __LINE__) |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn { |
||||
|
||||
/** @brief exception class for errors thrown by the cuDNN API */ |
||||
class cuDNNException : public CUDAException { |
||||
public: |
||||
using CUDAException::CUDAException; |
||||
}; |
||||
|
||||
namespace detail { |
||||
inline void check(cudnnStatus_t status, const char* func, const char* file, int line) { |
||||
if (status != CUDNN_STATUS_SUCCESS) |
||||
throw cuDNNException(Error::GpuApiCallError, cudnnGetErrorString(status), func, file, line); |
||||
} |
||||
|
||||
/** get_data_type<T> returns the equivalent cudnn enumeration constant for type T */ |
||||
template <class> auto get_data_type()->decltype(CUDNN_DATA_FLOAT); |
||||
template <> inline auto get_data_type<half>()->decltype(CUDNN_DATA_HALF) { return CUDNN_DATA_HALF; } |
||||
template <> inline auto get_data_type<float>()->decltype(CUDNN_DATA_FLOAT) { return CUDNN_DATA_FLOAT; } |
||||
} |
||||
|
||||
/** @brief noncopyable cuDNN smart handle
|
||||
* |
||||
* UniqueHandle is a smart non-sharable wrapper for cuDNN handle which ensures that the handle |
||||
* is destroyed after use. |
||||
*/ |
||||
class UniqueHandle { |
||||
public: |
||||
/** creates a cuDNN handle which executes in the default stream
|
||||
* |
||||
* Exception Guarantee: Basic |
||||
*/ |
||||
UniqueHandle() { CUDA4DNN_CHECK_CUDNN(cudnnCreate(&handle)); } |
||||
|
||||
UniqueHandle(UniqueHandle&) = delete; |
||||
UniqueHandle(UniqueHandle&& other) noexcept |
||||
: stream(std::move(other.stream)), handle{ other.handle } { |
||||
other.handle = nullptr; |
||||
} |
||||
|
||||
/** creates a cuDNN handle and associates it with the stream specified
|
||||
* |
||||
* Exception Guarantee: Basic |
||||
*/ |
||||
UniqueHandle(Stream strm) : stream(std::move(strm)) { |
||||
CUDA4DNN_CHECK_CUDNN(cudnnCreate(&handle)); |
||||
try { |
||||
CUDA4DNN_CHECK_CUDNN(cudnnSetStream(handle, stream.get())); |
||||
} catch (...) { |
||||
/* cudnnDestroy won't throw if a valid handle is passed */ |
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroy(handle)); |
||||
throw; |
||||
} |
||||
} |
||||
|
||||
~UniqueHandle() noexcept { |
||||
if (handle != nullptr) { |
||||
/* cudnnDestroy won't throw if a valid handle is passed */ |
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroy(handle)); |
||||
} |
||||
} |
||||
|
||||
UniqueHandle& operator=(const UniqueHandle&) = delete; |
||||
UniqueHandle& operator=(UniqueHandle&& other) noexcept { |
||||
stream = std::move(other.stream); |
||||
handle = other.handle; |
||||
other.handle = nullptr; |
||||
return *this; |
||||
} |
||||
|
||||
/** returns the raw cuDNN handle */ |
||||
cudnnHandle_t get() const noexcept { return handle; } |
||||
|
||||
private: |
||||
Stream stream; |
||||
cudnnHandle_t handle; |
||||
}; |
||||
|
||||
/** @brief sharable cuDNN smart handle
|
||||
* |
||||
* Handle is a smart sharable wrapper for cuDNN handle which ensures that the handle |
||||
* is destroyed after all references to the handle are destroyed. |
||||
* |
||||
* @note Moving a Handle object to another invalidates the former |
||||
*/ |
||||
class Handle { |
||||
public: |
||||
/** creates a cuDNN handle which executes in the default stream
|
||||
* |
||||
* Exception Guarantee: Basic |
||||
*/ |
||||
Handle() : handle(std::make_shared<UniqueHandle>()) { } |
||||
|
||||
Handle(const Handle&) = default; |
||||
Handle(Handle&&) = default; |
||||
|
||||
/** creates a cuDNN handle and associates it with the stream specified
|
||||
* |
||||
* Exception Guarantee: Basic |
||||
*/ |
||||
Handle(Stream strm) : handle(std::make_shared<UniqueHandle>(std::move(strm))) { } |
||||
|
||||
Handle& operator=(const Handle&) = default; |
||||
Handle& operator=(Handle&&) = default; |
||||
|
||||
/** returns true if the handle is valid */ |
||||
explicit operator bool() const noexcept { return static_cast<bool>(handle); } |
||||
|
||||
cudnnHandle_t get() const noexcept { |
||||
CV_Assert(handle); |
||||
return handle->get(); |
||||
} |
||||
|
||||
private: |
||||
std::shared_ptr<UniqueHandle> handle; |
||||
}; |
||||
|
||||
/** describe a tensor
|
||||
* |
||||
* @tparam T type of elements in the tensor |
||||
*/ |
||||
template <class T> |
||||
class TensorDescriptor { |
||||
public: |
||||
TensorDescriptor() noexcept : descriptor{ nullptr } { } |
||||
TensorDescriptor(const TensorDescriptor&) = delete; |
||||
TensorDescriptor(TensorDescriptor&& other) noexcept |
||||
: descriptor{ other.descriptor } { |
||||
other.descriptor = nullptr; |
||||
} |
||||
|
||||
/** constructs a tensor descriptor from the axis lengths provided in \p shape
|
||||
* |
||||
* Exception Guarantee: Basic |
||||
*/ |
||||
template <class SequenceContainer, typename = decltype(std::begin(std::declval<SequenceContainer>()))> |
||||
TensorDescriptor(const SequenceContainer& shape) { |
||||
constructor(shape.begin(), shape.end()); |
||||
} |
||||
|
||||
/** constructs a tensor descriptor from the axis lengths provided in [begin, end)
|
||||
* |
||||
* Exception Guarantee: Basic |
||||
*/ |
||||
template <class ForwardItr, typename = typename std::enable_if<!std::is_integral<ForwardItr>::value, void>::type> // TODO is_iterator
|
||||
TensorDescriptor(ForwardItr begin, ForwardItr end) { |
||||
constructor(begin, end); |
||||
} |
||||
|
||||
/** constructs a tensor descriptor from the axis lengths provided as arguments
|
||||
* |
||||
* Exception Guarantee: Basic |
||||
*/ |
||||
template <class ...Sizes> |
||||
TensorDescriptor(Sizes ...sizes) { |
||||
static_assert(sizeof...(Sizes) <= CUDNN_DIM_MAX, "required rank exceeds maximum supported rank"); |
||||
std::array<int, sizeof...(Sizes)> dims = { static_cast<int>(sizes)... }; |
||||
constructor(std::begin(dims), std::end(dims)); |
||||
} |
||||
|
||||
~TensorDescriptor() noexcept { |
||||
if (descriptor != nullptr) { |
||||
/* cudnnDestroyTensorDescriptor will not fail */ |
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyTensorDescriptor(descriptor)); |
||||
} |
||||
} |
||||
|
||||
TensorDescriptor& operator=(const TensorDescriptor&) = delete; |
||||
TensorDescriptor& operator=(TensorDescriptor&& other) noexcept { |
||||
descriptor = other.descriptor; |
||||
other.descriptor = nullptr; |
||||
return *this; |
||||
}; |
||||
|
||||
cudnnTensorDescriptor_t get() const noexcept { return descriptor; } |
||||
|
||||
private: |
||||
template <class ForwardItr> |
||||
void constructor(ForwardItr start, ForwardItr end) { |
||||
CV_Assert(start != end); |
||||
CV_Assert(std::distance(start, end) <= CUDNN_DIM_MAX); |
||||
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnCreateTensorDescriptor(&descriptor)); |
||||
try { |
||||
/* cuDNN documentation recommends using the 4d tensor API whenever possible
|
||||
* hence, we create a 4d tensor descriptors for 3d tensor |
||||
*/ |
||||
const auto rank = std::distance(start, end); |
||||
if (rank <= 4) { |
||||
std::array<int, 4> dims; |
||||
std::fill(std::begin(dims), std::end(dims), 1); |
||||
|
||||
/* suppose we have a 3d tensor, the first axis is the batch axis and
|
||||
* the second axis is the channel axis (generally) |
||||
* |
||||
* cuDNN frequently assumes that the first axis is the batch axis and the |
||||
* second axis is the channel axis; hence, we copy the shape of a lower rank |
||||
* tensor to the begining of `dims` |
||||
*/ |
||||
std::copy(start, end, std::begin(dims)); |
||||
|
||||
CUDA4DNN_CHECK_CUDNN( |
||||
cudnnSetTensor4dDescriptor(descriptor, |
||||
CUDNN_TENSOR_NCHW, detail::get_data_type<T>(), |
||||
dims[0], dims[1], dims[2], dims[3] |
||||
) |
||||
); |
||||
} else { |
||||
std::vector<int> stride(rank); |
||||
stride.back() = 1; |
||||
/* WHAT WE HAVE NOW:
|
||||
* stride[-1] = 1 |
||||
* stride[-2] = garbage |
||||
* stride[-3] = garbage |
||||
* stride[-4] = garbage |
||||
* ... |
||||
*/ |
||||
|
||||
std::copy(start + 1, end, stride.begin()); |
||||
/* WHAT WE HAVE NOW:
|
||||
* stride[-1] = 1 |
||||
* stride[-2] = dim[-1] |
||||
* stride[-3] = dim[-2] |
||||
* stride[-4] = dim[-3] |
||||
* ... |
||||
*/ |
||||
|
||||
std::partial_sum(stride.rbegin(), stride.rend(), stride.rbegin(), std::multiplies<int>()); |
||||
/* WHAT WE HAVE NOW:
|
||||
* stride[-1] = 1 |
||||
* stride[-2] = stride[-1] * dim[-1] |
||||
* stride[-3] = stride[-2] * dim[-2] |
||||
* stride[-4] = stride[-3] * dim[-3] |
||||
* ... |
||||
*/ |
||||
|
||||
std::vector<int> dims(start, end); |
||||
CUDA4DNN_CHECK_CUDNN( |
||||
cudnnSetTensorNdDescriptor(descriptor, |
||||
detail::get_data_type<T>(), rank, |
||||
dims.data(), stride.data() |
||||
) |
||||
); |
||||
} |
||||
} catch (...) { |
||||
/* cudnnDestroyTensorDescriptor will not fail */ |
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyTensorDescriptor(descriptor)); |
||||
throw; |
||||
} |
||||
} |
||||
|
||||
cudnnTensorDescriptor_t descriptor; |
||||
}; |
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */ |
||||
|
||||
#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_HPP */ |
@ -0,0 +1,205 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_LRN_HPP |
||||
#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_LRN_HPP |
||||
|
||||
#include "cudnn.hpp" |
||||
|
||||
#include "../pointer.hpp" |
||||
#include "../workspace.hpp" |
||||
|
||||
#include <opencv2/core.hpp> |
||||
|
||||
#include <cudnn.h> |
||||
|
||||
#include <cstddef> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn { |
||||
|
||||
class LRNDescriptor { |
||||
public: |
||||
enum class LRNType { |
||||
ACROSS_CHANNELS, |
||||
WITHIN_CHANNEL |
||||
}; |
||||
|
||||
LRNDescriptor() noexcept : descriptor{ nullptr } { } |
||||
LRNDescriptor(const LRNDescriptor&) = delete; |
||||
LRNDescriptor(LRNDescriptor&& other) noexcept |
||||
: descriptor{ other.descriptor }, type{ other.type } { |
||||
other.descriptor = nullptr; |
||||
} |
||||
|
||||
/** sets up a LRN descriptor
|
||||
* |
||||
* @param local_size size of the normalization window |
||||
* @param alpha variance scaling parameter |
||||
* @param beta power parameter |
||||
* @param k bias parameter |
||||
* |
||||
* @note \p alpha is divided by the window width in across channels mode |
||||
* @note \p alpha is divided by the (window width)^spatialDimensions in within channel mode |
||||
* |
||||
* @note the \p alpha, \p beta and \p k will be type casted to the tensor datatype during operation |
||||
* |
||||
* Exception Guarantee: Basic |
||||
*/ |
||||
LRNDescriptor(std::size_t local_size, double alpha, double beta, double k, LRNType type_) { |
||||
constructor(local_size, alpha, beta, k, type_); |
||||
} |
||||
|
||||
~LRNDescriptor() noexcept { |
||||
if (descriptor != nullptr) { |
||||
/* cudnnDestroyLRNDescriptor will not fail for a valid descriptor */ |
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyLRNDescriptor(descriptor)); |
||||
} |
||||
} |
||||
|
||||
LRNDescriptor& operator=(const LRNDescriptor&) = delete; |
||||
LRNDescriptor& operator=(LRNDescriptor&& other) noexcept { |
||||
descriptor = other.descriptor; |
||||
type = other.type; |
||||
other.descriptor = nullptr; |
||||
return *this; |
||||
}; |
||||
|
||||
cudnnLRNDescriptor_t get() const noexcept { return descriptor; } |
||||
LRNType getType() const noexcept { return type; } |
||||
|
||||
private: |
||||
void constructor(std::size_t local_size, double alpha, double beta, double k, LRNType type_) { |
||||
CV_Assert(CUDNN_LRN_MIN_N <= local_size && local_size <= CUDNN_LRN_MAX_N); |
||||
|
||||
type = type_; |
||||
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnCreateLRNDescriptor(&descriptor)); |
||||
try { |
||||
CUDA4DNN_CHECK_CUDNN( |
||||
cudnnSetLRNDescriptor( |
||||
descriptor, |
||||
local_size, |
||||
alpha, |
||||
beta, |
||||
k |
||||
) |
||||
); |
||||
} catch (...) { |
||||
/* cudnnDestroyLRNDescriptor will not fail for a valid descriptor */ |
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyLRNDescriptor(descriptor)); |
||||
throw; |
||||
} |
||||
} |
||||
|
||||
cudnnLRNDescriptor_t descriptor; |
||||
LRNType type; |
||||
}; |
||||
|
||||
/** @brief performs local response normalization
|
||||
* |
||||
* dstValue = alpha * result + beta * priorDstValue |
||||
* |
||||
* @tparam T element type (must be `half` or `float`) |
||||
* |
||||
* @param handle valid cuDNN Handle |
||||
* @param lrnDesc LRN description |
||||
* @param inputDesc tensor descriptor describing the input |
||||
* @param[in] inputPtr pointer to input tensor in device memory |
||||
* @param alpha result scale factor |
||||
* @param beta previous value scale factor |
||||
* @param outputDesc tensor descriptor describing the output |
||||
* @param[out] outputPtr pointer to output tensor in device memory |
||||
* @param workspace workspace memory which meets the requirements of \p convAlgo |
||||
* |
||||
* Exception Guarantee: Basic |
||||
*/ |
||||
template <class T> |
||||
void LRNForward( |
||||
const Handle& handle, |
||||
const LRNDescriptor& lrnDesc, |
||||
const TensorDescriptor<T>& inputDesc, |
||||
DevicePtr<const T> inputPtr, |
||||
T alpha, T beta, |
||||
const TensorDescriptor<T>& outputDesc, |
||||
DevicePtr<T> outputPtr, |
||||
WorkspaceInstance workspace) |
||||
{ |
||||
CV_Assert(handle); |
||||
|
||||
if (lrnDesc.getType() == LRNDescriptor::LRNType::ACROSS_CHANNELS) { |
||||
CUDA4DNN_CHECK_CUDNN( |
||||
cudnnLRNCrossChannelForward( |
||||
handle.get(), |
||||
lrnDesc.get(), CUDNN_LRN_CROSS_CHANNEL_DIM1, |
||||
&alpha, inputDesc.get(), inputPtr.get(), |
||||
&beta, outputDesc.get(), outputPtr.get() |
||||
) |
||||
); |
||||
} else if (lrnDesc.getType() == LRNDescriptor::LRNType::WITHIN_CHANNEL) { |
||||
std::size_t size; |
||||
CUDA4DNN_CHECK_CUDNN(cudnnGetTensorSizeInBytes(inputDesc.get(), &size)); |
||||
|
||||
DevicePtr<void> temp1 = workspace.get_span<half>(size).data(); |
||||
DevicePtr<void> temp2 = workspace.get_span<half>(size).data(); |
||||
|
||||
CUDA4DNN_CHECK_CUDNN( |
||||
cudnnDivisiveNormalizationForward( |
||||
handle.get(), |
||||
lrnDesc.get(), CUDNN_DIVNORM_PRECOMPUTED_MEANS, |
||||
&alpha, inputDesc.get(), inputPtr.get(), |
||||
NULL, |
||||
static_cast<void*>(temp1), static_cast<void*>(temp2), |
||||
&beta, outputDesc.get(), outputPtr.get() |
||||
) |
||||
); |
||||
} |
||||
} |
||||
|
||||
template <> inline |
||||
void LRNForward( |
||||
const Handle& handle, |
||||
const LRNDescriptor& lrnDesc, |
||||
const TensorDescriptor<half>& inputDesc, |
||||
DevicePtr<const half> inputPtr, |
||||
half alpha, half beta, |
||||
const TensorDescriptor<half>& outputDesc, |
||||
DevicePtr<half> outputPtr, |
||||
WorkspaceInstance workspace) |
||||
{ |
||||
CV_Assert(handle); |
||||
|
||||
/* we specalize for fp16 as the scaling factors must be provided as `float` */ |
||||
float alpha_ = alpha, beta_ = beta; |
||||
if (lrnDesc.getType() == LRNDescriptor::LRNType::ACROSS_CHANNELS) { |
||||
CUDA4DNN_CHECK_CUDNN( |
||||
cudnnLRNCrossChannelForward( |
||||
handle.get(), |
||||
lrnDesc.get(), CUDNN_LRN_CROSS_CHANNEL_DIM1, |
||||
&alpha_, inputDesc.get(), inputPtr.get(), |
||||
&beta_, outputDesc.get(), outputPtr.get() |
||||
) |
||||
); |
||||
} else if (lrnDesc.getType() == LRNDescriptor::LRNType::WITHIN_CHANNEL) { |
||||
std::size_t size; |
||||
CUDA4DNN_CHECK_CUDNN(cudnnGetTensorSizeInBytes(inputDesc.get(), &size)); |
||||
|
||||
DevicePtr<void> temp1 = workspace.get_span<half>(size).data(); |
||||
DevicePtr<void> temp2 = workspace.get_span<half>(size).data(); |
||||
|
||||
CUDA4DNN_CHECK_CUDNN( |
||||
cudnnDivisiveNormalizationForward( |
||||
handle.get(), |
||||
lrnDesc.get(), CUDNN_DIVNORM_PRECOMPUTED_MEANS, |
||||
&alpha_, inputDesc.get(), inputPtr.get(), |
||||
NULL, |
||||
static_cast<void*>(temp1), static_cast<void*>(temp2), |
||||
&beta_, outputDesc.get(), outputPtr.get() |
||||
) |
||||
); |
||||
} |
||||
} |
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */ |
||||
|
||||
#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_LRN_HPP */ |
@ -0,0 +1,236 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_POOLING_HPP |
||||
#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_POOLING_HPP |
||||
|
||||
#include "cudnn.hpp" |
||||
|
||||
#include "../pointer.hpp" |
||||
|
||||
#include <opencv2/core.hpp> |
||||
|
||||
#include <cudnn.h> |
||||
|
||||
#include <cstddef> |
||||
#include <array> |
||||
#include <algorithm> |
||||
#include <vector> |
||||
#include <type_traits> |
||||
#include <iterator> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn { |
||||
|
||||
class PoolingDescriptor { |
||||
public: |
||||
enum class PoolingType { |
||||
MAX, |
||||
MAX_DETERMINISTIC, |
||||
AVERAGE_EXCLUDE_PADDING, |
||||
AVERAGE_INCLUDE_PADDING |
||||
}; |
||||
|
||||
PoolingDescriptor() noexcept : descriptor{ nullptr } { } |
||||
PoolingDescriptor(const PoolingDescriptor&) = delete; |
||||
PoolingDescriptor(PoolingDescriptor&& other) noexcept |
||||
: descriptor{ other.descriptor } { |
||||
other.descriptor = nullptr; |
||||
} |
||||
|
||||
/** constructs a pooling descriptor
|
||||
* |
||||
* Pre-conditions: |
||||
* - \p window_size, \p padding and \p stride must have the same size |
||||
* |
||||
* The length of the containers is interpreted as the order of the pooling operation. |
||||
* |
||||
* Exception Guarantee: Basic |
||||
*/ |
||||
template <class SequenceContainer, typename = decltype(std::begin(std::declval<SequenceContainer>()))> |
||||
PoolingDescriptor( |
||||
const SequenceContainer& window_size, |
||||
const SequenceContainer& padding, |
||||
const SequenceContainer& stride, |
||||
PoolingType type) |
||||
{ |
||||
constructor(window_size, padding, stride, type); |
||||
} |
||||
|
||||
~PoolingDescriptor() noexcept { |
||||
if (descriptor != nullptr) { |
||||
/* cudnnDestroyPoolingDescriptor will not fail for a valid descriptor */ |
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyPoolingDescriptor(descriptor)); |
||||
} |
||||
} |
||||
|
||||
PoolingDescriptor& operator=(const PoolingDescriptor&) = delete; |
||||
PoolingDescriptor& operator=(PoolingDescriptor&& other) noexcept { |
||||
descriptor = other.descriptor; |
||||
other.descriptor = nullptr; |
||||
return *this; |
||||
}; |
||||
|
||||
cudnnPoolingDescriptor_t get() const noexcept { return descriptor; } |
||||
|
||||
private: |
||||
template <class SequenceContainer> |
||||
void constructor( |
||||
const SequenceContainer& window_size, |
||||
const SequenceContainer& padding, |
||||
const SequenceContainer& stride, |
||||
PoolingType type) |
||||
{ |
||||
CV_Assert(window_size.size() == padding.size()); |
||||
CV_Assert(window_size.size() == stride.size()); |
||||
|
||||
auto get_pooling_type = [] (PoolingType type) { |
||||
switch (type) { |
||||
case PoolingType::MAX: |
||||
return CUDNN_POOLING_MAX; |
||||
case PoolingType::MAX_DETERMINISTIC: |
||||
return CUDNN_POOLING_MAX_DETERMINISTIC; |
||||
case PoolingType::AVERAGE_EXCLUDE_PADDING: |
||||
return CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; |
||||
case PoolingType::AVERAGE_INCLUDE_PADDING: |
||||
return CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; |
||||
} |
||||
CV_Error(Error::StsBadArg, "unknown pooling type"); |
||||
}; |
||||
|
||||
CUDA4DNN_CHECK_CUDNN(cudnnCreatePoolingDescriptor(&descriptor)); |
||||
try { |
||||
const auto rank = window_size.size(); |
||||
if (rank == 2) { |
||||
CUDA4DNN_CHECK_CUDNN( |
||||
cudnnSetPooling2dDescriptor( |
||||
descriptor, |
||||
get_pooling_type(type), CUDNN_PROPAGATE_NAN, |
||||
window_size[0], window_size[1], |
||||
padding[0], padding[1], |
||||
stride[0], stride[1] |
||||
) |
||||
); |
||||
} else { |
||||
std::vector<int> iwindow_size(std::begin(window_size), std::end(window_size)); |
||||
std::vector<int> ipadding(std::begin(padding), std::end(padding)); |
||||
std::vector<int> istride(std::begin(stride), std::end(stride)); |
||||
CUDA4DNN_CHECK_CUDNN( |
||||
cudnnSetPoolingNdDescriptor( |
||||
descriptor, |
||||
get_pooling_type(type), CUDNN_PROPAGATE_NAN, |
||||
rank, iwindow_size.data(), ipadding.data(), istride.data() |
||||
) |
||||
); |
||||
} |
||||
} catch (...) { |
||||
/* cudnnDestroyPoolingDescriptor will not fail for a valid descriptor */ |
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyPoolingDescriptor(descriptor)); |
||||
throw; |
||||
} |
||||
} |
||||
|
||||
cudnnPoolingDescriptor_t descriptor; |
||||
}; |
||||
|
||||
/** gives the shape of the output tensor after pooling
|
||||
* |
||||
* @note it's not required to enforce the this shape in the output tensor; slightly different shapes will work |
||||
* |
||||
* Exception Guarantee: Basic |
||||
*/ |
||||
template <class T> inline |
||||
void getPoolingForwardOutputDim( |
||||
const PoolingDescriptor& poolingDesc, |
||||
const TensorDescriptor<T>& inputDesc, |
||||
std::vector<int>& output_dim) |
||||
{ |
||||
output_dim.clear(); |
||||
output_dim.resize(CUDNN_DIM_MAX); /* we use `output_dim` to hold temporaries */ |
||||
|
||||
std::vector<int> temp(CUDNN_DIM_MAX); |
||||
cudnnDataType_t tempDataType; |
||||
CUDA4DNN_CHECK_CUDNN( |
||||
cudnnGetTensorNdDescriptor( |
||||
inputDesc.get(), |
||||
CUDNN_DIM_MAX + 1, /* according to docs, this is what we do to get the rank */ |
||||
&tempDataType, |
||||
output_dim.data(), |
||||
temp.data(), |
||||
temp.data() |
||||
) |
||||
); |
||||
|
||||
const auto rank = output_dim[0]; |
||||
output_dim.resize(rank); |
||||
CUDA4DNN_CHECK_CUDNN( |
||||
cudnnGetPoolingNdForwardOutputDim(poolingDesc.get(), inputDesc.get(), rank, output_dim.data()) |
||||
); |
||||
} |
||||
|
||||
/** @brief performs pooling operation
|
||||
* |
||||
* dstValue = alpha * result + beta * priorDstValue |
||||
* |
||||
* @tparam T pooling element type (must be `half` or `float`) |
||||
* |
||||
* @param handle valid cuDNN Handle |
||||
* @param poolingDesc pooling description |
||||
* @param inputDesc tensor descriptor describing the input |
||||
* @param[in] inputPtr pointer to input tensor in device memory |
||||
* @param alpha result scale factor |
||||
* @param beta previous value scale factor |
||||
* @param outputDesc tensor descriptor describing the output |
||||
* @param[out] outputPtr pointer to output tensor in device memory |
||||
* |
||||
* Exception Guarantee: Basic |
||||
*/ |
||||
template <class T> |
||||
void pool( |
||||
const Handle& handle, |
||||
const PoolingDescriptor& poolingDesc, |
||||
const TensorDescriptor<T>& inputDesc, |
||||
const DevicePtr<const T> inputPtr, |
||||
T alpha, T beta, |
||||
const TensorDescriptor<T>& outputDesc, |
||||
DevicePtr<T> outputPtr) |
||||
{ |
||||
CV_Assert(handle); |
||||
|
||||
CUDA4DNN_CHECK_CUDNN( |
||||
cudnnPoolingForward( |
||||
handle.get(), |
||||
poolingDesc.get(), |
||||
&alpha, inputDesc.get(), inputPtr.get(), |
||||
&beta, outputDesc.get(), outputPtr.get() |
||||
) |
||||
); |
||||
} |
||||
|
||||
template <> inline |
||||
void pool( |
||||
const Handle& handle, |
||||
const PoolingDescriptor& poolingDesc, |
||||
const TensorDescriptor<half>& inputDesc, |
||||
const DevicePtr<const half> inputPtr, |
||||
half alpha, half beta, |
||||
const TensorDescriptor<half>& outputDesc, |
||||
DevicePtr<half> outputPtr) |
||||
{ |
||||
CV_Assert(handle); |
||||
|
||||
/* we specalize for fp16 as the scaling factors must be provided as `float` */ |
||||
float alpha_ = alpha, beta_ = beta; |
||||
CUDA4DNN_CHECK_CUDNN( |
||||
cudnnPoolingForward( |
||||
handle.get(), |
||||
poolingDesc.get(), |
||||
&alpha_, inputDesc.get(), inputPtr.get(), |
||||
&beta_, outputDesc.get(), outputPtr.get() |
||||
) |
||||
); |
||||
} |
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */ |
||||
|
||||
#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_POOLING_HPP */ |
@ -0,0 +1,68 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_SOFTMAX_HPP |
||||
#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_SOFTMAX_HPP |
||||
|
||||
#include "cudnn.hpp" |
||||
|
||||
#include "../pointer.hpp" |
||||
|
||||
#include <cudnn.h> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn { |
||||
|
||||
/** @brief computes softmax (or log softmax)
|
||||
* |
||||
* @tparam T element type (must be `half` or `float`) |
||||
* |
||||
* @param handle valid cuDNN handle |
||||
* @param outputDesc tensor descriptor for A |
||||
* @param[out] output pointer to tensor in device memory |
||||
* @param inputDesc tensor descriptor for C |
||||
* @param[in] input pointer to tensor in device memory |
||||
* @param log apply log on probabilities |
||||
* |
||||
* Exception Guarantee: Basic |
||||
*/ |
||||
template <class T> |
||||
void softmax(const cudnn::Handle& handle, |
||||
const TensorDescriptor<T>& outputDesc, DevicePtr<T> output, |
||||
const TensorDescriptor<T>& inputDesc, DevicePtr<const T> input, |
||||
bool log) |
||||
{ |
||||
T alpha = 1.0, beta = 0.0; |
||||
cudnnSoftmaxAlgorithm_t algo = log ? CUDNN_SOFTMAX_LOG : CUDNN_SOFTMAX_ACCURATE; |
||||
CUDA4DNN_CHECK_CUDNN( |
||||
cudnnSoftmaxForward( |
||||
handle.get(), |
||||
algo, CUDNN_SOFTMAX_MODE_CHANNEL, |
||||
&alpha, inputDesc.get(), input.get(), |
||||
&beta, outputDesc.get(), output.get() |
||||
) |
||||
); |
||||
} |
||||
|
||||
template <> inline |
||||
void softmax(const cudnn::Handle& handle, |
||||
const TensorDescriptor<half>& outputDesc, DevicePtr<half> output, |
||||
const TensorDescriptor<half>& inputDesc, DevicePtr<const half> input, |
||||
bool log) |
||||
{ |
||||
/* we specalize for fp16 as the scaling factors must be provided as `float` */ |
||||
float alpha = 1.0, beta = 0.0; |
||||
cudnnSoftmaxAlgorithm_t algo = log ? CUDNN_SOFTMAX_LOG : CUDNN_SOFTMAX_ACCURATE; |
||||
CUDA4DNN_CHECK_CUDNN( |
||||
cudnnSoftmaxForward( |
||||
handle.get(), |
||||
algo, CUDNN_SOFTMAX_MODE_CHANNEL, |
||||
&alpha, inputDesc.get(), input.get(), |
||||
&beta, outputDesc.get(), output.get() |
||||
) |
||||
); |
||||
} |
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */ |
||||
|
||||
#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_SOFTMAX_HPP */ |
@ -0,0 +1,142 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSFORM_HPP |
||||
#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSFORM_HPP |
||||
|
||||
#include "../pointer.hpp" |
||||
|
||||
#include "cudnn.hpp" |
||||
|
||||
#include <cudnn.h> |
||||
#include <vector> |
||||
#include <type_traits> |
||||
#include <iterator> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn { |
||||
|
||||
/** describes a tensor transform operation
|
||||
* |
||||
* Supported transformations: |
||||
* - add or remove asymmetric padding |
||||
*/ |
||||
class TensorTransformDescriptor { |
||||
public: |
||||
TensorTransformDescriptor() noexcept : descriptor{ nullptr } { } |
||||
TensorTransformDescriptor(const TensorTransformDescriptor&) = delete; |
||||
TensorTransformDescriptor(TensorTransformDescriptor&& other) noexcept |
||||
: descriptor{ other.descriptor } { |
||||
other.descriptor = nullptr; |
||||
} |
||||
|
||||
/** constructs a convolution descriptor
|
||||
* |
||||
* Pre-conditions: |
||||
* - \p padding_left and \p padding_right must have the same size |
||||
* |
||||
* The length of the containers is interpreted as the rank of the tensors which will be given. |
||||
* |
||||
* @note \p padding_left and \p padding_right may have negative values to remove padding |
||||
* |
||||
* Exception Guarantee: Basic |
||||
*/ |
||||
template <class SequenceContainer, typename = decltype(std::begin(std::declval<SequenceContainer>()))> |
||||
TensorTransformDescriptor( |
||||
const SequenceContainer& padding_left, |
||||
const SequenceContainer& padding_right) |
||||
{ |
||||
constructor(padding_left, padding_right); |
||||
} |
||||
|
||||
~TensorTransformDescriptor() noexcept { |
||||
if (descriptor != nullptr) { |
||||
/* cudnnDestroyTensorTransformDescriptor will not fail for a valid descriptor */ |
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyTensorTransformDescriptor(descriptor)); |
||||
} |
||||
} |
||||
|
||||
TensorTransformDescriptor& operator=(const TensorTransformDescriptor&) = delete; |
||||
TensorTransformDescriptor& operator=(TensorTransformDescriptor&& other) noexcept { |
||||
descriptor = other.descriptor; |
||||
other.descriptor = nullptr; |
||||
return *this; |
||||
}; |
||||
|
||||
cudnnTensorTransformDescriptor_t get() const noexcept { return descriptor; } |
||||
|
||||
private: |
||||
template <class SequenceContainer> |
||||
void constructor( |
||||
const SequenceContainer& padding_left, |
||||
const SequenceContainer& padding_right |
||||
) |
||||
{ |
||||
CV_Assert(padding_left.size() == padding_right.size()); |
||||
|
||||
auto ipadding_left = std::vector<int32_t>(std::begin(padding_left), std::end(padding_left)); |
||||
auto ipadding_right = std::vector<int32_t>(std::begin(padding_right), std::end(padding_right)); |
||||
CUDA4DNN_CHECK_CUDNN(cudnnCreateTensorTransformDescriptor(&descriptor)); |
||||
try { |
||||
CUDA4DNN_CHECK_CUDNN( |
||||
cudnnSetTensorTransformDescriptor( |
||||
descriptor, |
||||
ipadding_left.size(), CUDNN_TENSOR_NCHW, |
||||
ipadding_left.data(), ipadding_right.data(), |
||||
NULL, CUDNN_TRANSFORM_FOLD |
||||
) |
||||
); |
||||
} catch (...) { |
||||
/* cudnnDestroyTensorTransformDescriptor will not fail for a valid descriptor */ |
||||
CUDA4DNN_CHECK_CUDNN(cudnnDestroyTensorTransformDescriptor(descriptor)); |
||||
throw; |
||||
} |
||||
} |
||||
|
||||
cudnnTensorTransformDescriptor_t descriptor; |
||||
}; |
||||
|
||||
template <class T> |
||||
void transform( |
||||
const Handle& handle, |
||||
const TensorTransformDescriptor& transDesc, |
||||
const TensorDescriptor<T>& inputDesc, |
||||
DevicePtr<const T> inputPtr, |
||||
const TensorDescriptor<T>& outputDesc, |
||||
DevicePtr<T> outputPtr) |
||||
{ |
||||
T alpha = 1.0, beta = 0.0; |
||||
CUDA4DNN_CHECK_CUDNN( |
||||
cudnnTransformTensorEx( |
||||
handle.get(), |
||||
transDesc.get(), |
||||
&alpha, inputDesc.get(), inputPtr.get(), |
||||
&beta, outputDesc.get(), outputPtr.get() |
||||
) |
||||
); |
||||
} |
||||
|
||||
template <> inline |
||||
void transform( |
||||
const Handle& handle, |
||||
const TensorTransformDescriptor& transDesc, |
||||
const TensorDescriptor<half>& inputDesc, |
||||
DevicePtr<const half> inputPtr, |
||||
const TensorDescriptor<half>& outputDesc, |
||||
DevicePtr<half> outputPtr) |
||||
{ |
||||
/* we specalize for fp16 as the scaling factors must be provided as `float` */ |
||||
float alpha = 1.0, beta = 0.0; |
||||
CUDA4DNN_CHECK_CUDNN( |
||||
cudnnTransformTensorEx( |
||||
handle.get(), |
||||
transDesc.get(), |
||||
&alpha, inputDesc.get(), inputPtr.get(), |
||||
&beta, outputDesc.get(), outputPtr.get() |
||||
) |
||||
); |
||||
} |
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */ |
||||
|
||||
#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSFORM_HPP */ |
@ -0,0 +1,148 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSPOSE_CONVOLUTION_HPP |
||||
#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSPOSE_CONVOLUTION_HPP |
||||
|
||||
#include "cudnn.hpp" |
||||
#include "convolution.hpp" |
||||
|
||||
#include "../pointer.hpp" |
||||
#include "../workspace.hpp" |
||||
|
||||
#include <cudnn.h> |
||||
|
||||
#include <cstddef> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn { |
||||
|
||||
/** wrapper around a transpose convolution algorithm
|
||||
* |
||||
* @tparam T type of elements being transpose-convolved |
||||
*/ |
||||
template <class T> |
||||
class TransposeConvolutionAlgorithm { |
||||
public: |
||||
TransposeConvolutionAlgorithm() noexcept : workspace_size{ 0 } { } |
||||
TransposeConvolutionAlgorithm(TransposeConvolutionAlgorithm&) = default; |
||||
TransposeConvolutionAlgorithm(TransposeConvolutionAlgorithm&&) = default; |
||||
|
||||
TransposeConvolutionAlgorithm( |
||||
const Handle& handle, |
||||
const ConvolutionDescriptor<T>& conv, |
||||
const FilterDescriptor<T>& filter, |
||||
const TensorDescriptor<T>& input, |
||||
const TensorDescriptor<T>& output) |
||||
{ |
||||
CUDA4DNN_CHECK_CUDNN( |
||||
cudnnGetConvolutionBackwardDataAlgorithm( |
||||
handle.get(), |
||||
filter.get(), input.get(), conv.get(), output.get(), |
||||
CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, |
||||
0, /* no memory limit */ |
||||
&dalgo |
||||
) |
||||
); |
||||
|
||||
CUDA4DNN_CHECK_CUDNN( |
||||
cudnnGetConvolutionBackwardDataWorkspaceSize( |
||||
handle.get(), |
||||
filter.get(), input.get(), conv.get(), output.get(), |
||||
dalgo, &workspace_size |
||||
) |
||||
); |
||||
} |
||||
|
||||
TransposeConvolutionAlgorithm& operator=(const TransposeConvolutionAlgorithm&) = default; |
||||
TransposeConvolutionAlgorithm& operator=(TransposeConvolutionAlgorithm&& other) = default; |
||||
|
||||
cudnnConvolutionBwdDataAlgo_t get() const noexcept { return dalgo; } |
||||
|
||||
std::size_t get_workspace_size() const noexcept { return workspace_size; } |
||||
|
||||
private: |
||||
cudnnConvolutionBwdDataAlgo_t dalgo; |
||||
std::size_t workspace_size; |
||||
}; |
||||
|
||||
/** @brief performs transpose convolution
|
||||
* |
||||
* dstValue = alpha * result + beta * priorDstValue |
||||
* |
||||
* @tparam T transpose convolution element type (must be `half` or `float`) |
||||
* |
||||
* @param handle valid cuDNN Handle |
||||
* @param convDesc convolution description |
||||
* @param transConvAlgo algorithm to use for convolution |
||||
* @param workspace workspace memory which meets the requirements of \p convAlgo |
||||
* @param filterDesc filter descriptor |
||||
* @param[in] filterPtr pointer to device memory containing the filters |
||||
* @param inputDesc tensor descriptor describing the input |
||||
* @param[in] inputPtr pointer to input tensor in device memory |
||||
* @param alpha result scale factor |
||||
* @param beta previous value scale factor |
||||
* @param outputDesc tensor descriptor describing the output |
||||
* @param[out] outputPtr pointer to output tensor in device memory |
||||
* |
||||
* Exception Guarantee: Basic |
||||
*/ |
||||
template <class T> |
||||
void transpose_convolve( |
||||
const Handle& handle, |
||||
const ConvolutionDescriptor<T>& convDesc, |
||||
const TransposeConvolutionAlgorithm<T>& transConvAlgo, |
||||
WorkspaceInstance workspace, |
||||
const FilterDescriptor<T>& filterDesc, |
||||
DevicePtr<const T> filterPtr, |
||||
const TensorDescriptor<T>& inputDesc, |
||||
DevicePtr<const T> inputPtr, |
||||
T alpha, T beta, |
||||
const TensorDescriptor<T>& outputDesc, |
||||
DevicePtr<T> outputPtr) |
||||
{ |
||||
CUDA4DNN_CHECK_CUDNN( |
||||
cudnnConvolutionBackwardData( |
||||
handle.get(), |
||||
&alpha, |
||||
filterDesc.get(), filterPtr.get(), |
||||
inputDesc.get(), inputPtr.get(), |
||||
convDesc.get(), transConvAlgo.get(), |
||||
static_cast<void*>(workspace.get()), workspace.size_in_bytes(), |
||||
&beta, outputDesc.get(), outputPtr.get() |
||||
) |
||||
); |
||||
} |
||||
|
||||
template <> inline |
||||
void transpose_convolve( |
||||
const Handle& handle, |
||||
const ConvolutionDescriptor<half>& convDesc, |
||||
const TransposeConvolutionAlgorithm<half>& convAlgo, |
||||
WorkspaceInstance workspace, |
||||
const FilterDescriptor<half>& filterDesc, |
||||
DevicePtr<const half> filterPtr, |
||||
const TensorDescriptor<half>& inputDesc, |
||||
DevicePtr<const half> inputPtr, |
||||
half alpha, half beta, |
||||
const TensorDescriptor<half>& outputDesc, |
||||
DevicePtr<half> outputPtr) |
||||
{ |
||||
/* we specalize for fp16 as the scaling factors must be provided as `float` */ |
||||
float alpha_ = alpha, beta_ = beta; |
||||
CUDA4DNN_CHECK_CUDNN( |
||||
cudnnConvolutionBackwardData( |
||||
handle.get(), |
||||
&alpha_, |
||||
filterDesc.get(), filterPtr.get(), |
||||
inputDesc.get(), inputPtr.get(), |
||||
convDesc.get(), convAlgo.get(), |
||||
static_cast<void*>(workspace.get()), workspace.size_in_bytes(), |
||||
&beta_, outputDesc.get(), outputPtr.get() |
||||
) |
||||
); |
||||
} |
||||
|
||||
}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */ |
||||
|
||||
#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSPOSE_CONVOLUTION_HPP */ |
@ -0,0 +1,30 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_ERROR_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_ERROR_HPP |
||||
|
||||
#include <opencv2/core.hpp> |
||||
|
||||
#include <cuda_runtime_api.h> |
||||
|
||||
#define CUDA4DNN_CHECK_CUDA(call) \ |
||||
::cv::dnn::cuda4dnn::csl::detail::check((call), CV_Func, __FILE__, __LINE__) |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { |
||||
/** @brief exception class for errors thrown by the CUDA APIs */ |
||||
class CUDAException : public cv::Exception { |
||||
public: |
||||
using cv::Exception::Exception; |
||||
}; |
||||
|
||||
namespace detail { |
||||
inline void check(cudaError_t err, const char* func, const char* file, int line) { |
||||
if (err != cudaSuccess) |
||||
throw CUDAException(Error::GpuApiCallError, cudaGetErrorString(err), func, file, line); |
||||
} |
||||
} |
||||
}}}} /* namespace cv::dnn::cuda4dnn::csl */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_ERROR_HPP */ |
@ -0,0 +1,101 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_EVENT_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_EVENT_HPP |
||||
|
||||
#include "error.hpp" |
||||
#include "stream.hpp" |
||||
|
||||
#include <opencv2/core/utils/logger.hpp> |
||||
|
||||
#include <cuda_runtime_api.h> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { |
||||
|
||||
/** @brief sharable CUDA event
|
||||
* |
||||
* Event is a smart sharable wrapper for CUDA event handle which ensures that |
||||
* the handle is destroyed after use. |
||||
* |
||||
* @note Moving an Event object to another invalidates the former |
||||
*/ |
||||
class Event { |
||||
public: |
||||
Event() noexcept : event{ nullptr } { } |
||||
Event(const Event&) = delete; |
||||
Event(Event&& other) noexcept |
||||
: event{ other.event } { |
||||
other.event = nullptr; |
||||
} |
||||
|
||||
/** if \p create is `true`, a new event will be created; otherwise, an empty event object is created */ |
||||
Event(bool create, bool timing_event = false) : event{nullptr} { |
||||
if (create) { |
||||
unsigned int flags = cudaEventBlockingSync | (timing_event ? 0 : cudaEventDisableTiming); |
||||
CUDA4DNN_CHECK_CUDA(cudaEventCreateWithFlags(&event, flags)); |
||||
} |
||||
} |
||||
|
||||
~Event() { |
||||
try { |
||||
if (event != nullptr) |
||||
CUDA4DNN_CHECK_CUDA(cudaEventDestroy(event)); |
||||
} catch (const CUDAException& ex) { |
||||
std::ostringstream os; |
||||
os << "Asynchronous exception caught during CUDA event destruction.\n"; |
||||
os << ex.what(); |
||||
os << "Exception will be ignored.\n"; |
||||
CV_LOG_WARNING(0, os.str().c_str()); |
||||
} |
||||
} |
||||
|
||||
Event& operator=(const Event&) noexcept = delete; |
||||
Event& operator=(Event&& other) noexcept { |
||||
event = other.event; |
||||
other.event = nullptr; |
||||
return *this; |
||||
} |
||||
|
||||
/** mark a point in \p stream */ |
||||
void record(const Stream& stream) { |
||||
CUDA4DNN_CHECK_CUDA(cudaEventRecord(event, stream.get())); |
||||
} |
||||
|
||||
/** blocks the caller thread until all operations before the event finish */ |
||||
void synchronize() const { CUDA4DNN_CHECK_CUDA(cudaEventSynchronize(event)); } |
||||
|
||||
/** returns true if there are operations pending before the event completes */ |
||||
bool busy() const { |
||||
auto status = cudaEventQuery(event); |
||||
if (status == cudaErrorNotReady) |
||||
return true; |
||||
CUDA4DNN_CHECK_CUDA(status); |
||||
return false; |
||||
} |
||||
|
||||
cudaEvent_t get() const noexcept { return event; } |
||||
|
||||
/** returns true if the event is valid */ |
||||
explicit operator bool() const noexcept { return event; } |
||||
|
||||
private: |
||||
cudaEvent_t event; |
||||
}; |
||||
|
||||
/** makes a stream wait on an event */ |
||||
void StreamWaitOnEvent(const Stream& stream, const Event& event) { |
||||
CUDA4DNN_CHECK_CUDA(cudaStreamWaitEvent(stream.get(), event.get(), 0)); |
||||
} |
||||
|
||||
/** returns the time elapsed between two events in milliseconds */ |
||||
float TimeElapsedBetweenEvents(const Event& start, const Event& end) { |
||||
float temp; |
||||
CUDA4DNN_CHECK_CUDA(cudaEventElapsedTime(&temp, start.get(), end.get())); |
||||
return temp; |
||||
} |
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::csl */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_EVENT_HPP */ |
@ -0,0 +1,84 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_FP16_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_FP16_HPP |
||||
|
||||
#include "nvcc_defs.hpp" |
||||
|
||||
#include <cuda_fp16.h> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { |
||||
|
||||
namespace detail { |
||||
template <class T, class = void> |
||||
struct is_half_convertible : std::false_type { }; |
||||
|
||||
template <class T> |
||||
struct is_half_convertible<T, typename std::enable_if<std::is_integral<T>::value, void>::type> : std::true_type { }; |
||||
|
||||
template <class T> |
||||
struct is_half_convertible<T, typename std::enable_if<std::is_floating_point<T>::value, void>::type> : std::true_type { }; |
||||
} |
||||
|
||||
/* Note: nvcc has a broken overload resolution; it considers host overloads inside device code
|
||||
CUDA4DNN_HOST bool operator==(half lhs, half rhs) noexcept { return static_cast<float>(lhs) == static_cast<float>(rhs); } |
||||
CUDA4DNN_HOST bool operator!=(half lhs, half rhs) noexcept { return static_cast<float>(lhs) != static_cast<float>(rhs); } |
||||
CUDA4DNN_HOST bool operator<(half lhs, half rhs) noexcept { return static_cast<float>(lhs) < static_cast<float>(rhs); } |
||||
CUDA4DNN_HOST bool operator>(half lhs, half rhs) noexcept { return static_cast<float>(lhs) > static_cast<float>(rhs); } |
||||
CUDA4DNN_HOST bool operator<=(half lhs, half rhs) noexcept { return static_cast<float>(lhs) <= static_cast<float>(rhs); } |
||||
CUDA4DNN_HOST bool operator>=(half lhs, half rhs) noexcept { return static_cast<float>(lhs) >= static_cast<float>(rhs); } |
||||
*/ |
||||
|
||||
template <class T> CUDA4DNN_HOST |
||||
typename std::enable_if<detail::is_half_convertible<T>::value, bool> |
||||
::type operator==(half lhs, T rhs) noexcept { return static_cast<float>(lhs) == static_cast<float>(rhs); } |
||||
|
||||
template <class T> CUDA4DNN_HOST |
||||
typename std::enable_if<detail::is_half_convertible<T>::value, bool> |
||||
::type operator!=(half lhs, T rhs) noexcept { return static_cast<float>(lhs) != static_cast<float>(rhs); } |
||||
|
||||
template <class T> CUDA4DNN_HOST |
||||
typename std::enable_if<detail::is_half_convertible<T>::value, bool> |
||||
::type operator<(half lhs, T rhs) noexcept { return static_cast<float>(lhs) < static_cast<float>(rhs); } |
||||
|
||||
template <class T> CUDA4DNN_HOST |
||||
typename std::enable_if<detail::is_half_convertible<T>::value, bool> |
||||
::type operator>(half lhs, T rhs) noexcept { return static_cast<float>(lhs) > static_cast<float>(rhs); } |
||||
|
||||
template <class T> CUDA4DNN_HOST |
||||
typename std::enable_if<detail::is_half_convertible<T>::value, bool> |
||||
::type operator<=(half lhs, T rhs) noexcept { return static_cast<float>(lhs) <= static_cast<float>(rhs); } |
||||
|
||||
template <class T> CUDA4DNN_HOST |
||||
typename std::enable_if<detail::is_half_convertible<T>::value, bool> |
||||
::type operator>=(half lhs, T rhs) noexcept { return static_cast<float>(lhs) >= static_cast<float>(rhs); } |
||||
|
||||
template <class T> CUDA4DNN_HOST |
||||
typename std::enable_if<detail::is_half_convertible<T>::value, bool> |
||||
::type operator==(T lhs, half rhs) noexcept { return static_cast<float>(lhs) == static_cast<float>(rhs); } |
||||
|
||||
template <class T> CUDA4DNN_HOST |
||||
typename std::enable_if<detail::is_half_convertible<T>::value, bool> |
||||
::type operator!=(T lhs, half rhs) noexcept { return static_cast<float>(lhs) != static_cast<float>(rhs); } |
||||
|
||||
template <class T> CUDA4DNN_HOST |
||||
typename std::enable_if<detail::is_half_convertible<T>::value, bool> |
||||
::type operator<(T lhs, half rhs) noexcept { return static_cast<float>(lhs) < static_cast<float>(rhs); } |
||||
|
||||
template <class T> CUDA4DNN_HOST |
||||
typename std::enable_if<detail::is_half_convertible<T>::value, bool> |
||||
::type operator>(T lhs, half rhs) noexcept { return static_cast<float>(lhs) > static_cast<float>(rhs); } |
||||
|
||||
template <class T> CUDA4DNN_HOST |
||||
typename std::enable_if<detail::is_half_convertible<T>::value, bool> |
||||
::type operator<=(T lhs, half rhs) noexcept { return static_cast<float>(lhs) <= static_cast<float>(rhs); } |
||||
|
||||
template <class T> CUDA4DNN_HOST |
||||
typename std::enable_if<detail::is_half_convertible<T>::value, bool> |
||||
::type operator>=(T lhs, half rhs) noexcept { return static_cast<float>(lhs) >= static_cast<float>(rhs); } |
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::csl */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_FP16_HPP */ |
@ -0,0 +1,295 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_MEMORY_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_MEMORY_HPP |
||||
|
||||
#include "error.hpp" |
||||
#include "pointer.hpp" |
||||
|
||||
#include <opencv2/core.hpp> |
||||
|
||||
#include <cuda_runtime_api.h> |
||||
|
||||
#include <cstddef> |
||||
#include <type_traits> |
||||
#include <memory> |
||||
#include <utility> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { |
||||
|
||||
/* @brief smart device pointer with allocation/deallocation methods
|
||||
* |
||||
* ManagedPtr is a smart shared device pointer which also handles memory allocation. |
||||
*/ |
||||
template <class T> |
||||
class ManagedPtr { |
||||
static_assert(!std::is_const<T>::value && !std::is_volatile<T>::value, "T cannot be cv-qualified"); |
||||
static_assert(std::is_standard_layout<T>::value, "T must satisfy StandardLayoutType"); |
||||
|
||||
public: |
||||
using element_type = T; |
||||
|
||||
using pointer = DevicePtr<element_type>; |
||||
using const_pointer = DevicePtr<typename std::add_const<element_type>::type>; |
||||
|
||||
using size_type = std::size_t; |
||||
|
||||
ManagedPtr() noexcept : wrapped{ nullptr }, n{ 0 }, capacity{ 0 } { } |
||||
ManagedPtr(const ManagedPtr&) noexcept = default; |
||||
ManagedPtr(ManagedPtr&& other) noexcept |
||||
: wrapped{ std::move(other.wrapped) }, n{ other.n }, capacity { other.capacity } |
||||
{ |
||||
other.reset(); |
||||
} |
||||
|
||||
/** allocates device memory for \p count number of element */ |
||||
ManagedPtr(size_type count) { |
||||
if (count <= 0) { |
||||
CV_Error(Error::StsBadArg, "number of elements is zero or negative"); |
||||
} |
||||
|
||||
void* temp = nullptr; |
||||
CUDA4DNN_CHECK_CUDA(cudaMalloc(&temp, count * sizeof(element_type))); |
||||
|
||||
auto ptr = typename pointer::pointer(static_cast<element_type*>(temp)); |
||||
wrapped.reset(ptr, [](element_type* ptr) { |
||||
if (ptr != nullptr) { |
||||
/* contract violation for std::shared_ptr if cudaFree throws */ |
||||
try { |
||||
CUDA4DNN_CHECK_CUDA(cudaFree(ptr)); |
||||
} catch (const CUDAException& ex) { |
||||
std::ostringstream os; |
||||
os << "Device memory deallocation failed in deleter.\n"; |
||||
os << ex.what(); |
||||
os << "Exception will be ignored.\n"; |
||||
CV_LOG_WARNING(0, os.str().c_str()); |
||||
} |
||||
} |
||||
}); |
||||
/* std::shared_ptr<T>::reset invokves the deleter if an exception occurs; hence, we don't
|
||||
* need to have a try-catch block to free the allocated device memory |
||||
*/ |
||||
|
||||
n = capacity = count; |
||||
} |
||||
|
||||
ManagedPtr& operator=(ManagedPtr&& other) noexcept { |
||||
wrapped = std::move(other.wrapped); |
||||
n = other.n; |
||||
capacity = other.capacity; |
||||
|
||||
other.reset(); |
||||
return *this; |
||||
} |
||||
|
||||
size_type size() const noexcept { return n; } |
||||
|
||||
void reset() noexcept { wrapped.reset(); n = capacity = 0; } |
||||
|
||||
/**
|
||||
* deallocates any previously allocated memory and allocates device memory |
||||
* for \p count number of elements |
||||
* |
||||
* @note no reallocation if the previously allocated memory has no owners and the requested memory size fits in it |
||||
* @note use move constructor to guarantee a deallocation of the previously allocated memory |
||||
* |
||||
* Exception Guarantee: Strong |
||||
*/ |
||||
void reset(size_type count) { |
||||
/* we need to fully own the memory to perform optimizations */ |
||||
if (wrapped.use_count() == 1) { |
||||
/* avoid reallocation if the existing capacity is sufficient */ |
||||
if (count <= capacity) { |
||||
n = count; |
||||
return; |
||||
} |
||||
} |
||||
|
||||
/* no optimization performed; allocate memory */ |
||||
ManagedPtr tmp(count); |
||||
swap(tmp, *this); |
||||
} |
||||
|
||||
pointer get() const noexcept { return pointer(wrapped.get()); } |
||||
|
||||
explicit operator bool() const noexcept { return wrapped; } |
||||
|
||||
friend bool operator==(const ManagedPtr& lhs, const ManagedPtr& rhs) noexcept { return lhs.wrapped == rhs.wrapped; } |
||||
friend bool operator!=(const ManagedPtr& lhs, const ManagedPtr& rhs) noexcept { return lhs.wrapped != rhs.wrapped; } |
||||
|
||||
friend void swap(ManagedPtr& lhs, ManagedPtr& rhs) noexcept { |
||||
using std::swap; |
||||
swap(lhs.wrapped, rhs.wrapped); |
||||
swap(lhs.n, rhs.n); |
||||
swap(lhs.capacity, rhs.capacity); |
||||
} |
||||
|
||||
private: |
||||
std::shared_ptr<element_type> wrapped; |
||||
size_type n, capacity; |
||||
}; |
||||
|
||||
/** copies entire memory block pointed by \p src to \p dest
|
||||
* |
||||
* \param[in] src device pointer |
||||
* \param[out] dest host pointer |
||||
* |
||||
* Pre-conditions: |
||||
* - memory pointed by \p dest must be large enough to hold the entire block of memory held by \p src |
||||
* |
||||
* Exception Guarantee: Basic |
||||
*/ |
||||
template <class T> |
||||
void memcpy(T *dest, const ManagedPtr<T>& src) { |
||||
memcpy<T>(dest, src.get(), src.size()); |
||||
} |
||||
|
||||
/** copies data from memory pointed by \p src to fully fill \p dest
|
||||
* |
||||
* \param[in] src host pointer |
||||
* \param[out] dest device pointer |
||||
* |
||||
* Pre-conditions: |
||||
* - memory pointed by \p src must be at least as big as the memory block held by \p dest |
||||
* |
||||
* Exception Guarantee: Basic |
||||
*/ |
||||
template <class T> |
||||
void memcpy(const ManagedPtr<T>& dest, const T* src) { |
||||
memcpy<T>(dest.get(), src, dest.size()); |
||||
} |
||||
|
||||
/** copies data from memory pointed by \p src to \p dest
|
||||
* |
||||
* if the two \p src and \p dest have different sizes, the number of elements copied is |
||||
* equal to the size of the smaller memory block |
||||
* |
||||
* \param[in] src device pointer |
||||
* \param[out] dest device pointer |
||||
* |
||||
* Exception Guarantee: Basic |
||||
*/ |
||||
template <class T> |
||||
void memcpy(const ManagedPtr<T>& dest, const ManagedPtr<T>& src) { |
||||
memcpy<T>(dest.get(), src.get(), std::min(dest.size(), src.size())); |
||||
} |
||||
|
||||
/** sets device memory block to a specific 8-bit value
|
||||
* |
||||
* \param[in] src device pointer |
||||
* \param[out] ch 8-bit value to fill the device memory with |
||||
* |
||||
* Exception Guarantee: Basic |
||||
*/ |
||||
template <class T> |
||||
void memset(const ManagedPtr<T>& dest, std::int8_t ch) { |
||||
memset<T>(dest.get(), ch, dest.size()); |
||||
} |
||||
|
||||
/** copies entire memory block pointed by \p src to \p dest asynchronously
|
||||
* |
||||
* \param[in] src device pointer |
||||
* \param[out] dest host pointer |
||||
* \param stream CUDA stream that has to be used for the memory transfer |
||||
* |
||||
* Pre-conditions: |
||||
* - memory pointed by \p dest must be large enough to hold the entire block of memory held by \p src |
||||
* - \p dest points to page-locked memory |
||||
* |
||||
* Exception Guarantee: Basic |
||||
*/ |
||||
template <class T> |
||||
void memcpy(T *dest, const ManagedPtr<T>& src, const Stream& stream) { |
||||
CV_Assert(stream); |
||||
memcpy<T>(dest, src.get(), src.size(), stream); |
||||
} |
||||
|
||||
/** copies data from memory pointed by \p src to \p dest asynchronously
|
||||
* |
||||
* \param[in] src host pointer |
||||
* \param[out] dest device pointer |
||||
* \param stream CUDA stream that has to be used for the memory transfer |
||||
* |
||||
* Pre-conditions: |
||||
* - memory pointed by \p dest must be large enough to hold the entire block of memory held by \p src |
||||
* - \p src points to page-locked memory |
||||
* |
||||
* Exception Guarantee: Basic |
||||
*/ |
||||
template <class T> |
||||
void memcpy(const ManagedPtr<T>& dest, const T* src, const Stream& stream) { |
||||
CV_Assert(stream); |
||||
memcpy<T>(dest.get(), src, dest.size(), stream); |
||||
} |
||||
|
||||
/** copies data from memory pointed by \p src to \p dest asynchronously
|
||||
* |
||||
* \param[in] src device pointer |
||||
* \param[out] dest device pointer |
||||
* \param stream CUDA stream that has to be used for the memory transfer |
||||
* |
||||
* if the two \p src and \p dest have different sizes, the number of elements copied is |
||||
* equal to the size of the smaller memory block |
||||
* |
||||
* Exception Guarantee: Basic |
||||
*/ |
||||
template <class T> |
||||
void memcpy(ManagedPtr<T>& dest, const ManagedPtr<T>& src, const Stream& stream) { |
||||
CV_Assert(stream); |
||||
memcpy<T>(dest.get(), src.get(), std::min(dest.size(), src.size()), stream); |
||||
} |
||||
|
||||
/** sets device memory block to a specific 8-bit value asynchronously
|
||||
* |
||||
* \param[in] src device pointer |
||||
* \param[out] ch 8-bit value to fill the device memory with |
||||
* \param stream CUDA stream that has to be used for the memory operation |
||||
* |
||||
* Exception Guarantee: Basic |
||||
*/ |
||||
template <class T> |
||||
void memset(const ManagedPtr<T>& dest, int ch, const Stream& stream) { |
||||
CV_Assert(stream); |
||||
memset<T>(dest.get(), ch, dest.size(), stream); |
||||
} |
||||
|
||||
/** @brief registers host memory as page-locked and unregisters on destruction */ |
||||
class MemoryLockGuard { |
||||
public: |
||||
MemoryLockGuard() noexcept : ptr { nullptr } { } |
||||
MemoryLockGuard(const MemoryLockGuard&) = delete; |
||||
MemoryLockGuard(MemoryLockGuard&& other) noexcept : ptr{ other.ptr } { |
||||
other.ptr = nullptr; |
||||
} |
||||
|
||||
/** page-locks \p size_in_bytes bytes of memory starting from \p ptr_
|
||||
* |
||||
* Pre-conditons: |
||||
* - host memory should be unregistered |
||||
*/ |
||||
MemoryLockGuard(void* ptr_, std::size_t size_in_bytes) { |
||||
CUDA4DNN_CHECK_CUDA(cudaHostRegister(ptr_, size_in_bytes, cudaHostRegisterPortable)); |
||||
ptr = ptr_; |
||||
} |
||||
|
||||
MemoryLockGuard& operator=(const MemoryLockGuard&) = delete; |
||||
MemoryLockGuard& operator=(MemoryLockGuard&& other) noexcept { |
||||
ptr = other.ptr; |
||||
other.ptr = nullptr; |
||||
return *this; |
||||
} |
||||
|
||||
~MemoryLockGuard() { |
||||
if(ptr != nullptr) |
||||
CUDA4DNN_CHECK_CUDA(cudaHostUnregister(ptr)); |
||||
} |
||||
|
||||
private: |
||||
void *ptr; |
||||
}; |
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::csl */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_MEMORY_HPP */ |
@ -0,0 +1,20 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_NVCC_DEFS_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_NVCC_DEFS_HPP |
||||
|
||||
#include <cuda_runtime_api.h> |
||||
|
||||
#ifdef __CUDACC__ |
||||
# define CUDA4DNN_HOST __host__ |
||||
# define CUDA4DNN_DEVICE __device__ |
||||
# define CUDA4DNN_HOST_DEVICE CUDA4DNN_HOST CUDA4DNN_DEVICE |
||||
#else |
||||
# define CUDA4DNN_HOST |
||||
# define CUDA4DNN_DEVICE |
||||
# define CUDA4DNN_HOST_DEVICE |
||||
#endif |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_NVCC_DEFS_HPP */ |
@ -0,0 +1,411 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_POINTER_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_POINTER_HPP |
||||
|
||||
#include "nvcc_defs.hpp" |
||||
#include "error.hpp" |
||||
#include "stream.hpp" |
||||
|
||||
#include <opencv2/core.hpp> |
||||
|
||||
#include <cuda_runtime_api.h> |
||||
|
||||
#include <cstddef> |
||||
#include <type_traits> |
||||
#include <ostream> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { |
||||
|
||||
/** @brief provides a type-safe device pointer
|
||||
* |
||||
* DevicePtr wraps a raw pointer and mimics its behaviour. It does not implicitly convert |
||||
* to a raw pointer. This ensures that accidental mixing of host and device pointers do not happen. |
||||
* |
||||
* It is meant to point to locations in device memory. Hence, it provides dereferencing or |
||||
* array subscript capability for device code only. |
||||
* |
||||
* A `const DevicePtr<T>` represents an immutable pointer to a mutable memory. |
||||
* A `DevicePtr<const T>` represents a mutable pointer to an immutable memory. |
||||
* A `const DevicePtr<const T>` represents an immutable pointer to an immutable memory. |
||||
* |
||||
* A `DevicePtr<T>` can implicitly convert to `DevicePtr<const T>`. |
||||
* |
||||
* Specalizations: |
||||
* - DevicePtr<void>/DevicePtr<const void> do not support pointer arithmetic (but relational operators are provided) |
||||
* - any device pointer pointing to mutable memory is implicitly convertible to DevicePtr<void> |
||||
* - any device pointer is implicitly convertible to DevicePtr<const void> |
||||
* - DevicePtr<void> can be explicitly converted to any device pointer |
||||
* - DevicePtr<const void> can be explicitly converted to any device pointer pointing to immutable memory |
||||
*/ |
||||
template <class T> |
||||
class DevicePtr { |
||||
static_assert(std::is_standard_layout<T>::value, "T must satisfy StandardLayoutType"); |
||||
|
||||
public: |
||||
using element_type = T; |
||||
using difference_type = std::ptrdiff_t; |
||||
using pointer = typename std::add_pointer<element_type>::type; |
||||
using reference = typename std::add_lvalue_reference<element_type>::type; |
||||
|
||||
DevicePtr() = default; |
||||
CUDA4DNN_HOST_DEVICE explicit DevicePtr(pointer ptr_) noexcept : ptr{ ptr_ } { } |
||||
|
||||
CUDA4DNN_HOST_DEVICE DevicePtr operator=(pointer ptr_) noexcept { ptr = ptr_; return *this; } |
||||
|
||||
CUDA4DNN_HOST_DEVICE pointer get() const noexcept { return ptr; }; |
||||
|
||||
CUDA4DNN_DEVICE reference operator[](difference_type idx) const noexcept { return get()[idx]; } |
||||
CUDA4DNN_DEVICE reference operator*() const noexcept { return *get(); } |
||||
CUDA4DNN_DEVICE pointer operator->() const noexcept { return get(); } |
||||
|
||||
template<class U = T, typename std::enable_if<!std::is_const<U>::value, bool>::type = true> |
||||
CUDA4DNN_HOST_DEVICE operator DevicePtr<typename std::add_const<U>::type>() const noexcept { |
||||
return DevicePtr<typename std::add_const<U>::type>{ptr}; |
||||
} |
||||
|
||||
CUDA4DNN_HOST_DEVICE explicit operator bool() const noexcept { return ptr; } |
||||
|
||||
CUDA4DNN_HOST_DEVICE DevicePtr operator++() noexcept { |
||||
++ptr; |
||||
return *this; |
||||
} |
||||
|
||||
CUDA4DNN_HOST_DEVICE DevicePtr operator++(int) noexcept { |
||||
auto tmp = DevicePtr(*this); |
||||
ptr++; |
||||
return tmp; |
||||
} |
||||
|
||||
CUDA4DNN_HOST_DEVICE DevicePtr operator--() noexcept { |
||||
--ptr; |
||||
return *this; |
||||
} |
||||
|
||||
CUDA4DNN_HOST_DEVICE DevicePtr operator--(int) noexcept { |
||||
auto tmp = DevicePtr(*this); |
||||
ptr--; |
||||
return tmp; |
||||
} |
||||
|
||||
CUDA4DNN_HOST_DEVICE DevicePtr operator+=(std::ptrdiff_t offset) noexcept { |
||||
ptr += offset; |
||||
return *this; |
||||
} |
||||
|
||||
CUDA4DNN_HOST_DEVICE DevicePtr operator-=(std::ptrdiff_t offset) noexcept { |
||||
ptr -= offset; |
||||
return *this; |
||||
} |
||||
|
||||
CUDA4DNN_HOST_DEVICE friend DevicePtr operator+(DevicePtr lhs, std::ptrdiff_t offset) noexcept { |
||||
return lhs += offset; |
||||
} |
||||
|
||||
CUDA4DNN_HOST_DEVICE friend DevicePtr operator-(DevicePtr lhs, std::ptrdiff_t offset) noexcept { |
||||
return lhs -= offset; |
||||
} |
||||
|
||||
CUDA4DNN_HOST_DEVICE friend difference_type operator-(DevicePtr lhs, DevicePtr rhs) noexcept { |
||||
return lhs.ptr - rhs.ptr; |
||||
} |
||||
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator==(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr == rhs.ptr; } |
||||
CUDA4DNN_HOST_DEVICE friend bool operator!=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs == rhs); } |
||||
CUDA4DNN_HOST_DEVICE friend bool operator<(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr < rhs.ptr; } |
||||
CUDA4DNN_HOST_DEVICE friend bool operator>(DevicePtr lhs, DevicePtr rhs) noexcept { return rhs < lhs; } |
||||
CUDA4DNN_HOST_DEVICE friend bool operator<=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(rhs < lhs); } |
||||
CUDA4DNN_HOST_DEVICE friend bool operator>=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs < rhs); } |
||||
|
||||
CUDA4DNN_HOST_DEVICE explicit operator pointer() const noexcept { return ptr; } |
||||
|
||||
CUDA4DNN_HOST friend void swap(DevicePtr& lhs, DevicePtr& rhs) noexcept { |
||||
using std::swap; |
||||
swap(lhs.ptr, rhs.ptr); |
||||
} |
||||
|
||||
template <class U, class V> |
||||
CUDA4DNN_HOST friend std::basic_ostream<U, V>& operator<<(std::basic_ostream<U, V>& os, DevicePtr other) { |
||||
os << other.get() << " (device)"; |
||||
return os; |
||||
} |
||||
|
||||
private: |
||||
pointer ptr; |
||||
}; |
||||
|
||||
template <> |
||||
class DevicePtr<const void> { |
||||
public: |
||||
using element_type = const void; |
||||
using pointer = typename std::add_pointer<element_type>::type; |
||||
|
||||
DevicePtr() = default; |
||||
|
||||
/* host const void pointer to const void device pointer */ |
||||
CUDA4DNN_HOST_DEVICE explicit DevicePtr(pointer ptr_) noexcept : ptr{ ptr_ } { } |
||||
|
||||
/* allow any device pointer to be implicitly convereted to void device pointer */ |
||||
template <class T> |
||||
CUDA4DNN_HOST_DEVICE DevicePtr(DevicePtr<T> ptr_) noexcept : ptr{ ptr_.get() } { } |
||||
|
||||
CUDA4DNN_HOST_DEVICE DevicePtr operator=(pointer ptr_) noexcept { ptr = ptr_; return *this; } |
||||
|
||||
CUDA4DNN_HOST_DEVICE pointer get() const noexcept { return ptr; }; |
||||
|
||||
CUDA4DNN_HOST_DEVICE explicit operator bool() const noexcept { return ptr; } |
||||
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator==(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr == rhs.ptr; } |
||||
CUDA4DNN_HOST_DEVICE friend bool operator!=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs == rhs); } |
||||
CUDA4DNN_HOST_DEVICE friend bool operator<(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr < rhs.ptr; } |
||||
CUDA4DNN_HOST_DEVICE friend bool operator>(DevicePtr lhs, DevicePtr rhs) noexcept { return rhs < lhs; } |
||||
CUDA4DNN_HOST_DEVICE friend bool operator<=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(rhs < lhs); } |
||||
CUDA4DNN_HOST_DEVICE friend bool operator>=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs < rhs); } |
||||
|
||||
/* explicit conversion into host void pointer */ |
||||
CUDA4DNN_HOST_DEVICE explicit operator pointer() const noexcept { return ptr; } |
||||
|
||||
/* const void device pointer can be explicitly casted into any const device pointer type */ |
||||
template <class T, typename std::enable_if<std::is_const<T>::value, bool>::type = true> |
||||
CUDA4DNN_HOST_DEVICE explicit operator DevicePtr<T>() const noexcept { |
||||
return static_cast<T*>(ptr); |
||||
} |
||||
|
||||
CUDA4DNN_HOST friend void swap(DevicePtr& lhs, DevicePtr& rhs) noexcept { |
||||
using std::swap; |
||||
swap(lhs.ptr, rhs.ptr); |
||||
} |
||||
|
||||
template <class U, class V> |
||||
CUDA4DNN_HOST friend std::basic_ostream<U, V>& operator<<(std::basic_ostream<U, V>& os, DevicePtr other) { |
||||
os << other.get() << " (device)"; |
||||
return os; |
||||
} |
||||
|
||||
private: |
||||
pointer ptr; |
||||
}; |
||||
|
||||
template <> |
||||
class DevicePtr<void> { |
||||
public: |
||||
using element_type = void; |
||||
using pointer = typename std::add_pointer<element_type>::type; |
||||
|
||||
DevicePtr() = default; |
||||
|
||||
/* host pointer to device pointer */ |
||||
CUDA4DNN_HOST_DEVICE explicit DevicePtr(pointer ptr_) noexcept : ptr{ ptr_ } { } |
||||
|
||||
/* allow any device pointer to mutable memory to be implicitly convereted to void device pointer */ |
||||
template <class T, typename std::enable_if<!std::is_const<T>::value, bool>::type = false> |
||||
CUDA4DNN_HOST_DEVICE DevicePtr(DevicePtr<T> ptr_) noexcept : ptr { ptr_.get() } { } |
||||
|
||||
CUDA4DNN_HOST_DEVICE DevicePtr operator=(pointer ptr_) noexcept { ptr = ptr_; return *this; } |
||||
|
||||
CUDA4DNN_HOST_DEVICE pointer get() const noexcept { return ptr; }; |
||||
|
||||
CUDA4DNN_HOST_DEVICE operator DevicePtr<const void>() const noexcept { return DevicePtr<const void>{ptr}; } |
||||
|
||||
CUDA4DNN_HOST_DEVICE explicit operator bool() const noexcept { return ptr; } |
||||
|
||||
CUDA4DNN_HOST_DEVICE friend bool operator==(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr == rhs.ptr; } |
||||
CUDA4DNN_HOST_DEVICE friend bool operator!=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs == rhs); } |
||||
CUDA4DNN_HOST_DEVICE friend bool operator<(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr < rhs.ptr; } |
||||
CUDA4DNN_HOST_DEVICE friend bool operator>(DevicePtr lhs, DevicePtr rhs) noexcept { return rhs < lhs; } |
||||
CUDA4DNN_HOST_DEVICE friend bool operator<=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(rhs < lhs); } |
||||
CUDA4DNN_HOST_DEVICE friend bool operator>=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs < rhs); } |
||||
|
||||
/* explicit conversion into host void pointer */ |
||||
CUDA4DNN_HOST_DEVICE explicit operator pointer() const noexcept { return ptr; } |
||||
|
||||
/* void device pointer can be explicitly casted into any device pointer type */ |
||||
template <class T> |
||||
CUDA4DNN_HOST_DEVICE explicit operator DevicePtr<T>() const noexcept { |
||||
return DevicePtr<T>(static_cast<T*>(ptr)); |
||||
} |
||||
|
||||
CUDA4DNN_HOST friend void swap(DevicePtr& lhs, DevicePtr& rhs) noexcept { |
||||
using std::swap; |
||||
swap(lhs.ptr, rhs.ptr); |
||||
} |
||||
|
||||
template <class U, class V> |
||||
CUDA4DNN_HOST friend std::basic_ostream<U, V>& operator<<(std::basic_ostream<U, V>& os, DevicePtr other) { |
||||
os << other.get() << " (device)"; |
||||
return os; |
||||
} |
||||
|
||||
private: |
||||
pointer ptr; |
||||
}; |
||||
|
||||
template <class T> |
||||
bool is_aligned(DevicePtr<const T> ptr, std::size_t alignment) { |
||||
auto addr = reinterpret_cast<std::intptr_t>(ptr.get()); |
||||
return addr % alignment == 0; |
||||
} |
||||
|
||||
/** copies \p n elements from \p src to \p dest4
|
||||
* |
||||
* \param[in] src device pointer |
||||
* \param[out] dest host pointer |
||||
* |
||||
* Pre-conditions: |
||||
* - memory pointed by \p dest and \p src must be large enough to hold \p n elements |
||||
* |
||||
* Exception Guarantee: Basic |
||||
*/ |
||||
template <class T> |
||||
void memcpy(T *dest, DevicePtr<const T> src, std::size_t n) { |
||||
if (n <= 0) { |
||||
CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive"); |
||||
} |
||||
|
||||
CUDA4DNN_CHECK_CUDA(cudaMemcpy(dest, src.get(), n * sizeof(T), cudaMemcpyDefault)); |
||||
} |
||||
|
||||
/** copies \p n elements from \p src to \p dest
|
||||
* |
||||
* \param[in] src host pointer |
||||
* \param[out] dest device pointer |
||||
* |
||||
* Pre-conditions: |
||||
* - memory pointed by \p dest and \p src must be large enough to hold \p n elements |
||||
* |
||||
* Exception Guarantee: Basic |
||||
*/ |
||||
template <class T> |
||||
void memcpy(DevicePtr<T> dest, const T* src, std::size_t n) { |
||||
if (n <= 0) { |
||||
CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive"); |
||||
} |
||||
|
||||
CUDA4DNN_CHECK_CUDA(cudaMemcpy(dest.get(), src, n * sizeof(T), cudaMemcpyDefault)); |
||||
} |
||||
|
||||
/** copies \p n elements from \p src to \p dest
|
||||
* |
||||
* \param[in] src device pointer |
||||
* \param[out] dest device pointer |
||||
* |
||||
* Pre-conditions: |
||||
* - memory pointed by \p dest and \p src must be large enough to hold \p n elements |
||||
* |
||||
* Exception Guarantee: Basic |
||||
*/ |
||||
template <class T> |
||||
void memcpy(DevicePtr<T> dest, DevicePtr<const T> src, std::size_t n) { |
||||
if (n <= 0) { |
||||
CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive"); |
||||
} |
||||
|
||||
CUDA4DNN_CHECK_CUDA(cudaMemcpy(dest.get(), src.get(), n * sizeof(T), cudaMemcpyDefault)); |
||||
} |
||||
|
||||
/** sets \p n elements to \p ch in \p dest
|
||||
* |
||||
* \param[in] src device pointer |
||||
* \param[out] ch 8-bit value to fill the device memory with |
||||
* |
||||
* Pre-conditions: |
||||
* - memory pointed by \p dest must be large enough to hold \p n elements |
||||
* |
||||
* Exception Guarantee: Basic |
||||
*/ |
||||
template <class T> |
||||
void memset(DevicePtr<T> dest, std::int8_t ch, std::size_t n) { |
||||
if (n <= 0) { |
||||
CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive"); |
||||
} |
||||
|
||||
CUDA4DNN_CHECK_CUDA(cudaMemset(dest.get(), ch, n * sizeof(T))); |
||||
} |
||||
|
||||
/** copies \p n elements from \p src to \p dest asynchronously
|
||||
* |
||||
* \param[in] src device pointer |
||||
* \param[out] dest host pointer |
||||
* \param stream CUDA stream that has to be used for the memory transfer |
||||
* |
||||
* Pre-conditions: |
||||
* - memory pointed by \p dest and \p src must be large enough to hold \p n elements |
||||
* - \p dest points to page-locked memory |
||||
* |
||||
* Exception Guarantee: Basic |
||||
*/ |
||||
template <class T> |
||||
void memcpy(T *dest, DevicePtr<const T> src, std::size_t n, const Stream& stream) { |
||||
if (n <= 0) { |
||||
CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive"); |
||||
} |
||||
|
||||
CUDA4DNN_CHECK_CUDA(cudaMemcpyAsync(dest, src.get(), n * sizeof(T), cudaMemcpyDefault, stream.get())); |
||||
} |
||||
|
||||
/** copies data from memory pointed by \p src to \p dest asynchronously
|
||||
* |
||||
* \param[in] src host pointer |
||||
* \param[out] dest device pointer |
||||
* \param stream CUDA stream that has to be used for the memory transfer |
||||
* |
||||
* Pre-conditions: |
||||
* - memory pointed by \p dest and \p src must be large enough to hold \p n elements |
||||
* - \p src points to page-locked memory |
||||
* |
||||
* Exception Guarantee: Basic |
||||
*/ |
||||
template <class T> |
||||
void memcpy(DevicePtr<T> dest, const T *src, std::size_t n, const Stream& stream) { |
||||
if (n <= 0) { |
||||
CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive"); |
||||
} |
||||
|
||||
CUDA4DNN_CHECK_CUDA(cudaMemcpyAsync(dest.get(), src, n * sizeof(T), cudaMemcpyDefault, stream.get())); |
||||
} |
||||
|
||||
/** copies \p n elements from \p src to \p dest asynchronously
|
||||
* |
||||
* \param[in] src device pointer |
||||
* \param[out] dest device pointer |
||||
* \param stream CUDA stream that has to be used for the memory transfer |
||||
* |
||||
* Pre-conditions: |
||||
* - memory pointed by \p dest and \p src must be large enough to hold \p n elements |
||||
* |
||||
* Exception Guarantee: Basic |
||||
*/ |
||||
template <class T> |
||||
void memcpy(DevicePtr<T> dest, DevicePtr<const T> src, std::size_t n, const Stream& stream) { |
||||
if (n <= 0) { |
||||
CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive"); |
||||
} |
||||
|
||||
CUDA4DNN_CHECK_CUDA(cudaMemcpyAsync(dest.get(), src.get(), n * sizeof(T), cudaMemcpyDefault, stream.get())); |
||||
} |
||||
|
||||
/** sets \p n elements to \p ch in \p dest asynchronously
|
||||
* |
||||
* \param[in] src device pointer |
||||
* \param[out] ch 8-bit value to fill the device memory with |
||||
* \param stream CUDA stream that has to be used for the memory operation |
||||
* |
||||
* Pre-conditions: |
||||
* - memory pointed by \p dest must be large enough to hold \p n elements |
||||
* |
||||
* Exception Guarantee: Basic |
||||
*/ |
||||
template <class T> |
||||
void memset(DevicePtr<T> dest, std::int8_t ch, std::size_t n, const Stream& stream) { |
||||
if (n <= 0) { |
||||
CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive"); |
||||
} |
||||
|
||||
CUDA4DNN_CHECK_CUDA(cudaMemsetAsync(dest.get(), ch, n * sizeof(T), stream.get())); |
||||
} |
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::csl */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_POINTER_HPP */ |
@ -0,0 +1,83 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_SPAN_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_SPAN_HPP |
||||
|
||||
#include "pointer.hpp" |
||||
#include "nvcc_defs.hpp" |
||||
|
||||
#include <cstddef> |
||||
#include <type_traits> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { |
||||
|
||||
/** @brief provides non-owning mutable access for device arrays
|
||||
* |
||||
* const Span<T>/Span<T> provides mutable access to the elements unless T is const qualified |
||||
* const Span<T> makes the span immutable but not the elements |
||||
*/ |
||||
template <class T> |
||||
class Span { |
||||
static_assert(std::is_standard_layout<T>::value, "T must satisfy StandardLayoutType"); |
||||
|
||||
public: |
||||
using value_type = T; |
||||
using size_type = std::size_t; |
||||
using difference_type = std::ptrdiff_t; |
||||
|
||||
using pointer = DevicePtr<value_type>; |
||||
using const_pointer = DevicePtr<typename std::add_const<value_type>::type>; |
||||
using reference = typename std::add_lvalue_reference<value_type>::type; |
||||
using const_reference = typename std::add_lvalue_reference<typename std::add_const<value_type>::type>; |
||||
|
||||
using iterator = pointer; |
||||
using const_iterator = const_pointer; |
||||
|
||||
Span() noexcept : ptr{ nullptr }, sz{ 0 } { } |
||||
CUDA4DNN_HOST_DEVICE Span(pointer first, pointer last) noexcept : ptr{ first }, sz{ last - first } { } |
||||
CUDA4DNN_HOST_DEVICE Span(pointer first, size_type count) noexcept : ptr{ first }, sz{ count } { } |
||||
|
||||
CUDA4DNN_HOST_DEVICE size_type size() const noexcept { return sz; } |
||||
CUDA4DNN_HOST_DEVICE bool empty() const noexcept { return size() == 0; } |
||||
|
||||
CUDA4DNN_DEVICE reference operator[](difference_type index) const { return ptr[index]; } |
||||
CUDA4DNN_HOST_DEVICE pointer data() const noexcept { return ptr; } |
||||
|
||||
template<class U = T, class V = typename std::add_const<U>::type, |
||||
typename std::enable_if<!std::is_const<U>::value, bool>::type = true> |
||||
CUDA4DNN_HOST_DEVICE operator Span<V>() const noexcept { return Span<V>{ptr, sz}; } |
||||
|
||||
private: |
||||
pointer ptr; |
||||
size_type sz; |
||||
}; |
||||
|
||||
/** @brief provides non-owning immutable view for device arrays */ |
||||
template <class T> |
||||
using View = Span<const T>; |
||||
|
||||
/** returns true if the address of a span/view is aligned to \p alignment number of elements (not bytes) */ |
||||
template <class T> |
||||
bool is_address_aligned(View<T> v, std::size_t alignment) { |
||||
return is_aligned(v.data(), alignment * sizeof(T)); |
||||
} |
||||
|
||||
/** returns true if the size of a span/view is a multiple of \p alignment */ |
||||
template <class T> |
||||
bool is_size_aligned(View<T> v, std::size_t alignment) { |
||||
return v.size() % alignment == 0; |
||||
} |
||||
|
||||
/** @brief returns true if the address and the size of the span/view is aligned
|
||||
* \p alignment refers to the number of elements (not bytes) |
||||
*/ |
||||
template <class T> |
||||
bool is_fully_aligned(View<T> v, std::size_t alignment) { |
||||
return is_address_aligned(v, alignment) && is_size_aligned(v, alignment); |
||||
} |
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::csl */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_SPAN_HPP */ |
@ -0,0 +1,118 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_STREAM_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_STREAM_HPP |
||||
|
||||
#include "error.hpp" |
||||
|
||||
#include <opencv2/core.hpp> |
||||
#include <opencv2/core/utils/logger.hpp> |
||||
|
||||
#include <cuda_runtime_api.h> |
||||
|
||||
#include <memory> |
||||
#include <sstream> |
||||
#include <utility> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { |
||||
|
||||
/** @brief noncopyable smart CUDA stream
|
||||
* |
||||
* UniqueStream is a smart non-sharable wrapper for CUDA stream handle which ensures that |
||||
* the handle is destroyed after use. Unless explicitly specified by a constructor argument, |
||||
* the stream object represents the default stream. |
||||
*/ |
||||
class UniqueStream { |
||||
public: |
||||
UniqueStream() noexcept : stream{ 0 } { } |
||||
UniqueStream(UniqueStream&) = delete; |
||||
UniqueStream(UniqueStream&& other) noexcept { |
||||
stream = other.stream; |
||||
other.stream = 0; |
||||
} |
||||
|
||||
UniqueStream(bool create) : stream{ 0 } { |
||||
if (create) { |
||||
CUDA4DNN_CHECK_CUDA(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); |
||||
} |
||||
} |
||||
|
||||
~UniqueStream() { |
||||
try { |
||||
if (stream != 0) |
||||
CUDA4DNN_CHECK_CUDA(cudaStreamDestroy(stream)); |
||||
} catch (const CUDAException& ex) { |
||||
std::ostringstream os; |
||||
os << "Asynchronous exception caught during CUDA stream destruction.\n"; |
||||
os << ex.what(); |
||||
os << "Exception will be ignored.\n"; |
||||
CV_LOG_WARNING(0, os.str().c_str()); |
||||
} |
||||
} |
||||
|
||||
UniqueStream& operator=(const UniqueStream&) = delete; |
||||
UniqueStream& operator=(UniqueStream&& other) noexcept { |
||||
stream = other.stream; |
||||
other.stream = 0; |
||||
return *this; |
||||
} |
||||
|
||||
/** returns the raw CUDA stream handle */ |
||||
cudaStream_t get() const noexcept { return stream; } |
||||
|
||||
void synchronize() const { CUDA4DNN_CHECK_CUDA(cudaStreamSynchronize(stream)); } |
||||
bool busy() const { |
||||
auto status = cudaStreamQuery(stream); |
||||
if (status == cudaErrorNotReady) |
||||
return true; |
||||
CUDA4DNN_CHECK_CUDA(status); |
||||
return false; |
||||
} |
||||
|
||||
private: |
||||
cudaStream_t stream; |
||||
}; |
||||
|
||||
/** @brief sharable smart CUDA stream
|
||||
* |
||||
* Stream is a smart sharable wrapper for CUDA stream handle which ensures that |
||||
* the handle is destroyed after use. Unless explicitly specified by a constructor argument, |
||||
* the stream object represents the default stream. |
||||
* |
||||
* @note Moving a Stream object to another invalidates the former |
||||
*/ |
||||
class Stream { |
||||
public: |
||||
Stream() : stream(std::make_shared<UniqueStream>()) { } |
||||
Stream(const Stream&) = default; |
||||
Stream(Stream&&) = default; |
||||
|
||||
/** if \p create is `true`, a new stream will be created instead of the otherwise default stream */ |
||||
Stream(bool create) : stream(std::make_shared<UniqueStream>(create)) { } |
||||
|
||||
Stream& operator=(const Stream&) = default; |
||||
Stream& operator=(Stream&&) = default; |
||||
|
||||
/** blocks the caller thread until all operations in the stream are complete */ |
||||
void synchronize() const { stream->synchronize(); } |
||||
|
||||
/** returns true if there are operations pending in the stream */ |
||||
bool busy() const { return stream->busy(); } |
||||
|
||||
/** returns true if the stream is valid */ |
||||
explicit operator bool() const noexcept { return static_cast<bool>(stream); } |
||||
|
||||
cudaStream_t get() const noexcept { |
||||
CV_Assert(stream); |
||||
return stream->get(); |
||||
} |
||||
|
||||
private: |
||||
std::shared_ptr<UniqueStream> stream; |
||||
}; |
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::csl */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_STREAM_HPP */ |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,384 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_TENSOR_OPS_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_TENSOR_OPS_HPP |
||||
|
||||
#include "stream.hpp" |
||||
#include "tensor.hpp" |
||||
#include "pointer.hpp" |
||||
#include "cublas.hpp" |
||||
#include "cudnn.hpp" |
||||
#include "workspace.hpp" |
||||
|
||||
#include "cudnn/convolution.hpp" |
||||
#include "cudnn/pooling.hpp" |
||||
#include "cudnn/lrn.hpp" |
||||
#include "cudnn/softmax.hpp" |
||||
#include "cudnn/transform.hpp" |
||||
#include "cudnn/transpose_convolution.hpp" |
||||
|
||||
#include <opencv2/core.hpp> |
||||
|
||||
#include <cstddef> |
||||
#include <array> |
||||
#include <vector> |
||||
#include <algorithm> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { |
||||
|
||||
namespace tensor_ops { |
||||
|
||||
/** @brief copies data between tensors
|
||||
* |
||||
* Pre-conditions: |
||||
* - \p dest and \p src must have the same shape |
||||
* |
||||
* Exception Gaurantee: Basic |
||||
*/ |
||||
template <class T> inline |
||||
void copy(const Stream& stream, TensorSpan<T> dest, TensorView<T> src) { |
||||
CV_Assert(is_shape_same(dest, src)); |
||||
if (dest.get() != src.get()) |
||||
memcpy(dest.get(), src.get(), dest.size(), stream); |
||||
} |
||||
|
||||
/** @brief performs generalized matrix-multiplication
|
||||
* |
||||
* Pre-conditions: |
||||
* - \p A and \p B must meet the mathematical requirements for matrix multiplication |
||||
* - \p result must be large enough to hold the result |
||||
* |
||||
* Exception Gaurantee: Basic |
||||
*/ |
||||
template <class T> inline |
||||
void gemm(const cublas::Handle& handle, T beta, TensorSpan<T> result, T alpha, bool transa, TensorView<T> A, bool transb, TensorView<T> B) { |
||||
/* matrix operations can be performed only on rank two or less tensors */ |
||||
CV_Assert(get_effective_rank(A) <= 2 && |
||||
get_effective_rank(B) <= 2 && |
||||
get_effective_rank(result) <= 2); |
||||
|
||||
/* check dimension requirements for matrix multiplication */ |
||||
if (!transa && !transb) { |
||||
CV_Assert(A.get_axis_size(-2) == result.get_axis_size(-2)); |
||||
CV_Assert(A.get_axis_size(-1) == B.get_axis_size(-2)); |
||||
CV_Assert(B.get_axis_size(-1) == result.get_axis_size(-1)); |
||||
} else if (!transa && transb) { |
||||
CV_Assert(A.get_axis_size(-2) == result.get_axis_size(-2)); |
||||
CV_Assert(A.get_axis_size(-1) == B.get_axis_size(-1)); |
||||
CV_Assert(B.get_axis_size(-2) == result.get_axis_size(-1)); |
||||
} else if (transa && !transb) { |
||||
CV_Assert(A.get_axis_size(-1) == result.get_axis_size(-2)); |
||||
CV_Assert(A.get_axis_size(-2) == B.get_axis_size(-2)); |
||||
CV_Assert(B.get_axis_size(-1) == result.get_axis_size(-1)); |
||||
} else { |
||||
CV_Assert(A.get_axis_size(-1) == result.get_axis_size(-2)); |
||||
CV_Assert(A.get_axis_size(-2) == B.get_axis_size(-1)); |
||||
CV_Assert(B.get_axis_size(-2) == result.get_axis_size(-1)); |
||||
} |
||||
|
||||
const auto result_nr = result.get_axis_size(-2); |
||||
const auto result_nc = result.get_axis_size(-1); |
||||
const auto common_dim = A.get_axis_size(transa ? -2 : -1); |
||||
const auto A_nc = A.get_axis_size(-1); |
||||
const auto B_nc = B.get_axis_size(-1); |
||||
|
||||
/* tensors are stored in row-major but cublas::gemm operates on column-major matrices
|
||||
* a row-major matrix when read as column-major matrix gives the transpose of the intended matrix |
||||
* |
||||
* Required: C = AB |
||||
* what cuBLAS sees: C^T = A^TB^T = (BA)^T |
||||
* |
||||
* By reversing operands, we effectively perform: |
||||
* C^T = B^TA^T = (AB)^T |
||||
* |
||||
* which gives C = AB |
||||
*/ |
||||
cublas::gemm<T>(handle, |
||||
transb, transa, |
||||
result_nc, result_nr, common_dim, |
||||
alpha, B.get(), B_nc, |
||||
A.get(), A_nc, |
||||
beta, result.get(), result_nc); |
||||
} |
||||
|
||||
/** @brief performs element-wise addition with broadcasting
|
||||
* |
||||
* Pre-conditions: |
||||
* - \p A and \p result must be compatible tensors |
||||
* |
||||
* Exception Gaurantee: Basic |
||||
*/ |
||||
template <class T> inline |
||||
void softmax(const cudnn::Handle& handle, TensorSpan<T> output, TensorView<T> input, int channel_axis, bool log) { |
||||
CV_Assert(is_shape_same(output, input)); |
||||
|
||||
channel_axis = clamp_axis(channel_axis, input.rank()); |
||||
|
||||
std::size_t outer_size = input.size_range(0, channel_axis); |
||||
auto channel_size = input.get_axis_size(channel_axis); |
||||
std::size_t inner_size = input.size_range(channel_axis + 1, input.rank()); |
||||
|
||||
std::array<std::size_t, 4> shape = { outer_size, channel_size, 1, inner_size }; |
||||
|
||||
using cudnn::TensorDescriptor; |
||||
auto inputDesc = TensorDescriptor<T>(shape); |
||||
auto outputDesc = TensorDescriptor<T>(shape); |
||||
cudnn::softmax(handle, outputDesc, output.get(), inputDesc, input.get(), log); |
||||
} |
||||
} |
||||
|
||||
template <class T> |
||||
class Convolution { |
||||
using TensorDescriptor = cudnn::TensorDescriptor<T>; |
||||
using FilterDescriptor = cudnn::FilterDescriptor<T>; |
||||
using ConvolutionDescriptor = cudnn::ConvolutionDescriptor<T>; |
||||
using ConvolutionAlgorithm = cudnn::ConvolutionAlgorithm<T>; |
||||
|
||||
public: |
||||
struct params_type { |
||||
std::vector<std::size_t> input_shape; |
||||
std::vector<std::size_t> filter_shape; |
||||
|
||||
std::vector<std::size_t> padding; |
||||
std::vector<std::size_t> stride; |
||||
std::vector<std::size_t> dilation; |
||||
|
||||
std::size_t groups; |
||||
}; |
||||
|
||||
Convolution() = default; |
||||
Convolution(const Convolution&) = delete; |
||||
Convolution(Convolution&&) = default; |
||||
Convolution(cudnn::Handle handle, const params_type& params) { |
||||
cudnnHandle = std::move(handle); |
||||
|
||||
inputTensorDesc = TensorDescriptor(params.input_shape); |
||||
filterDesc = FilterDescriptor(params.filter_shape); |
||||
convDesc = ConvolutionDescriptor(params.padding, params.stride, params.dilation, params.groups); |
||||
|
||||
std::vector<int> output_dims; |
||||
getConvolutionForwardOutputDim(convDesc, filterDesc, inputTensorDesc, output_dims); |
||||
outputTensorDesc = TensorDescriptor(output_dims); |
||||
|
||||
algo = ConvolutionAlgorithm(cudnnHandle, convDesc, filterDesc, inputTensorDesc, outputTensorDesc); |
||||
} |
||||
|
||||
Convolution& operator=(const Convolution&) = delete; |
||||
Convolution& operator=(Convolution&&) = default; |
||||
|
||||
std::size_t get_workspace_size() const noexcept { |
||||
return algo.get_workspace_size(); |
||||
} |
||||
|
||||
void convolve(TensorSpan<T> output, TensorView<T> input, TensorView<T> filters, WorkspaceInstance scratchpad) { |
||||
cudnn::convolve<T>( |
||||
cudnnHandle, |
||||
convDesc, algo, scratchpad, |
||||
filterDesc, filters.get(), |
||||
inputTensorDesc, input.get(), |
||||
1.0, 0.0, outputTensorDesc, output.get() |
||||
); |
||||
} |
||||
|
||||
private: |
||||
cudnn::Handle cudnnHandle; |
||||
TensorDescriptor inputTensorDesc, outputTensorDesc; |
||||
FilterDescriptor filterDesc; |
||||
ConvolutionDescriptor convDesc; |
||||
ConvolutionAlgorithm algo; |
||||
}; |
||||
|
||||
template <class T> |
||||
class TransposeConvolution { |
||||
using TensorDescriptor = cudnn::TensorDescriptor<T>; |
||||
using FilterDescriptor = cudnn::FilterDescriptor<T>; |
||||
using ConvolutionDescriptor = cudnn::ConvolutionDescriptor<T>; |
||||
using TransposeConvolutionAlgorithm = cudnn::TransposeConvolutionAlgorithm<T>; |
||||
|
||||
public: |
||||
struct params_type { |
||||
std::vector<std::size_t> input_shape; |
||||
std::vector<std::size_t> output_shape; |
||||
|
||||
std::vector<std::size_t> filter_shape; |
||||
|
||||
std::vector<std::size_t> padding; |
||||
std::vector<std::size_t> stride; |
||||
std::vector<std::size_t> dilation; |
||||
|
||||
std::size_t groups; |
||||
}; |
||||
|
||||
TransposeConvolution() = default; |
||||
TransposeConvolution(const TransposeConvolution&) = delete; |
||||
TransposeConvolution(TransposeConvolution&&) = default; |
||||
TransposeConvolution(cudnn::Handle handle, const params_type& params) { |
||||
cudnnHandle = std::move(handle); |
||||
|
||||
filterDesc = FilterDescriptor(params.filter_shape); |
||||
convDesc = ConvolutionDescriptor(params.padding, params.stride, params.dilation, params.groups); |
||||
|
||||
/* input_shape is the output shape for convolution
|
||||
* output_shape is the input shape for convolution |
||||
*/ |
||||
convInputTensorDesc = TensorDescriptor(params.output_shape); |
||||
|
||||
std::vector<int> conv_output_dims; |
||||
getConvolutionForwardOutputDim(convDesc, filterDesc, convInputTensorDesc, conv_output_dims); |
||||
|
||||
/* the convolution output must be identical to what cuDNN expects */ |
||||
CV_Assert(std::equal(std::begin(conv_output_dims), std::end(conv_output_dims), std::begin(params.input_shape))); |
||||
|
||||
convOutputTensorDesc = TensorDescriptor(params.input_shape); |
||||
|
||||
algo = TransposeConvolutionAlgorithm(cudnnHandle, convDesc, filterDesc, convOutputTensorDesc, convInputTensorDesc); |
||||
} |
||||
|
||||
TransposeConvolution& operator=(const TransposeConvolution&) = delete; |
||||
TransposeConvolution& operator=(TransposeConvolution&&) = default; |
||||
|
||||
std::size_t get_workspace_size() const noexcept { |
||||
return algo.get_workspace_size(); |
||||
} |
||||
|
||||
void transpose_convolve(TensorSpan<T> output, TensorView<T> input, TensorView<T> filters, WorkspaceInstance scratchpad) { |
||||
cudnn::transpose_convolve<T>( |
||||
cudnnHandle, |
||||
convDesc, algo, scratchpad, |
||||
filterDesc, filters.get(), |
||||
convOutputTensorDesc, input.get(), |
||||
1.0, 0.0, convInputTensorDesc, output.get() |
||||
); |
||||
} |
||||
|
||||
private: |
||||
cudnn::Handle cudnnHandle; |
||||
TensorDescriptor convInputTensorDesc, convOutputTensorDesc; |
||||
FilterDescriptor filterDesc; |
||||
ConvolutionDescriptor convDesc; |
||||
TransposeConvolutionAlgorithm algo; |
||||
}; |
||||
|
||||
template <class T> |
||||
class Pooling { |
||||
using TensorDescriptor = cudnn::TensorDescriptor<T>; |
||||
using PoolingDescriptor = cudnn::PoolingDescriptor; |
||||
|
||||
public: |
||||
using PoolingType = PoolingDescriptor::PoolingType; |
||||
|
||||
struct params_type { |
||||
std::vector<std::size_t> input_shape; |
||||
std::vector<std::size_t> output_shape; |
||||
|
||||
std::vector<std::size_t> window_size; |
||||
std::vector<std::size_t> padding; |
||||
std::vector<std::size_t> stride; |
||||
|
||||
PoolingType type; |
||||
}; |
||||
|
||||
Pooling() = default; |
||||
Pooling(const Pooling&) = delete; |
||||
Pooling(Pooling&&) = default; |
||||
Pooling(cudnn::Handle handle, const params_type& params) { |
||||
cudnnHandle = std::move(handle); |
||||
|
||||
inputTensorDesc = TensorDescriptor(params.input_shape); |
||||
poolingDesc = PoolingDescriptor(params.window_size, params.padding, params.stride, params.type); |
||||
|
||||
//std::vector<int> output_dim;
|
||||
//getPoolingForwardOutputDim(poolingDesc, inputTensorDesc, output_dim);
|
||||
outputTensorDesc = TensorDescriptor(params.output_shape); |
||||
} |
||||
|
||||
Pooling& operator=(const Pooling&) = delete; |
||||
Pooling& operator=(Pooling&&) = default; |
||||
|
||||
void pool(TensorView<T> input, TensorSpan<T> output) { |
||||
cudnn::pool<T>( |
||||
cudnnHandle, |
||||
poolingDesc, |
||||
inputTensorDesc, input.get(), |
||||
1.0, 0.0, outputTensorDesc, output.get() |
||||
); |
||||
} |
||||
|
||||
private: |
||||
cudnn::Handle cudnnHandle; |
||||
TensorDescriptor inputTensorDesc, outputTensorDesc; |
||||
PoolingDescriptor poolingDesc; |
||||
}; |
||||
|
||||
template <class T> |
||||
class LRN { |
||||
using LRNDescriptor = cudnn::LRNDescriptor; |
||||
using TensorDescriptor = cudnn::TensorDescriptor<T>; |
||||
|
||||
public: |
||||
using LRNType = LRNDescriptor::LRNType; |
||||
|
||||
LRN() = default; |
||||
LRN(const LRN&) = delete; |
||||
LRN(LRN&&) = default; |
||||
LRN(cudnn::Handle handle, std::size_t local_size, T alpha, T beta, T k, LRNType type) { |
||||
cudnnHandle = std::move(handle); |
||||
lrnDesc = LRNDescriptor(local_size, alpha, beta, k, type); |
||||
} |
||||
|
||||
LRN& operator=(const LRN&) = delete; |
||||
LRN& operator=(LRN&&) = default; |
||||
|
||||
void normalize(TensorView<T> input, TensorSpan<T> output, WorkspaceInstance workspace) { |
||||
cudnn::LRNForward<T>( |
||||
cudnnHandle, |
||||
lrnDesc, |
||||
TensorDescriptor(input.shape_as_vector()), input.get(), |
||||
1.0, 0.0, TensorDescriptor(output.shape_as_vector()), output.get(), |
||||
workspace |
||||
); |
||||
} |
||||
|
||||
private: |
||||
cudnn::Handle cudnnHandle; |
||||
LRNDescriptor lrnDesc; |
||||
}; |
||||
|
||||
template <class T> |
||||
class TensorTransform { |
||||
using TensorTransformDescriptor = cudnn::TensorTransformDescriptor; |
||||
using TensorDescriptor = cudnn::TensorDescriptor<T>; |
||||
|
||||
public: |
||||
TensorTransform() = default; |
||||
TensorTransform(const TensorTransform&) = delete; |
||||
TensorTransform(TensorTransform&&) = default; |
||||
|
||||
template <class SequenceContainer> |
||||
TensorTransform(cudnn::Handle handle, const SequenceContainer& paddingLeft, const SequenceContainer& paddingRight) { |
||||
cudnnHandle = std::move(handle); |
||||
transDesc = TensorTransformDescriptor(paddingLeft, paddingRight); |
||||
} |
||||
|
||||
TensorTransform& operator=(const TensorTransform&) = delete; |
||||
TensorTransform& operator=(TensorTransform&&) = default; |
||||
|
||||
void transform(TensorView<T> input, TensorSpan<T> output) { |
||||
cudnn::transform<T>( |
||||
cudnnHandle, |
||||
transDesc, |
||||
TensorDescriptor(input.shape_as_vector()), input.get(), |
||||
TensorDescriptor(output.shape_as_vector()), output.get() |
||||
); |
||||
} |
||||
|
||||
private: |
||||
cudnn::Handle cudnnHandle; |
||||
TensorTransformDescriptor transDesc; |
||||
}; |
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::csl */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_TENSOR_OPS_HPP */ |
@ -0,0 +1,166 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_WORKSPACE_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CSL_WORKSPACE_HPP |
||||
|
||||
#include "pointer.hpp" |
||||
#include "span.hpp" |
||||
#include "tensor.hpp" |
||||
|
||||
#include <cstddef> |
||||
#include <cstdint> |
||||
#include <iterator> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { |
||||
|
||||
/** @brief maintains a single block of reusable device memory
|
||||
* |
||||
* Each Workspace object is intended to be used by a single entity at a time but by |
||||
* different entities at different times. It maintains a single reusable block of memory which |
||||
* is sufficient for the largest consumer. |
||||
*/ |
||||
class Workspace { |
||||
public: |
||||
|
||||
/** @brief reserve \p bytes of memory */ |
||||
void require(std::size_t bytes) { |
||||
if (bytes > ptr.size()) |
||||
ptr.reset(bytes); |
||||
} |
||||
|
||||
/** @brief number of bytes reserved by the largest consumer */ |
||||
std::size_t size() const noexcept { |
||||
return ptr.size(); |
||||
} |
||||
|
||||
/** @brief returns the pointer to the workspace memory */ |
||||
DevicePtr<unsigned char> get() { |
||||
return ptr.get(); |
||||
} |
||||
|
||||
private: |
||||
ManagedPtr<unsigned char> ptr; |
||||
}; |
||||
|
||||
/** used to compute total workspace size from several workspace requests */ |
||||
class WorkspaceBuilder { |
||||
public: |
||||
WorkspaceBuilder() noexcept : max_size_in_bytes{ 0 } { } |
||||
|
||||
/** request memory for \p count number of elements of the type \tparam T */ |
||||
template <class T = std::int8_t> |
||||
void require(std::size_t count) noexcept { |
||||
auto blocks256 = (count * sizeof(T) + 255) / 256; |
||||
max_size_in_bytes += blocks256 * 256; |
||||
} |
||||
|
||||
/** returns the total workspace memory that is required */ |
||||
std::size_t required_workspace_size() const noexcept { return max_size_in_bytes; } |
||||
|
||||
private: |
||||
std::size_t max_size_in_bytes; |
||||
}; |
||||
|
||||
/** general memory block from a workspace which can be passed on to the requester */ |
||||
class WorkspaceInstance { |
||||
public: |
||||
|
||||
/** returns a device pointer to the workspace memory */ |
||||
template <class T = void> |
||||
DevicePtr<T> get() const noexcept { |
||||
return static_cast<DevicePtr<T>>(ptr); |
||||
} |
||||
|
||||
/** returnss the size of the workspace memory in bytes */ |
||||
std::size_t size_in_bytes() const noexcept { |
||||
return size_in_bytes_; |
||||
} |
||||
|
||||
/** creates a Span<T> of \p count elements from the workspace memory */ |
||||
template <class T> |
||||
Span<T> get_span(std::size_t count = 0) const { |
||||
if (count == 0) |
||||
count = size_in_bytes_ / sizeof(T); |
||||
|
||||
if (count * sizeof(T) > size_in_bytes_) |
||||
CV_Error(Error::StsNoMem, "memory not sufficient"); |
||||
|
||||
return Span<T>(static_cast<DevicePtr<T>>(ptr), count); |
||||
} |
||||
|
||||
/** creates a TensorSpan<T> of the given shape from the workspace memory */ |
||||
template <class T, class ForwardItr> |
||||
TensorSpan<T> get_tensor_span(ForwardItr shape_begin, ForwardItr shape_end) const { |
||||
using ItrValueType = typename std::iterator_traits<ForwardItr>::value_type; |
||||
auto required_size = std::accumulate(shape_begin, shape_end, 1, std::multiplies<ItrValueType>()); |
||||
if (required_size * sizeof(T) > size_in_bytes_) |
||||
CV_Error(Error::StsNoMem, "memory not sufficient"); |
||||
return TensorSpan<T>(static_cast<DevicePtr<T>>(ptr), shape_begin, shape_end); |
||||
} |
||||
|
||||
private: |
||||
DevicePtr<void> ptr; |
||||
std::size_t size_in_bytes_; |
||||
|
||||
friend class WorkspaceAllocator; |
||||
WorkspaceInstance(DevicePtr<void> ptr_, std::size_t size_in_bytes__) |
||||
: ptr{ ptr_ }, size_in_bytes_{ size_in_bytes__ } { } |
||||
}; |
||||
|
||||
/** used to split a single workspace into constituents */ |
||||
class WorkspaceAllocator { |
||||
public: |
||||
WorkspaceAllocator() = default; |
||||
WorkspaceAllocator(Workspace& workspace) noexcept |
||||
: current{ workspace.get() }, bytes_remaining { workspace.size() } |
||||
{ |
||||
CV_Assert(is_aligned<void>(current, 256)); |
||||
CV_Assert(bytes_remaining % 256 == 0); |
||||
} |
||||
|
||||
/** allocates a Span<T> of \p count elements from the workspace memory */ |
||||
template <class T> |
||||
Span<T> get_span(std::size_t count = 0) { |
||||
return accquire<T>(count); |
||||
} |
||||
|
||||
/** allocates a TensorSpan<T> of the given shape from the workspace memory */ |
||||
template <class T, class ForwardItr> |
||||
TensorSpan<T> get_tensor_span(ForwardItr start, ForwardItr end) { |
||||
using ItrValueType = typename std::iterator_traits<ForwardItr>::value_type; |
||||
auto required_size = std::accumulate(start, end, 1, std::multiplies<ItrValueType>()); |
||||
return TensorSpan<T>(accquire<T>(required_size).data(), start, end); |
||||
} |
||||
|
||||
/** allocates a WorkspaceInstance of size \p bytes from the workspace memory */ |
||||
WorkspaceInstance get_instance(std::size_t bytes = 0) { |
||||
auto span = accquire(bytes); |
||||
return WorkspaceInstance(DevicePtr<void>(span.data()), span.size()); |
||||
} |
||||
|
||||
private: |
||||
template <class T = std::int8_t> |
||||
Span<T> accquire(std::size_t count = 0) { |
||||
auto ptr = current; |
||||
|
||||
if (count == 0) |
||||
count = bytes_remaining / sizeof(T); |
||||
|
||||
auto blocks256 = (count * sizeof(T) + 255) / 256; |
||||
if (bytes_remaining < blocks256 * 256) |
||||
CV_Error(Error::StsNoMem, "out of workspace memory"); |
||||
|
||||
bytes_remaining -= blocks256 * 256; |
||||
current = static_cast<DevicePtr<std::int8_t>>(current) + blocks256 * 256; |
||||
return Span<T>(static_cast<DevicePtr<T>>(ptr), count); |
||||
} |
||||
|
||||
DevicePtr<void> current; |
||||
std::size_t bytes_remaining; |
||||
}; |
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::csl */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_WORKSPACE_HPP */ |
@ -0,0 +1,31 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CXX_UTILS_IS_ITERATOR_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CXX_UTILS_IS_ITERATOR_HPP |
||||
|
||||
#include <iterator> |
||||
#include <type_traits> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace cxx_utils { |
||||
|
||||
namespace detail { |
||||
template <class T, class Tag, class = void> |
||||
struct is_iterator_helper : std::false_type {}; |
||||
|
||||
template <class T, class Tag> |
||||
struct is_iterator_helper<T, Tag, |
||||
typename std::enable_if<std::is_base_of<Tag, typename std::iterator_traits<T>::iterator_category>::value, void>::type |
||||
> : std::true_type {}; |
||||
} |
||||
|
||||
template <class T> |
||||
using is_iterator = typename detail::is_iterator_helper<T, std::input_iterator_tag>; |
||||
|
||||
template <class T> |
||||
using is_forward_iterator = typename detail::is_iterator_helper<T, std::forward_iterator_tag>; |
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::csl::cxx_utils */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CXX_UTILS_IS_ITERATOR_HPP */ |
@ -0,0 +1,110 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_CXX_UTILS_RESIZABLE_STATIC_ARRAY_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_CXX_UTILS_RESIZABLE_STATIC_ARRAY_HPP |
||||
|
||||
#include <cstddef> |
||||
#include <array> |
||||
#include <cassert> |
||||
#include <algorithm> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace cxx_utils { |
||||
|
||||
template <class T, std::size_t maxN> |
||||
class resizable_static_array { |
||||
using container_type = std::array<T, maxN>; |
||||
|
||||
public: |
||||
using value_type = typename container_type::value_type; |
||||
using size_type = typename container_type::size_type; |
||||
using difference_type = typename container_type::difference_type; |
||||
using reference = typename container_type::reference; |
||||
using const_reference = typename container_type::const_reference; |
||||
using pointer = typename container_type::pointer; |
||||
using const_pointer = typename container_type::const_pointer; |
||||
using iterator = typename container_type::iterator; |
||||
using const_iterator = typename container_type::const_iterator; |
||||
using reverse_iterator = typename container_type::reverse_iterator; |
||||
using const_reverse_iterator = typename container_type::const_reverse_iterator; |
||||
|
||||
resizable_static_array() noexcept : size_{ 0 } { } |
||||
explicit resizable_static_array(size_type sz) noexcept : size_{ sz } { } |
||||
|
||||
bool empty() const noexcept { return static_cast<bool>(size_); } |
||||
size_type size() const noexcept { return size_; } |
||||
size_type capacity() const noexcept { return maxN; } |
||||
|
||||
void resize(size_type sz) noexcept { |
||||
assert(sz <= capacity()); |
||||
size_ = sz; |
||||
} |
||||
|
||||
void clear() noexcept { size_ = 0; } |
||||
|
||||
template <class ForwardItr> |
||||
void assign(ForwardItr first, ForwardItr last) { |
||||
resize(std::distance(first, last)); |
||||
std::copy(first, last, begin()); |
||||
} |
||||
|
||||
iterator begin() noexcept { return std::begin(arr); } |
||||
iterator end() noexcept { return std::begin(arr) + size(); } |
||||
|
||||
const_iterator begin() const noexcept { return arr.cbegin(); } |
||||
const_iterator end() const noexcept { return arr.cbegin() + size(); } |
||||
|
||||
const_iterator cbegin() const noexcept { return arr.cbegin(); } |
||||
const_iterator cend() const noexcept { return arr.cbegin() + size(); } |
||||
|
||||
reverse_iterator rbegin() noexcept { return std::begin(arr) + size(); } |
||||
reverse_iterator rend() noexcept { return std::begin(arr); } |
||||
|
||||
const_reverse_iterator rbegin() const noexcept { return arr.cbegin()+ size(); } |
||||
const_reverse_iterator rend() const noexcept { return arr.cbegin(); } |
||||
|
||||
const_reverse_iterator crbegin() const noexcept { return arr.cbegin() + size(); } |
||||
const_reverse_iterator crend() const noexcept { return arr.cbegin(); } |
||||
|
||||
reference operator[](size_type pos) { |
||||
assert(pos < size()); |
||||
return arr[pos]; |
||||
} |
||||
|
||||
const_reference operator[](size_type pos) const { |
||||
assert(pos < size()); |
||||
return arr[pos]; |
||||
} |
||||
|
||||
iterator insert(iterator pos, const T& value) { |
||||
resize(size() + 1); |
||||
std::move_backward(pos, end() - 1, end()); |
||||
*pos = value; |
||||
return pos; |
||||
} |
||||
|
||||
iterator insert(iterator pos, T&& value) { |
||||
resize(size() + 1); |
||||
std::move_backward(pos, end() - 1, end()); |
||||
*pos = std::move(value); |
||||
return pos; |
||||
} |
||||
|
||||
iterator erase(iterator pos) { |
||||
std::move(pos + 1, end(), pos); |
||||
resize(size() - 1); |
||||
return pos; |
||||
} |
||||
|
||||
pointer data() noexcept { return arr.data(); } |
||||
const_pointer data() const noexcept { return arr.data(); } |
||||
|
||||
private: |
||||
std::size_t size_; |
||||
container_type arr; |
||||
}; |
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::csl::cxx_utils */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_CXX_UTILS_RESIZABLE_STATIC_ARRAY_HPP */ |
@ -0,0 +1,44 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ACTIVATIONS_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ACTIVATIONS_HPP |
||||
|
||||
#include "../csl/stream.hpp" |
||||
#include "../csl/span.hpp" |
||||
|
||||
#include <cstddef> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { |
||||
|
||||
template <class T> |
||||
void abs(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input); |
||||
|
||||
template <class T> |
||||
void tanh(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input); |
||||
|
||||
template <class T> |
||||
void sigmoid(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input); |
||||
|
||||
template <class T> |
||||
void bnll(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input); |
||||
|
||||
template <class T> |
||||
void elu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input); |
||||
|
||||
template <class T> |
||||
void relu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T slope); |
||||
|
||||
template <class T> |
||||
void clipped_relu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T floor, T ceiling); |
||||
|
||||
template <class T> |
||||
void axiswise_relu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, std::size_t inner_size, csl::View<T> slope); |
||||
|
||||
template <class T> |
||||
void power(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T exp, T scale, T shift); |
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ACTIVATIONS_HPP */ |
@ -0,0 +1,27 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_CONCAT_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_CONCAT_HPP |
||||
|
||||
#include "../csl/stream.hpp" |
||||
#include "../csl/tensor.hpp" |
||||
|
||||
#include <cstddef> |
||||
#include <vector> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { |
||||
|
||||
template <class T> |
||||
void concat( |
||||
const csl::Stream& stream, |
||||
csl::TensorSpan<T> output, std::size_t output_axis_offset, |
||||
csl::TensorView<T> input, std::size_t axis); |
||||
|
||||
template <class T> |
||||
void concat_with_offsets(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, std::vector<std::size_t> axis_offsets); |
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_CONCAT_HPP */ |
@ -0,0 +1,29 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ELTWISE_OPS_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ELTWISE_OPS_HPP |
||||
|
||||
#include "../csl/stream.hpp" |
||||
#include "../csl/span.hpp" |
||||
|
||||
#include <cstddef> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { |
||||
|
||||
template <class T> |
||||
void eltwise_max_2(const csl::Stream& stream, csl::Span<T> output, csl::View<T> x, csl::View<T> y); |
||||
|
||||
template <class T> |
||||
void eltwise_sum_2(const csl::Stream& stream, csl::Span<T> output, csl::View<T> x, csl::View<T> y); |
||||
|
||||
template <class T> |
||||
void eltwise_sum_coeff_2(const csl::Stream& stream, csl::Span<T> output, T coeff_x, csl::View<T> x, T coeff_y, csl::View<T> y); |
||||
|
||||
template <class T> |
||||
void eltwise_prod_2(const csl::Stream& stream, csl::Span<T> output, csl::View<T> x, csl::View<T> y); |
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ELTWISE_OPS_HPP */ |
@ -0,0 +1,18 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_FILL_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_FILL_HPP |
||||
|
||||
#include "../csl/stream.hpp" |
||||
#include "../csl/span.hpp" |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { |
||||
|
||||
template <class T> |
||||
void fill(const csl::Stream& stream, csl::Span<T> output, T value); |
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_FILL_HPP */ |
@ -0,0 +1,32 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_MAX_UNPOOLING_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_MAX_UNPOOLING_HPP |
||||
|
||||
#include "../csl/stream.hpp" |
||||
#include "../csl/tensor.hpp" |
||||
|
||||
#include <cstddef> |
||||
#include <vector> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { |
||||
|
||||
template <class T> |
||||
void max_pooling_with_indices( |
||||
const csl::Stream& stream, |
||||
csl::TensorSpan<T> output, csl::TensorSpan<T> indices, csl::TensorView<T> input, |
||||
const std::vector<std::size_t>& kernel_size, const std::vector<std::size_t>& strides, |
||||
const std::vector<std::size_t>& padding_left); |
||||
|
||||
template <class T> |
||||
void max_unpooling( |
||||
const csl::Stream& stream, |
||||
csl::TensorSpan<T> output, csl::TensorView<T> input, csl::TensorView<T> indices, |
||||
const std::vector<std::size_t>& window_size, const std::vector<std::size_t>& strides, |
||||
const std::vector<std::size_t>& padding_left); |
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_MAX_UNPOOLING_HPP */ |
@ -0,0 +1,24 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_NORMALIZE_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_NORMALIZE_HPP |
||||
|
||||
#include "../csl/stream.hpp" |
||||
#include "../csl/span.hpp" |
||||
|
||||
#include <cstddef> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { |
||||
|
||||
template <class T> |
||||
void normalize( |
||||
const csl::Stream& stream, |
||||
csl::Span<T> output, csl::View<T> input, |
||||
std::size_t outer_size, std::size_t mid_size, std::size_t inner_size, std::size_t norm, T epsilon, |
||||
csl::Span<T> workspace); |
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_NORMALIZE_HPP */ |
@ -0,0 +1,25 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PADDING_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PADDING_HPP |
||||
|
||||
#include "../csl/stream.hpp" |
||||
#include "../csl/tensor.hpp" |
||||
|
||||
#include <cstddef> |
||||
#include <vector> |
||||
#include <utility> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { |
||||
|
||||
template <class T> |
||||
void copy_with_reflection101( |
||||
const csl::Stream& stream, |
||||
csl::TensorSpan<T> output, csl::TensorView<T> input, |
||||
std::vector<std::pair<std::size_t, std::size_t>> ranges); |
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PADDING_HPP */ |
@ -0,0 +1,21 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PERMUTE_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PERMUTE_HPP |
||||
|
||||
#include "../csl/stream.hpp" |
||||
#include "../csl/tensor.hpp" |
||||
|
||||
#include <cstddef> |
||||
#include <vector> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { |
||||
|
||||
template <class T> |
||||
void permute(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, std::vector<std::size_t> order); |
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PERMUTE_HPP */ |
@ -0,0 +1,28 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PRIOR_BOX_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PRIOR_BOX_HPP |
||||
|
||||
#include "../csl/stream.hpp" |
||||
#include "../csl/span.hpp" |
||||
|
||||
#include <cstddef> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { |
||||
|
||||
template <class T> |
||||
void generate_prior_boxes( |
||||
const csl::Stream& stream, |
||||
csl::Span<T> output, |
||||
csl::View<float> boxWidth, csl::View<float> boxHeight, csl::View<float> offsetX, csl::View<float> offsetY, float stepX, float stepY, |
||||
std::vector<float> variance, |
||||
std::size_t numPriors, |
||||
std::size_t layerWidth, std::size_t layerHeight, |
||||
std::size_t imageWidth, std::size_t imageHeight, |
||||
bool normalize, bool clip); |
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PRIOR_BOX_HPP */ |
@ -0,0 +1,32 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_REGION_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_REGION_HPP |
||||
|
||||
#include "../csl/stream.hpp" |
||||
#include "../csl/span.hpp" |
||||
|
||||
#include <cstddef> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { |
||||
|
||||
template <class T> |
||||
void sigmoid_strided(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, std::size_t n, std::size_t stride, std::size_t offset); |
||||
|
||||
template <class T> |
||||
void softmax_strided(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, std::size_t n, std::size_t stride, std::size_t offset); |
||||
|
||||
template <class T> |
||||
void region_finalize(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, csl::View<T> bias, |
||||
T object_prob_cutoff, T class_prob_cutoff, |
||||
std::size_t height_norm, std::size_t width_norm, |
||||
std::size_t rows, std::size_t cols, |
||||
std::size_t boxes_per_cell, |
||||
std::size_t box_size, |
||||
std::size_t classes); |
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_REGION_HPP */ |
@ -0,0 +1,23 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_RESIZE_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_RESIZE_HPP |
||||
|
||||
#include "../csl/stream.hpp" |
||||
#include "../csl/tensor.hpp" |
||||
|
||||
#include <cstddef> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { |
||||
|
||||
template <class T> |
||||
void resize_nn(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input); |
||||
|
||||
template <class T> |
||||
void resize_bilinear(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, float scale_y, float scale_x); |
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_RESIZE_HPP */ |
@ -0,0 +1,45 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_SCALE_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_SCALE_HPP |
||||
|
||||
#include "../csl/stream.hpp" |
||||
#include "../csl/tensor.hpp" |
||||
|
||||
#include <cstddef> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { |
||||
|
||||
template <class T> |
||||
void bias1(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, T alpha); |
||||
|
||||
template <class T> |
||||
void biasN(const csl::Stream& stream, |
||||
csl::TensorSpan<T> output, |
||||
csl::TensorView<T> input, std::size_t inner_size, |
||||
csl::TensorView<T> bias); |
||||
|
||||
template <class T> |
||||
void scale1(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, T alpha); |
||||
|
||||
template <class T> |
||||
void scaleN(const csl::Stream& stream, |
||||
csl::TensorSpan<T> output, |
||||
csl::TensorView<T> input, std::size_t inner_size, |
||||
csl::TensorView<T> weights); |
||||
|
||||
template <class T> |
||||
void scale1_with_bias1(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T alpha, T beta); |
||||
|
||||
template <class T> |
||||
void scaleN_with_biasN( |
||||
const csl::Stream& stream, |
||||
csl::TensorSpan<T> output, |
||||
csl::TensorView<T> input, std::size_t inner_size, |
||||
csl::TensorView<T> weights, csl::TensorView<T> bias); |
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_SCALE_HPP */ |
@ -0,0 +1,22 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_SLICE_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_SLICE_HPP |
||||
|
||||
#include "../csl/stream.hpp" |
||||
#include "../csl/tensor.hpp" |
||||
|
||||
#include <cstddef> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { |
||||
|
||||
template <class T> |
||||
void slice(const csl::Stream& stream, |
||||
csl::TensorSpan<T> output, csl::TensorView<T> input, |
||||
std::vector<std::size_t> offsets); |
||||
|
||||
}}}} /* namespace cv::dnn::cuda4dnn::kernels */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_SLICE_HPP */ |
@ -0,0 +1,290 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_ACTIVATION_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_ACTIVATION_HPP |
||||
|
||||
#include "../../op_cuda.hpp" |
||||
|
||||
#include "../csl/stream.hpp" |
||||
#include "../csl/tensor.hpp" |
||||
|
||||
#include "../kernels/activations.hpp" |
||||
|
||||
#include <opencv2/core.hpp> |
||||
|
||||
#include <utility> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { |
||||
|
||||
template <class T> |
||||
class ReLUOp final : public CUDABackendNode { |
||||
public: |
||||
using wrapper_type = GetCUDABackendWrapperType<T>; |
||||
|
||||
ReLUOp(csl::Stream stream_, T slope_) |
||||
: stream(std::move(stream_)), slope{ slope_ } { } |
||||
|
||||
void forward( |
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs, |
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs, |
||||
csl::Workspace& workspace) override |
||||
{ |
||||
for (int i = 0; i < inputs.size(); i++) |
||||
{ |
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>(); |
||||
auto input = input_wrapper->getView(); |
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>(); |
||||
auto output = output_wrapper->getSpan(); |
||||
|
||||
kernels::relu<T>(stream, output, input, slope); |
||||
} |
||||
} |
||||
|
||||
private: |
||||
csl::Stream stream; |
||||
const T slope; |
||||
}; |
||||
|
||||
template <class T> |
||||
class ClippedReLUOp final : public CUDABackendNode { |
||||
public: |
||||
using wrapper_type = GetCUDABackendWrapperType<T>; |
||||
|
||||
ClippedReLUOp(csl::Stream stream_, T min_, T max_) |
||||
: stream(std::move(stream_)), min{ min_ }, max{ max_ } { } |
||||
|
||||
void forward( |
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs, |
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs, |
||||
csl::Workspace& workspace) override |
||||
{ |
||||
for (int i = 0; i < inputs.size(); i++) |
||||
{ |
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>(); |
||||
auto input = input_wrapper->getView(); |
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>(); |
||||
auto output = output_wrapper->getSpan(); |
||||
|
||||
kernels::clipped_relu<T>(stream, output, input, min, max); |
||||
} |
||||
} |
||||
|
||||
private: |
||||
csl::Stream stream; |
||||
const T min, max; |
||||
}; |
||||
|
||||
template <class T> |
||||
class ChannelwiseReLUOp final : public CUDABackendNode { |
||||
public: |
||||
using wrapper_type = GetCUDABackendWrapperType<T>; |
||||
|
||||
ChannelwiseReLUOp(csl::Stream stream_, const Mat& slope) |
||||
: stream(std::move(stream_)) |
||||
{ |
||||
CV_Assert(!slope.empty()); |
||||
slopeTensor = csl::makeTensorHeader<T>(slope); |
||||
csl::copyMatToTensor<T>(slope, slopeTensor, stream); |
||||
} |
||||
|
||||
void forward( |
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs, |
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs, |
||||
csl::Workspace& workspace) override |
||||
{ |
||||
for (int i = 0; i < inputs.size(); i++) |
||||
{ |
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>(); |
||||
auto input = input_wrapper->getView(); |
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>(); |
||||
auto output = output_wrapper->getSpan(); |
||||
|
||||
CV_Assert(input.get_axis_size(1) == slopeTensor.size()); |
||||
std::size_t inner_size = input.size_range(2, input.rank()); |
||||
kernels::axiswise_relu<T>(stream, output, input, inner_size, slopeTensor); |
||||
} |
||||
} |
||||
|
||||
private: |
||||
csl::Stream stream; |
||||
csl::Tensor<T> slopeTensor; |
||||
}; |
||||
|
||||
template <class T> |
||||
class TanHOp final : public CUDABackendNode { |
||||
public: |
||||
using wrapper_type = GetCUDABackendWrapperType<T>; |
||||
|
||||
TanHOp(csl::Stream stream_) : stream(std::move(stream_)) { } |
||||
|
||||
void forward( |
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs, |
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs, |
||||
csl::Workspace& workspace) override |
||||
{ |
||||
for (int i = 0; i < inputs.size(); i++) |
||||
{ |
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>(); |
||||
auto input = input_wrapper->getView(); |
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>(); |
||||
auto output = output_wrapper->getSpan(); |
||||
|
||||
kernels::tanh<T>(stream, output, input); |
||||
} |
||||
} |
||||
|
||||
private: |
||||
csl::Stream stream; |
||||
}; |
||||
|
||||
template <class T> |
||||
class SigmoidOp final : public CUDABackendNode { |
||||
public: |
||||
using wrapper_type = GetCUDABackendWrapperType<T>; |
||||
|
||||
SigmoidOp(csl::Stream stream_) : stream(std::move(stream_)) { } |
||||
|
||||
void forward( |
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs, |
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs, |
||||
csl::Workspace& workspace) override |
||||
{ |
||||
for (int i = 0; i < inputs.size(); i++) |
||||
{ |
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>(); |
||||
auto input = input_wrapper->getView(); |
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>(); |
||||
auto output = output_wrapper->getSpan(); |
||||
|
||||
kernels::sigmoid<T>(stream, output, input); |
||||
} |
||||
} |
||||
|
||||
private: |
||||
csl::Stream stream; |
||||
}; |
||||
|
||||
template <class T> |
||||
class ELUOp final : public CUDABackendNode { |
||||
public: |
||||
using wrapper_type = GetCUDABackendWrapperType<T>; |
||||
|
||||
ELUOp(csl::Stream stream_) : stream(std::move(stream_)) { } |
||||
|
||||
void forward( |
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs, |
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs, |
||||
csl::Workspace& workspace) override |
||||
{ |
||||
for (int i = 0; i < inputs.size(); i++) |
||||
{ |
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>(); |
||||
auto input = input_wrapper->getView(); |
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>(); |
||||
auto output = output_wrapper->getSpan(); |
||||
|
||||
kernels::elu<T>(stream, output, input); |
||||
} |
||||
} |
||||
|
||||
private: |
||||
csl::Stream stream; |
||||
}; |
||||
|
||||
template <class T> |
||||
class AbsValOp final : public CUDABackendNode { |
||||
public: |
||||
using wrapper_type = GetCUDABackendWrapperType<T>; |
||||
|
||||
AbsValOp(csl::Stream stream_) : stream(std::move(stream_)) { } |
||||
|
||||
void forward( |
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs, |
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs, |
||||
csl::Workspace& workspace) override |
||||
{ |
||||
for (int i = 0; i < inputs.size(); i++) |
||||
{ |
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>(); |
||||
auto input = input_wrapper->getView(); |
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>(); |
||||
auto output = output_wrapper->getSpan(); |
||||
|
||||
kernels::abs<T>(stream, output, input); |
||||
} |
||||
} |
||||
|
||||
private: |
||||
csl::Stream stream; |
||||
}; |
||||
|
||||
template <class T> |
||||
class BNLLOp final : public CUDABackendNode { |
||||
public: |
||||
using wrapper_type = GetCUDABackendWrapperType<T>; |
||||
|
||||
BNLLOp(csl::Stream stream_) : stream(std::move(stream_)) { } |
||||
|
||||
void forward( |
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs, |
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs, |
||||
csl::Workspace& workspace) override |
||||
{ |
||||
for (int i = 0; i < inputs.size(); i++) |
||||
{ |
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>(); |
||||
auto input = input_wrapper->getView(); |
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>(); |
||||
auto output = output_wrapper->getSpan(); |
||||
|
||||
kernels::bnll<T>(stream, output, input); |
||||
} |
||||
} |
||||
|
||||
private: |
||||
csl::Stream stream; |
||||
}; |
||||
|
||||
template <class T> |
||||
class PowerOp final : public CUDABackendNode { |
||||
public: |
||||
using wrapper_type = GetCUDABackendWrapperType<T>; |
||||
|
||||
PowerOp(csl::Stream stream_, T exp_, T scale_, T shift_) |
||||
: stream(std::move(stream_)), exp{ exp_ }, scale{ scale_ }, shift{ shift_ } { } |
||||
|
||||
void forward( |
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs, |
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs, |
||||
csl::Workspace& workspace) override |
||||
{ |
||||
for (int i = 0; i < inputs.size(); i++) |
||||
{ |
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>(); |
||||
auto input = input_wrapper->getView(); |
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>(); |
||||
auto output = output_wrapper->getSpan(); |
||||
|
||||
kernels::power<T>(stream, output, input, exp, scale, shift); |
||||
} |
||||
} |
||||
|
||||
private: |
||||
csl::Stream stream; |
||||
const T exp, scale, shift; |
||||
}; |
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_ACTIVATION_HPP */ |
@ -0,0 +1,58 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_BATCH_NORM_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_BATCH_NORM_HPP |
||||
|
||||
#include "../../op_cuda.hpp" |
||||
|
||||
#include "../csl/stream.hpp" |
||||
#include "../csl/tensor.hpp" |
||||
|
||||
#include "../kernels/scale_shift.hpp" |
||||
|
||||
#include <utility> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { |
||||
|
||||
template <class T> |
||||
class BatchNormOp final : public CUDABackendNode { |
||||
public: |
||||
using wrapper_type = GetCUDABackendWrapperType<T>; |
||||
|
||||
BatchNormOp(csl::Stream stream_, const cv::Mat& weights, const cv::Mat& bias) |
||||
: stream(std::move(stream_)) |
||||
{ |
||||
biasTensor = csl::makeTensorHeader<T>(bias); |
||||
csl::copyMatToTensor<T>(bias, biasTensor, stream); |
||||
|
||||
weightsTensor = csl::makeTensorHeader<T>(weights); |
||||
csl::copyMatToTensor<T>(weights, weightsTensor, stream); |
||||
} |
||||
|
||||
void forward( |
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs, |
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs, |
||||
csl::Workspace& workspace) override |
||||
{ |
||||
CV_Assert(inputs.size() == 1 && outputs.size() == 1); |
||||
|
||||
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>(); |
||||
auto input = input_wrapper->getView(); |
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>(); |
||||
auto output = output_wrapper->getSpan(); |
||||
|
||||
std::size_t inner_size = input.size_range(2, input.rank()); |
||||
kernels::scaleN_with_biasN<T>(stream, output, input, inner_size, weightsTensor, biasTensor); |
||||
} |
||||
|
||||
private: |
||||
csl::Stream stream; |
||||
csl::Tensor<T> weightsTensor, biasTensor; |
||||
}; |
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_BATCH_NORM_HPP */ |
@ -0,0 +1,90 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONCAT_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONCAT_HPP |
||||
|
||||
#include "../../op_cuda.hpp" |
||||
|
||||
#include "../csl/stream.hpp" |
||||
#include "../csl/pointer.hpp" |
||||
|
||||
#include "../kernels/fill.hpp" |
||||
#include "../kernels/concat.hpp" |
||||
|
||||
#include <opencv2/core.hpp> |
||||
|
||||
#include <cstddef> |
||||
#include <vector> |
||||
#include <utility> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { |
||||
|
||||
template <class T> |
||||
class ConcatOp final : public CUDABackendNode { |
||||
public: |
||||
using wrapper_type = GetCUDABackendWrapperType<T>; |
||||
|
||||
ConcatOp(csl::Stream stream_, std::size_t concat_axis, bool zero_padding) |
||||
: stream(std::move(stream_)), concat_axis{ concat_axis }, zero_padding{ zero_padding } |
||||
{ |
||||
} |
||||
|
||||
void forward( |
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs, |
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs, |
||||
csl::Workspace& workspace) override |
||||
{ |
||||
CV_Assert(outputs.size() == 1); |
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>(); |
||||
auto output = output_wrapper->getSpan(); |
||||
|
||||
if(zero_padding) |
||||
{ |
||||
auto output_shape = output_wrapper->getShape(); |
||||
|
||||
kernels::fill<T>(stream, output, 0.0); |
||||
|
||||
std::size_t output_concat_axis_offset = 0; |
||||
for (int i = 0; i < inputs.size(); i++) |
||||
{ |
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>(); |
||||
auto input = input_wrapper->getView(); |
||||
auto input_shape = input_wrapper->getShape(); |
||||
|
||||
std::vector<std::size_t> offsets(input_shape.size()); |
||||
for (int j = 0; j < offsets.size(); j++) |
||||
offsets[j] = (output_shape[j] - input_shape[j]) / 2; |
||||
offsets[concat_axis] = output_concat_axis_offset; |
||||
|
||||
kernels::concat_with_offsets(stream, output, input, offsets); |
||||
|
||||
output_concat_axis_offset += input.get_axis_size(concat_axis); |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
std::size_t output_axis_offset = 0; |
||||
for (int i = 0; i < inputs.size(); i++) |
||||
{ |
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>(); |
||||
auto input = input_wrapper->getView(); |
||||
|
||||
kernels::concat(stream, output, output_axis_offset, input, concat_axis); |
||||
|
||||
output_axis_offset += input.get_axis_size(concat_axis); |
||||
} |
||||
} |
||||
} |
||||
|
||||
private: |
||||
csl::Stream stream; |
||||
std::size_t concat_axis; |
||||
bool zero_padding; |
||||
}; |
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONCAT_HPP */ |
@ -0,0 +1,51 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONST_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONST_HPP |
||||
|
||||
#include "../../op_cuda.hpp" |
||||
|
||||
#include "../csl/stream.hpp" |
||||
#include "../csl/tensor.hpp" |
||||
#include "../csl/tensor_ops.hpp" |
||||
|
||||
#include <opencv2/core.hpp> |
||||
|
||||
#include <utility> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { |
||||
|
||||
template <class T> |
||||
class ConstOp final : public CUDABackendNode { |
||||
public: |
||||
using wrapper_type = GetCUDABackendWrapperType<T>; |
||||
|
||||
ConstOp(csl::Stream stream_, const cv::Mat& data) |
||||
: stream(std::move(stream_)) |
||||
{ |
||||
constTensor = csl::makeTensorHeader<T>(data); |
||||
csl::copyMatToTensor<T>(data, constTensor, stream); |
||||
} |
||||
|
||||
void forward( |
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs, |
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs, |
||||
csl::Workspace& workspace) override |
||||
{ |
||||
CV_Assert(outputs.size() == 1 && inputs.size() == 0); |
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>(); |
||||
auto output = output_wrapper->getSpan(); |
||||
csl::tensor_ops::copy<T>(stream, output, constTensor); |
||||
} |
||||
|
||||
private: |
||||
csl::Stream stream; |
||||
csl::Tensor<T> constTensor; |
||||
}; |
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONST_HPP */ |
@ -0,0 +1,250 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONVOLUTION_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONVOLUTION_HPP |
||||
|
||||
#include "../../op_cuda.hpp" |
||||
|
||||
#include "../csl/cudnn.hpp" |
||||
#include "../csl/stream.hpp" |
||||
#include "../csl/tensor.hpp" |
||||
#include "../csl/tensor_ops.hpp" |
||||
#include "../kernels/scale_shift.hpp" |
||||
|
||||
#include <opencv2/core.hpp> |
||||
|
||||
#include <cstddef> |
||||
#include <cstdint> |
||||
#include <vector> |
||||
#include <utility> |
||||
#include <algorithm> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { |
||||
|
||||
struct ConvolutionConfiguration { |
||||
/* the size of the following vectors must be equal to the kernel size */ |
||||
std::vector<std::size_t> kernel_size; |
||||
std::vector<std::size_t> dilations, strides; |
||||
|
||||
enum class PaddingMode { |
||||
MANUAL, /* uses explicit padding values provided in `pads_begin` and `pads_end` */ |
||||
VALID, /* no padding is added */ |
||||
SAME /* TensorFlow logic is used for same padding */ |
||||
}; |
||||
|
||||
/* explicit paddings are used if and only if padMode is set to manual */ |
||||
PaddingMode padMode; |
||||
std::vector<std::size_t> pads_begin, pads_end; |
||||
|
||||
/* full shape inclusive of channel and batch axis */ |
||||
std::vector<std::size_t> input_shape; |
||||
std::vector<std::size_t> output_shape; |
||||
|
||||
/* group count for grouped convolution */ |
||||
std::size_t groups; |
||||
}; |
||||
|
||||
template <class T> |
||||
class ConvolutionOp final : public CUDABackendNode { |
||||
public: |
||||
using wrapper_type = GetCUDABackendWrapperType<T>; |
||||
|
||||
ConvolutionOp(csl::Stream stream_, csl::cudnn::Handle handle, const ConvolutionConfiguration& config, const Mat& filters, const Mat& bias) |
||||
: stream(std::move(stream_)), cudnnHandle(std::move(handle)) |
||||
{ |
||||
const auto& kernel_size = config.kernel_size; |
||||
const auto& dilations = config.dilations; |
||||
const auto& strides = config.strides; |
||||
|
||||
const auto convolution_order = kernel_size.size(); |
||||
CV_Assert(convolution_order >= 1); |
||||
|
||||
CV_Assert(convolution_order == dilations.size()); |
||||
CV_Assert(convolution_order == strides.size()); |
||||
|
||||
const auto& input_shape = config.input_shape; |
||||
const auto& output_shape = config.output_shape; |
||||
CV_Assert(input_shape.size() == output_shape.size()); |
||||
CV_Assert(input_shape.size() == convolution_order + 2); |
||||
|
||||
const auto groups = config.groups; |
||||
|
||||
if (convolution_order > 3) |
||||
CV_Error(Error::StsNotImplemented, "Only 1D/2D/3D convolution is supported."); |
||||
|
||||
const auto rank = input_shape.size(); |
||||
const auto output_feature_maps = output_shape[1]; |
||||
const auto input_feature_maps = input_shape[1]; |
||||
const auto input_feature_maps_per_group = input_feature_maps / groups; |
||||
CV_Assert(input_feature_maps % groups == 0); |
||||
|
||||
filtersTensor = csl::makeTensorHeader<T>(filters); |
||||
csl::copyMatToTensor<T>(filters, filtersTensor, stream); |
||||
|
||||
if (!bias.empty()) |
||||
{ |
||||
biasTensor = csl::makeTensorHeader<T>(bias); |
||||
csl::copyMatToTensor<T>(bias, biasTensor, stream); |
||||
} |
||||
|
||||
/* left and right are misleading as the padding is applicable for any number of dimensions
|
||||
* but we use those identifiers to avoid confusion with `pads_begin` and `pads_end` |
||||
* |
||||
* `common_padding` contains the amount of padding that has to be added to both sides |
||||
* `padding_left` and `padding_right` contains the amount of padding that needs to be added |
||||
* to a particular side in addition to the common padding |
||||
*/ |
||||
std::vector<std::size_t> common_padding(rank, 0); |
||||
std::vector<std::size_t> padding_left(rank, 0), padding_right(rank, 0); |
||||
if (config.padMode == ConvolutionConfiguration::PaddingMode::MANUAL) |
||||
{ |
||||
const auto& pads_begin = config.pads_begin; |
||||
const auto& pads_end = config.pads_end; |
||||
|
||||
CV_Assert(convolution_order == pads_begin.size()); |
||||
CV_Assert(convolution_order == pads_end.size()); |
||||
|
||||
for (int i = 2; i < common_padding.size(); i++) |
||||
{ |
||||
common_padding[i] = std::min(pads_begin[i - 2], pads_end[i - 2]); |
||||
padding_left[i] = pads_begin[i - 2] - common_padding[i]; |
||||
padding_right[i] = pads_end[i - 2] - common_padding[i]; |
||||
} |
||||
} |
||||
else if (config.padMode == ConvolutionConfiguration::PaddingMode::VALID) |
||||
{ |
||||
/* nothing to do as the paddings are already preset to zero */ |
||||
} |
||||
else if (config.padMode == ConvolutionConfiguration::PaddingMode::SAME) |
||||
{ |
||||
/* TensorFlow Logic:
|
||||
* total_padding[i] = (o[i] - 1) * s[i] + effective_k[i] - i[i] |
||||
* |
||||
* if total padding is odd, the extra is added towards the end |
||||
*/ |
||||
for (int i = 2; i < rank; i++) |
||||
{ |
||||
const auto j = i - 2; /* filter index */ |
||||
const auto effective_kernel_size = dilations[j] * (kernel_size[j] - 1) + 1; |
||||
const auto required_total_padding = |
||||
std::max<std::int64_t>(0, (output_shape[i] - 1) * strides[j] + effective_kernel_size - input_shape[i]); |
||||
|
||||
common_padding[i] = required_total_padding / 2; |
||||
padding_left[i] = 0; |
||||
padding_right[i] = required_total_padding % 2; |
||||
} |
||||
} |
||||
|
||||
/* in some scenarios, the extra padding at the end may not change the output at all */ |
||||
for (int i = 2; i < rank; i++) { |
||||
const auto j = i - 2; /* filter idx */ |
||||
const auto total_padding = common_padding[i] * 2 + padding_left[i] + padding_right[i]; |
||||
const auto effective_kernel_size = dilations[j] * (kernel_size[j] - 1) + 1; |
||||
std::int64_t rem = (input_shape[i] + total_padding - effective_kernel_size) % strides[j]; |
||||
|
||||
/* the output shape doesn't change if we decrease the total padding by at most `rem`
|
||||
* provided that we decrease from the right |
||||
*/ |
||||
if (rem && padding_right[i] > 0) |
||||
padding_right[i] = std::max<std::int64_t>(0, padding_right[i] - rem); |
||||
} |
||||
|
||||
auto is_not_zero = [](std::size_t i) { return i != 0; }; |
||||
if(std::any_of(std::begin(padding_left), std::end(padding_left), is_not_zero) || |
||||
std::any_of(std::begin(padding_right), std::end(padding_right), is_not_zero)) |
||||
{ |
||||
/* csl::Convolution supports symmetric padding only; hence, we deal with asymmetric padding by
|
||||
* copying the input to a bigger tensor and padding the ends manually |
||||
*/ |
||||
transformed_shape = input_shape; |
||||
for (int i = 0; i < rank; i++) |
||||
transformed_shape[i] += padding_left[i] + padding_right[i]; |
||||
|
||||
inputTransformer = csl::TensorTransform<T>(cudnnHandle, padding_left, padding_right); |
||||
} |
||||
|
||||
typename csl::Convolution<T>::params_type params; |
||||
if (transformed_shape.empty()) |
||||
{ |
||||
params.input_shape.assign(std::begin(input_shape), std::end(input_shape)); |
||||
} |
||||
else |
||||
{ |
||||
/* the convolution operation will be seeing the transformed input */ |
||||
params.input_shape.assign(std::begin(transformed_shape), std::end(transformed_shape)); |
||||
} |
||||
|
||||
auto& fshape = params.filter_shape; |
||||
fshape.resize(rank); |
||||
fshape[0] = output_feature_maps; |
||||
fshape[1] = input_feature_maps_per_group; |
||||
std::copy(std::begin(kernel_size), std::end(kernel_size), std::begin(fshape) + 2); |
||||
CV_Assert(fshape.size() == kernel_size.size() + 2); |
||||
|
||||
params.padding.assign(std::begin(common_padding) + 2, std::end(common_padding)); |
||||
params.stride = strides; |
||||
params.dilation = dilations; |
||||
params.groups = config.groups; |
||||
|
||||
convoluter = csl::Convolution<T>(cudnnHandle, params); |
||||
|
||||
csl::WorkspaceBuilder builder; |
||||
if (!transformed_shape.empty()) { |
||||
auto& shape = transformed_shape; |
||||
auto sz = std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies<std::size_t>()); |
||||
builder.require<T>(sz); |
||||
} |
||||
builder.require(convoluter.get_workspace_size()); |
||||
scratch_mem_in_bytes = builder.required_workspace_size(); |
||||
} |
||||
|
||||
void forward( |
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs, |
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs, |
||||
csl::Workspace& workspace) override |
||||
{ |
||||
CV_Assert(inputs.size() == 1 && outputs.size() == 1); |
||||
|
||||
csl::WorkspaceAllocator allocator(workspace); |
||||
|
||||
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>(); |
||||
auto input = input_wrapper->getView(); |
||||
|
||||
if (!transformed_shape.empty()) |
||||
{ |
||||
auto& shape = transformed_shape; |
||||
auto transformed_input = allocator.get_tensor_span<T>(std::begin(shape), std::end(shape)); |
||||
inputTransformer.transform(input, transformed_input); |
||||
input = transformed_input; |
||||
} |
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>(); |
||||
auto output = output_wrapper->getSpan(); |
||||
|
||||
convoluter.convolve(output, input, filtersTensor, allocator.get_instance()); |
||||
if (!biasTensor.empty()) |
||||
{ |
||||
std::size_t inner_size = output.size_range(2, output.rank()); |
||||
kernels::biasN<T>(stream, output, output, inner_size, biasTensor); |
||||
} |
||||
} |
||||
|
||||
std::size_t get_workspace_memory_in_bytes() const noexcept override { return scratch_mem_in_bytes; } |
||||
|
||||
private: |
||||
csl::Stream stream; |
||||
csl::cudnn::Handle cudnnHandle; |
||||
csl::Tensor<T> filtersTensor, biasTensor; |
||||
csl::Convolution<T> convoluter; |
||||
|
||||
std::vector<std::size_t> transformed_shape; |
||||
csl::TensorTransform<T> inputTransformer; |
||||
|
||||
std::size_t scratch_mem_in_bytes; |
||||
}; |
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONVOLUTION_HPP */ |
@ -0,0 +1,115 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_ELTWISE_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_ELTWISE_HPP |
||||
|
||||
#include "../../op_cuda.hpp" |
||||
|
||||
#include "../csl/stream.hpp" |
||||
#include "../csl/tensor.hpp" |
||||
#include "../csl/tensor_ops.hpp" |
||||
|
||||
#include "../kernels/eltwise_ops.hpp" |
||||
|
||||
#include <opencv2/core.hpp> |
||||
|
||||
#include <cstddef> |
||||
#include <vector> |
||||
#include <utility> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { |
||||
|
||||
enum class EltwiseOpType { |
||||
MAX, |
||||
SUM, |
||||
PRODUCT |
||||
}; |
||||
|
||||
template <class T> |
||||
class EltwiseOp final : public CUDABackendNode { |
||||
public: |
||||
using wrapper_type = GetCUDABackendWrapperType<T>; |
||||
|
||||
template <class V> |
||||
EltwiseOp(csl::Stream stream_, EltwiseOpType op_, std::vector<V> coeffs_) |
||||
: stream(std::move(stream_)), op{ op_ }, coeffs(std::begin(coeffs_), std::end(coeffs_)) |
||||
{ |
||||
} |
||||
|
||||
void forward( |
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs, |
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs, |
||||
csl::Workspace& workspace) override |
||||
{ |
||||
CV_Assert(inputs.size() >= 2); |
||||
CV_Assert(outputs.size() == 1); |
||||
|
||||
CV_Assert(coeffs.size() == 0 || op == EltwiseOpType::SUM); |
||||
CV_Assert(coeffs.size() == 0 || inputs.size() == coeffs.size()); |
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>(); |
||||
auto output = output_wrapper->getSpan(); |
||||
|
||||
if (inputs.size() == 2) |
||||
{ |
||||
auto input_wrapper_x = inputs[0].dynamicCast<wrapper_type>(); |
||||
auto input_x = input_wrapper_x->getView(); |
||||
|
||||
auto input_wrapper_y = inputs[1].dynamicCast<wrapper_type>(); |
||||
auto input_y = input_wrapper_y->getView(); |
||||
|
||||
switch (op) |
||||
{ |
||||
case EltwiseOpType::MAX: kernels::eltwise_max_2<T>(stream, output, input_x, input_y); break; |
||||
case EltwiseOpType::PRODUCT: kernels::eltwise_prod_2<T>(stream, output, input_x, input_y); break; |
||||
case EltwiseOpType::SUM: |
||||
if (coeffs.empty() || (coeffs[0] == 1 && coeffs[1] == 1)) |
||||
kernels::eltwise_sum_2<T>(stream, output, input_x, input_y); |
||||
else |
||||
kernels::eltwise_sum_coeff_2<T>(stream, output, coeffs[0], input_x, coeffs[1], input_y); |
||||
break; |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
auto input_wrapper_0 = inputs[0].dynamicCast<wrapper_type>(); |
||||
auto input_0 = input_wrapper_0->getView(); |
||||
|
||||
/* we first make a copy and then apply EltwiseOp cumulatively */ |
||||
csl::tensor_ops::copy(stream, output, input_0); |
||||
|
||||
for (int i = 1; i < inputs.size(); i++) |
||||
{ |
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>(); |
||||
auto input = input_wrapper->getView(); |
||||
|
||||
switch (op) |
||||
{ |
||||
case EltwiseOpType::MAX: kernels::eltwise_max_2<T>(stream, output, output, input); break; |
||||
case EltwiseOpType::PRODUCT: kernels::eltwise_prod_2<T>(stream, output, output, input); break; |
||||
case EltwiseOpType::SUM: |
||||
if (coeffs.empty() || coeffs[i] == 1) |
||||
kernels::eltwise_sum_2<T>(stream, output, output, input); |
||||
else |
||||
{ |
||||
/* if this is the first op, we must scale output too */ |
||||
auto coeff_x = (i == 1) ? coeffs[0] : static_cast<T>(1.0); |
||||
kernels::eltwise_sum_coeff_2<T>(stream, output, coeff_x, output, coeffs[i], input); |
||||
} |
||||
break; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
private: |
||||
csl::Stream stream; |
||||
EltwiseOpType op; |
||||
std::vector<T> coeffs; |
||||
}; |
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_ELTWISE_HPP */ |
@ -0,0 +1,92 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_INNER_PRODUCT_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_INNER_PRODUCT_HPP |
||||
|
||||
#include "../../op_cuda.hpp" |
||||
|
||||
#include "../csl/stream.hpp" |
||||
#include "../csl/cublas.hpp" |
||||
#include "../csl/tensor.hpp" |
||||
#include "../csl/tensor_ops.hpp" |
||||
|
||||
#include "../kernels/scale_shift.hpp" |
||||
|
||||
#include <opencv2/core.hpp> |
||||
|
||||
#include <cstddef> |
||||
#include <vector> |
||||
#include <utility> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { |
||||
|
||||
template <class T> |
||||
class InnerProductOp final : public CUDABackendNode { |
||||
public: |
||||
using wrapper_type = GetCUDABackendWrapperType<T>; |
||||
|
||||
InnerProductOp(csl::Stream stream_, csl::cublas::Handle handle, std::size_t axis, const Mat& weights, const Mat& bias) |
||||
: stream(std::move(stream_)), cublasHandle(std::move(handle)), axis{ axis } |
||||
{ |
||||
weightsTensor = csl::makeTensorHeader<T>(weights); |
||||
CV_Assert(get_effective_rank(weightsTensor) == 2); |
||||
csl::copyMatToTensor<T>(weights, weightsTensor, stream); |
||||
|
||||
if (!bias.empty()) |
||||
{ |
||||
biasTensor = csl::makeTensorHeader<T>(bias); |
||||
csl::copyMatToTensor<T>(bias, biasTensor, stream); |
||||
CV_Assert(weightsTensor.get_axis_size(-2) == biasTensor.size()); |
||||
} |
||||
} |
||||
|
||||
void forward( |
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs, |
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs, |
||||
csl::Workspace& workspace) override |
||||
{ |
||||
for (int i = 0; i < inputs.size(); i++) |
||||
{ |
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>(); |
||||
auto input = input_wrapper->getView(); |
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>(); |
||||
auto output = output_wrapper->getSpan(); |
||||
|
||||
std::size_t batch_size = input.size_range(0, axis); |
||||
|
||||
auto input_size = input.size() / batch_size; |
||||
CV_Assert(input_size == weightsTensor.get_axis_size(-1)); |
||||
|
||||
auto output_size = output.size() / batch_size; |
||||
CV_Assert(output_size == weightsTensor.get_axis_size(-2)); |
||||
|
||||
/* we treat the input and output as a matrix with dimensions (batch_size, input_size)
|
||||
* and (batch_size, output_size) respectively |
||||
* |
||||
* weight matrix dimensions: (output_size, input_size) |
||||
* |
||||
* I(W^T) = O |
||||
* (batch_size, input_size) * (input_size, output_size) = (batch_size, output_size) |
||||
*/ |
||||
input.reshape(batch_size, input_size); |
||||
output.reshape(batch_size, output_size); |
||||
csl::tensor_ops::gemm<T>(cublasHandle, 0.0, output, 1.0, false, input, true, weightsTensor); |
||||
|
||||
if (!biasTensor.empty()) |
||||
kernels::biasN<T>(stream, output, output, 1, biasTensor); |
||||
} |
||||
} |
||||
|
||||
private: |
||||
csl::Stream stream; |
||||
csl::cublas::Handle cublasHandle; |
||||
csl::Tensor<T> weightsTensor, biasTensor; |
||||
std::size_t axis; |
||||
}; |
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_INNER_PRODUCT_HPP */ |
@ -0,0 +1,75 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_LRN_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_LRN_HPP |
||||
|
||||
#include "../../op_cuda.hpp" |
||||
|
||||
#include "../csl/cudnn.hpp" |
||||
#include "../csl/tensor_ops.hpp" |
||||
|
||||
#include <cstddef> |
||||
#include <utility> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { |
||||
|
||||
enum class LRNType { |
||||
ACROSS_CHANNELS, |
||||
WITHIN_CHANNEL |
||||
}; |
||||
|
||||
template <class T> |
||||
class LRNOp final : public CUDABackendNode { |
||||
public: |
||||
using wrapper_type = GetCUDABackendWrapperType<T>; |
||||
|
||||
LRNOp(csl::cudnn::Handle handle, LRNType type_, std::size_t local_size, T alpha, T beta, T bias, std::size_t largestInputSize) |
||||
: scratch_mem_in_bytes { 0 } |
||||
{ |
||||
typename csl::LRN<T>::LRNType type{}; |
||||
switch (type_) { |
||||
case LRNType::ACROSS_CHANNELS: type = csl::LRN<T>::LRNType::ACROSS_CHANNELS; break; |
||||
case LRNType::WITHIN_CHANNEL: type = csl::LRN<T>::LRNType::WITHIN_CHANNEL; break; |
||||
} |
||||
lrn = csl::LRN<T>(std::move(handle), local_size, alpha, beta, bias, type); |
||||
|
||||
csl::WorkspaceBuilder builder; |
||||
if (type_ == LRNType::WITHIN_CHANNEL) { |
||||
/* this is not a bug; we require two of these */ |
||||
builder.require<T>(largestInputSize); |
||||
builder.require<T>(largestInputSize); |
||||
} |
||||
|
||||
scratch_mem_in_bytes = builder.required_workspace_size(); |
||||
} |
||||
|
||||
void forward( |
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs, |
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs, |
||||
csl::Workspace& workspace) override |
||||
{ |
||||
for (int i = 0; i < inputs.size(); i++) |
||||
{ |
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>(); |
||||
auto input = input_wrapper->getView(); |
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>(); |
||||
auto output = output_wrapper->getSpan(); |
||||
|
||||
csl::WorkspaceAllocator allocator(workspace); |
||||
lrn.normalize(input, output, allocator.get_instance()); |
||||
} |
||||
} |
||||
|
||||
std::size_t get_workspace_memory_in_bytes() const noexcept override { return scratch_mem_in_bytes; } |
||||
|
||||
private: |
||||
csl::LRN<T> lrn; |
||||
std::size_t scratch_mem_in_bytes; |
||||
}; |
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_LRN_HPP */ |
@ -0,0 +1,182 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_MAX_UNPOOLING_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_MAX_UNPOOLING_HPP |
||||
|
||||
#include "../../op_cuda.hpp" |
||||
|
||||
#include "../csl/stream.hpp" |
||||
|
||||
#include "../kernels/max_unpooling.hpp" |
||||
|
||||
#include <opencv2/core.hpp> |
||||
|
||||
#include <cstddef> |
||||
#include <vector> |
||||
#include <utility> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { |
||||
|
||||
struct MaxPoolingConfiguration { |
||||
/* the size of the following vectors must be equal to the pooling order */ |
||||
std::vector<std::size_t> window_size; |
||||
std::vector<std::size_t> strides; |
||||
|
||||
enum class PaddingMode { |
||||
MANUAL, /* uses explicit padding values provided in `pads_begin` and `pads_end` */ |
||||
VALID, /* no padding is added */ |
||||
SAME /* TensorFlow logic is used for same padding */ |
||||
}; |
||||
|
||||
PaddingMode padMode; |
||||
|
||||
/* explicit paddings are used if and only if padMode is set to manual */ |
||||
std::vector<std::size_t> pads_begin; |
||||
|
||||
/* full shape inclusive of channel and batch axis */ |
||||
std::vector<std::size_t> input_shape; |
||||
}; |
||||
|
||||
template <class T> |
||||
class MaxPoolingOp final : public CUDABackendNode { |
||||
public: |
||||
using wrapper_type = GetCUDABackendWrapperType<T>; |
||||
|
||||
MaxPoolingOp(csl::Stream stream_, const MaxPoolingConfiguration& config) |
||||
: stream(std::move(stream_)) |
||||
{ |
||||
window_size = config.window_size; |
||||
|
||||
const auto pooling_order = window_size.size(); |
||||
CV_Assert(pooling_order >= 1); |
||||
|
||||
strides = config.strides; |
||||
CV_Assert(pooling_order == strides.size()); |
||||
|
||||
if (pooling_order != 2 && pooling_order != 3) |
||||
CV_Error(Error::StsNotImplemented, "Only 2D/3D max-pooling are supported."); |
||||
|
||||
padding_left.resize(pooling_order); |
||||
if (config.padMode == MaxPoolingConfiguration::PaddingMode::MANUAL) |
||||
{ |
||||
const auto& pads_begin = config.pads_begin; |
||||
CV_Assert(pooling_order == pads_begin.size()); |
||||
|
||||
padding_left.assign(std::begin(pads_begin), std::end(pads_begin)); |
||||
} |
||||
else if (config.padMode == MaxPoolingConfiguration::PaddingMode::VALID) |
||||
{ |
||||
/* nothing to do as the paddings are already preset to zero */ |
||||
} |
||||
else if (config.padMode == MaxPoolingConfiguration::PaddingMode::SAME) |
||||
{ |
||||
/* TensorFlow Logic:
|
||||
* total_padding[i] = (o[i] - 1) * s[i] + effective_k[i] - i[i] |
||||
* |
||||
* if total padding is odd, the extra is added towards the end |
||||
*/ |
||||
const auto& input_shape = config.input_shape; |
||||
CV_Assert(input_shape.size() == pooling_order + 2); |
||||
|
||||
for (int i = 0; i < pooling_order; i++) |
||||
{ |
||||
const auto output_dim = (input_shape[i + 2] - 1 + strides[i]) / strides[i]; |
||||
const auto required_total_padding = |
||||
std::max<std::int64_t>(0, (output_dim - 1) * strides[i] + window_size[i] - input_shape[i + 2]); |
||||
|
||||
padding_left[i] = required_total_padding / 2; |
||||
} |
||||
} |
||||
} |
||||
|
||||
void forward( |
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs, |
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs, |
||||
csl::Workspace& workspace) override |
||||
{ |
||||
CV_Assert(inputs.size() == 1 && outputs.size() == 2); |
||||
|
||||
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>(); |
||||
auto input_data = input_wrapper->getView(); |
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>(); |
||||
auto output_data = output_wrapper->getSpan(); |
||||
|
||||
auto indices_wrapper = outputs[1].dynamicCast<wrapper_type>(); |
||||
auto output_indices = indices_wrapper->getSpan(); |
||||
|
||||
kernels::max_pooling_with_indices<T>( |
||||
stream, output_data, output_indices, input_data, window_size, strides, padding_left |
||||
); |
||||
} |
||||
|
||||
private: |
||||
csl::Stream stream; |
||||
|
||||
std::vector<std::size_t> window_size, strides, padding_left; |
||||
}; |
||||
|
||||
struct MaxUnpoolingConfiguration { |
||||
/* the size of the following vectors must be equal to the unpooling order */ |
||||
std::vector<std::size_t> window_size; |
||||
std::vector<std::size_t> strides; |
||||
std::vector<std::size_t> pads_begin; |
||||
}; |
||||
|
||||
template <class T> |
||||
class MaxUnpoolingOp final : public CUDABackendNode { |
||||
public: |
||||
using wrapper_type = GetCUDABackendWrapperType<T>; |
||||
|
||||
MaxUnpoolingOp(csl::Stream stream_, const MaxUnpoolingConfiguration& config) |
||||
: stream(std::move(stream_)) |
||||
{ |
||||
window_size = config.window_size; |
||||
|
||||
const auto pooling_order = window_size.size(); |
||||
CV_Assert(pooling_order >= 1); |
||||
|
||||
strides = config.strides; |
||||
padding_left = config.pads_begin; |
||||
CV_Assert(strides.size() == pooling_order); |
||||
CV_Assert(padding_left.size() == pooling_order); |
||||
|
||||
if (pooling_order != 2 && pooling_order != 3) |
||||
CV_Error(Error::StsNotImplemented, "Only 2D/3D max-unpooling are supported."); |
||||
} |
||||
|
||||
void forward( |
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs, |
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs, |
||||
csl::Workspace& workspace) override |
||||
{ |
||||
/* sometimes a third input is passed to provide the output shape; we won't need it */ |
||||
CV_Assert(inputs.size() == 2 || inputs.size() == 3); |
||||
CV_Assert(outputs.size() >= 1); |
||||
|
||||
for(int i = 0; i < outputs.size(); i++) |
||||
{ |
||||
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>(); |
||||
auto input_data = input_wrapper->getView(); |
||||
|
||||
auto indices_wrapper = inputs[1].dynamicCast<wrapper_type>(); |
||||
auto input_indices = indices_wrapper->getView(); |
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>(); |
||||
auto output_data = output_wrapper->getSpan(); |
||||
|
||||
kernels::max_unpooling<T>(stream, output_data, input_data, input_indices, window_size, strides, padding_left); |
||||
} |
||||
} |
||||
|
||||
private: |
||||
csl::Stream stream; |
||||
|
||||
std::vector<std::size_t> window_size, strides, padding_left; |
||||
}; |
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_MAX_UNPOOLING_HPP */ |
@ -0,0 +1,142 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_NORMALIZE_BBOX_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_NORMALIZE_BBOX_HPP |
||||
|
||||
#include "../../op_cuda.hpp" |
||||
|
||||
#include "../csl/stream.hpp" |
||||
#include "../csl/span.hpp" |
||||
#include "../csl/tensor.hpp" |
||||
#include "../csl/workspace.hpp" |
||||
|
||||
#include "../kernels/scale_shift.hpp" |
||||
#include "../kernels/normalize.hpp" |
||||
|
||||
#include <opencv2/core.hpp> |
||||
|
||||
#include <cstddef> |
||||
#include <vector> |
||||
#include <utility> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { |
||||
|
||||
template <class T> |
||||
struct NormalizeConfiguration { |
||||
std::vector<std::size_t> input_shape; |
||||
|
||||
/* axis range across which values are normalized
|
||||
* |
||||
* [0, axis_start) = outer range |
||||
* [axis_start, axis_end) = mid range |
||||
* [axis_end + 1, -1) = inner range |
||||
* |
||||
* for each location in the outer and inner range, all the values in the mid range are |
||||
* normalized together |
||||
*/ |
||||
std::size_t axis_start, axis_end; |
||||
|
||||
/* 1 for L1 norm, 2 for L2 norm */ |
||||
std::size_t norm; |
||||
|
||||
/* epsilon to use to avoid divison by zero */ |
||||
T eps; |
||||
}; |
||||
|
||||
template <class T> |
||||
class NormalizeOp final : public CUDABackendNode { |
||||
public: |
||||
using wrapper_type = GetCUDABackendWrapperType<T>; |
||||
|
||||
template <class V> |
||||
NormalizeOp(csl::Stream stream_, const Mat& weights, const NormalizeConfiguration<V>& config) |
||||
: stream(std::move(stream_)), weight{ 1.0 } |
||||
{ |
||||
norm_order = config.norm; |
||||
epsilon = config.eps; |
||||
axis_start = config.axis_start; |
||||
axis_end = config.axis_end; |
||||
|
||||
if (!weights.empty()) |
||||
{ |
||||
if (weights.total() == 1) |
||||
{ |
||||
CV_Assert(weights.type() == CV_32F); |
||||
weight = weights.at<float>(0, 0); |
||||
} |
||||
else |
||||
{ |
||||
weightsTensor = csl::makeTensorHeader<T>(weights); |
||||
csl::copyMatToTensor<T>(weights, weightsTensor, stream); |
||||
} |
||||
} |
||||
|
||||
std::size_t outer_size = 1; |
||||
for (int i = 0; i < axis_start; i++) |
||||
outer_size *= config.input_shape[i]; |
||||
|
||||
std::size_t inner_size = 1; |
||||
for (int i = axis_end; i < config.input_shape.size(); i++) |
||||
inner_size *= config.input_shape[i]; |
||||
|
||||
csl::WorkspaceBuilder builder; |
||||
builder.require<T>(outer_size * inner_size); |
||||
scratch_mem_in_bytes = builder.required_workspace_size(); |
||||
} |
||||
|
||||
void forward( |
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs, |
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs, |
||||
csl::Workspace& workspace) override |
||||
{ |
||||
CV_Assert(inputs.size() == 1 && outputs.size() == 1); |
||||
|
||||
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>(); |
||||
auto input = input_wrapper->getView(); |
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>(); |
||||
auto output = output_wrapper->getSpan(); |
||||
|
||||
std::size_t outer_size = input.size_range(0, axis_start); |
||||
std::size_t mid_size = input.size_range(axis_start, axis_end); |
||||
std::size_t inner_size = input.size_range(axis_end, input.rank()); |
||||
|
||||
auto ws_allocator = csl::WorkspaceAllocator(workspace); |
||||
auto scratch = ws_allocator.get_span<T>(); |
||||
kernels::normalize<T>(stream, output, input, outer_size, mid_size, inner_size, norm_order, epsilon, scratch); |
||||
|
||||
/* there might be a single weight in which case `weight` will be not equal to 1.0
|
||||
* or there might be several weights |
||||
* or we don't have to scale |
||||
*/ |
||||
if (weight != 1.0) |
||||
{ |
||||
kernels::scale1<T>(stream, output, input, weight); |
||||
} |
||||
else if (!weightsTensor.empty()) |
||||
{ |
||||
CV_Assert(weightsTensor.size() != 1); /* constructor should have set up to use `weight` */ |
||||
CV_Assert(weightsTensor.size() == mid_size); |
||||
kernels::scaleN<T>(stream, output, input, inner_size, weightsTensor); |
||||
} |
||||
} |
||||
|
||||
std::size_t get_workspace_memory_in_bytes() const noexcept override { return scratch_mem_in_bytes; } |
||||
|
||||
private: |
||||
csl::Stream stream; |
||||
csl::Tensor<T> weightsTensor; |
||||
T weight; /* if there is only one weight, we use this */ |
||||
|
||||
T epsilon; |
||||
std::size_t norm_order; |
||||
std::size_t axis_start, axis_end; |
||||
|
||||
std::size_t scratch_mem_in_bytes; |
||||
}; |
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_NORMALIZE_BBOX_HPP */ |
@ -0,0 +1,113 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_PADDING_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_PADDING_HPP |
||||
|
||||
#include "../../op_cuda.hpp" |
||||
|
||||
#include "../csl/stream.hpp" |
||||
#include "../csl/tensor.hpp" |
||||
|
||||
#include "../kernels/fill.hpp" |
||||
#include "../kernels/concat.hpp" |
||||
#include "../kernels/padding.hpp" |
||||
|
||||
#include <opencv2/core.hpp> |
||||
|
||||
#include <cstddef> |
||||
#include <vector> |
||||
#include <algorithm> |
||||
#include <utility> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { |
||||
|
||||
enum class PaddingType { |
||||
CONSTANT, |
||||
REFLECTION101 |
||||
}; |
||||
|
||||
template <class T> |
||||
class PaddingOp final : public CUDABackendNode { |
||||
public: |
||||
using wrapper_type = GetCUDABackendWrapperType<T>; |
||||
|
||||
/* `ranges` is indexed by axis and contains the range in the output where the input is copied to */ |
||||
PaddingOp(csl::Stream stream_, PaddingType type_, T value_, std::vector<cv::Range> ranges) |
||||
: stream(std::move(stream_)), type{ type_ }, value{ value_ }, dstRanges(std::move(ranges)) |
||||
{ |
||||
} |
||||
|
||||
void forward( |
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs, |
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs, |
||||
csl::Workspace& workspace) override |
||||
{ |
||||
CV_Assert(inputs.size() == 1 && outputs.size() == 1); |
||||
|
||||
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>(); |
||||
auto input = input_wrapper->getView(); |
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>(); |
||||
auto output = output_wrapper->getSpan(); |
||||
|
||||
auto effective_rank = get_effective_rank(input); |
||||
CV_Assert(get_effective_rank(input) == get_effective_rank(output)); |
||||
|
||||
/* suppose we require padding for the first spatial axis (H in NCHW or D in NCDHW)
|
||||
* |
||||
* there could be a case where the batch axis, channel axis, and the first spatial axis are all one |
||||
* this would result in effective rank being less than the number of axes requiring padding |
||||
*/ |
||||
effective_rank = std::max(effective_rank, dstRanges.size()); |
||||
|
||||
for (int i = effective_rank - dstRanges.size(); i < effective_rank; i++) |
||||
{ |
||||
if (dstRanges[i] == Range::all()) |
||||
CV_Assert(input.get_axis_size(i) == output.get_axis_size(i)); |
||||
else |
||||
CV_Assert(input.get_axis_size(i) == dstRanges[i].size()); |
||||
} |
||||
|
||||
if (type == PaddingType::CONSTANT) |
||||
{ |
||||
kernels::fill<T>(stream, output, value); |
||||
|
||||
std::vector<std::size_t> offsets(effective_rank, 0); |
||||
for (int i = 0; i < dstRanges.size(); i++) |
||||
{ |
||||
const auto delta = effective_rank - dstRanges.size(); |
||||
if (dstRanges[i] != Range::all()) |
||||
offsets[delta + i] = dstRanges[i].start; |
||||
} |
||||
|
||||
kernels::concat_with_offsets<T>(stream, output, input, offsets); |
||||
} |
||||
else if (type == PaddingType::REFLECTION101) |
||||
{ |
||||
std::vector<std::pair<std::size_t, std::size_t>> ranges(effective_rank); |
||||
for (int i = 0; i < effective_rank; i++) |
||||
{ |
||||
const auto delta = effective_rank - dstRanges.size(); |
||||
if (i < delta || dstRanges[i - delta] == Range::all()) |
||||
ranges[i] = { 0, input.get_axis_size(i) }; |
||||
else |
||||
ranges[i] = { dstRanges[i].start, dstRanges[i].end }; |
||||
} |
||||
|
||||
kernels::copy_with_reflection101<T>(stream, output, input, ranges); |
||||
} |
||||
} |
||||
|
||||
private: |
||||
csl::Stream stream; |
||||
PaddingType type; |
||||
T value; |
||||
|
||||
std::vector<cv::Range> dstRanges; |
||||
}; |
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_PADDING_HPP */ |
@ -0,0 +1,70 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_PERMUTE_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_PERMUTE_HPP |
||||
|
||||
#include "../../op_cuda.hpp" |
||||
|
||||
#include "../csl/stream.hpp" |
||||
#include "../csl/tensor_ops.hpp" |
||||
|
||||
#include "../kernels/permute.hpp" |
||||
|
||||
#include <opencv2/core.hpp> |
||||
|
||||
#include <cstddef> |
||||
#include <vector> |
||||
#include <utility> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { |
||||
|
||||
template <class T> |
||||
class PermuteOp final : public CUDABackendNode { |
||||
public: |
||||
using wrapper_type = GetCUDABackendWrapperType<T>; |
||||
|
||||
PermuteOp(csl::Stream stream_, std::vector<std::size_t> order_) |
||||
: stream(std::move(stream_)), order(std::move(order_)) { } |
||||
|
||||
void forward( |
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs, |
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs, |
||||
csl::Workspace& workspace) override |
||||
{ |
||||
for (int i = 0; i < inputs.size(); i++) |
||||
{ |
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>(); |
||||
auto input = input_wrapper->getView(); |
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>(); |
||||
auto output = output_wrapper->getSpan(); |
||||
|
||||
auto needsPermute = [&] { |
||||
for (int i = 0; i < order.size(); i++) |
||||
if (order[i] != i) |
||||
return true; |
||||
return false; |
||||
}(); |
||||
|
||||
if (needsPermute) |
||||
{ |
||||
kernels::permute(stream, output, input, order); |
||||
} |
||||
else |
||||
{ |
||||
if (input.get() != output.get()) |
||||
csl::tensor_ops::copy(stream, output, input); |
||||
} |
||||
} |
||||
} |
||||
|
||||
private: |
||||
csl::Stream stream; |
||||
std::vector<std::size_t> order; |
||||
}; |
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_PERMUTE_HPP */ |
@ -0,0 +1,258 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_POOLING_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_POOLING_HPP |
||||
|
||||
#include "../../op_cuda.hpp" |
||||
|
||||
#include "../csl/cudnn.hpp" |
||||
#include "../csl/tensor.hpp" |
||||
#include "../csl/tensor_ops.hpp" |
||||
|
||||
#include <opencv2/core.hpp> |
||||
|
||||
#include <cstddef> |
||||
#include <cstdint> |
||||
#include <vector> |
||||
#include <utility> |
||||
#include <algorithm> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { |
||||
|
||||
struct PoolingConfiguration { |
||||
enum class PoolingMode { |
||||
MAX, |
||||
AVERAGE_INCLUDE_PADDING, /* include padding while calculating average */ |
||||
AVERAGE_EXCLUDE_PADDING /* exclude padding while calculating average */ |
||||
}; |
||||
|
||||
PoolingMode poolMode; |
||||
|
||||
/* the size of the following vectors must be equal to the window size */ |
||||
std::vector<std::size_t> window_size; |
||||
std::vector<std::size_t> strides; |
||||
|
||||
enum class PaddingMode { |
||||
MANUAL, /* uses explicit padding values provided in `pads_begin` and `pads_end` */ |
||||
VALID, /* no padding is added */ |
||||
SAME /* TensorFlow logic is used for same padding */ |
||||
}; |
||||
|
||||
PaddingMode padMode; |
||||
|
||||
/* explicit paddings are used if and only if padMode is set to manual */ |
||||
std::vector<std::size_t> pads_begin, pads_end; |
||||
|
||||
/* the output shape is calculated using the following formula:
|
||||
* output_dim = func[(input_dim + padding_left + padding_right - kernel_dim)/stride] + 1 |
||||
* |
||||
* rounding mode decides what is used as `func` |
||||
*/ |
||||
enum class RoundingMode { |
||||
CEIL, /* uses ceil */ |
||||
FLOOR |
||||
}; |
||||
|
||||
RoundingMode roundMode; |
||||
|
||||
/* full shape inclusive of channel and batch axis */ |
||||
std::vector<std::size_t> input_shape; |
||||
}; |
||||
|
||||
template <class T> |
||||
class PoolingOp final : public CUDABackendNode { |
||||
public: |
||||
using wrapper_type = GetCUDABackendWrapperType<T>; |
||||
|
||||
PoolingOp(csl::cudnn::Handle handle, const PoolingConfiguration& config) |
||||
: cudnnHandle(std::move(handle)) |
||||
{ |
||||
const auto& window_size = config.window_size; |
||||
|
||||
const auto pooling_order = window_size.size(); |
||||
CV_Assert(pooling_order >= 1); |
||||
|
||||
const auto& strides = config.strides; |
||||
CV_Assert(pooling_order == strides.size()); |
||||
|
||||
const auto& input_shape = config.input_shape; |
||||
CV_Assert(input_shape.size() == pooling_order + 2); |
||||
|
||||
if (pooling_order > 3) |
||||
CV_Error(Error::StsNotImplemented, "Only 1D/2D/3D pooling are supported."); |
||||
|
||||
const auto rank = input_shape.size(); |
||||
|
||||
/* left and right are misleading as the padding is applicable for any number of dimensions
|
||||
* but we use those identifiers to avoid confusion with `pads_begin` and `pads_end` |
||||
* |
||||
* `common_padding` contains the amount of padding that has to be added to both sides |
||||
* `padding_left` and `padding_right` contains the amount of padding that needs to be added |
||||
* to a particular side in addition to the common padding |
||||
*/ |
||||
std::vector<std::size_t> common_padding(rank, 0); |
||||
std::vector<std::size_t> padding_left(rank, 0), padding_right(rank, 0); |
||||
if (config.padMode == PoolingConfiguration::PaddingMode::MANUAL) |
||||
{ |
||||
const auto& pads_begin = config.pads_begin; |
||||
const auto& pads_end = config.pads_end; |
||||
|
||||
CV_Assert(pooling_order == pads_begin.size()); |
||||
CV_Assert(pooling_order == pads_end.size()); |
||||
|
||||
/* cuDNN rounds down by default; hence, if ceilMode is false, we do nothing
|
||||
* otherwise, we add extra padding towards the end so that the convolution arithmetic yeilds |
||||
* the correct output size without having to deal with fancy fractional sizes |
||||
*/ |
||||
auto pads_end_modified = pads_end; |
||||
if (config.roundMode == PoolingConfiguration::RoundingMode::CEIL) |
||||
{ |
||||
for (int i = 0; i < window_size.size(); i++) { |
||||
auto rem = (input_shape[i + 2] + pads_begin[i] + pads_end[i] - window_size[i]) % strides[i]; |
||||
if (rem) |
||||
pads_end_modified[i] += strides[i] - rem; |
||||
} |
||||
} |
||||
|
||||
for (int i = 2; i < common_padding.size(); i++) |
||||
{ |
||||
common_padding[i] = std::min(pads_begin[i - 2], pads_end_modified[i - 2]); |
||||
padding_left[i] = pads_begin[i - 2] - common_padding[i]; |
||||
padding_right[i] = pads_end_modified[i - 2] - common_padding[i]; |
||||
} |
||||
} |
||||
else if (config.padMode == PoolingConfiguration::PaddingMode::VALID) |
||||
{ |
||||
/* nothing to do as the paddings are already preset to zero */ |
||||
} |
||||
else if (config.padMode == PoolingConfiguration::PaddingMode::SAME) |
||||
{ |
||||
/* TensorFlow Logic:
|
||||
* total_padding[i] = (o[i] - 1) * s[i] + effective_k[i] - i[i] |
||||
* |
||||
* if total padding is odd, the extra is added towards the end |
||||
*/ |
||||
for (int i = 2; i < rank; i++) |
||||
{ |
||||
const auto j = i - 2; /* filter index */ |
||||
const auto output_dim = (input_shape[i] - 1 + strides[j]) / strides[j]; |
||||
const auto required_total_padding = |
||||
std::max<std::int64_t>(0, (output_dim - 1) * strides[j] + window_size[j] - input_shape[i]); |
||||
|
||||
common_padding[i] = required_total_padding / 2; |
||||
padding_left[i] = 0; |
||||
padding_right[i] = required_total_padding % 2; |
||||
} |
||||
} |
||||
|
||||
/* in some scenarios, the extra padding at the end may not change the output at all */ |
||||
for (int i = 2; i < rank; i++) { |
||||
const auto j = i - 2; /* filter idx */ |
||||
const auto total_padding = common_padding[i] * 2 + padding_left[i] + padding_right[i]; |
||||
std::int64_t rem = (input_shape[i] + total_padding - window_size[j]) % strides[j]; |
||||
|
||||
/* the output shape doesn't change if we decrease the total padding by at most `rem`
|
||||
* provided that we decrease from the right |
||||
*/ |
||||
if (rem && padding_right[i] > 0) |
||||
padding_right[i] = std::max<std::int64_t>(0, padding_right[i] - rem); |
||||
} |
||||
|
||||
auto is_not_zero = [](std::size_t i) { return i != 0; }; |
||||
if (std::any_of(std::begin(padding_left), std::end(padding_left), is_not_zero) || |
||||
std::any_of(std::begin(padding_right), std::end(padding_right), is_not_zero)) |
||||
{ |
||||
/* csl::Pooling does not fully support asymmetric padding; hence, we deal with asymmetric padding by
|
||||
* copying the input to a bigger tensor and padding the ends manually |
||||
* |
||||
* But we first try to avoid the transformation using cuDNN's flexibility. cuDNN can accept a smaller or |
||||
* a bigger output shape. This effectively allows us to have arbitary padding at the right. |
||||
*/ |
||||
if (std::any_of(std::begin(padding_left), std::end(padding_left), is_not_zero)) |
||||
{ |
||||
/* there is padding on the left and we are forced to transform */ |
||||
auto transformed_input_shape = input_shape; |
||||
for (int i = 0; i < rank; i++) |
||||
transformed_input_shape[i] += padding_left[i] + padding_right[i]; |
||||
|
||||
transformedInput.resize(std::begin(transformed_input_shape), std::end(transformed_input_shape)); |
||||
inputTransformer = csl::TensorTransform<T>(cudnnHandle, padding_left, padding_right); |
||||
} |
||||
} |
||||
|
||||
typename csl::Pooling<T>::params_type params; |
||||
if (transformedInput.empty()) |
||||
{ |
||||
/* no transform => use original input shape */ |
||||
params.input_shape.assign(std::begin(input_shape), std::end(input_shape)); |
||||
} |
||||
else |
||||
{ |
||||
/* the pooling operation will be seeing the transformed input */ |
||||
auto transformed_input_shape = transformedInput.shape_as_vector(); |
||||
params.input_shape.assign(std::begin(transformed_input_shape), std::end(transformed_input_shape)); |
||||
} |
||||
|
||||
auto output_shape = input_shape; |
||||
for (int i = 2; i < rank; i++) |
||||
{ |
||||
auto total_padding = common_padding[i] * 2 + padding_left[i] + padding_right[i]; |
||||
output_shape[i] = (params.input_shape[i] + total_padding - window_size[i - 2]) / strides[i - 2] + 1; |
||||
} |
||||
|
||||
params.output_shape.assign(std::begin(output_shape), std::end(output_shape)); |
||||
params.window_size = window_size; |
||||
params.padding.assign(std::begin(common_padding) + 2, std::end(common_padding)); |
||||
params.stride = strides; |
||||
|
||||
if (config.poolMode == PoolingConfiguration::PoolingMode::MAX) |
||||
{ |
||||
params.type = csl::Pooling<T>::PoolingType::MAX; |
||||
} |
||||
else if (config.poolMode == PoolingConfiguration::PoolingMode::AVERAGE_INCLUDE_PADDING) |
||||
{ |
||||
params.type = csl::Pooling<T>::PoolingType::AVERAGE_INCLUDE_PADDING; |
||||
} |
||||
else if (config.poolMode == PoolingConfiguration::PoolingMode::AVERAGE_EXCLUDE_PADDING) |
||||
{ |
||||
params.type = csl::Pooling<T>::PoolingType::AVERAGE_EXCLUDE_PADDING; |
||||
} |
||||
|
||||
pooler = csl::Pooling<T>(cudnnHandle, params); |
||||
} |
||||
|
||||
void forward( |
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs, |
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs, |
||||
csl::Workspace& workspace) override |
||||
{ |
||||
CV_Assert(inputs.size() == 1 && outputs.size() == 1); |
||||
|
||||
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>(); |
||||
auto input = input_wrapper->getView(); |
||||
|
||||
if (!transformedInput.empty()) |
||||
{ |
||||
inputTransformer.transform(input, transformedInput); |
||||
input = csl::TensorView<T>(transformedInput); |
||||
} |
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>(); |
||||
auto output = output_wrapper->getSpan(); |
||||
|
||||
pooler.pool(input, output); |
||||
} |
||||
|
||||
private: |
||||
csl::cudnn::Handle cudnnHandle; |
||||
csl::Pooling<T> pooler; |
||||
|
||||
csl::Tensor<T> transformedInput; |
||||
csl::TensorTransform<T> inputTransformer; |
||||
}; |
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_POOLING_HPP */ |
@ -0,0 +1,136 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_PRIOR_BOX_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_PRIOR_BOX_HPP |
||||
|
||||
#include "../../op_cuda.hpp" |
||||
|
||||
#include "../csl/stream.hpp" |
||||
#include "../csl/span.hpp" |
||||
#include "../csl/tensor.hpp" |
||||
|
||||
#include "../kernels/prior_box.hpp" |
||||
|
||||
#include <cstddef> |
||||
#include <vector> |
||||
#include <utility> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { |
||||
|
||||
struct PriorBoxConfiguration { |
||||
std::size_t feature_map_width, feature_map_height; |
||||
std::size_t image_width, image_height; |
||||
|
||||
/* parameters for prior boxes for each feature point */ |
||||
std::vector<float> box_widths, box_heights; |
||||
std::vector<float> offsets_x, offsets_y; |
||||
float stepX, stepY; |
||||
|
||||
std::vector<float> variance; |
||||
|
||||
/* number of priors per feature point */ |
||||
std::size_t num_priors; |
||||
|
||||
/* clamps the box coordinates to [0, 1] range */ |
||||
bool clip; |
||||
|
||||
/* normalizes the box coordinates using the image dimensions */ |
||||
bool normalize; |
||||
}; |
||||
|
||||
template <class T> |
||||
class PriorBoxOp final : public CUDABackendNode { |
||||
public: |
||||
using wrapper_type = GetCUDABackendWrapperType<T>; |
||||
|
||||
PriorBoxOp(csl::Stream stream_, const PriorBoxConfiguration& config) |
||||
: stream(std::move(stream_)) |
||||
{ |
||||
feature_map_width = config.feature_map_width; |
||||
feature_map_height = config.feature_map_height; |
||||
|
||||
image_width = config.image_width; |
||||
image_height = config.image_height; |
||||
|
||||
const auto& box_widths = config.box_widths; |
||||
const auto& box_heights = config.box_heights; |
||||
CV_Assert(box_widths.size() == box_heights.size()); |
||||
|
||||
box_size = box_widths.size(); |
||||
|
||||
const auto& offsets_x = config.offsets_x; |
||||
const auto& offsets_y = config.offsets_y; |
||||
CV_Assert(offsets_x.size() == offsets_y.size()); |
||||
|
||||
offset_size = offsets_x.size(); |
||||
|
||||
/* for better memory utilization and preassumably better cache performance, we merge
|
||||
* the four vectors and put them in a single tensor |
||||
*/ |
||||
auto total = box_widths.size() * 2 + offsets_x.size() * 2; |
||||
std::vector<float> merged_params; |
||||
merged_params.insert(std::end(merged_params), std::begin(box_widths), std::end(box_widths)); |
||||
merged_params.insert(std::end(merged_params), std::begin(box_heights), std::end(box_heights)); |
||||
merged_params.insert(std::end(merged_params), std::begin(offsets_x), std::end(offsets_x)); |
||||
merged_params.insert(std::end(merged_params), std::begin(offsets_y), std::end(offsets_y)); |
||||
CV_Assert(merged_params.size() == total); |
||||
|
||||
paramsTensor.resize(total); |
||||
csl::memcpy(paramsTensor.get(), merged_params.data(), total, stream); /* synchronous copy */ |
||||
|
||||
const auto& variance_ = config.variance; |
||||
variance.assign(std::begin(variance_), std::end(variance_)); |
||||
|
||||
num_priors = config.num_priors; |
||||
stepX = config.stepX; |
||||
stepY = config.stepY; |
||||
clip = config.clip; |
||||
normalize = config.normalize; |
||||
} |
||||
|
||||
void forward( |
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs, |
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs, |
||||
csl::Workspace& workspace) override |
||||
{ |
||||
CV_Assert(inputs.size() == 2); /* we don't need the inputs but we are given */ |
||||
CV_Assert(outputs.size() == 1); |
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>(); |
||||
auto output = output_wrapper->getSpan(); |
||||
|
||||
/* we had stored all the parameters in a single tensor; now we create appropriate views
|
||||
* for each of the parameter arrays from the single tensor |
||||
*/ |
||||
auto boxWidths = csl::View<float>(paramsTensor.get(), box_size); |
||||
auto boxHeights = csl::View<float>(paramsTensor.get() + box_size, box_size); |
||||
auto offsetsX = csl::View<float>(paramsTensor.get() + 2 * box_size, offset_size); |
||||
auto offsetsY = csl::View<float>(paramsTensor.get() + 2 * box_size + offset_size, offset_size); |
||||
|
||||
kernels::generate_prior_boxes<T>(stream, output, |
||||
boxWidths, boxHeights, offsetsX, offsetsY, stepX, stepY, |
||||
variance, num_priors, feature_map_width, feature_map_height, image_width, image_height, normalize, clip); |
||||
} |
||||
|
||||
private: |
||||
csl::Stream stream; |
||||
csl::Tensor<float> paramsTensor; /* widths, heights, offsetsX, offsetsY */ |
||||
|
||||
std::size_t feature_map_width, feature_map_height; |
||||
std::size_t image_width, image_height; |
||||
|
||||
std::size_t box_size, offset_size; |
||||
float stepX, stepY; |
||||
|
||||
std::vector<float> variance; |
||||
|
||||
std::size_t num_priors; |
||||
bool clip, normalize; |
||||
}; |
||||
|
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_PRIOR_BOX_HPP */ |
@ -0,0 +1,181 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_REGION_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_REGION_HPP |
||||
|
||||
#include "../../op_cuda.hpp" |
||||
|
||||
#include "../csl/stream.hpp" |
||||
#include "../csl/cudnn.hpp" |
||||
#include "../csl/tensor_ops.hpp" |
||||
|
||||
#include "../kernels/region.hpp" |
||||
|
||||
#include "../../nms.inl.hpp" |
||||
|
||||
#include <opencv2/core.hpp> |
||||
|
||||
#include <cstddef> |
||||
#include <utility> |
||||
#include <vector> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { |
||||
|
||||
enum class SquashMethod { |
||||
SOFTMAX, |
||||
SIGMOID |
||||
}; |
||||
|
||||
template <class T> |
||||
struct RegionConfiguration { |
||||
/* The image is divided into (H, W) cells.
|
||||
* |
||||
* Each cell is interested in exactly one object and predicts `boxes_per_cell` bounding boxes |
||||
* for that object. |
||||
* |
||||
* Each bounding box contains: |
||||
* - 4 box coordinates |
||||
* - objectness confidence score |
||||
* - `classes` number of class scores |
||||
* |
||||
* The object score is reduced to a probability using sigmoid and the class scores are reduced to |
||||
* probabilities by either applying sigmoid or softmax (which is a configuration option). |
||||
* |
||||
* object_prob = sigmoid(object_score) |
||||
* conditional_class_prob = sigmoid, softmax across all classes |
||||
* |
||||
* actual class probability = conditional_class_prob * object_prob |
||||
*/ |
||||
|
||||
/* method for reducing class scores to probabilities */ |
||||
SquashMethod squash_method; |
||||
|
||||
std::size_t classes, boxes_per_cell; |
||||
|
||||
std::size_t width_norm, height_norm; |
||||
|
||||
/* prob cutoffs below which the prediction is nulled */ |
||||
T object_prob_cutoff; |
||||
T class_prob_cutoff; |
||||
|
||||
T nms_iou_threshold; |
||||
}; |
||||
|
||||
template <class T> |
||||
class RegionOp final : public CUDABackendNode { |
||||
public: |
||||
using wrapper_type = GetCUDABackendWrapperType<T>; |
||||
|
||||
template <class V> |
||||
RegionOp(csl::Stream stream_, const cv::Mat& bias, const RegionConfiguration<V>& config) |
||||
: stream(std::move(stream_)) |
||||
{ |
||||
biasTensor = csl::makeTensorHeader<T>(bias); |
||||
csl::copyMatToTensor<T>(bias, biasTensor, stream); |
||||
|
||||
classes = config.classes; |
||||
boxes_per_cell = config.boxes_per_cell; |
||||
|
||||
width_norm = config.width_norm; |
||||
height_norm = config.height_norm; |
||||
|
||||
squash_type = config.squash_method; |
||||
|
||||
object_prob_cutoff = config.object_prob_cutoff; |
||||
class_prob_cutoff = config.class_prob_cutoff; |
||||
|
||||
nms_iou_threshold = config.nms_iou_threshold; |
||||
} |
||||
|
||||
void forward( |
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs, |
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs, |
||||
csl::Workspace& workspace) override |
||||
{ |
||||
CV_Assert(outputs.size() == 1); |
||||
|
||||
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>(); |
||||
auto input = input_wrapper->getView(); |
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>(); |
||||
auto output = output_wrapper->getSpan(); |
||||
|
||||
csl::memcpy<T>(output.get(), input.get(), output.size(), stream); |
||||
|
||||
auto rows = input.get_axis_size(1); |
||||
auto cols = input.get_axis_size(2); |
||||
|
||||
auto cell_box_size = classes + 4 + 1; |
||||
|
||||
/* we squash class scores into probabilities using softmax or sigmoid */ |
||||
if (squash_type == SquashMethod::SOFTMAX) |
||||
kernels::softmax_strided<T>(stream, output, input, classes, cell_box_size, 5); |
||||
else if (squash_type == SquashMethod::SIGMOID) |
||||
kernels::sigmoid_strided<T>(stream, output, input, classes, cell_box_size, 5); |
||||
|
||||
kernels::region_finalize<T>(stream, output, input, biasTensor, object_prob_cutoff, class_prob_cutoff, |
||||
height_norm, width_norm, rows, cols, boxes_per_cell, cell_box_size, classes); |
||||
|
||||
if (nms_iou_threshold > 0) { |
||||
auto output_mat = output_wrapper->getMutableHostMat(); |
||||
CV_Assert(output_mat.type() == CV_32F); |
||||
for (int i = 0; i < input.get_axis_size(0); i++) { |
||||
auto sample_size = rows * cols * boxes_per_cell * cell_box_size; |
||||
do_nms_sort(reinterpret_cast<float*>(output_mat.data) + i * sample_size, rows * cols * boxes_per_cell, class_prob_cutoff, nms_iou_threshold); |
||||
} |
||||
} |
||||
} |
||||
|
||||
private: |
||||
void do_nms_sort(float *detections, int total, float score_thresh, float nms_thresh) |
||||
{ |
||||
std::vector<Rect2d> boxes(total); |
||||
std::vector<float> scores(total); |
||||
|
||||
for (int i = 0; i < total; ++i) |
||||
{ |
||||
Rect2d &b = boxes[i]; |
||||
int box_index = i * (classes + 4 + 1); |
||||
b.width = detections[box_index + 2]; |
||||
b.height = detections[box_index + 3]; |
||||
b.x = detections[box_index + 0] - b.width / 2; |
||||
b.y = detections[box_index + 1] - b.height / 2; |
||||
} |
||||
|
||||
std::vector<int> indices; |
||||
for (int k = 0; k < classes; ++k) |
||||
{ |
||||
for (int i = 0; i < total; ++i) |
||||
{ |
||||
int box_index = i * (classes + 4 + 1); |
||||
int class_index = box_index + 5; |
||||
scores[i] = detections[class_index + k]; |
||||
detections[class_index + k] = 0; |
||||
} |
||||
NMSBoxes(boxes, scores, score_thresh, nms_thresh, indices); |
||||
for (int i = 0, n = indices.size(); i < n; ++i) |
||||
{ |
||||
int box_index = indices[i] * (classes + 4 + 1); |
||||
int class_index = box_index + 5; |
||||
detections[class_index + k] = scores[indices[i]]; |
||||
} |
||||
} |
||||
} |
||||
|
||||
private: |
||||
csl::Stream stream; |
||||
|
||||
csl::Tensor<T> biasTensor; |
||||
std::size_t classes, boxes_per_cell; |
||||
std::size_t width_norm, height_norm; |
||||
SquashMethod squash_type; |
||||
|
||||
T object_prob_cutoff, class_prob_cutoff; |
||||
T nms_iou_threshold; |
||||
}; |
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_REGION_HPP */ |
@ -0,0 +1,75 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_REORG_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_REORG_HPP |
||||
|
||||
#include "../../op_cuda.hpp" |
||||
|
||||
#include "../csl/stream.hpp" |
||||
#include "../kernels/permute.hpp" |
||||
|
||||
#include <opencv2/core.hpp> |
||||
|
||||
#include <vector> |
||||
#include <utility> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { |
||||
|
||||
template <class T> |
||||
class ReorgOp final : public CUDABackendNode { |
||||
public: |
||||
using wrapper_type = GetCUDABackendWrapperType<T>; |
||||
|
||||
ReorgOp(csl::Stream stream_, std::size_t stride_) |
||||
: stream(std::move(stream_)), stride{ stride_ } { } |
||||
|
||||
void forward( |
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs, |
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs, |
||||
csl::Workspace& workspace) override |
||||
{ |
||||
CV_Assert(inputs.size() == 1 && outputs.size() == 1); |
||||
|
||||
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>(); |
||||
auto input = input_wrapper->getView(); |
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>(); |
||||
auto output = output_wrapper->getSpan(); |
||||
|
||||
const std::size_t permute_input_shape[] = { |
||||
input.get_axis_size(0), |
||||
input.get_axis_size(1) * input.get_axis_size(2) / (stride * stride), |
||||
stride, |
||||
input.get_axis_size(3), |
||||
stride |
||||
}; |
||||
|
||||
constexpr std::size_t order[] = { 0, 2, 4, 1, 3 }; |
||||
|
||||
const std::size_t permute_output_shape[] = { |
||||
permute_input_shape[order[0]], |
||||
permute_input_shape[order[1]], |
||||
permute_input_shape[order[2]], |
||||
permute_input_shape[order[3]], |
||||
permute_input_shape[order[4]] |
||||
}; |
||||
|
||||
input.unsqueeze(); |
||||
input.reshape(std::begin(permute_input_shape), std::end(permute_input_shape)); |
||||
|
||||
output.unsqueeze(); |
||||
output.reshape(std::begin(permute_output_shape), std::end(permute_output_shape)); |
||||
|
||||
kernels::permute(stream, output, input, { std::begin(order), std::end(order) }); |
||||
} |
||||
|
||||
private: |
||||
csl::Stream stream; |
||||
std::size_t stride; |
||||
}; |
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_REORG_HPP */ |
@ -0,0 +1,61 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_RESHAPE_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_RESHAPE_HPP |
||||
|
||||
#include "../../op_cuda.hpp" |
||||
|
||||
#include "../csl/stream.hpp" |
||||
#include "../csl/tensor.hpp" |
||||
#include "../csl/tensor_ops.hpp" |
||||
|
||||
#include <utility> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { |
||||
|
||||
template <class T> |
||||
class ReshapeOp final : public CUDABackendNode { |
||||
public: |
||||
using wrapper_type = GetCUDABackendWrapperType<T>; |
||||
|
||||
ReshapeOp(csl::Stream stream_) : stream(std::move(stream_)) { } |
||||
|
||||
void forward( |
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs, |
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs, |
||||
csl::Workspace& workspace) override |
||||
{ |
||||
/* sometimes the output shape is passed as extra inputs; hence, >= instead of == */ |
||||
CV_Assert(inputs.size() >= outputs.size()); |
||||
|
||||
for (int i = 0; i < outputs.size(); i++) |
||||
{ |
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>(); |
||||
auto input = input_wrapper->getView(); |
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>(); |
||||
auto output = output_wrapper->getSpan(); |
||||
|
||||
if (input.get() != output.get()) |
||||
{ |
||||
while (input.rank() < output.rank()) |
||||
input.unsqueeze(); |
||||
|
||||
while (output.rank() < input.rank()) |
||||
output.unsqueeze(); |
||||
|
||||
input.reshape_as(output); |
||||
csl::tensor_ops::copy(stream, output, input); |
||||
} |
||||
} |
||||
} |
||||
|
||||
private: |
||||
csl::Stream stream; |
||||
}; |
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_RESHAPE_HPP */ |
@ -0,0 +1,60 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_RESIZE_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_RESIZE_HPP |
||||
|
||||
#include "../../op_cuda.hpp" |
||||
|
||||
#include "../csl/stream.hpp" |
||||
|
||||
#include "../kernels/resize.hpp" |
||||
|
||||
#include <utility> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { |
||||
|
||||
enum class InterpolationType { |
||||
NEAREST_NEIGHBOUR, |
||||
BILINEAR |
||||
}; |
||||
|
||||
template <class T> |
||||
class ResizeOp final : public CUDABackendNode { |
||||
public: |
||||
using wrapper_type = GetCUDABackendWrapperType<T>; |
||||
|
||||
ResizeOp(csl::Stream stream_, InterpolationType type_, float scaleHeight_, float scaleWidth_) |
||||
: stream(std::move(stream_)), type{ type_ }, scaleHeight{ scaleHeight_ }, scaleWidth{ scaleWidth_ } |
||||
{ |
||||
} |
||||
|
||||
void forward( |
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs, |
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs, |
||||
csl::Workspace& workspace) override |
||||
{ |
||||
CV_Assert(inputs.size() == 1 && outputs.size() == 1); |
||||
|
||||
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>(); |
||||
auto input = input_wrapper->getView(); |
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>(); |
||||
auto output = output_wrapper->getSpan(); |
||||
|
||||
if (type == InterpolationType::NEAREST_NEIGHBOUR) |
||||
kernels::resize_nn<T>(stream, output, input); |
||||
else if (type == InterpolationType::BILINEAR) |
||||
kernels::resize_bilinear<T>(stream, output, input, scaleHeight, scaleWidth); |
||||
} |
||||
|
||||
private: |
||||
csl::Stream stream; |
||||
InterpolationType type; |
||||
float scaleHeight, scaleWidth; /* for bilinear interpolation */ |
||||
}; |
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_RESIZE_HPP */ |
@ -0,0 +1,110 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SCALE_SHIFT_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SCALE_SHIFT_HPP |
||||
|
||||
#include "../../op_cuda.hpp" |
||||
|
||||
#include "../csl/stream.hpp" |
||||
#include "../csl/tensor.hpp" |
||||
|
||||
#include "../kernels/scale_shift.hpp" |
||||
|
||||
#include <opencv2/core.hpp> |
||||
|
||||
#include <cstddef> |
||||
#include <utility> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { |
||||
|
||||
template <class T> |
||||
class ScaleShiftOp final : public CUDABackendNode { |
||||
public: |
||||
using wrapper_type = GetCUDABackendWrapperType<T>; |
||||
|
||||
ScaleShiftOp(csl::Stream stream_, std::size_t axis, const cv::Mat& weights, const cv::Mat& bias) |
||||
: stream(std::move(stream_)), axis{ axis } |
||||
{ |
||||
if (!weights.empty()) |
||||
{ |
||||
weightsTensor = csl::makeTensorHeader<T>(weights); |
||||
csl::copyMatToTensor<T>(weights, weightsTensor, stream); |
||||
} |
||||
|
||||
if (!bias.empty()) |
||||
{ |
||||
biasTensor = csl::makeTensorHeader<T>(bias); |
||||
csl::copyMatToTensor<T>(bias, biasTensor, stream); |
||||
} |
||||
} |
||||
|
||||
void forward( |
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs, |
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs, |
||||
csl::Workspace& workspace) override |
||||
{ |
||||
CV_Assert(outputs.size() == 1); |
||||
|
||||
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>(); |
||||
auto input = input_wrapper->getView(); |
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>(); |
||||
auto output = output_wrapper->getSpan(); |
||||
|
||||
csl::TensorView<T> weights; |
||||
if (weightsTensor.empty() && biasTensor.empty()) |
||||
{ |
||||
CV_Assert(inputs.size() == 2); |
||||
|
||||
/* no explicit scale/shift values provided; use the second input as weights */ |
||||
auto wrapper = inputs[1].dynamicCast<wrapper_type>(); |
||||
weights = wrapper->getView(); |
||||
} |
||||
else if (!weightsTensor.empty()) |
||||
{ |
||||
weights = csl::TensorSpan<T>(weightsTensor); |
||||
} |
||||
|
||||
csl::TensorView<T> bias; |
||||
if (!biasTensor.empty()) |
||||
bias = csl::TensorSpan<T>(biasTensor); |
||||
|
||||
const auto numParams = !weights.empty() ? weights.size() : bias.size(); |
||||
CV_Assert(numParams != 0); |
||||
if (!weightsTensor.empty() && !biasTensor.empty()) |
||||
{ |
||||
CV_CheckEQ(weights.size(), bias.size(), "weights and bias size are not equal"); |
||||
} |
||||
|
||||
/* the weights/bias might require broadcasting to scale/shift */ |
||||
const int end_axis = [&] { |
||||
for (int endAxis = axis + 1; endAxis <= input.rank(); endAxis++) |
||||
{ |
||||
std::size_t size = input.size_range(axis, endAxis); |
||||
if (size == numParams) |
||||
return endAxis; |
||||
} |
||||
CV_Assert(0 /* invalid weights matrix */); |
||||
}(); |
||||
|
||||
std::size_t inner_size = input.size_range(end_axis, input.rank()); |
||||
|
||||
if (!weights.empty() && !bias.empty()) |
||||
kernels::scaleN_with_biasN<T>(stream, output, input, inner_size, weights, bias); |
||||
else if (!weights.empty()) |
||||
kernels::scaleN<T>(stream, output, input, inner_size, weights); |
||||
else |
||||
kernels::biasN<T>(stream, output, input, inner_size, bias); |
||||
} |
||||
|
||||
private: |
||||
csl::Stream stream; |
||||
csl::Tensor<T> weightsTensor, biasTensor; |
||||
std::size_t axis; |
||||
}; |
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SCALE_SHIFT_HPP */ |
@ -0,0 +1,79 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SHUFFLE_CHANNEL_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SHUFFLE_CHANNEL_HPP |
||||
|
||||
#include "../../op_cuda.hpp" |
||||
|
||||
#include "../csl/stream.hpp" |
||||
#include "../csl/tensor_ops.hpp" |
||||
|
||||
#include "../kernels/permute.hpp" |
||||
|
||||
#include <opencv2/core.hpp> |
||||
|
||||
#include <vector> |
||||
#include <utility> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { |
||||
|
||||
template <class T> |
||||
class ShuffleChannelOp final : public CUDABackendNode { |
||||
public: |
||||
using wrapper_type = GetCUDABackendWrapperType<T>; |
||||
|
||||
ShuffleChannelOp(csl::Stream stream_, std::size_t group_) |
||||
: stream(std::move(stream_)), group{ group_ } { } |
||||
|
||||
void forward( |
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs, |
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs, |
||||
csl::Workspace& workspace) override |
||||
{ |
||||
CV_Assert(inputs.size() == 1 && outputs.size() == 1); |
||||
|
||||
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>(); |
||||
auto input = input_wrapper->getView(); |
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>(); |
||||
auto output = output_wrapper->getSpan(); |
||||
|
||||
if (group == 1) { |
||||
/* permute is redundant; check else branch to know why */ |
||||
if (input.get() != output.get()) { |
||||
input.reshape_as(output); |
||||
csl::tensor_ops::copy(stream, output, input); |
||||
} |
||||
} else { |
||||
const std::size_t permute_input_shape[] = { |
||||
input.get_axis_size(0), |
||||
group, |
||||
input.get_axis_size(1) / group, |
||||
input.get_axis_size(2) * input.get_axis_size(3) |
||||
}; |
||||
|
||||
constexpr std::size_t order[] = { 0, 2, 1, 3 }; |
||||
|
||||
const std::size_t permute_output_shape[] = { |
||||
permute_input_shape[order[0]], |
||||
permute_input_shape[order[1]], |
||||
permute_input_shape[order[2]], |
||||
permute_input_shape[order[3]], |
||||
}; |
||||
|
||||
input.reshape(std::begin(permute_input_shape), std::end(permute_input_shape)); |
||||
output.reshape(std::begin(permute_output_shape), std::end(permute_output_shape)); |
||||
kernels::permute(stream, output, input, { std::begin(order), std::end(order) }); |
||||
} |
||||
} |
||||
|
||||
private: |
||||
csl::Stream stream; |
||||
std::size_t group; |
||||
}; |
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SHUFFLE_CHANNEL_HPP */ |
@ -0,0 +1,62 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SLICE_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SLICE_HPP |
||||
|
||||
#include "../../op_cuda.hpp" |
||||
|
||||
#include "../csl/stream.hpp" |
||||
|
||||
#include "../kernels/slice.hpp" |
||||
|
||||
#include <opencv2/core.hpp> |
||||
|
||||
#include <cstddef> |
||||
#include <vector> |
||||
#include <utility> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { |
||||
|
||||
template <class T> |
||||
class SliceOp final : public CUDABackendNode { |
||||
public: |
||||
using wrapper_type = GetCUDABackendWrapperType<T>; |
||||
|
||||
/* offsets is indexed by output number and each subvector is indexed by axis number */ |
||||
SliceOp(csl::Stream stream_, std::vector<std::vector<std::size_t>> offsets) |
||||
: stream(std::move(stream_)), offsets(std::move(offsets)) |
||||
{ |
||||
} |
||||
|
||||
void forward( |
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs, |
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs, |
||||
csl::Workspace& workspace) override |
||||
{ |
||||
/* sometimes the output shape is passed in the form of a second input tensor
|
||||
* it's only required for initialization and not here |
||||
*/ |
||||
CV_Assert(inputs.size() == 1 || inputs.size() == 2); |
||||
|
||||
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>(); |
||||
auto input = input_wrapper->getView(); |
||||
|
||||
for (int i = 0; i < outputs.size(); ++i) |
||||
{ |
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>(); |
||||
auto output = output_wrapper->getSpan(); |
||||
|
||||
kernels::slice<T>(stream, output, input, offsets[i]); |
||||
} |
||||
} |
||||
|
||||
private: |
||||
csl::Stream stream; |
||||
std::vector<std::vector<std::size_t>> offsets; |
||||
}; |
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SLICE_HPP */ |
@ -0,0 +1,53 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SOFTMAX_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SOFTMAX_HPP |
||||
|
||||
#include "../../op_cuda.hpp" |
||||
|
||||
#include "../csl/cudnn.hpp" |
||||
#include "../csl/tensor_ops.hpp" |
||||
|
||||
#include <cstddef> |
||||
#include <utility> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { |
||||
|
||||
template <class T> |
||||
class SoftmaxOp final : public CUDABackendNode { |
||||
public: |
||||
using wrapper_type = GetCUDABackendWrapperType<T>; |
||||
|
||||
SoftmaxOp(csl::cudnn::Handle handle, std::size_t axis_, bool log_) |
||||
: cudnnHandle(std::move(handle)), channel_axis{ axis_ }, log{ log_ } |
||||
{ |
||||
} |
||||
|
||||
void forward( |
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs, |
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs, |
||||
csl::Workspace& workspace) override |
||||
{ |
||||
for (int i = 0; i < inputs.size(); i++) |
||||
{ |
||||
auto input_wrapper = inputs[i].dynamicCast<wrapper_type>(); |
||||
auto input = input_wrapper->getView(); |
||||
|
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>(); |
||||
auto output = output_wrapper->getSpan(); |
||||
|
||||
csl::tensor_ops::softmax<T>(cudnnHandle, output, input, channel_axis, log); |
||||
} |
||||
} |
||||
|
||||
private: |
||||
csl::cudnn::Handle cudnnHandle; |
||||
std::size_t channel_axis; |
||||
bool log; |
||||
}; |
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SOFTMAX_HPP */ |
@ -0,0 +1,54 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SPLIT_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SPLIT_HPP |
||||
|
||||
#include "../../op_cuda.hpp" |
||||
|
||||
#include "../csl/stream.hpp" |
||||
#include "../csl/tensor_ops.hpp" |
||||
|
||||
#include <opencv2/core.hpp> |
||||
|
||||
#include <utility> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { |
||||
|
||||
template <class T> |
||||
class SplitOp final : public CUDABackendNode { |
||||
public: |
||||
using wrapper_type = GetCUDABackendWrapperType<T>; |
||||
|
||||
SplitOp(csl::Stream stream_) |
||||
: stream(std::move(stream_)) |
||||
{ |
||||
} |
||||
|
||||
void forward( |
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs, |
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs, |
||||
csl::Workspace& workspace) override |
||||
{ |
||||
CV_Assert(inputs.size() == 1); |
||||
|
||||
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>(); |
||||
auto input = input_wrapper->getView(); |
||||
|
||||
for (int i = 0; i < outputs.size(); i++) |
||||
{ |
||||
auto output_wrapper = outputs[i].dynamicCast<wrapper_type>(); |
||||
auto output = output_wrapper->getSpan(); |
||||
|
||||
csl::tensor_ops::copy<T>(stream, output, input); |
||||
} |
||||
} |
||||
|
||||
private: |
||||
csl::Stream stream; |
||||
}; |
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SPLIT_HPP */ |
@ -0,0 +1,230 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_TRANSPOSE_CONVOLUTION_HPP |
||||
#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_TRANSPOSE_CONVOLUTION_HPP |
||||
|
||||
#include "../../op_cuda.hpp" |
||||
|
||||
#include "../csl/cudnn.hpp" |
||||
#include "../csl/stream.hpp" |
||||
#include "../csl/tensor.hpp" |
||||
#include "../csl/tensor_ops.hpp" |
||||
|
||||
#include "../kernels/scale_shift.hpp" |
||||
|
||||
#include <opencv2/core.hpp> |
||||
|
||||
#include <cstddef> |
||||
#include <cstdint> |
||||
#include <vector> |
||||
#include <utility> |
||||
#include <algorithm> |
||||
|
||||
namespace cv { namespace dnn { namespace cuda4dnn { |
||||
|
||||
struct TransposeConvolutionConfiguration { |
||||
/* other than `input_shape` and `output_shape`, all the configuration values must be provided
|
||||
* for the corresponding convolution operation (not transpose convolution) |
||||
*/ |
||||
|
||||
/* the size of the following vectors must be equal to the kernel size */ |
||||
std::vector<std::size_t> kernel_size; |
||||
std::vector<std::size_t> dilations, strides; |
||||
|
||||
enum class PaddingMode { |
||||
MANUAL, /* uses explicit padding values provided in `pads_begin` and `pads_end` */ |
||||
VALID, /* no padding is added */ |
||||
SAME /* TensorFlow logic is used for same padding */ |
||||
}; |
||||
|
||||
/* explicit paddings are used if and only if padMode is set to manual */ |
||||
PaddingMode padMode; |
||||
std::vector<std::size_t> pads_begin, pads_end; |
||||
|
||||
/* full shape inclusive of channel and batch axis */ |
||||
std::vector<std::size_t> input_shape; |
||||
std::vector<std::size_t> output_shape; |
||||
|
||||
/* group count for grouped convolution */ |
||||
std::size_t groups; |
||||
}; |
||||
|
||||
template <class T> |
||||
class TransposeConvolutionOp final : public CUDABackendNode { |
||||
public: |
||||
using wrapper_type = GetCUDABackendWrapperType<T>; |
||||
|
||||
TransposeConvolutionOp(csl::Stream stream_, csl::cudnn::Handle handle, const TransposeConvolutionConfiguration& config, const Mat& filters, const Mat& bias) |
||||
: stream(std::move(stream_)), cudnnHandle(std::move(handle)) |
||||
{ |
||||
/* we make use of backward pass of convolution to perform forward pass of transpose convolution
|
||||
* hence, we must setup configuration for the convolution operation and perform backward pass |
||||
*/ |
||||
const auto& kernel_size = config.kernel_size; |
||||
const auto& dilations = config.dilations; |
||||
const auto& strides = config.strides; |
||||
|
||||
const auto convolution_order = kernel_size.size(); |
||||
CV_Assert(convolution_order >= 1); |
||||
|
||||
CV_Assert(convolution_order == dilations.size()); |
||||
CV_Assert(convolution_order == strides.size()); |
||||
|
||||
const auto& input_shape = config.input_shape; |
||||
const auto& output_shape = config.output_shape; |
||||
CV_Assert(input_shape.size() == output_shape.size()); |
||||
CV_Assert(input_shape.size() == convolution_order + 2); |
||||
|
||||
const auto groups = config.groups; |
||||
|
||||
if (convolution_order > 3) |
||||
CV_Error(Error::StsNotImplemented, "Only 1D/2D/3D transpose convolution is supported."); |
||||
|
||||
const auto rank = input_shape.size(); |
||||
const auto input_feature_maps = input_shape[1]; |
||||
const auto output_feature_maps = output_shape[1]; |
||||
const auto output_feature_maps_per_group = output_feature_maps / groups; |
||||
CV_Assert(output_feature_maps % groups == 0); |
||||
|
||||
filtersTensor = csl::makeTensorHeader<T>(filters); |
||||
csl::copyMatToTensor<T>(filters, filtersTensor, stream); |
||||
|
||||
if (!bias.empty()) |
||||
{ |
||||
CV_Assert(bias.total() == output_feature_maps); |
||||
biasTensor = csl::makeTensorHeader<T>(bias); |
||||
csl::copyMatToTensor<T>(bias, biasTensor, stream); |
||||
} |
||||
|
||||
/* left and right are misleading as the padding is applicable for any number of dimensions
|
||||
* but we use those identifiers to avoid confusion with `pads_begin` and `pads_end` |
||||
* |
||||
* `common_padding` contains the amount of padding that has to be added to both sides |
||||
* `padding_left` and `padding_right` contains the amount of padding that needs to be added |
||||
* to a particular side in addition to the common padding |
||||
* |
||||
* note that we compute the padding for the convolution operation |
||||
*/ |
||||
std::vector<std::size_t> common_padding(rank, 0); |
||||
std::vector<std::size_t> padding_left(rank, 0), padding_right(rank, 0); |
||||
if (config.padMode == TransposeConvolutionConfiguration::PaddingMode::MANUAL) |
||||
{ |
||||
const auto& pads_begin = config.pads_begin; |
||||
const auto& pads_end = config.pads_end; |
||||
|
||||
CV_Assert(convolution_order == pads_begin.size()); |
||||
CV_Assert(convolution_order == pads_end.size()); |
||||
|
||||
for (int i = 2; i < common_padding.size(); i++) |
||||
{ |
||||
common_padding[i] = std::min(pads_begin[i - 2], pads_end[i - 2]); |
||||
padding_left[i] = pads_begin[i - 2] - common_padding[i]; |
||||
padding_right[i] = pads_end[i - 2] - common_padding[i]; |
||||
} |
||||
} |
||||
else if (config.padMode == TransposeConvolutionConfiguration::PaddingMode::VALID) |
||||
{ |
||||
/* nothing to do as the paddings are already preset to zero */ |
||||
} |
||||
else if (config.padMode == TransposeConvolutionConfiguration::PaddingMode::SAME) |
||||
{ |
||||
/* TensorFlow Logic:
|
||||
* total_padding[i] = (o[i] - 1) * s[i] + effective_k[i] - i[i] |
||||
* |
||||
* if total padding is odd, the extra is added towards the end |
||||
*/ |
||||
for (int i = 2; i < rank; i++) |
||||
{ |
||||
const auto j = i - 2; /* filter index */ |
||||
const auto effective_kernel_size = dilations[j] * (kernel_size[j] - 1) + 1; |
||||
const auto required_total_padding = |
||||
std::max<std::int64_t>(0, (input_shape[i] - 1) * strides[j] + effective_kernel_size - output_shape[i]); |
||||
|
||||
common_padding[i] = required_total_padding / 2; |
||||
padding_left[i] = 0; |
||||
padding_right[i] = required_total_padding % 2; |
||||
} |
||||
} |
||||
|
||||
/* in some scenarios, the extra padding at the end may not change the output at all */ |
||||
for (int i = 2; i < rank; i++) { |
||||
const auto j = i - 2; /* filter idx */ |
||||
const auto total_padding = common_padding[i] * 2 + padding_left[i] + padding_right[i]; |
||||
const auto effective_kernel_size = dilations[j] * (kernel_size[j] - 1) + 1; |
||||
std::int64_t rem = (input_shape[i] + total_padding - effective_kernel_size) % strides[j]; |
||||
|
||||
/* the output shape doesn't change if we decrease the total padding by at most `rem`
|
||||
* provided that we decrease from the right |
||||
*/ |
||||
if (rem && padding_right[i] > 0) |
||||
padding_right[i] = std::max<std::int64_t>(0, padding_right[i] - rem); |
||||
} |
||||
|
||||
auto is_not_zero = [](std::size_t i) { return i != 0; }; |
||||
if(std::any_of(std::begin(padding_left), std::end(padding_left), is_not_zero) || |
||||
std::any_of(std::begin(padding_right), std::end(padding_right), is_not_zero)) |
||||
{ |
||||
CV_Error(Error::StsNotImplemented, "Padding configuration requires asymmetric padding and hence is not supported."); |
||||
} |
||||
|
||||
typename csl::TransposeConvolution<T>::params_type params; |
||||
params.input_shape.assign(std::begin(input_shape), std::end(input_shape)); |
||||
params.output_shape.assign(std::begin(output_shape), std::end(output_shape)); |
||||
|
||||
auto& fshape = params.filter_shape; |
||||
fshape.resize(rank); |
||||
fshape[0] = input_feature_maps; |
||||
fshape[1] = output_feature_maps_per_group; |
||||
std::copy(std::begin(kernel_size), std::end(kernel_size), std::begin(fshape) + 2); |
||||
CV_Assert(fshape.size() == kernel_size.size() + 2); |
||||
|
||||
params.padding.assign(std::begin(common_padding) + 2, std::end(common_padding)); |
||||
params.stride = strides; |
||||
params.dilation = dilations; |
||||
params.groups = config.groups; |
||||
|
||||
convoluter = csl::TransposeConvolution<T>(cudnnHandle, params); |
||||
|
||||
csl::WorkspaceBuilder builder; |
||||
builder.require(convoluter.get_workspace_size()); |
||||
scratch_mem_in_bytes = builder.required_workspace_size(); |
||||
} |
||||
|
||||
void forward( |
||||
const std::vector<cv::Ptr<BackendWrapper>>& inputs, |
||||
const std::vector<cv::Ptr<BackendWrapper>>& outputs, |
||||
csl::Workspace& workspace) override |
||||
{ |
||||
CV_Assert(inputs.size() == 1 && outputs.size() == 1); |
||||
|
||||
auto input_wrapper = inputs[0].dynamicCast<wrapper_type>(); |
||||
auto input = input_wrapper->getView(); |
||||
|
||||
auto output_wrapper = outputs[0].dynamicCast<wrapper_type>(); |
||||
auto output = output_wrapper->getSpan(); |
||||
|
||||
csl::WorkspaceAllocator allocator(workspace); |
||||
convoluter.transpose_convolve(output, input, filtersTensor, allocator.get_instance()); |
||||
if (!biasTensor.empty()) |
||||
{ |
||||
std::size_t inner_size = total(output_wrapper->getShape(), 2, -1); |
||||
kernels::biasN<T>(stream, output, output, inner_size, biasTensor); |
||||
} |
||||
} |
||||
|
||||
std::size_t get_workspace_memory_in_bytes() const noexcept override { return scratch_mem_in_bytes; } |
||||
|
||||
private: |
||||
csl::Stream stream; |
||||
csl::cudnn::Handle cudnnHandle; |
||||
csl::Tensor<T> filtersTensor, biasTensor; |
||||
csl::TransposeConvolution<T> convoluter; |
||||
|
||||
std::size_t scratch_mem_in_bytes; |
||||
}; |
||||
|
||||
}}} /* namespace cv::dnn::cuda4dnn */ |
||||
|
||||
#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_TRANSPOSE_CONVOLUTION_HPP */ |
@ -1,18 +0,0 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
// this file is a stub and will be removed once actual code is added
|
||||
|
||||
#include "../precomp.hpp" |
||||
|
||||
#ifndef HAVE_CUDA |
||||
# error "CUDA4DNN should be enabled iff CUDA and cuDNN were found" |
||||
#endif |
||||
|
||||
#include <cudnn.h> |
||||
|
||||
void cuda4dnn_build_test_func() { |
||||
auto ver = cudnnGetVersion(); |
||||
CV_UNUSED(ver); |
||||
} |
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue