|
|
@ -26,7 +26,8 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { |
|
|
|
template <class T, std::size_t CHANNELS_PER_ITER> |
|
|
|
template <class T, std::size_t CHANNELS_PER_ITER> |
|
|
|
__global__ void resize_nn( |
|
|
|
__global__ void resize_nn( |
|
|
|
Span<T> output, size_type out_height, size_type out_width, |
|
|
|
Span<T> output, size_type out_height, size_type out_width, |
|
|
|
View<T> input, size_type in_height, size_type in_width) |
|
|
|
View<T> input, size_type in_height, size_type in_width, |
|
|
|
|
|
|
|
float o2i_fy, float o2i_fx, bool round, bool half_pixel_centers) |
|
|
|
{ |
|
|
|
{ |
|
|
|
auto in_image_size = in_height * in_width; |
|
|
|
auto in_image_size = in_height * in_width; |
|
|
|
auto out_image_size = out_height * out_width; |
|
|
|
auto out_image_size = out_height * out_width; |
|
|
@ -60,12 +61,16 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { |
|
|
|
const index_type y = (iter % out_image_size) / out_width; |
|
|
|
const index_type y = (iter % out_image_size) / out_width; |
|
|
|
const index_type x = iter % out_width; |
|
|
|
const index_type x = iter % out_width; |
|
|
|
|
|
|
|
|
|
|
|
/* o2i = output to input */ |
|
|
|
auto in_yf = half_pixel_centers ? (y + 0.5f) * o2i_fy : y * o2i_fy; |
|
|
|
auto o2i_fy = static_cast<float>(in_height) / out_height; |
|
|
|
auto in_xf = half_pixel_centers ? (x + 0.5f) * o2i_fx : x * o2i_fx; |
|
|
|
auto o2i_fx = static_cast<float>(in_width) / out_width; |
|
|
|
|
|
|
|
|
|
|
|
using device::lround; |
|
|
|
|
|
|
|
index_type in_y = round ? lround(in_yf) : static_cast<index_type>(in_yf); |
|
|
|
|
|
|
|
index_type in_x = round ? lround(in_xf) : static_cast<index_type>(in_xf); |
|
|
|
|
|
|
|
|
|
|
|
auto in_y = static_cast<index_type>(y * o2i_fy); |
|
|
|
using device::min; |
|
|
|
auto in_x = static_cast<index_type>(x * o2i_fx); |
|
|
|
in_y = min(in_y, in_height - 1); |
|
|
|
|
|
|
|
in_x = min(in_x, in_width - 1); |
|
|
|
|
|
|
|
|
|
|
|
index_type in_idx = c_start * in_image_size + in_y * in_width + in_x; |
|
|
|
index_type in_idx = c_start * in_image_size + in_y * in_width + in_x; |
|
|
|
index_type out_idx = c_start * out_image_size + y * out_width + x; |
|
|
|
index_type out_idx = c_start * out_image_size + y * out_width + x; |
|
|
@ -83,7 +88,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { |
|
|
|
__global__ void resize_bilinear( |
|
|
|
__global__ void resize_bilinear( |
|
|
|
Span<T> output, size_type out_height, size_type out_width, |
|
|
|
Span<T> output, size_type out_height, size_type out_width, |
|
|
|
View<T> input, size_type in_height, size_type in_width, |
|
|
|
View<T> input, size_type in_height, size_type in_width, |
|
|
|
float o2i_fy, float o2i_fx) |
|
|
|
float o2i_fy, float o2i_fx, bool half_pixel_centers) |
|
|
|
{ |
|
|
|
{ |
|
|
|
auto in_image_size = in_height * in_width; |
|
|
|
auto in_image_size = in_height * in_width; |
|
|
|
auto out_image_size = out_height * out_width; |
|
|
|
auto out_image_size = out_height * out_width; |
|
|
@ -119,8 +124,9 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { |
|
|
|
const index_type y = (iter % out_image_size) / out_width; |
|
|
|
const index_type y = (iter % out_image_size) / out_width; |
|
|
|
const index_type x = iter % out_width; |
|
|
|
const index_type x = iter % out_width; |
|
|
|
|
|
|
|
|
|
|
|
auto in_x = x * o2i_fx; |
|
|
|
using device::max; |
|
|
|
auto in_y = y * o2i_fy; |
|
|
|
auto in_x = half_pixel_centers ? max<float>((x + 0.5f) * o2i_fx - 0.5f, 0.0f) : x * o2i_fx; |
|
|
|
|
|
|
|
auto in_y = half_pixel_centers ? max<float>((y + 0.5f) * o2i_fy - 0.5f, 0.0f) : y * o2i_fy; |
|
|
|
|
|
|
|
|
|
|
|
auto in_x0 = static_cast<index_type>(in_x); |
|
|
|
auto in_x0 = static_cast<index_type>(in_x); |
|
|
|
auto in_y0 = static_cast<index_type>(in_y); |
|
|
|
auto in_y0 = static_cast<index_type>(in_y); |
|
|
@ -157,15 +163,16 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { |
|
|
|
template <class T, std::size_t CHANNELS_PER_ITER> static |
|
|
|
template <class T, std::size_t CHANNELS_PER_ITER> static |
|
|
|
void launch_multichannel_resize_nn(const Stream& stream, |
|
|
|
void launch_multichannel_resize_nn(const Stream& stream, |
|
|
|
Span<T> output, size_type out_height, size_type out_width, |
|
|
|
Span<T> output, size_type out_height, size_type out_width, |
|
|
|
View<T> input, size_type in_height, size_type in_width) |
|
|
|
View<T> input, size_type in_height, size_type in_width, |
|
|
|
|
|
|
|
float scale_y, float scale_x, bool round, bool half_pixel_centers) |
|
|
|
{ |
|
|
|
{ |
|
|
|
auto kernel = raw::resize_nn<T, CHANNELS_PER_ITER>; |
|
|
|
auto kernel = raw::resize_nn<T, CHANNELS_PER_ITER>; |
|
|
|
auto policy = make_policy(kernel, output.size() / CHANNELS_PER_ITER, 0, stream); |
|
|
|
auto policy = make_policy(kernel, output.size() / CHANNELS_PER_ITER, 0, stream); |
|
|
|
launch_kernel(kernel, policy, output, out_height, out_width, input, in_height, in_width); |
|
|
|
launch_kernel(kernel, policy, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
template <class T> |
|
|
|
template <class T> |
|
|
|
void resize_nn(const Stream& stream, TensorSpan<T> output, TensorView<T> input) { |
|
|
|
void resize_nn(const Stream& stream, TensorSpan<T> output, TensorView<T> input, float scale_y, float scale_x, bool round, bool half_pixel_centers) { |
|
|
|
auto out_height = output.get_axis_size(-2); |
|
|
|
auto out_height = output.get_axis_size(-2); |
|
|
|
auto out_width = output.get_axis_size(-1); |
|
|
|
auto out_width = output.get_axis_size(-1); |
|
|
|
|
|
|
|
|
|
|
@ -176,38 +183,38 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { |
|
|
|
auto num_iters = num_effective_channels * out_height * out_width; |
|
|
|
auto num_iters = num_effective_channels * out_height * out_width; |
|
|
|
|
|
|
|
|
|
|
|
if (num_effective_channels % 32 == 0 && num_iters > 655360) { |
|
|
|
if (num_effective_channels % 32 == 0 && num_iters > 655360) { |
|
|
|
launch_multichannel_resize_nn<T, 32>(stream, output, out_height, out_width, input, in_height, in_width); |
|
|
|
launch_multichannel_resize_nn<T, 32>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers); |
|
|
|
} else if (num_effective_channels % 16 == 0 && num_iters > 327680) { |
|
|
|
} else if (num_effective_channels % 16 == 0 && num_iters > 327680) { |
|
|
|
launch_multichannel_resize_nn<T, 16>(stream, output, out_height, out_width, input, in_height, in_width); |
|
|
|
launch_multichannel_resize_nn<T, 16>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers); |
|
|
|
} else if (num_effective_channels % 8 == 0 && num_iters > 163840) { |
|
|
|
} else if (num_effective_channels % 8 == 0 && num_iters > 163840) { |
|
|
|
launch_multichannel_resize_nn<T, 8>(stream, output, out_height, out_width, input, in_height, in_width); |
|
|
|
launch_multichannel_resize_nn<T, 8>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers); |
|
|
|
} else if (num_effective_channels % 4 == 0 && num_iters > 81920) { |
|
|
|
} else if (num_effective_channels % 4 == 0 && num_iters > 81920) { |
|
|
|
launch_multichannel_resize_nn<T, 4>(stream, output, out_height, out_width, input, in_height, in_width); |
|
|
|
launch_multichannel_resize_nn<T, 4>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers); |
|
|
|
} else if (num_effective_channels % 2 == 0) { |
|
|
|
} else if (num_effective_channels % 2 == 0) { |
|
|
|
launch_multichannel_resize_nn<T, 2>(stream, output, out_height, out_width, input, in_height, in_width); |
|
|
|
launch_multichannel_resize_nn<T, 2>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers); |
|
|
|
} else { |
|
|
|
} else { |
|
|
|
launch_multichannel_resize_nn<T, 1>(stream, output, out_height, out_width, input, in_height, in_width); |
|
|
|
launch_multichannel_resize_nn<T, 1>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, round, half_pixel_centers); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) |
|
|
|
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) |
|
|
|
template void resize_nn<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>); |
|
|
|
template void resize_nn<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, float, float, bool, bool); |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
template void resize_nn<float>(const Stream&, TensorSpan<float>, TensorView<float>); |
|
|
|
template void resize_nn<float>(const Stream&, TensorSpan<float>, TensorView<float>, float, float, bool,bool); |
|
|
|
|
|
|
|
|
|
|
|
template <class T, std::size_t CHANNELS_PER_ITER> static |
|
|
|
template <class T, std::size_t CHANNELS_PER_ITER> static |
|
|
|
void launch_multichannel_resize_bilinear(const Stream& stream, |
|
|
|
void launch_multichannel_resize_bilinear(const Stream& stream, |
|
|
|
Span<T> output, size_type out_height, size_type out_width, |
|
|
|
Span<T> output, size_type out_height, size_type out_width, |
|
|
|
View<T> input, size_type in_height, size_type in_width, |
|
|
|
View<T> input, size_type in_height, size_type in_width, |
|
|
|
float scale_y, float scale_x) |
|
|
|
float scale_y, float scale_x, bool half_pixel_centers) |
|
|
|
{ |
|
|
|
{ |
|
|
|
auto kernel = raw::resize_bilinear<T, CHANNELS_PER_ITER>; |
|
|
|
auto kernel = raw::resize_bilinear<T, CHANNELS_PER_ITER>; |
|
|
|
auto policy = make_policy(kernel, output.size() / CHANNELS_PER_ITER, 0, stream); |
|
|
|
auto policy = make_policy(kernel, output.size() / CHANNELS_PER_ITER, 0, stream); |
|
|
|
launch_kernel(kernel, policy, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x); |
|
|
|
launch_kernel(kernel, policy, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, half_pixel_centers); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
template <class T> |
|
|
|
template <class T> |
|
|
|
void resize_bilinear(const Stream& stream, TensorSpan<T> output, TensorView<T> input, float scale_y, float scale_x) { |
|
|
|
void resize_bilinear(const Stream& stream, TensorSpan<T> output, TensorView<T> input, float scale_y, float scale_x, bool half_pixel_centers) { |
|
|
|
auto out_height = output.get_axis_size(-2); |
|
|
|
auto out_height = output.get_axis_size(-2); |
|
|
|
auto out_width = output.get_axis_size(-1); |
|
|
|
auto out_width = output.get_axis_size(-1); |
|
|
|
|
|
|
|
|
|
|
@ -218,21 +225,21 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { |
|
|
|
auto num_iters = num_effective_channels * out_height * out_width; |
|
|
|
auto num_iters = num_effective_channels * out_height * out_width; |
|
|
|
|
|
|
|
|
|
|
|
if (num_effective_channels % 16 == 0 && num_iters > 163840) { |
|
|
|
if (num_effective_channels % 16 == 0 && num_iters > 163840) { |
|
|
|
launch_multichannel_resize_bilinear<T, 16>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x); |
|
|
|
launch_multichannel_resize_bilinear<T, 16>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, half_pixel_centers); |
|
|
|
} else if (num_effective_channels % 8 == 0 && num_iters > 81920) { |
|
|
|
} else if (num_effective_channels % 8 == 0 && num_iters > 81920) { |
|
|
|
launch_multichannel_resize_bilinear<T, 8>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x); |
|
|
|
launch_multichannel_resize_bilinear<T, 8>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, half_pixel_centers); |
|
|
|
} else if (num_effective_channels % 4 == 0 && num_iters > 40960) { |
|
|
|
} else if (num_effective_channels % 4 == 0 && num_iters > 40960) { |
|
|
|
launch_multichannel_resize_bilinear<T, 4>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x); |
|
|
|
launch_multichannel_resize_bilinear<T, 4>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, half_pixel_centers); |
|
|
|
} else if (num_effective_channels % 2 == 0) { |
|
|
|
} else if (num_effective_channels % 2 == 0) { |
|
|
|
launch_multichannel_resize_bilinear<T, 2>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x); |
|
|
|
launch_multichannel_resize_bilinear<T, 2>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, half_pixel_centers); |
|
|
|
} else { |
|
|
|
} else { |
|
|
|
launch_multichannel_resize_bilinear<T, 1>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x); |
|
|
|
launch_multichannel_resize_bilinear<T, 1>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x, half_pixel_centers); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) |
|
|
|
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) |
|
|
|
template void resize_bilinear<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, float, float); |
|
|
|
template void resize_bilinear<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, float, float, bool); |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
template void resize_bilinear<float>(const Stream&, TensorSpan<float>, TensorView<float>, float, float); |
|
|
|
template void resize_bilinear<float>(const Stream&, TensorSpan<float>, TensorView<float>, float, float, bool); |
|
|
|
|
|
|
|
|
|
|
|
}}}} /* namespace cv::dnn::cuda4dnn::kernels */ |
|
|
|
}}}} /* namespace cv::dnn::cuda4dnn::kernels */ |
|
|
|