|
|
|
@ -102,6 +102,26 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { |
|
|
|
|
v_store(output_vPtr[i], vec_x); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
template <class T, std::size_t N> |
|
|
|
|
__global__ void eltwise_div_2_vec(Span<T> output, View<T> x, View<T> y) { |
|
|
|
|
using vector_type = get_vector_type_t<T, N>; |
|
|
|
|
|
|
|
|
|
auto output_vPtr = vector_type::get_pointer(output.data()); |
|
|
|
|
auto x_vPtr = vector_type::get_pointer(x.data()); |
|
|
|
|
auto y_vPtr = vector_type::get_pointer(y.data()); |
|
|
|
|
|
|
|
|
|
for (auto i : grid_stride_range(output.size() / vector_type::size())) { |
|
|
|
|
vector_type vec_x, vec_y; |
|
|
|
|
v_load(vec_x, x_vPtr[i]); |
|
|
|
|
v_load(vec_y, y_vPtr[i]); |
|
|
|
|
|
|
|
|
|
for (int j = 0; j < vector_type::size(); j++) |
|
|
|
|
vec_x.data[j] = vec_x.data[j] / vec_y.data[j]; |
|
|
|
|
|
|
|
|
|
v_store(output_vPtr[i], vec_x); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
template <class T, std::size_t N> |
|
|
|
@ -221,4 +241,32 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels { |
|
|
|
|
template void eltwise_prod_2(const Stream& stream, Span<__half> output, View<__half> x, View<__half> y); |
|
|
|
|
template void eltwise_prod_2(const Stream& stream, Span<float> output, View<float> x, View<float> y); |
|
|
|
|
|
|
|
|
|
template <class T, std::size_t N> |
|
|
|
|
void launch_vectorized_eltwise_div_2(const Stream& stream, Span<T> output, View<T> x, View<T> y) { |
|
|
|
|
CV_Assert(is_fully_aligned<T>(output, N)); |
|
|
|
|
CV_Assert(is_fully_aligned<T>(x, N)); |
|
|
|
|
CV_Assert(is_fully_aligned<T>(y, N)); |
|
|
|
|
|
|
|
|
|
auto kernel = raw::eltwise_div_2_vec<T, N>; |
|
|
|
|
auto policy = make_policy(kernel, output.size() / N, 0, stream); |
|
|
|
|
launch_kernel(kernel, policy, output, x, y); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
template <class T> |
|
|
|
|
void eltwise_div_2(const Stream& stream, Span<T> output, View<T> x, View<T> y) { |
|
|
|
|
CV_Assert(x.size() == y.size()); |
|
|
|
|
CV_Assert(x.size() == output.size()); |
|
|
|
|
|
|
|
|
|
if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(x, 4) && is_fully_aligned<T>(y, 4)) { |
|
|
|
|
launch_vectorized_eltwise_div_2<T, 4>(stream, output, x, y); |
|
|
|
|
} else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(x, 2) && is_fully_aligned<T>(y, 2)) { |
|
|
|
|
launch_vectorized_eltwise_div_2<T, 2>(stream, output, x, y); |
|
|
|
|
} else { |
|
|
|
|
launch_vectorized_eltwise_div_2<T, 1>(stream, output, x, y); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
template void eltwise_div_2(const Stream& stream, Span<__half> output, View<__half> x, View<__half> y); |
|
|
|
|
template void eltwise_div_2(const Stream& stream, Span<float> output, View<float> x, View<float> y); |
|
|
|
|
|
|
|
|
|
}}}} /* namespace cv::dnn::cuda4dnn::kernels */ |
|
|
|
|