video(DIS): use OpenCL shared mem

- fix perf test iterations
pull/14403/head
Alexander Alekhin 6 years ago
parent 076ee65c85
commit 0e1dd63f5e
  1. 3
      modules/video/perf/opencl/perf_dis_optflow.cpp
  2. 11
      modules/video/src/dis_flow.cpp
  3. 58
      modules/video/src/opencl/dis_flow.cl

@ -37,10 +37,11 @@ OCL_PERF_TEST_P(DenseOpticalFlow_DIS, perf,
Ptr<DenseOpticalFlow> algo = DISOpticalFlow::create(preset);
OCL_TEST_CYCLE_N(10)
PERF_SAMPLE_BEGIN()
{
algo->calc(frame1, frame2, flow);
}
PERF_SAMPLE_END()
SANITY_CHECK_NOTHING();
}

@ -1055,11 +1055,16 @@ bool DISOpticalFlowImpl::ocl_PatchInverseSearch(UMat &src_Ux, UMat &src_Uy,
int idx;
int num_inner_iter = (int)floor(grad_descent_iter / (float)num_iter);
String subgroups_build_options;
if (ocl::Device::getDefault().isExtensionSupported("cl_khr_subgroups"))
subgroups_build_options = "-DCV_USE_SUBGROUPS=1";
for (int iter = 0; iter < num_iter; iter++)
{
if (iter == 0)
{
ocl::Kernel k1("dis_patch_inverse_search_fwd_1", ocl::video::dis_flow_oclsrc);
ocl::Kernel k1("dis_patch_inverse_search_fwd_1", ocl::video::dis_flow_oclsrc, subgroups_build_options);
size_t global_sz[] = {(size_t)hs * 8};
size_t local_sz[] = {8};
idx = 0;
@ -1111,7 +1116,7 @@ bool DISOpticalFlowImpl::ocl_PatchInverseSearch(UMat &src_Ux, UMat &src_Uy,
}
else
{
ocl::Kernel k3("dis_patch_inverse_search_bwd_1", ocl::video::dis_flow_oclsrc);
ocl::Kernel k3("dis_patch_inverse_search_bwd_1", ocl::video::dis_flow_oclsrc, subgroups_build_options);
size_t global_sz[] = {(size_t)hs * 8};
size_t local_sz[] = {8};
idx = 0;
@ -1368,7 +1373,7 @@ void DISOpticalFlowImpl::calc(InputArray I0, InputArray I1, InputOutputArray flo
CV_Assert(I0.isContinuous());
CV_Assert(I1.isContinuous());
CV_OCL_RUN(ocl::Device::getDefault().isIntel() && flow.isUMat() &&
CV_OCL_RUN(flow.isUMat() &&
(patch_size == 8) && (use_spatial_propagation == true),
ocl_calc(I0, I1, flow));

@ -2,6 +2,8 @@
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
//#define CV_USE_SUBGROUPS
#define EPS 0.001f
#define INF 1E+10F
@ -193,7 +195,11 @@ __kernel void dis_densification(__global const float *sx, __global const float *
float computeSSDMeanNorm(const __global uchar *I0_ptr, const __global uchar *I1_ptr,
int I0_stride, int I1_stride,
float w00, float w01, float w10, float w11, int patch_sz, int i)
float w00, float w01, float w10, float w11, int patch_sz, int i
#ifndef CV_USE_SUBGROUPS
, __local float2 *smem /*[8]*/
#endif
)
{
float sum_diff = 0.0f, sum_diff_sq = 0.0f;
int n = patch_sz * patch_sz;
@ -214,12 +220,31 @@ float computeSSDMeanNorm(const __global uchar *I0_ptr, const __global uchar *I1_
sum_diff = (dot(vec.lo, 1.0) + dot(vec.hi, 1.0));
sum_diff_sq = (dot(vec.lo, vec.lo) + dot(vec.hi, vec.hi));
#ifdef CV_USE_SUBGROUPS
sum_diff = sub_group_reduce_add(sum_diff);
sum_diff_sq = sub_group_reduce_add(sum_diff_sq);
#else
barrier(CLK_LOCAL_MEM_FENCE);
smem[i] = (float2)(sum_diff, sum_diff_sq);
barrier(CLK_LOCAL_MEM_FENCE);
if (i < 4)
smem[i] += smem[i + 4];
barrier(CLK_LOCAL_MEM_FENCE);
if (i < 2)
smem[i] += smem[i + 2];
barrier(CLK_LOCAL_MEM_FENCE);
if (i == 0)
smem[0] += smem[1];
barrier(CLK_LOCAL_MEM_FENCE);
float2 reduce_add_result = smem[0];
sum_diff = reduce_add_result.x;
sum_diff_sq = reduce_add_result.y;
#endif
return sum_diff_sq - sum_diff * sum_diff / n;
}
__attribute__((reqd_work_group_size(8, 1, 1)))
__kernel void dis_patch_inverse_search_fwd_1(__global const float *Ux_ptr, __global const float *Uy_ptr,
__global const uchar *I0_ptr, __global const uchar *I1_ptr,
int border_size, int patch_size, int patch_stride,
@ -227,8 +252,7 @@ __kernel void dis_patch_inverse_search_fwd_1(__global const float *Ux_ptr, __glo
__global float *Sx_ptr, __global float *Sy_ptr)
{
int id = get_global_id(0);
int is = id / 8;
if (id >= (hs * 8)) return;
int is = get_group_id(0);
int i = is * patch_stride;
int j = 0;
@ -249,7 +273,14 @@ __kernel void dis_patch_inverse_search_fwd_1(__global const float *Ux_ptr, __glo
Sy_ptr[is * ws] = prev_Uy;
j += patch_stride;
#ifdef CV_USE_SUBGROUPS
int sid = get_sub_group_local_id();
#define EXTRA_ARGS_computeSSDMeanNorm sid
#else
__local float2 smem[8];
int sid = get_local_id(0);
#define EXTRA_ARGS_computeSSDMeanNorm sid, smem
#endif
for (int js = 1; js < ws; js++, j += patch_stride)
{
float min_SSD, cur_SSD;
@ -258,11 +289,11 @@ __kernel void dis_patch_inverse_search_fwd_1(__global const float *Ux_ptr, __glo
INIT_BILINEAR_WEIGHTS(Ux, Uy);
min_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
w, w_ext, w00, w01, w10, w11, psz, sid);
w, w_ext, w00, w01, w10, w11, psz, EXTRA_ARGS_computeSSDMeanNorm);
INIT_BILINEAR_WEIGHTS(prev_Ux, prev_Uy);
cur_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
w, w_ext, w00, w01, w10, w11, psz, sid);
w, w_ext, w00, w01, w10, w11, psz, EXTRA_ARGS_computeSSDMeanNorm);
if (cur_SSD < min_SSD)
{
Ux = prev_Ux;
@ -274,6 +305,7 @@ __kernel void dis_patch_inverse_search_fwd_1(__global const float *Ux_ptr, __glo
Sx_ptr[is * ws + js] = Ux;
Sy_ptr[is * ws + js] = Uy;
}
#undef EXTRA_ARGS_computeSSDMeanNorm
}
float3 processPatchMeanNorm(const __global uchar *I0_ptr, const __global uchar *I1_ptr,
@ -396,14 +428,14 @@ __kernel void dis_patch_inverse_search_fwd_2(__global const float *Ux_ptr, __glo
}
}
__attribute__((reqd_work_group_size(8, 1, 1)))
__kernel void dis_patch_inverse_search_bwd_1(__global const uchar *I0_ptr, __global const uchar *I1_ptr,
int border_size, int patch_size, int patch_stride,
int w, int h, int ws, int hs, int pyr_level,
__global float *Sx_ptr, __global float *Sy_ptr)
{
int id = get_global_id(0);
int is = id / 8;
if (id >= (hs * 8)) return;
int is = get_group_id(0);
is = (hs - 1 - is);
int i = is * patch_stride;
@ -419,7 +451,14 @@ __kernel void dis_patch_inverse_search_bwd_1(__global const uchar *I0_ptr, __glo
float j_upper_limit = bsz + w - 1.0f;
float i_I1, j_I1, w00, w01, w10, w11;
#ifdef CV_USE_SUBGROUPS
int sid = get_sub_group_local_id();
#define EXTRA_ARGS_computeSSDMeanNorm sid
#else
__local float2 smem[8];
int sid = get_local_id(0);
#define EXTRA_ARGS_computeSSDMeanNorm sid, smem
#endif
for (int js = (ws - 2); js > -1; js--, j -= patch_stride)
{
float min_SSD, cur_SSD;
@ -428,17 +467,18 @@ __kernel void dis_patch_inverse_search_bwd_1(__global const uchar *I0_ptr, __glo
INIT_BILINEAR_WEIGHTS(Ux.x, Uy.x);
min_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
w, w_ext, w00, w01, w10, w11, psz, sid);
w, w_ext, w00, w01, w10, w11, psz, EXTRA_ARGS_computeSSDMeanNorm);
INIT_BILINEAR_WEIGHTS(Ux.y, Uy.y);
cur_SSD = computeSSDMeanNorm(I0_ptr + i * w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
w, w_ext, w00, w01, w10, w11, psz, sid);
w, w_ext, w00, w01, w10, w11, psz, EXTRA_ARGS_computeSSDMeanNorm);
if (cur_SSD < min_SSD)
{
Sx_ptr[is * ws + js] = Ux.y;
Sy_ptr[is * ws + js] = Uy.y;
}
}
#undef EXTRA_ARGS_computeSSDMeanNorm
}
__kernel void dis_patch_inverse_search_bwd_2(__global const uchar *I0_ptr, __global const uchar *I1_ptr,

Loading…
Cancel
Save