diff --git a/modules/stitching/src/opencl/warpers.cl b/modules/stitching/src/opencl/warpers.cl index 032ddf3cee..9b5619fcad 100644 --- a/modules/stitching/src/opencl/warpers.cl +++ b/modules/stitching/src/opencl/warpers.cl @@ -46,103 +46,127 @@ __kernel void buildWarpPlaneMaps(__global uchar * xmapptr, int xmap_step, int xmap_offset, __global uchar * ymapptr, int ymap_step, int ymap_offset, int rows, int cols, __constant float * ck_rinv, __constant float * ct, - int tl_u, int tl_v, float scale) + int tl_u, int tl_v, float scale, int rowsPerWI) { int du = get_global_id(0); - int dv = get_global_id(1); + int dv0 = get_global_id(1) * rowsPerWI; - if (du < cols && dv < rows) + if (du < cols) { - __global float * xmap = (__global float *)(xmapptr + mad24(dv, xmap_step, xmap_offset + du * (int)sizeof(float))); - __global float * ymap = (__global float *)(ymapptr + mad24(dv, ymap_step, ymap_offset + du * (int)sizeof(float))); + int xmap_index = mad24(dv0, xmap_step, mad24(du, (int)sizeof(float), xmap_offset)); + int ymap_index = mad24(dv0, ymap_step, mad24(du, (int)sizeof(float), ymap_offset)); - float u = tl_u + du; - float v = tl_v + dv; - float x, y; + for (int dv = dv0, dv1 = min(rows, dv0 + rowsPerWI); dv < dv1; ++dv, xmap_index += xmap_step, + ymap_index += ymap_step) + { + __global float * xmap = (__global float *)(xmapptr + xmap_index); + __global float * ymap = (__global float *)(ymapptr + ymap_index); - float x_ = u / scale - ct[0]; - float y_ = v / scale - ct[1]; + float u = tl_u + du; + float v = tl_v + dv; - float z; - x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * (1 - ct[2]); - y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * (1 - ct[2]); - z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * (1 - ct[2]); + float x_ = u / scale - ct[0]; + float y_ = v / scale - ct[1]; - x /= z; - y /= z; + float ct1 = 1 - ct[2]; + float x = fma(ck_rinv[0], x_, fma(ck_rinv[1], y_, ck_rinv[2] * ct1)); + float y = fma(ck_rinv[3], x_, fma(ck_rinv[4], y_, ck_rinv[5] * ct1)); + float z = fma(ck_rinv[6], x_, fma(ck_rinv[7], y_, ck_rinv[8] * ct1)); - xmap[0] = x; - ymap[0] = y; + x /= z; + y /= z; + + xmap[0] = x; + ymap[0] = y; + } } } __kernel void buildWarpCylindricalMaps(__global uchar * xmapptr, int xmap_step, int xmap_offset, __global uchar * ymapptr, int ymap_step, int ymap_offset, int rows, int cols, - __constant float * ck_rinv, int tl_u, int tl_v, float scale) + __constant float * ck_rinv, int tl_u, int tl_v, float scale, int rowsPerWI) { int du = get_global_id(0); - int dv = get_global_id(1); + int dv0 = get_global_id(1) * rowsPerWI; - if (du < cols && dv < rows) + if (du < cols) { - __global float * xmap = (__global float *)(xmapptr + mad24(dv, xmap_step, xmap_offset + du * (int)sizeof(float))); - __global float * ymap = (__global float *)(ymapptr + mad24(dv, ymap_step, ymap_offset + du * (int)sizeof(float))); - - float u = tl_u + du; - float v = tl_v + dv; - float x, y; - - u /= scale; - float x_ = sin(u); - float y_ = v / scale; - float z_ = cos(u); - - float z; - x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_; - y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_; - z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_; - - if (z > 0) x /= z, y /= z; - else x = y = -1; - - xmap[0] = x; - ymap[0] = y; + int xmap_index = mad24(dv0, xmap_step, mad24(du, (int)sizeof(float), xmap_offset)); + int ymap_index = mad24(dv0, ymap_step, mad24(du, (int)sizeof(float), ymap_offset)); + + for (int dv = dv0, dv1 = min(rows, dv0 + rowsPerWI); dv < dv1; ++dv, xmap_index += xmap_step, + ymap_index += ymap_step) + { + __global float * xmap = (__global float *)(xmapptr + xmap_index); + __global float * ymap = (__global float *)(ymapptr + ymap_index); + + float u = tl_u + du; + float v = tl_v + dv; + float x, y; + + u /= scale; + float x_, y_, z_; + x_ = sincos(u, &z_); + y_ = v / scale; + + float z; + x = fma(ck_rinv[0], x_, fma(ck_rinv[1], y_, ck_rinv[2] * z_)); + y = fma(ck_rinv[3], x_, fma(ck_rinv[4], y_, ck_rinv[5] * z_)); + z = fma(ck_rinv[6], x_, fma(ck_rinv[7], y_, ck_rinv[8] * z_)); + + if (z > 0) + x /= z, y /= z; + else + x = y = -1; + + xmap[0] = x; + ymap[0] = y; + } } } __kernel void buildWarpSphericalMaps(__global uchar * xmapptr, int xmap_step, int xmap_offset, __global uchar * ymapptr, int ymap_step, int ymap_offset, int rows, int cols, - __constant float * ck_rinv, int tl_u, int tl_v, float scale) + __constant float * ck_rinv, int tl_u, int tl_v, float scale, int rowsPerWI) { int du = get_global_id(0); - int dv = get_global_id(1); + int dv0 = get_global_id(1) * rowsPerWI; - if (du < cols && dv < rows) + if (du < cols) { - __global float * xmap = (__global float *)(xmapptr + mad24(dv, xmap_step, xmap_offset + du * (int)sizeof(float))); - __global float * ymap = (__global float *)(ymapptr + mad24(dv, ymap_step, ymap_offset + du * (int)sizeof(float))); - - float u = tl_u + du; - float v = tl_v + dv; - float x, y; - - v /= scale; - u /= scale; - - float sinv = sin(v); - float x_ = sinv * sin(u); - float y_ = -cos(v); - float z_ = sinv * cos(u); - - float z; - x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_; - y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_; - z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_; - - if (z > 0) x /= z, y /= z; - else x = y = -1; - - xmap[0] = x; - ymap[0] = y; + int xmap_index = mad24(dv0, xmap_step, mad24(du, (int)sizeof(float), xmap_offset)); + int ymap_index = mad24(dv0, ymap_step, mad24(du, (int)sizeof(float), ymap_offset)); + + for (int dv = dv0, dv1 = min(rows, dv0 + rowsPerWI); dv < dv1; ++dv, xmap_index += xmap_step, + ymap_index += ymap_step) + { + __global float * xmap = (__global float *)(xmapptr + xmap_index); + __global float * ymap = (__global float *)(ymapptr + ymap_index); + + float u = tl_u + du; + float v = tl_v + dv; + float x, y; + + v /= scale; + u /= scale; + + float cosv, sinv = sincos(v, &cosv), cosu, sinu = sincos(u, &cosu); + float x_ = sinv * sinu; + float y_ = -cosv; + float z_ = sinv * cosu; + + float z; + x = fma(ck_rinv[0], x_, fma(ck_rinv[1], y_, ck_rinv[2] * z_)); + y = fma(ck_rinv[3], x_, fma(ck_rinv[4], y_, ck_rinv[5] * z_)); + z = fma(ck_rinv[6], x_, fma(ck_rinv[7], y_, ck_rinv[8] * z_)); + + if (z > 0) + x /= z, y /= z; + else + x = y = -1; + + xmap[0] = x; + ymap[0] = y; + } } } diff --git a/modules/stitching/src/warpers.cpp b/modules/stitching/src/warpers.cpp index b6d1f8a8ad..8b2c77e759 100644 --- a/modules/stitching/src/warpers.cpp +++ b/modules/stitching/src/warpers.cpp @@ -103,16 +103,16 @@ Rect PlaneWarper::buildMaps(Size src_size, InputArray K, InputArray R, InputArra ocl::Kernel k("buildWarpPlaneMaps", ocl::stitching::warpers_oclsrc); if (!k.empty()) { - + int rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1; Mat k_rinv(1, 9, CV_32FC1, projector_.k_rinv), t(1, 3, CV_32FC1, projector_.t); UMat uxmap = _xmap.getUMat(), uymap = _ymap.getUMat(), uk_rinv = k_rinv.getUMat(ACCESS_READ), ut = t.getUMat(ACCESS_READ); k.args(ocl::KernelArg::WriteOnlyNoSize(uxmap), ocl::KernelArg::WriteOnly(uymap), ocl::KernelArg::PtrReadOnly(uk_rinv), ocl::KernelArg::PtrReadOnly(ut), - dst_tl.x, dst_tl.y, projector_.scale); + dst_tl.x, dst_tl.y, projector_.scale, rowsPerWI); - size_t globalsize[2] = { dsize.width, dsize.height }; + size_t globalsize[2] = { dsize.width, (dsize.height + rowsPerWI - 1) / rowsPerWI }; if (k.run(2, globalsize, NULL, true)) return Rect(dst_tl, dst_br); } @@ -371,6 +371,7 @@ Rect SphericalWarper::buildMaps(Size src_size, InputArray K, InputArray R, Outpu ocl::Kernel k("buildWarpSphericalMaps", ocl::stitching::warpers_oclsrc); if (!k.empty()) { + int rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1; projector_.setCameraParams(K, R); Point dst_tl, dst_br; @@ -384,9 +385,9 @@ Rect SphericalWarper::buildMaps(Size src_size, InputArray K, InputArray R, Outpu UMat uxmap = xmap.getUMat(), uymap = ymap.getUMat(), uk_rinv = k_rinv.getUMat(ACCESS_READ); k.args(ocl::KernelArg::WriteOnlyNoSize(uxmap), ocl::KernelArg::WriteOnly(uymap), - ocl::KernelArg::PtrReadOnly(uk_rinv), dst_tl.x, dst_tl.y, projector_.scale); + ocl::KernelArg::PtrReadOnly(uk_rinv), dst_tl.x, dst_tl.y, projector_.scale, rowsPerWI); - size_t globalsize[2] = { dsize.width, dsize.height }; + size_t globalsize[2] = { dsize.width, (dsize.height + rowsPerWI - 1) / rowsPerWI }; if (k.run(2, globalsize, NULL, true)) return Rect(dst_tl, dst_br); } @@ -415,6 +416,7 @@ Rect CylindricalWarper::buildMaps(Size src_size, InputArray K, InputArray R, Out ocl::Kernel k("buildWarpCylindricalMaps", ocl::stitching::warpers_oclsrc); if (!k.empty()) { + int rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1; projector_.setCameraParams(K, R); Point dst_tl, dst_br; @@ -428,9 +430,10 @@ Rect CylindricalWarper::buildMaps(Size src_size, InputArray K, InputArray R, Out UMat uxmap = xmap.getUMat(), uymap = ymap.getUMat(), uk_rinv = k_rinv.getUMat(ACCESS_READ); k.args(ocl::KernelArg::WriteOnlyNoSize(uxmap), ocl::KernelArg::WriteOnly(uymap), - ocl::KernelArg::PtrReadOnly(uk_rinv), dst_tl.x, dst_tl.y, projector_.scale); + ocl::KernelArg::PtrReadOnly(uk_rinv), dst_tl.x, dst_tl.y, projector_.scale, + rowsPerWI); - size_t globalsize[2] = { dsize.width, dsize.height }; + size_t globalsize[2] = { dsize.width, (dsize.height + rowsPerWI - 1) / rowsPerWI }; if (k.run(2, globalsize, NULL, true)) return Rect(dst_tl, dst_br); }