OCL: fix incompatibility with Mali ruintime

pull/24730/head
Maksim Shabunin 1 year ago
parent 9434c89ba0
commit adde942e34
  1. 2
      modules/core/src/arithm.cpp
  2. 48
      modules/core/src/opencl/inrange.cl
  3. 2
      modules/imgproc/src/color.hpp
  4. 10
      modules/imgproc/src/color_hsv.dispatch.cpp
  5. 12
      modules/imgproc/src/color_lab.cpp
  6. 18
      modules/imgproc/src/color_rgb.dispatch.cpp
  7. 18
      modules/imgproc/src/color_yuv.dispatch.cpp
  8. 10
      modules/imgproc/src/deriv.cpp
  9. 14
      modules/imgproc/src/imgwarp.cpp
  10. 60
      modules/imgproc/src/opencl/color_hsv.cl
  11. 22
      modules/imgproc/src/opencl/color_lab.cl
  12. 44
      modules/imgproc/src/opencl/color_rgb.cl
  13. 198
      modules/imgproc/src/opencl/color_yuv.cl
  14. 42
      modules/imgproc/src/opencl/laplacian5.cl
  15. 28
      modules/imgproc/src/opencl/pyr_down.cl
  16. 22
      modules/imgproc/src/opencl/pyr_up.cl
  17. 88
      modules/imgproc/src/opencl/remap.cl
  18. 68
      modules/imgproc/src/opencl/resize.cl
  19. 50
      modules/imgproc/src/opencl/warp_affine.cl
  20. 24
      modules/imgproc/src/opencl/warp_perspective.cl
  21. 8
      modules/imgproc/src/pyramids.cpp
  22. 20
      modules/imgproc/src/resize.cpp

@ -1643,7 +1643,7 @@ static bool ocl_inRange( InputArray _src, InputArray _lowerb,
if (kercn % cn != 0)
kercn = cn;
int colsPerWI = kercn / cn;
String opts = format("%s-D cn=%d -D srcT=%s -D srcT1=%s -D dstT=%s -D kercn=%d -D depth=%d%s -D colsPerWI=%d",
String opts = format("%s-D CN=%d -D SRC_T=%s -D SRC_T1=%s -D DST_T=%s -D KERCN=%d -D DEPTH=%d%s -D COLS_PER_WI=%d",
haveScalar ? "-D HAVE_SCALAR " : "", cn, ocl::typeToStr(CV_MAKE_TYPE(sdepth, kercn)),
ocl::typeToStr(sdepth), ocl::typeToStr(CV_8UC(colsPerWI)), kercn, sdepth,
doubleSupport ? " -D DOUBLE_SUPPORT" : "", colsPerWI);

@ -52,7 +52,7 @@
__kernel void inrange(__global const uchar * src1ptr, int src1_step, int src1_offset,
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
#ifdef HAVE_SCALAR
__global const srcT1 * src2, __global const srcT1 * src3,
__global const SRC_T1 * src2, __global const SRC_T1 * src3,
#else
__global const uchar * src2ptr, int src2_step, int src2_offset,
__global const uchar * src3ptr, int src3_step, int src3_offset,
@ -64,56 +64,56 @@ __kernel void inrange(__global const uchar * src1ptr, int src1_step, int src1_of
if (x < dst_cols)
{
int src1_index = mad24(y0, src1_step, mad24(x, (int)sizeof(srcT1) * kercn, src1_offset));
int dst_index = mad24(y0, dst_step, mad24(x, colsPerWI, dst_offset));
int src1_index = mad24(y0, src1_step, mad24(x, (int)sizeof(SRC_T1) * KERCN, src1_offset));
int dst_index = mad24(y0, dst_step, mad24(x, COLS_PER_WI, dst_offset));
#ifndef HAVE_SCALAR
int src2_index = mad24(y0, src2_step, mad24(x, (int)sizeof(srcT1) * kercn, src2_offset));
int src3_index = mad24(y0, src3_step, mad24(x, (int)sizeof(srcT1) * kercn, src3_offset));
int src2_index = mad24(y0, src2_step, mad24(x, (int)sizeof(SRC_T1) * KERCN, src2_offset));
int src3_index = mad24(y0, src3_step, mad24(x, (int)sizeof(SRC_T1) * KERCN, src3_offset));
#endif
for (int y = y0, y1 = min(dst_rows, y0 + rowsPerWI); y < y1; ++y, src1_index += src1_step, dst_index += dst_step)
{
#if kercn >= cn && kercn == 4 && depth <= 4 && !defined HAVE_SCALAR
srcT src1 = *(__global const srcT *)(src1ptr + src1_index);
srcT src2 = *(__global const srcT *)(src2ptr + src2_index);
srcT src3 = *(__global const srcT *)(src3ptr + src3_index);
__global dstT * dst = (__global dstT *)(dstptr + dst_index);
#if cn == 1
dst[0] = src2 > src1 || src3 < src1 ? (dstT)(0) : (dstT)(255);
#elif cn == 2
dst[0] = (dstT)(src2.xy > src1.xy || src3.xy < src1.xy ||
src2.zw > src1.zw || src3.zw < src1.zw ? (dstT)(0) : (dstT)(255);
#elif cn == 4
dst[0] = (dstT)(src2.x > src1.x || src3.x < src1.x ||
#if KERCN >= CN && KERCN == 4 && DEPTH <= 4 && !defined HAVE_SCALAR
SRC_T src1 = *(__global const SRC_T *)(src1ptr + src1_index);
SRC_T src2 = *(__global const SRC_T *)(src2ptr + src2_index);
SRC_T src3 = *(__global const SRC_T *)(src3ptr + src3_index);
__global DST_T * dst = (__global DST_T *)(dstptr + dst_index);
#if CN == 1
dst[0] = src2 > src1 || src3 < src1 ? (DST_T)(0) : (DST_T)(255);
#elif CN == 2
dst[0] = (DST_T)(src2.xy > src1.xy || src3.xy < src1.xy ||
src2.zw > src1.zw || src3.zw < src1.zw ? (DST_T)(0) : (DST_T)(255);
#elif CN == 4
dst[0] = (DST_T)(src2.x > src1.x || src3.x < src1.x ||
src2.y > src1.y || src3.y < src1.y ||
src2.z > src1.z || src3.z < src1.z ||
src2.w > src1.w || src3.w < src1.w ? 0 : 255);
#endif
#else
__global const srcT1 * src1 = (__global const srcT1 *)(src1ptr + src1_index);
__global const SRC_T1 * src1 = (__global const SRC_T1 *)(src1ptr + src1_index);
__global uchar * dst = dstptr + dst_index;
#ifndef HAVE_SCALAR
__global const srcT1 * src2 = (__global const srcT1 *)(src2ptr + src2_index);
__global const srcT1 * src3 = (__global const srcT1 *)(src3ptr + src3_index);
__global const SRC_T1 * src2 = (__global const SRC_T1 *)(src2ptr + src2_index);
__global const SRC_T1 * src3 = (__global const SRC_T1 *)(src3ptr + src3_index);
#endif
#pragma unroll
for (int px = 0; px < colsPerWI; ++px, src1 += cn
for (int px = 0; px < COLS_PER_WI; ++px, src1 += CN
#ifndef HAVE_SCALAR
, src2 += cn, src3 += cn
, src2 += CN, src3 += CN
#endif
)
{
dst[px] = 255;
for (int c = 0; c < cn; ++c)
for (int c = 0; c < CN; ++c)
if (src2[c] > src1[c] || src3[c] < src1[c])
{
dst[px] = 0;
break;
}
}
#endif // kercn >= cn
#endif // KERCN >= CN
#ifndef HAVE_SCALAR
src2_index += src2_step;
src3_index += src3_step;

@ -277,7 +277,7 @@ struct OclHelper
int pxPerWIy = dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU) ? 4 : 1;
int pxPerWIx = 1;
cv::String baseOptions = format("-D depth=%d -D scn=%d -D PIX_PER_WI_Y=%d ",
cv::String baseOptions = format("-D SRC_DEPTH=%d -D SCN=%d -D PIX_PER_WI_Y=%d ",
src.depth(), src.channels(), pxPerWIy);
switch (sizePolicy)

@ -219,7 +219,7 @@ bool oclCvtColorHSV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, b
int hrange = _src.depth() == CV_32F ? 360 : (!full ? 180 : 255);
if(!h.createKernel("HSV2RGB", ocl::imgproc::color_hsv_oclsrc,
format("-D dcn=%d -D bidx=%d -D hrange=%d -D hscale=%ff", dcn, bidx, hrange, 6.f/hrange)))
format("-D DCN=%d -D BIDX=%d -D HRANGE=%d -D HSCALE=%ff", dcn, bidx, hrange, 6.f/hrange)))
{
return false;
}
@ -234,7 +234,7 @@ bool oclCvtColorHLS2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, b
int hrange = _src.depth() == CV_32F ? 360 : (!full ? 180 : 255);
if(!h.createKernel("HLS2RGB", ocl::imgproc::color_hsv_oclsrc,
format("-D dcn=%d -D bidx=%d -D hrange=%d -D hscale=%ff", dcn, bidx, hrange, 6.f/hrange)))
format("-D DCN=%d -D BIDX=%d -D HRANGE=%d -D HSCALE=%ff", dcn, bidx, hrange, 6.f/hrange)))
{
return false;
}
@ -249,7 +249,7 @@ bool oclCvtColorBGR2HLS( InputArray _src, OutputArray _dst, int bidx, bool full
float hscale = (_src.depth() == CV_32F ? 360.f : (!full ? 180.f : 256.f))/360.f;
if(!h.createKernel("RGB2HLS", ocl::imgproc::color_hsv_oclsrc,
format("-D hscale=%ff -D bidx=%d -D dcn=3", hscale, bidx)))
format("-D HSCALE=%ff -D BIDX=%d -D DCN=3", hscale, bidx)))
{
return false;
}
@ -264,8 +264,8 @@ bool oclCvtColorBGR2HSV( InputArray _src, OutputArray _dst, int bidx, bool full
int hrange = _src.depth() == CV_32F ? 360 : (!full ? 180 : 256);
cv::String options = (_src.depth() == CV_8U ?
format("-D hrange=%d -D bidx=%d -D dcn=3", hrange, bidx) :
format("-D hscale=%ff -D bidx=%d -D dcn=3", hrange*(1.f/360.f), bidx));
format("-D HRANGE=%d -D BIDX=%d -D DCN=3", hrange, bidx) :
format("-D HSCALE=%ff -D BIDX=%d -D DCN=3", hrange*(1.f/360.f), bidx));
if(!h.createKernel("RGB2HSV", ocl::imgproc::color_hsv_oclsrc, options))
{

@ -4424,7 +4424,7 @@ bool oclCvtColorBGR2Luv( InputArray _src, OutputArray _dst, int bidx, bool srgb)
OclHelper< Set<3, 4>, Set<3>, Set<CV_8U, CV_32F> > h(_src, _dst, 3);
if(!h.createKernel("BGR2Luv", ocl::imgproc::color_lab_oclsrc,
format("-D dcn=3 -D bidx=%d%s", bidx, srgb ? " -D SRGB" : "")))
format("-D DCN=3 -D BIDX=%d%s", bidx, srgb ? " -D SRGB" : "")))
{
return false;
}
@ -4492,7 +4492,7 @@ bool oclCvtColorBGR2Lab( InputArray _src, OutputArray _dst, int bidx, bool srgb
OclHelper< Set<3, 4>, Set<3>, Set<CV_8U, CV_32F> > h(_src, _dst, 3);
if(!h.createKernel("BGR2Lab", ocl::imgproc::color_lab_oclsrc,
format("-D dcn=3 -D bidx=%d%s", bidx, srgb ? " -D SRGB" : "")))
format("-D DCN=3 -D BIDX=%d%s", bidx, srgb ? " -D SRGB" : "")))
{
return false;
}
@ -4587,7 +4587,7 @@ bool oclCvtColorLab2BGR(InputArray _src, OutputArray _dst, int dcn, int bidx, bo
OclHelper< Set<3>, Set<3, 4>, Set<CV_8U, CV_32F> > h(_src, _dst, dcn);
if(!h.createKernel("Lab2BGR", ocl::imgproc::color_lab_oclsrc,
format("-D dcn=%d -D bidx=%d%s", dcn, bidx, srgb ? " -D SRGB" : "")))
format("-D DCN=%d -D BIDX=%d%s", dcn, bidx, srgb ? " -D SRGB" : "")))
{
return false;
}
@ -4638,7 +4638,7 @@ bool oclCvtColorLuv2BGR(InputArray _src, OutputArray _dst, int dcn, int bidx, bo
OclHelper< Set<3>, Set<3, 4>, Set<CV_8U, CV_32F> > h(_src, _dst, dcn);
if(!h.createKernel("Luv2BGR", ocl::imgproc::color_lab_oclsrc,
format("-D dcn=%d -D bidx=%d%s", dcn, bidx, srgb ? " -D SRGB" : "")))
format("-D DCN=%d -D BIDX=%d%s", dcn, bidx, srgb ? " -D SRGB" : "")))
{
return false;
}
@ -4692,7 +4692,7 @@ bool oclCvtColorBGR2XYZ( InputArray _src, OutputArray _dst, int bidx )
OclHelper< Set<3, 4>, Set<3>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, 3);
if(!h.createKernel("RGB2XYZ", ocl::imgproc::color_lab_oclsrc,
format("-D dcn=3 -D bidx=%d", bidx)))
format("-D DCN=3 -D BIDX=%d", bidx)))
{
return false;
}
@ -4740,7 +4740,7 @@ bool oclCvtColorXYZ2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx )
OclHelper< Set<3>, Set<3, 4>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, dcn);
if(!h.createKernel("XYZ2RGB", ocl::imgproc::color_lab_oclsrc,
format("-D dcn=%d -D bidx=%d", dcn, bidx)))
format("-D DCN=%d -D BIDX=%d", dcn, bidx)))
{
return false;
}

@ -428,7 +428,7 @@ bool oclCvtColorBGR2BGR( InputArray _src, OutputArray _dst, int dcn, bool revers
OclHelper< Set<3, 4>, Set<3, 4>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, dcn);
if(!h.createKernel("RGB", ocl::imgproc::color_rgb_oclsrc,
format("-D dcn=%d -D bidx=0 -D %s", dcn, reverse ? "REVERSE" : "ORDER")))
format("-D DCN=%d -D BIDX=0 -D %s", dcn, reverse ? "REVERSE" : "ORDER")))
{
return false;
}
@ -441,7 +441,7 @@ bool oclCvtColorBGR25x5( InputArray _src, OutputArray _dst, int bidx, int gbits
OclHelper< Set<3, 4>, Set<2>, Set<CV_8U> > h(_src, _dst, 2);
if(!h.createKernel("RGB2RGB5x5", ocl::imgproc::color_rgb_oclsrc,
format("-D dcn=2 -D bidx=%d -D greenbits=%d", bidx, gbits)))
format("-D DCN=2 -D BIDX=%d -D GREENBITS=%d", bidx, gbits)))
{
return false;
}
@ -454,7 +454,7 @@ bool oclCvtColor5x52BGR( InputArray _src, OutputArray _dst, int dcn, int bidx, i
OclHelper< Set<2>, Set<3, 4>, Set<CV_8U> > h(_src, _dst, dcn);
if(!h.createKernel("RGB5x52RGB", ocl::imgproc::color_rgb_oclsrc,
format("-D dcn=%d -D bidx=%d -D greenbits=%d", dcn, bidx, gbits)))
format("-D DCN=%d -D BIDX=%d -D GREENBITS=%d", dcn, bidx, gbits)))
{
return false;
}
@ -467,7 +467,7 @@ bool oclCvtColor5x52Gray( InputArray _src, OutputArray _dst, int gbits)
OclHelper< Set<2>, Set<1>, Set<CV_8U> > h(_src, _dst, 1);
if(!h.createKernel("BGR5x52Gray", ocl::imgproc::color_rgb_oclsrc,
format("-D dcn=1 -D bidx=0 -D greenbits=%d", gbits)))
format("-D DCN=1 -D BIDX=0 -D GREENBITS=%d", gbits)))
{
return false;
}
@ -480,7 +480,7 @@ bool oclCvtColorGray25x5( InputArray _src, OutputArray _dst, int gbits)
OclHelper< Set<1>, Set<2>, Set<CV_8U> > h(_src, _dst, 2);
if(!h.createKernel("Gray2BGR5x5", ocl::imgproc::color_rgb_oclsrc,
format("-D dcn=2 -D bidx=0 -D greenbits=%d", gbits)))
format("-D DCN=2 -D BIDX=0 -D GREENBITS=%d", gbits)))
{
return false;
}
@ -494,7 +494,7 @@ bool oclCvtColorBGR2Gray( InputArray _src, OutputArray _dst, int bidx)
int stripeSize = 1;
if(!h.createKernel("RGB2Gray", ocl::imgproc::color_rgb_oclsrc,
format("-D dcn=1 -D bidx=%d -D STRIPE_SIZE=%d", bidx, stripeSize)))
format("-D DCN=1 -D BIDX=%d -D STRIPE_SIZE=%d", bidx, stripeSize)))
{
return false;
}
@ -507,7 +507,7 @@ bool oclCvtColorGray2BGR( InputArray _src, OutputArray _dst, int dcn)
{
OclHelper< Set<1>, Set<3, 4>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, dcn);
if(!h.createKernel("Gray2RGB", ocl::imgproc::color_rgb_oclsrc,
format("-D bidx=0 -D dcn=%d", dcn)))
format("-D BIDX=0 -D DCN=%d", dcn)))
{
return false;
}
@ -520,7 +520,7 @@ bool oclCvtColorRGBA2mRGBA( InputArray _src, OutputArray _dst)
OclHelper< Set<4>, Set<4>, Set<CV_8U> > h(_src, _dst, 4);
if(!h.createKernel("RGBA2mRGBA", ocl::imgproc::color_rgb_oclsrc,
"-D dcn=4 -D bidx=3"))
"-D DCN=4 -D BIDX=3"))
{
return false;
}
@ -533,7 +533,7 @@ bool oclCvtColormRGBA2RGBA( InputArray _src, OutputArray _dst)
OclHelper< Set<4>, Set<4>, Set<CV_8U> > h(_src, _dst, 4);
if(!h.createKernel("mRGBA2RGBA", ocl::imgproc::color_rgb_oclsrc,
"-D dcn=4 -D bidx=3"))
"-D DCN=4 -D BIDX=3"))
{
return false;
}

@ -232,7 +232,7 @@ bool oclCvtColorYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx )
OclHelper< Set<3>, Set<3, 4>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, dcn);
if(!h.createKernel("YUV2RGB", ocl::imgproc::color_yuv_oclsrc,
format("-D dcn=%d -D bidx=%d", dcn, bidx)))
format("-D DCN=%d -D BIDX=%d", dcn, bidx)))
{
return false;
}
@ -245,7 +245,7 @@ bool oclCvtColorBGR2YUV( InputArray _src, OutputArray _dst, int bidx )
OclHelper< Set<3, 4>, Set<3>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, 3);
if(!h.createKernel("RGB2YUV", ocl::imgproc::color_yuv_oclsrc,
format("-D dcn=3 -D bidx=%d", bidx)))
format("-D DCN=3 -D BIDX=%d", bidx)))
{
return false;
}
@ -258,7 +258,7 @@ bool oclCvtcolorYCrCb2BGR( InputArray _src, OutputArray _dst, int dcn, int bidx)
OclHelper< Set<3>, Set<3, 4>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, dcn);
if(!h.createKernel("YCrCb2RGB", ocl::imgproc::color_yuv_oclsrc,
format("-D dcn=%d -D bidx=%d", dcn, bidx)))
format("-D DCN=%d -D BIDX=%d", dcn, bidx)))
{
return false;
}
@ -271,7 +271,7 @@ bool oclCvtColorBGR2YCrCb( InputArray _src, OutputArray _dst, int bidx)
OclHelper< Set<3, 4>, Set<3>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, 3);
if(!h.createKernel("RGB2YCrCb", ocl::imgproc::color_yuv_oclsrc,
format("-D dcn=3 -D bidx=%d", bidx)))
format("-D DCN=3 -D BIDX=%d", bidx)))
{
return false;
}
@ -285,7 +285,7 @@ bool oclCvtColorOnePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int
bool optimized = _src.offset() % 4 == 0 && _src.step() % 4 == 0;
if(!h.createKernel("YUV2RGB_422", ocl::imgproc::color_yuv_oclsrc,
format("-D dcn=%d -D bidx=%d -D uidx=%d -D yidx=%d%s", dcn, bidx, uidx, yidx,
format("-D DCN=%d -D BIDX=%d -D UIDX=%d -D YIDX=%d%s", dcn, bidx, uidx, yidx,
optimized ? " -D USE_OPTIMIZED_LOAD" : "")))
{
return false;
@ -299,7 +299,7 @@ bool oclCvtColorOnePlaneBGR2YUV( InputArray _src, OutputArray _dst, int dcn, int
OclHelper< Set<3, 4>, Set<2>, Set<CV_8U, CV_16U, CV_32F> > h(_src, _dst, dcn);
if(!h.createKernel("RGB2YUV_422", ocl::imgproc::color_yuv_oclsrc,
format("-D dcn=%d -D bidx=%d -D uidx=%d -D yidx=%d", dcn, bidx, uidx, yidx
format("-D DCN=%d -D BIDX=%d -D UIDX=%d -D YIDX=%d", dcn, bidx, uidx, yidx
)))
{
return false;
@ -321,7 +321,7 @@ bool oclCvtColorTwoPlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, int
OclHelper< Set<1>, Set<3, 4>, Set<CV_8U>, FROM_YUV > h(_src, _dst, dcn);
if(!h.createKernel("YUV2RGB_NVx", ocl::imgproc::color_yuv_oclsrc,
format("-D dcn=%d -D bidx=%d -D uidx=%d", dcn, bidx, uidx)))
format("-D DCN=%d -D BIDX=%d -D UIDX=%d", dcn, bidx, uidx)))
{
return false;
}
@ -334,7 +334,7 @@ bool oclCvtColorThreePlaneYUV2BGR( InputArray _src, OutputArray _dst, int dcn, i
OclHelper< Set<1>, Set<3, 4>, Set<CV_8U>, FROM_YUV > h(_src, _dst, dcn);
if(!h.createKernel("YUV2RGB_YV12_IYUV", ocl::imgproc::color_yuv_oclsrc,
format("-D dcn=%d -D bidx=%d -D uidx=%d%s", dcn, bidx, uidx,
format("-D DCN=%d -D BIDX=%d -D UIDX=%d%s", dcn, bidx, uidx,
_src.isContinuous() ? " -D SRC_CONT" : "")))
{
return false;
@ -348,7 +348,7 @@ bool oclCvtColorBGR2ThreePlaneYUV( InputArray _src, OutputArray _dst, int bidx,
OclHelper< Set<3, 4>, Set<1>, Set<CV_8U>, TO_YUV > h(_src, _dst, 1);
if(!h.createKernel("RGB2YUV_YV12_IYUV", ocl::imgproc::color_yuv_oclsrc,
format("-D dcn=1 -D bidx=%d -D uidx=%d", bidx, uidx)))
format("-D DCN=1 -D BIDX=%d -D UIDX=%d", bidx, uidx)))
{
return false;
}

@ -583,9 +583,9 @@ static bool ocl_Laplacian5(InputArray _src, OutputArray _dst,
"BORDER_REFLECT_101" };
String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUS=%d%s%s"
" -D convertToWT=%s -D convertToDT=%s"
" -D %s -D srcT1=%s -D dstT1=%s -D WT1=%s"
" -D srcT=%s -D dstT=%s -D WT=%s"
" -D CONVERT_TO_WT=%s -D CONVERT_TO_DT=%s"
" -D %s -D SRC_T1=%s -D DST_T1=%s -D WT1=%s"
" -D SRC_T=%s -D DST_T=%s -D WT=%s"
" -D CN=%d ",
(int)lt2[0], (int)lt2[1], kernelX.cols / 2,
ocl::kernelToStr(kernelX, wdepth, "KERNEL_MATRIX_X").c_str(),
@ -627,8 +627,8 @@ static bool ocl_Laplacian5(InputArray _src, OutputArray _dst,
char cvt[2][50];
ocl::Kernel k("sumConvert", ocl::imgproc::laplacian5_oclsrc,
format("-D ONLY_SUM_CONVERT "
"-D srcT=%s -D WT=%s -D dstT=%s -D coeffT=%s -D wdepth=%d "
"-D convertToWT=%s -D convertToDT=%s%s",
"-D SRC_T=%s -D WT=%s -D DST_T=%s -D COEFF_T=%s -D WDEPTH=%d "
"-D CONVERT_TO_WT=%s -D CONVERT_TO_DT=%s%s",
ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)),
ocl::typeToStr(CV_MAKE_TYPE(wdepth, kercn)),
ocl::typeToStr(CV_MAKE_TYPE(ddepth, kercn)),

@ -1354,7 +1354,7 @@ static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, Input
static const char * const interMap[] = { "INTER_NEAREST", "INTER_LINEAR", "INTER_CUBIC", "INTER_LINEAR", "INTER_LANCZOS" };
static const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP",
"BORDER_REFLECT_101", "BORDER_TRANSPARENT" };
String buildOptions = format("-D %s -D %s -D T=%s -D rowsPerWI=%d",
String buildOptions = format("-D %s -D %s -D T=%s -D ROWS_PER_WI=%d",
interMap[interpolation], borderMap[borderType],
ocl::typeToStr(type), rowsPerWI);
@ -1363,8 +1363,8 @@ static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, Input
char cvt[3][50];
int wdepth = std::max(CV_32F, depth);
buildOptions = buildOptions
+ format(" -D WT=%s -D convertToT=%s -D convertToWT=%s"
" -D convertToWT2=%s -D WT2=%s",
+ format(" -D WT=%s -D CONVERT_TO_T=%s -D CONVERT_TO_WT=%s"
" -D CONVERT_TO_WT2=%s -D WT2=%s",
ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
ocl::convertTypeStr(wdepth, depth, cn, cvt[0], sizeof(cvt[0])),
ocl::convertTypeStr(depth, wdepth, cn, cvt[1], sizeof(cvt[1])),
@ -1373,7 +1373,7 @@ static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, Input
}
int scalarcn = cn == 3 ? 4 : cn;
int sctype = CV_MAKETYPE(depth, scalarcn);
buildOptions += format(" -D T=%s -D T1=%s -D cn=%d -D ST=%s -D depth=%d",
buildOptions += format(" -D T=%s -D T1=%s -D CN=%d -D ST=%s -D SRC_DEPTH=%d",
ocl::typeToStr(type), ocl::typeToStr(depth),
cn, ocl::typeToStr(sctype), depth);
@ -2494,7 +2494,7 @@ static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0,
String opts;
if (interpolation == INTER_NEAREST)
{
opts = format("-D INTER_NEAREST -D T=%s%s -D CT=%s -D T1=%s -D ST=%s -D cn=%d -D rowsPerWI=%d",
opts = format("-D INTER_NEAREST -D T=%s%s -D CT=%s -D T1=%s -D ST=%s -D CN=%d -D ROWS_PER_WI=%d",
ocl::typeToStr(type),
doubleSupport ? " -D DOUBLE_SUPPORT" : "",
useDouble ? "double" : "float",
@ -2504,8 +2504,8 @@ static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0,
else
{
char cvt[2][50];
opts = format("-D INTER_%s -D T=%s -D T1=%s -D ST=%s -D WT=%s -D depth=%d"
" -D convertToWT=%s -D convertToT=%s%s -D CT=%s -D cn=%d -D rowsPerWI=%d",
opts = format("-D INTER_%s -D T=%s -D T1=%s -D ST=%s -D WT=%s -D SRC_DEPTH=%d"
" -D CONVERT_TO_WT=%s -D CONVERT_TO_T=%s%s -D CT=%s -D CN=%d -D ROWS_PER_WI=%d",
interpolationMap[interpolation], ocl::typeToStr(type),
ocl::typeToStr(CV_MAT_DEPTH(type)),
ocl::typeToStr(sctype),

@ -46,21 +46,21 @@
/**************************************PUBLICFUNC*************************************/
#if depth == 0
#if SRC_DEPTH == 0
#define DATA_TYPE uchar
#define MAX_NUM 255
#define HALF_MAX_NUM 128
#define COEFF_TYPE int
#define SAT_CAST(num) convert_uchar_sat(num)
#define DEPTH_0
#elif depth == 2
#elif SRC_DEPTH == 2
#define DATA_TYPE ushort
#define MAX_NUM 65535
#define HALF_MAX_NUM 32768
#define COEFF_TYPE int
#define SAT_CAST(num) convert_ushort_sat(num)
#define DEPTH_2
#elif depth == 5
#elif SRC_DEPTH == 5
#define DATA_TYPE float
#define MAX_NUM 1.0f
#define HALF_MAX_NUM 0.5f
@ -78,18 +78,18 @@ enum
hsv_shift = 12
};
#define scnbytes ((int)sizeof(DATA_TYPE)*scn)
#define dcnbytes ((int)sizeof(DATA_TYPE)*dcn)
#define scnbytes ((int)sizeof(DATA_TYPE)*SCN)
#define dcnbytes ((int)sizeof(DATA_TYPE)*DCN)
#ifndef hscale
#define hscale 0
#ifndef HSCALE
#define HSCALE 0
#endif
#ifndef hrange
#define hrange 0
#ifndef HRANGE
#define HRANGE 0
#endif
#if bidx == 0
#if BIDX == 0
#define R_COMP z
#define G_COMP y
#define B_COMP x
@ -148,7 +148,7 @@ __kernel void RGB2HSV(__global const uchar* src, int src_step, int src_offset,
h = (vr & (g - b)) +
(~vr & ((vg & mad24(diff, 2, b - r)) + ((~vg) & mad24(4, diff, r - g))));
h = mad24(h, hdiv_table[diff], (1 << (hsv_shift-1))) >> hsv_shift;
h += h < 0 ? hrange : 0;
h += h < 0 ? HRANGE : 0;
dst[dst_index] = convert_uchar_sat_rte(h);
dst[dst_index + 1] = (uchar)s;
@ -188,7 +188,7 @@ __kernel void HSV2RGB(__global const uchar* src, int src_step, int src_offset,
{
float tab[4];
int sector;
h *= hscale;
h *= HSCALE;
if( h < 0 )
do h += 6; while( h < 0 );
else if( h >= 6 )
@ -213,10 +213,10 @@ __kernel void HSV2RGB(__global const uchar* src, int src_step, int src_offset,
else
b = g = r = v;
dst[dst_index + bidx] = convert_uchar_sat_rte(b*255.f);
dst[dst_index + BIDX] = convert_uchar_sat_rte(b*255.f);
dst[dst_index + 1] = convert_uchar_sat_rte(g*255.f);
dst[dst_index + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
#if dcn == 4
dst[dst_index + (BIDX^2)] = convert_uchar_sat_rte(r*255.f);
#if DCN == 4
dst[dst_index + 3] = MAX_NUM;
#endif
@ -275,7 +275,7 @@ __kernel void RGB2HSV(__global const uchar* srcptr, int src_step, int src_offset
if( h < 0 )
h += 360.f;
dst[0] = h*hscale;
dst[0] = h*HSCALE;
dst[1] = s;
dst[2] = v;
@ -316,7 +316,7 @@ __kernel void HSV2RGB(__global const uchar* srcptr, int src_step, int src_offset
{
float tab[4];
int sector;
h *= hscale;
h *= HSCALE;
if(h < 0)
do h += 6; while (h < 0);
else if (h >= 6)
@ -341,10 +341,10 @@ __kernel void HSV2RGB(__global const uchar* srcptr, int src_step, int src_offset
else
b = g = r = v;
dst[bidx] = b;
dst[BIDX] = b;
dst[1] = g;
dst[bidx^2] = r;
#if dcn == 4
dst[BIDX^2] = r;
#if DCN == 4
dst[3] = MAX_NUM;
#endif
@ -410,7 +410,7 @@ __kernel void RGB2HLS(__global const uchar* src, int src_step, int src_offset,
h += 360.f;
}
dst[dst_index] = convert_uchar_sat_rte(h*hscale);
dst[dst_index] = convert_uchar_sat_rte(h*HSCALE);
dst[dst_index + 1] = convert_uchar_sat_rte(l*255.f);
dst[dst_index + 2] = convert_uchar_sat_rte(s*255.f);
@ -451,7 +451,7 @@ __kernel void HLS2RGB(__global const uchar* src, int src_step, int src_offset,
float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
float p1 = 2*l - p2;
h *= hscale;
h *= HSCALE;
if( h < 0 )
do h += 6; while( h < 0 );
else if( h >= 6 )
@ -472,10 +472,10 @@ __kernel void HLS2RGB(__global const uchar* src, int src_step, int src_offset,
else
b = g = r = l;
dst[dst_index + bidx] = convert_uchar_sat_rte(b*255.f);
dst[dst_index + BIDX] = convert_uchar_sat_rte(b*255.f);
dst[dst_index + 1] = convert_uchar_sat_rte(g*255.f);
dst[dst_index + (bidx^2)] = convert_uchar_sat_rte(r*255.f);
#if dcn == 4
dst[dst_index + (BIDX^2)] = convert_uchar_sat_rte(r*255.f);
#if DCN == 4
dst[dst_index + 3] = MAX_NUM;
#endif
@ -538,7 +538,7 @@ __kernel void RGB2HLS(__global const uchar* srcptr, int src_step, int src_offset
if( h < 0.f ) h += 360.f;
}
dst[0] = h*hscale;
dst[0] = h*HSCALE;
dst[1] = l;
dst[2] = s;
@ -582,7 +582,7 @@ __kernel void HLS2RGB(__global const uchar* srcptr, int src_step, int src_offset
float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
float p1 = 2*l - p2;
h *= hscale;
h *= HSCALE;
if( h < 0 )
do h += 6; while( h < 0 );
else if( h >= 6 )
@ -603,10 +603,10 @@ __kernel void HLS2RGB(__global const uchar* srcptr, int src_step, int src_offset
else
b = g = r = l;
dst[bidx] = b;
dst[BIDX] = b;
dst[1] = g;
dst[bidx^2] = r;
#if dcn == 4
dst[BIDX^2] = r;
#if DCN == 4
dst[3] = MAX_NUM;
#endif

@ -44,21 +44,21 @@
//
//M*/
#if depth == 0
#if SRC_DEPTH == 0
#define DATA_TYPE uchar
#define MAX_NUM 255
#define HALF_MAX_NUM 128
#define COEFF_TYPE int
#define SAT_CAST(num) convert_uchar_sat(num)
#define DEPTH_0
#elif depth == 2
#elif SRC_DEPTH == 2
#define DATA_TYPE ushort
#define MAX_NUM 65535
#define HALF_MAX_NUM 32768
#define COEFF_TYPE int
#define SAT_CAST(num) convert_ushort_sat(num)
#define DEPTH_2
#elif depth == 5
#elif SRC_DEPTH == 5
#define DATA_TYPE float
#define MAX_NUM 1.0f
#define HALF_MAX_NUM 0.5f
@ -76,8 +76,8 @@ enum
xyz_shift = 12,
};
#define scnbytes ((int)sizeof(DATA_TYPE)*scn)
#define dcnbytes ((int)sizeof(DATA_TYPE)*dcn)
#define scnbytes ((int)sizeof(DATA_TYPE)*SCN)
#define dcnbytes ((int)sizeof(DATA_TYPE)*DCN)
#define __CAT(x, y) x##y
#define CAT(x, y) __CAT(x, y)
@ -167,11 +167,11 @@ __kernel void XYZ2RGB(__global const uchar * srcptr, int src_step, int src_offse
DATA_TYPE dst0 = SAT_CAST(b);
DATA_TYPE dst1 = SAT_CAST(g);
DATA_TYPE dst2 = SAT_CAST(r);
#if dcn == 3 || defined DEPTH_5
#if DCN == 3 || defined DEPTH_5
dst[0] = dst0;
dst[1] = dst1;
dst[2] = dst2;
#if dcn == 4
#if DCN == 4
dst[3] = MAX_NUM;
#endif
#else
@ -403,7 +403,7 @@ __kernel void Lab2BGR(__global const uchar * src, int src_step, int src_offset,
#endif
coeffs, lThresh, fThresh);
#if dcn == 3
#if DCN == 3
dst_ptr[0] = SAT_CAST(dstbuf[0] * 255.0f);
dst_ptr[1] = SAT_CAST(dstbuf[1] * 255.0f);
dst_ptr[2] = SAT_CAST(dstbuf[2] * 255.0f);
@ -455,7 +455,7 @@ __kernel void Lab2BGR(__global const uchar * srcptr, int src_step, int src_offse
coeffs, lThresh, fThresh);
dst[0] = dstbuf[0], dst[1] = dstbuf[1], dst[2] = dstbuf[2];
#if dcn == 4
#if DCN == 4
dst[3] = MAX_NUM;
#endif
++y;
@ -644,7 +644,7 @@ __kernel void Luv2BGR(__global const uchar * srcptr, int src_step, int src_offse
dst[0] = R;
dst[1] = G;
dst[2] = B;
#if dcn == 4
#if DCN == 4
dst[3] = MAX_NUM;
#endif
++y;
@ -717,7 +717,7 @@ __kernel void Luv2BGR(__global const uchar * src, int src_step, int src_offset,
uchar dst1 = SAT_CAST(G * 255.0f);
uchar dst2 = SAT_CAST(B * 255.0f);
#if dcn == 4
#if DCN == 4
*(__global uchar4 *)dst = (uchar4)(dst0, dst1, dst2, MAX_NUM);
#else
dst[0] = dst0;

@ -46,21 +46,21 @@
/**************************************PUBLICFUNC*************************************/
#if depth == 0
#if SRC_DEPTH == 0
#define DATA_TYPE uchar
#define MAX_NUM 255
#define HALF_MAX_NUM 128
#define COEFF_TYPE int
#define SAT_CAST(num) convert_uchar_sat(num)
#define DEPTH_0
#elif depth == 2
#elif SRC_DEPTH == 2
#define DATA_TYPE ushort
#define MAX_NUM 65535
#define HALF_MAX_NUM 32768
#define COEFF_TYPE int
#define SAT_CAST(num) convert_ushort_sat(num)
#define DEPTH_2
#elif depth == 5
#elif SRC_DEPTH == 5
#define DATA_TYPE float
#define MAX_NUM 1.0f
#define HALF_MAX_NUM 0.5f
@ -86,10 +86,10 @@ enum
#define G2YF 0.587f
#define R2YF 0.299f
#define scnbytes ((int)sizeof(DATA_TYPE)*scn)
#define dcnbytes ((int)sizeof(DATA_TYPE)*dcn)
#define scnbytes ((int)sizeof(DATA_TYPE)*SCN)
#define dcnbytes ((int)sizeof(DATA_TYPE)*DCN)
#if bidx == 0
#if BIDX == 0
#define R_COMP z
#define G_COMP y
#define B_COMP x
@ -160,9 +160,9 @@ __kernel void Gray2RGB(__global const uchar * srcptr, int src_step, int src_offs
__global const DATA_TYPE* src = (__global const DATA_TYPE*)(srcptr + src_index);
__global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
DATA_TYPE val = src[0];
#if dcn == 3 || defined DEPTH_5
#if DCN == 3 || defined DEPTH_5
dst[0] = dst[1] = dst[2] = val;
#if dcn == 4
#if DCN == 4
dst[3] = MAX_NUM;
#endif
#else
@ -197,7 +197,7 @@ __kernel void RGB(__global const uchar* srcptr, int src_step, int src_offset,
{
__global const DATA_TYPE * src = (__global const DATA_TYPE *)(srcptr + src_index);
__global DATA_TYPE * dst = (__global DATA_TYPE *)(dstptr + dst_index);
#if scn == 3
#if SCN == 3
DATA_TYPE_3 src_pix = vload3(0, src);
#else
DATA_TYPE_4 src_pix = vload4(0, src);
@ -213,8 +213,8 @@ __kernel void RGB(__global const uchar* srcptr, int src_step, int src_offset,
dst[2] = src_pix.z;
#endif
#if dcn == 4
#if scn == 3
#if DCN == 4
#if SCN == 3
dst[3] = MAX_NUM;
#else
dst[3] = src[3];
@ -250,18 +250,18 @@ __kernel void RGB5x52RGB(__global const uchar* src, int src_step, int src_offset
{
ushort t = *((__global const ushort*)(src + src_index));
#if greenbits == 6
dst[dst_index + bidx] = (uchar)(t << 3);
#if GREENBITS == 6
dst[dst_index + BIDX] = (uchar)(t << 3);
dst[dst_index + 1] = (uchar)((t >> 3) & ~3);
dst[dst_index + (bidx^2)] = (uchar)((t >> 8) & ~7);
dst[dst_index + (BIDX^2)] = (uchar)((t >> 8) & ~7);
#else
dst[dst_index + bidx] = (uchar)(t << 3);
dst[dst_index + BIDX] = (uchar)(t << 3);
dst[dst_index + 1] = (uchar)((t >> 2) & ~7);
dst[dst_index + (bidx^2)] = (uchar)((t >> 7) & ~7);
dst[dst_index + (BIDX^2)] = (uchar)((t >> 7) & ~7);
#endif
#if dcn == 4
#if greenbits == 6
#if DCN == 4
#if GREENBITS == 6
dst[dst_index + 3] = 255;
#else
dst[dst_index + 3] = t & 0x8000 ? 255 : 0;
@ -295,9 +295,9 @@ __kernel void RGB2RGB5x5(__global const uchar* src, int src_step, int src_offset
{
uchar4 src_pix = vload4(0, src + src_index);
#if greenbits == 6
#if GREENBITS == 6
*((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~3) << 3)|((src_pix.R_COMP&~7) << 8));
#elif scn == 3
#elif SCN == 3
*((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|((src_pix.R_COMP&~7) << 7));
#else
*((__global ushort*)(dst + dst_index)) = (ushort)((src_pix.B_COMP >> 3)|((src_pix.G_COMP&~7) << 2)|
@ -333,7 +333,7 @@ __kernel void BGR5x52Gray(__global const uchar* src, int src_step, int src_offse
{
int t = *((__global const ushort*)(src + src_index));
#if greenbits == 6
#if GREENBITS == 6
dst[dst_index] = (uchar)CV_DESCALE(mad24((t << 3) & 0xf8, BY15, mad24((t >> 3) & 0xfc, GY15, ((t >> 8) & 0xf8) * RY15)), gray_shift);
#else
dst[dst_index] = (uchar)CV_DESCALE(mad24((t << 3) & 0xf8, BY15, mad24((t >> 2) & 0xf8, GY15, ((t >> 7) & 0xf8) * RY15)), gray_shift);
@ -365,7 +365,7 @@ __kernel void Gray2BGR5x5(__global const uchar* src, int src_step, int src_offse
{
int t = src[src_index];
#if greenbits == 6
#if GREENBITS == 6
*((__global ushort*)(dst + dst_index)) = (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));
#else
t >>= 3;

@ -46,21 +46,21 @@
/**************************************PUBLICFUNC*************************************/
#if depth == 0
#if SRC_DEPTH == 0
#define DATA_TYPE uchar
#define MAX_NUM 255
#define HALF_MAX_NUM 128
#define COEFF_TYPE int
#define SAT_CAST(num) convert_uchar_sat(num)
#define DEPTH_0
#elif depth == 2
#elif SRC_DEPTH == 2
#define DATA_TYPE ushort
#define MAX_NUM 65535
#define HALF_MAX_NUM 32768
#define COEFF_TYPE int
#define SAT_CAST(num) convert_ushort_sat(num)
#define DEPTH_2
#elif depth == 5
#elif SRC_DEPTH == 5
#define DATA_TYPE float
#define MAX_NUM 1.0f
#define HALF_MAX_NUM 0.5f
@ -114,10 +114,10 @@ enum
#define CR2GI -11698
#define CB2BI 29049
#define scnbytes ((int)sizeof(DATA_TYPE)*scn)
#define dcnbytes ((int)sizeof(DATA_TYPE)*dcn)
#define scnbytes ((int)sizeof(DATA_TYPE)*SCN)
#define dcnbytes ((int)sizeof(DATA_TYPE)*DCN)
#if bidx == 0
#if BIDX == 0
#define R_COMP z
#define G_COMP y
#define B_COMP x
@ -127,12 +127,12 @@ enum
#define B_COMP z
#endif
#ifndef uidx
#define uidx 0
#ifndef UIDX
#define UIDX 0
#endif
#ifndef yidx
#define yidx 0
#ifndef YIDX
#define YIDX 0
#endif
#ifndef PIX_PER_WI_X
@ -234,10 +234,10 @@ __kernel void YUV2RGB(__global const uchar* srcptr, int src_step, int src_offset
const int b = Y + CV_DESCALE(mul24(U - HALF_MAX_NUM, coeffs[0]), yuv_shift);
#endif
dst[bidx] = SAT_CAST( b );
dst[BIDX] = SAT_CAST( b );
dst[1] = SAT_CAST( g );
dst[bidx^2] = SAT_CAST( r );
#if dcn == 4
dst[BIDX^2] = SAT_CAST( r );
#if DCN == 4
dst[3] = MAX_NUM;
#endif
++y;
@ -266,7 +266,7 @@ __kernel void YUV2RGB_NVx(__global const uchar* srcptr, int src_step, int src_of
{
__global const uchar* ysrc = srcptr + mad24(y << 1, src_step, (x << 1) + src_offset);
__global const uchar* usrc = srcptr + mad24(rows + y, src_step, (x << 1) + src_offset);
__global uchar* dst1 = dstptr + mad24(y << 1, dst_step, mad24(x, dcn<<1, dt_offset));
__global uchar* dst1 = dstptr + mad24(y << 1, dst_step, mad24(x, DCN<<1, dt_offset));
__global uchar* dst2 = dst1 + dst_step;
float Y1 = ysrc[0];
@ -274,8 +274,8 @@ __kernel void YUV2RGB_NVx(__global const uchar* srcptr, int src_step, int src_of
float Y3 = ysrc[src_step];
float Y4 = ysrc[src_step + 1];
float U = ((float)usrc[uidx]) - HALF_MAX_NUM;
float V = ((float)usrc[1-uidx]) - HALF_MAX_NUM;
float U = ((float)usrc[UIDX]) - HALF_MAX_NUM;
float V = ((float)usrc[1-UIDX]) - HALF_MAX_NUM;
__constant float* coeffs = c_YUV2RGBCoeffs_420;
float ruv = fma(coeffs[4], V, 0.5f);
@ -283,34 +283,34 @@ __kernel void YUV2RGB_NVx(__global const uchar* srcptr, int src_step, int src_of
float buv = fma(coeffs[1], U, 0.5f);
Y1 = max(0.f, Y1 - 16.f) * coeffs[0];
dst1[2 - bidx] = convert_uchar_sat(Y1 + ruv);
dst1[2 - BIDX] = convert_uchar_sat(Y1 + ruv);
dst1[1] = convert_uchar_sat(Y1 + guv);
dst1[bidx] = convert_uchar_sat(Y1 + buv);
#if dcn == 4
dst1[BIDX] = convert_uchar_sat(Y1 + buv);
#if DCN == 4
dst1[3] = 255;
#endif
Y2 = max(0.f, Y2 - 16.f) * coeffs[0];
dst1[dcn + 2 - bidx] = convert_uchar_sat(Y2 + ruv);
dst1[dcn + 1] = convert_uchar_sat(Y2 + guv);
dst1[dcn + bidx] = convert_uchar_sat(Y2 + buv);
#if dcn == 4
dst1[DCN + 2 - BIDX] = convert_uchar_sat(Y2 + ruv);
dst1[DCN + 1] = convert_uchar_sat(Y2 + guv);
dst1[DCN + BIDX] = convert_uchar_sat(Y2 + buv);
#if DCN == 4
dst1[7] = 255;
#endif
Y3 = max(0.f, Y3 - 16.f) * coeffs[0];
dst2[2 - bidx] = convert_uchar_sat(Y3 + ruv);
dst2[2 - BIDX] = convert_uchar_sat(Y3 + ruv);
dst2[1] = convert_uchar_sat(Y3 + guv);
dst2[bidx] = convert_uchar_sat(Y3 + buv);
#if dcn == 4
dst2[BIDX] = convert_uchar_sat(Y3 + buv);
#if DCN == 4
dst2[3] = 255;
#endif
Y4 = max(0.f, Y4 - 16.f) * coeffs[0];
dst2[dcn + 2 - bidx] = convert_uchar_sat(Y4 + ruv);
dst2[dcn + 1] = convert_uchar_sat(Y4 + guv);
dst2[dcn + bidx] = convert_uchar_sat(Y4 + buv);
#if dcn == 4
dst2[DCN + 2 - BIDX] = convert_uchar_sat(Y4 + ruv);
dst2[DCN + 1] = convert_uchar_sat(Y4 + guv);
dst2[DCN + BIDX] = convert_uchar_sat(Y4 + buv);
#if DCN == 4
dst2[7] = 255;
#endif
}
@ -319,7 +319,7 @@ __kernel void YUV2RGB_NVx(__global const uchar* srcptr, int src_step, int src_of
}
}
#if uidx < 2
#if UIDX < 2
__kernel void YUV2RGB_YV12_IYUV(__global const uchar* srcptr, int src_step, int src_offset,
__global uchar* dstptr, int dst_step, int dt_offset,
@ -336,7 +336,7 @@ __kernel void YUV2RGB_YV12_IYUV(__global const uchar* srcptr, int src_step, int
if (y < rows / 2 )
{
__global const uchar* ysrc = srcptr + mad24(y << 1, src_step, (x << 1) + src_offset);
__global uchar* dst1 = dstptr + mad24(y << 1, dst_step, x * (dcn<<1) + dt_offset);
__global uchar* dst1 = dstptr + mad24(y << 1, dst_step, x * (DCN<<1) + dt_offset);
__global uchar* dst2 = dst1 + dst_step;
float Y1 = ysrc[0];
@ -354,8 +354,8 @@ __kernel void YUV2RGB_YV12_IYUV(__global const uchar* srcptr, int src_step, int
__global const uchar* vsrc = usrc + mad24(rows >> 2, src_step, rows % 4 ? vsteps[y%2] : 0);
float uv[2] = { ((float)usrc[0]) - HALF_MAX_NUM, ((float)vsrc[0]) - HALF_MAX_NUM };
#endif
float U = uv[uidx];
float V = uv[1-uidx];
float U = uv[UIDX];
float V = uv[1-UIDX];
__constant float* coeffs = c_YUV2RGBCoeffs_420;
float ruv = fma(coeffs[4], V, 0.5f);
@ -363,34 +363,34 @@ __kernel void YUV2RGB_YV12_IYUV(__global const uchar* srcptr, int src_step, int
float buv = fma(coeffs[1], U, 0.5f);
Y1 = max(0.f, Y1 - 16.f) * coeffs[0];
dst1[2 - bidx] = convert_uchar_sat(Y1 + ruv);
dst1[2 - BIDX] = convert_uchar_sat(Y1 + ruv);
dst1[1] = convert_uchar_sat(Y1 + guv);
dst1[bidx] = convert_uchar_sat(Y1 + buv);
#if dcn == 4
dst1[BIDX] = convert_uchar_sat(Y1 + buv);
#if DCN == 4
dst1[3] = 255;
#endif
Y2 = max(0.f, Y2 - 16.f) * coeffs[0];
dst1[dcn + 2 - bidx] = convert_uchar_sat(Y2 + ruv);
dst1[dcn + 1] = convert_uchar_sat(Y2 + guv);
dst1[dcn + bidx] = convert_uchar_sat(Y2 + buv);
#if dcn == 4
dst1[DCN + 2 - BIDX] = convert_uchar_sat(Y2 + ruv);
dst1[DCN + 1] = convert_uchar_sat(Y2 + guv);
dst1[DCN + BIDX] = convert_uchar_sat(Y2 + buv);
#if DCN == 4
dst1[7] = 255;
#endif
Y3 = max(0.f, Y3 - 16.f) * coeffs[0];
dst2[2 - bidx] = convert_uchar_sat(Y3 + ruv);
dst2[2 - BIDX] = convert_uchar_sat(Y3 + ruv);
dst2[1] = convert_uchar_sat(Y3 + guv);
dst2[bidx] = convert_uchar_sat(Y3 + buv);
#if dcn == 4
dst2[BIDX] = convert_uchar_sat(Y3 + buv);
#if DCN == 4
dst2[3] = 255;
#endif
Y4 = max(0.f, Y4 - 16.f) * coeffs[0];
dst2[dcn + 2 - bidx] = convert_uchar_sat(Y4 + ruv);
dst2[dcn + 1] = convert_uchar_sat(Y4 + guv);
dst2[dcn + bidx] = convert_uchar_sat(Y4 + buv);
#if dcn == 4
dst2[DCN + 2 - BIDX] = convert_uchar_sat(Y4 + ruv);
dst2[DCN + 1] = convert_uchar_sat(Y4 + guv);
dst2[DCN + BIDX] = convert_uchar_sat(Y4 + buv);
#if DCN == 4
dst2[7] = 255;
#endif
}
@ -401,7 +401,7 @@ __kernel void YUV2RGB_YV12_IYUV(__global const uchar* srcptr, int src_step, int
#endif
#if uidx < 2
#if UIDX < 2
__constant float c_RGB2YUVCoeffs_420[8] = { 0.256999969f, 0.50399971f, 0.09799957f, -0.1479988098f, -0.2909994125f,
0.438999176f, -0.3679990768f, -0.0709991455f };
@ -415,7 +415,7 @@ __kernel void RGB2YUV_YV12_IYUV(__global const uchar* srcptr, int src_step, int
if (x < cols/2)
{
int src_index = mad24(y << 1, src_step, mad24(x << 1, scn, src_offset));
int src_index = mad24(y << 1, src_step, mad24(x << 1, SCN, src_offset));
int ydst_index = mad24(y << 1, dst_step, (x << 1) + dst_offset);
int y_rows = rows / 3 * 2;
int vsteps[2] = { cols >> 1, dst_step - (cols >> 1)};
@ -438,56 +438,56 @@ __kernel void RGB2YUV_YV12_IYUV(__global const uchar* srcptr, int src_step, int
int s11 = *((__global const int*) src1);
int s12 = *((__global const int*) src1 + 1);
int s13 = *((__global const int*) src1 + 2);
#if scn == 4
#if SCN == 4
int s14 = *((__global const int*) src1 + 3);
#endif
int s21 = *((__global const int*) src2);
int s22 = *((__global const int*) src2 + 1);
int s23 = *((__global const int*) src2 + 2);
#if scn == 4
#if SCN == 4
int s24 = *((__global const int*) src2 + 3);
#endif
float src_pix1[scn * 4], src_pix2[scn * 4];
float src_pix1[SCN * 4], src_pix2[SCN * 4];
*((float4*) src_pix1) = convert_float4(as_uchar4(s11));
*((float4*) src_pix1 + 1) = convert_float4(as_uchar4(s12));
*((float4*) src_pix1 + 2) = convert_float4(as_uchar4(s13));
#if scn == 4
#if SCN == 4
*((float4*) src_pix1 + 3) = convert_float4(as_uchar4(s14));
#endif
*((float4*) src_pix2) = convert_float4(as_uchar4(s21));
*((float4*) src_pix2 + 1) = convert_float4(as_uchar4(s22));
*((float4*) src_pix2 + 2) = convert_float4(as_uchar4(s23));
#if scn == 4
#if SCN == 4
*((float4*) src_pix2 + 3) = convert_float4(as_uchar4(s24));
#endif
uchar4 y1, y2;
y1.x = convert_uchar_sat(fma(coeffs[0], src_pix1[ 2-bidx], fma(coeffs[1], src_pix1[ 1], fma(coeffs[2], src_pix1[ bidx], 16.5f))));
y1.y = convert_uchar_sat(fma(coeffs[0], src_pix1[ scn+2-bidx], fma(coeffs[1], src_pix1[ scn+1], fma(coeffs[2], src_pix1[ scn+bidx], 16.5f))));
y1.z = convert_uchar_sat(fma(coeffs[0], src_pix1[2*scn+2-bidx], fma(coeffs[1], src_pix1[2*scn+1], fma(coeffs[2], src_pix1[2*scn+bidx], 16.5f))));
y1.w = convert_uchar_sat(fma(coeffs[0], src_pix1[3*scn+2-bidx], fma(coeffs[1], src_pix1[3*scn+1], fma(coeffs[2], src_pix1[3*scn+bidx], 16.5f))));
y2.x = convert_uchar_sat(fma(coeffs[0], src_pix2[ 2-bidx], fma(coeffs[1], src_pix2[ 1], fma(coeffs[2], src_pix2[ bidx], 16.5f))));
y2.y = convert_uchar_sat(fma(coeffs[0], src_pix2[ scn+2-bidx], fma(coeffs[1], src_pix2[ scn+1], fma(coeffs[2], src_pix2[ scn+bidx], 16.5f))));
y2.z = convert_uchar_sat(fma(coeffs[0], src_pix2[2*scn+2-bidx], fma(coeffs[1], src_pix2[2*scn+1], fma(coeffs[2], src_pix2[2*scn+bidx], 16.5f))));
y2.w = convert_uchar_sat(fma(coeffs[0], src_pix2[3*scn+2-bidx], fma(coeffs[1], src_pix2[3*scn+1], fma(coeffs[2], src_pix2[3*scn+bidx], 16.5f))));
y1.x = convert_uchar_sat(fma(coeffs[0], src_pix1[ 2-BIDX], fma(coeffs[1], src_pix1[ 1], fma(coeffs[2], src_pix1[ BIDX], 16.5f))));
y1.y = convert_uchar_sat(fma(coeffs[0], src_pix1[ SCN+2-BIDX], fma(coeffs[1], src_pix1[ SCN+1], fma(coeffs[2], src_pix1[ SCN+BIDX], 16.5f))));
y1.z = convert_uchar_sat(fma(coeffs[0], src_pix1[2*SCN+2-BIDX], fma(coeffs[1], src_pix1[2*SCN+1], fma(coeffs[2], src_pix1[2*SCN+BIDX], 16.5f))));
y1.w = convert_uchar_sat(fma(coeffs[0], src_pix1[3*SCN+2-BIDX], fma(coeffs[1], src_pix1[3*SCN+1], fma(coeffs[2], src_pix1[3*SCN+BIDX], 16.5f))));
y2.x = convert_uchar_sat(fma(coeffs[0], src_pix2[ 2-BIDX], fma(coeffs[1], src_pix2[ 1], fma(coeffs[2], src_pix2[ BIDX], 16.5f))));
y2.y = convert_uchar_sat(fma(coeffs[0], src_pix2[ SCN+2-BIDX], fma(coeffs[1], src_pix2[ SCN+1], fma(coeffs[2], src_pix2[ SCN+BIDX], 16.5f))));
y2.z = convert_uchar_sat(fma(coeffs[0], src_pix2[2*SCN+2-BIDX], fma(coeffs[1], src_pix2[2*SCN+1], fma(coeffs[2], src_pix2[2*SCN+BIDX], 16.5f))));
y2.w = convert_uchar_sat(fma(coeffs[0], src_pix2[3*SCN+2-BIDX], fma(coeffs[1], src_pix2[3*SCN+1], fma(coeffs[2], src_pix2[3*SCN+BIDX], 16.5f))));
*((__global int*) ydst1) = as_int(y1);
*((__global int*) ydst2) = as_int(y2);
float uv[4] = { fma(coeffs[3], src_pix1[ 2-bidx], fma(coeffs[4], src_pix1[ 1], fma(coeffs[5], src_pix1[ bidx], 128.5f))),
fma(coeffs[5], src_pix1[ 2-bidx], fma(coeffs[6], src_pix1[ 1], fma(coeffs[7], src_pix1[ bidx], 128.5f))),
fma(coeffs[3], src_pix1[2*scn+2-bidx], fma(coeffs[4], src_pix1[2*scn+1], fma(coeffs[5], src_pix1[2*scn+bidx], 128.5f))),
fma(coeffs[5], src_pix1[2*scn+2-bidx], fma(coeffs[6], src_pix1[2*scn+1], fma(coeffs[7], src_pix1[2*scn+bidx], 128.5f))) };
float uv[4] = { fma(coeffs[3], src_pix1[ 2-BIDX], fma(coeffs[4], src_pix1[ 1], fma(coeffs[5], src_pix1[ BIDX], 128.5f))),
fma(coeffs[5], src_pix1[ 2-BIDX], fma(coeffs[6], src_pix1[ 1], fma(coeffs[7], src_pix1[ BIDX], 128.5f))),
fma(coeffs[3], src_pix1[2*SCN+2-BIDX], fma(coeffs[4], src_pix1[2*SCN+1], fma(coeffs[5], src_pix1[2*SCN+BIDX], 128.5f))),
fma(coeffs[5], src_pix1[2*SCN+2-BIDX], fma(coeffs[6], src_pix1[2*SCN+1], fma(coeffs[7], src_pix1[2*SCN+BIDX], 128.5f))) };
udst[0] = convert_uchar_sat(uv[uidx] );
vdst[0] = convert_uchar_sat(uv[1 - uidx]);
udst[1] = convert_uchar_sat(uv[2 + uidx]);
vdst[1] = convert_uchar_sat(uv[3 - uidx]);
udst[0] = convert_uchar_sat(uv[UIDX] );
vdst[0] = convert_uchar_sat(uv[1 - UIDX]);
udst[1] = convert_uchar_sat(uv[2 + UIDX]);
vdst[1] = convert_uchar_sat(uv[3 - UIDX]);
#else
float4 src_pix1 = convert_float4(vload4(0, src1));
float4 src_pix2 = convert_float4(vload4(0, src1+scn));
float4 src_pix2 = convert_float4(vload4(0, src1+SCN));
float4 src_pix3 = convert_float4(vload4(0, src2));
float4 src_pix4 = convert_float4(vload4(0, src2+scn));
float4 src_pix4 = convert_float4(vload4(0, src2+SCN));
ydst1[0] = convert_uchar_sat(fma(coeffs[0], src_pix1.R_COMP, fma(coeffs[1], src_pix1.G_COMP, fma(coeffs[2], src_pix1.B_COMP, 16.5f))));
ydst1[1] = convert_uchar_sat(fma(coeffs[0], src_pix2.R_COMP, fma(coeffs[1], src_pix2.G_COMP, fma(coeffs[2], src_pix2.B_COMP, 16.5f))));
@ -497,8 +497,8 @@ __kernel void RGB2YUV_YV12_IYUV(__global const uchar* srcptr, int src_step, int
float uv[2] = { fma(coeffs[3], src_pix1.R_COMP, fma(coeffs[4], src_pix1.G_COMP, fma(coeffs[5], src_pix1.B_COMP, 128.5f))),
fma(coeffs[5], src_pix1.R_COMP, fma(coeffs[6], src_pix1.G_COMP, fma(coeffs[7], src_pix1.B_COMP, 128.5f))) };
udst[0] = convert_uchar_sat(uv[uidx] );
vdst[0] = convert_uchar_sat(uv[1-uidx]);
udst[0] = convert_uchar_sat(uv[UIDX] );
vdst[0] = convert_uchar_sat(uv[1-UIDX]);
#endif
++y;
src_index += 2*src_step;
@ -520,7 +520,7 @@ __kernel void YUV2RGB_422(__global const uchar* srcptr, int src_step, int src_of
if (x < cols / 2)
{
__global const uchar* src = srcptr + mad24(y, src_step, (x << 2) + src_offset);
__global uchar* dst = dstptr + mad24(y, dst_step, mad24(x << 1, dcn, dst_offset));
__global uchar* dst = dstptr + mad24(y, dst_step, mad24(x << 1, DCN, dst_offset));
#pragma unroll
for (int cy = 0; cy < PIX_PER_WI_Y; ++cy)
@ -530,34 +530,34 @@ __kernel void YUV2RGB_422(__global const uchar* srcptr, int src_step, int src_of
__constant float* coeffs = c_YUV2RGBCoeffs_420;
#ifndef USE_OPTIMIZED_LOAD
float U = ((float) src[uidx]) - HALF_MAX_NUM;
float V = ((float) src[(2 + uidx) % 4]) - HALF_MAX_NUM;
float y00 = max(0.f, ((float) src[yidx]) - 16.f) * coeffs[0];
float y01 = max(0.f, ((float) src[yidx + 2]) - 16.f) * coeffs[0];
float U = ((float) src[UIDX]) - HALF_MAX_NUM;
float V = ((float) src[(2 + UIDX) % 4]) - HALF_MAX_NUM;
float y00 = max(0.f, ((float) src[YIDX]) - 16.f) * coeffs[0];
float y01 = max(0.f, ((float) src[YIDX + 2]) - 16.f) * coeffs[0];
#else
int load_src = *((__global int*) src);
float vec_src[4] = { load_src & 0xff, (load_src >> 8) & 0xff, (load_src >> 16) & 0xff, (load_src >> 24) & 0xff};
float U = vec_src[uidx] - HALF_MAX_NUM;
float V = vec_src[(2 + uidx) % 4] - HALF_MAX_NUM;
float y00 = max(0.f, vec_src[yidx] - 16.f) * coeffs[0];
float y01 = max(0.f, vec_src[yidx + 2] - 16.f) * coeffs[0];
float U = vec_src[UIDX] - HALF_MAX_NUM;
float V = vec_src[(2 + UIDX) % 4] - HALF_MAX_NUM;
float y00 = max(0.f, vec_src[YIDX] - 16.f) * coeffs[0];
float y01 = max(0.f, vec_src[YIDX + 2] - 16.f) * coeffs[0];
#endif
float ruv = fma(coeffs[4], V, 0.5f);
float guv = fma(coeffs[3], V, fma(coeffs[2], U, 0.5f));
float buv = fma(coeffs[1], U, 0.5f);
dst[2 - bidx] = convert_uchar_sat(y00 + ruv);
dst[2 - BIDX] = convert_uchar_sat(y00 + ruv);
dst[1] = convert_uchar_sat(y00 + guv);
dst[bidx] = convert_uchar_sat(y00 + buv);
#if dcn == 4
dst[BIDX] = convert_uchar_sat(y00 + buv);
#if DCN == 4
dst[3] = 255;
#endif
dst[dcn + 2 - bidx] = convert_uchar_sat(y01 + ruv);
dst[dcn + 1] = convert_uchar_sat(y01 + guv);
dst[dcn + bidx] = convert_uchar_sat(y01 + buv);
#if dcn == 4
dst[DCN + 2 - BIDX] = convert_uchar_sat(y01 + ruv);
dst[DCN + 1] = convert_uchar_sat(y01 + guv);
dst[DCN + BIDX] = convert_uchar_sat(y01 + buv);
#if DCN == 4
dst[7] = 255;
#endif
}
@ -600,7 +600,7 @@ __kernel void RGB2YUV_422(__global const uchar* srcptr, int src_step, int src_of
__global DATA_TYPE* dst = (__global DATA_TYPE*)(dstptr + dst_index);
DATA_TYPE_3 src_pix1 = vload3(0, src);
DATA_TYPE b1 = src_pix1.B_COMP, g1 = src_pix1.G_COMP, r1 = src_pix1.R_COMP;
DATA_TYPE_3 src_pix2 = vload3(0, src+scn);
DATA_TYPE_3 src_pix2 = vload3(0, src+SCN);
DATA_TYPE b2 = src_pix2.B_COMP, g2 = src_pix2.G_COMP, r2 = src_pix2.R_COMP;
@ -625,10 +625,10 @@ __kernel void RGB2YUV_422(__global const uchar* srcptr, int src_step, int src_of
const res_dtype U = MAC_fn(coeffs[5], sr, coeffs[1] + MAC_fn(coeffs[6], sg, mul_fn(coeffs[7], sb)));
const res_dtype V = MAC_fn(coeffs[7], sr, coeffs[1] + MAC_fn(coeffs[8], sg, mul_fn(coeffs[9], sb)));
dst[uidx] = output_scale_fn(U);
dst[(2 + uidx) % 4] = output_scale_fn(V);
dst[yidx] = output_scale_fn(Y1);
dst[yidx+2] = output_scale_fn(Y2);
dst[UIDX] = output_scale_fn(U);
dst[(2 + UIDX) % 4] = output_scale_fn(V);
dst[YIDX] = output_scale_fn(Y1);
dst[YIDX+2] = output_scale_fn(Y2);
++y;
dst_index += dst_step;
@ -728,10 +728,10 @@ __kernel void YCrCb2RGB(__global const uchar* src, int src_step, int src_offset,
int b = yp + CV_DESCALE(coeff[3] * (cb - HALF_MAX_NUM), yuv_shift);
#endif
dstptr[(bidx^2)] = SAT_CAST(r);
dstptr[(BIDX^2)] = SAT_CAST(r);
dstptr[1] = SAT_CAST(g);
dstptr[bidx] = SAT_CAST(b);
#if dcn == 4
dstptr[BIDX] = SAT_CAST(b);
#if DCN == 4
dstptr[3] = MAX_NUM;
#endif

@ -13,25 +13,25 @@
__kernel void sumConvert(__global const uchar * src1ptr, int src1_step, int src1_offset,
__global const uchar * src2ptr, int src2_step, int src2_offset,
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
coeffT scale, coeffT delta)
COEFF_T scale, COEFF_T delta)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (y < dst_rows && x < dst_cols)
{
int src1_index = mad24(y, src1_step, mad24(x, (int)sizeof(srcT), src1_offset));
int src2_index = mad24(y, src2_step, mad24(x, (int)sizeof(srcT), src2_offset));
int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(dstT), dst_offset));
int src1_index = mad24(y, src1_step, mad24(x, (int)sizeof(SRC_T), src1_offset));
int src2_index = mad24(y, src2_step, mad24(x, (int)sizeof(SRC_T), src2_offset));
int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(DST_T), dst_offset));
__global const srcT * src1 = (__global const srcT *)(src1ptr + src1_index);
__global const srcT * src2 = (__global const srcT *)(src2ptr + src2_index);
__global dstT * dst = (__global dstT *)(dstptr + dst_index);
__global const SRC_T * src1 = (__global const SRC_T *)(src1ptr + src1_index);
__global const SRC_T * src2 = (__global const SRC_T *)(src2ptr + src2_index);
__global DST_T * dst = (__global DST_T *)(dstptr + dst_index);
#if wdepth <= 4
dst[0] = convertToDT( mad24((WT)(scale), convertToWT(src1[0]) + convertToWT(src2[0]), (WT)(delta)) );
#if WDEPTH <= 4
dst[0] = CONVERT_TO_DT( mad24((WT)(scale), CONVERT_TO_WT(src1[0]) + CONVERT_TO_WT(src2[0]), (WT)(delta)) );
#else
dst[0] = convertToDT( mad((WT)(scale), convertToWT(src1[0]) + convertToWT(src2[0]), (WT)(delta)) );
dst[0] = CONVERT_TO_DT( mad((WT)(scale), CONVERT_TO_WT(src1[0]) + CONVERT_TO_WT(src2[0]), (WT)(delta)) );
#endif
}
}
@ -74,18 +74,18 @@ __kernel void sumConvert(__global const uchar * src1ptr, int src1_step, int src1
#endif
#if CN != 3
#define loadpix(addr) *(__global const srcT *)(addr)
#define storepix(val, addr) *(__global dstT *)(addr) = val
#define SRCSIZE (int)sizeof(srcT)
#define DSTSIZE (int)sizeof(dstT)
#define loadpix(addr) *(__global const SRC_T *)(addr)
#define storepix(val, addr) *(__global DST_T *)(addr) = val
#define SRCSIZE (int)sizeof(SRC_T)
#define DSTSIZE (int)sizeof(DST_T)
#else
#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))
#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))
#define SRCSIZE (int)sizeof(srcT1)*3
#define DSTSIZE (int)sizeof(dstT1)*3
#define loadpix(addr) vload3(0, (__global const SRC_T1 *)(addr))
#define storepix(val, addr) vstore3(val, 0, (__global DST_T1 *)(addr))
#define SRCSIZE (int)sizeof(SRC_T1)*3
#define DSTSIZE (int)sizeof(DST_T1)*3
#endif
#define SRC(_x,_y) convertToWT(loadpix(Src + mad24(_y, src_step, SRCSIZE * _x)))
#define SRC(_x,_y) CONVERT_TO_WT(loadpix(Src + mad24(_y, src_step, SRCSIZE * _x)))
#ifdef BORDER_CONSTANT
// CCCCCC|abcdefgh|CCCCCCC
@ -173,7 +173,7 @@ __kernel void laplacian(__global uchar* Src, int src_step, int srcOffsetX, int s
}
WT sum = mad(scale_v, (sum1 + sum2), delta_v);
storepix(convertToDT(sum), Dst + mad24(y + liy, dst_step, mad24(x, DSTSIZE, dst_offset)));
storepix(CONVERT_TO_DT(sum), Dst + mad24(y + liy, dst_step, mad24(x, DSTSIZE, dst_offset)));
}
for (int i = liy * BLK_X + lix; i < (RADIUS*2) * (BLK_X+(RADIUS*2)); i += BLK_X * BLK_Y)
@ -203,4 +203,4 @@ __kernel void laplacian(__global uchar* Src, int src_step, int srcOffsetX, int s
}
}
#endif
#endif

@ -67,7 +67,7 @@
#error No extrapolation method
#endif
#if cn != 3
#if CN != 3
#define loadpix(addr) *(__global const T*)(addr)
#define storepix(val, addr) *(__global T*)(addr) = (val)
#define PIXSIZE ((int)sizeof(T))
@ -77,9 +77,9 @@
#define PIXSIZE ((int)sizeof(T1)*3)
#endif
#define SRC(_x,_y) convertToFT(loadpix(srcData + mad24(_y, src_step, PIXSIZE * _x)))
#define SRC(_x,_y) CONVERT_TO_FT(loadpix(srcData + mad24(_y, src_step, PIXSIZE * _x)))
#if kercn == 4
#if KERCN == 4
#define SRC4(_x,_y) convert_float4(vload4(0, srcData + mad24(_y, src_step, PIXSIZE * _x)))
#endif
@ -107,7 +107,7 @@
smem[1][col_lcl] = sum1;
#if kercn == 4
#if KERCN == 4
#define LOAD_LOCAL4(col_gl, col_lcl) \
sum40 = co3* SRC4(col_gl, EXTRAPOLATE_(src_y - 2, src_rows)); \
sum40 = MAD(co2, SRC4(col_gl, EXTRAPOLATE_(src_y - 1, src_rows)), sum40); \
@ -131,7 +131,7 @@
__kernel void pyrDown(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)
{
const int x = get_global_id(0)*kercn;
const int x = get_global_id(0)*KERCN;
const int y = 2*get_global_id(1);
__local FT smem[2][LOCAL_SIZE + 4];
@ -150,7 +150,7 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
{
#undef EXTRAPOLATE_
#define EXTRAPOLATE_(val, maxVal) val
#if kercn == 1
#if KERCN == 1
col = EXTRAPOLATE(x, src_cols);
LOAD_LOCAL(col, 2 + get_local_id(0))
#else
@ -183,7 +183,7 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
{
#undef EXTRAPOLATE_
#define EXTRAPOLATE_(val, maxVal) EXTRAPOLATE(val, maxVal)
#if kercn == 1
#if KERCN == 1
col = EXTRAPOLATE(x, src_cols);
LOAD_LOCAL(col, 2 + get_local_id(0))
#else
@ -215,7 +215,7 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
barrier(CLK_LOCAL_MEM_FENCE);
#if kercn == 1
#if KERCN == 1
if (get_local_id(0) < LOCAL_SIZE / 2)
{
const int tid2 = get_local_id(0) * 2;
@ -226,8 +226,8 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
{
for (int yin = y, y1 = min(dst_rows, y + 2); yin < y1; yin++)
{
#if cn == 1
#if fdepth <= 5
#if CN == 1
#if FDEPTH <= 5
FT sum = dot(vload4(0, (__local float*) (&smem) + tid2 + (yin - y) * (LOCAL_SIZE + 4)), (float4)(co3, co2, co1, co2));
#else
FT sum = dot(vload4(0, (__local double*) (&smem) + tid2 + (yin - y) * (LOCAL_SIZE + 4)), (double4)(co3, co2, co1, co2));
@ -239,7 +239,7 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
sum = MAD(co2, smem[yin - y][2 + tid2 + 1], sum);
#endif
sum = MAD(co3, smem[yin - y][2 + tid2 + 2], sum);
storepix(convertToT(sum), dstData + yin * dst_step + dst_x * PIXSIZE);
storepix(CONVERT_TO_T(sum), dstData + yin * dst_step + dst_x * PIXSIZE);
}
}
}
@ -256,7 +256,7 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
sum = MAD(co2, smem[yin - y][2 + tid4 - 1], sum);
sum = MAD(co1, smem[yin - y][2 + tid4 ], sum);
sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum);
storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));
storepix(CONVERT_TO_T(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));
dst_x ++;
sum = co3* smem[yin - y][2 + tid4 + 4];
@ -264,7 +264,7 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum);
sum = MAD(co1, smem[yin - y][2 + tid4 + 2], sum);
sum = MAD(co2, smem[yin - y][2 + tid4 + 3], sum);
storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));
storepix(CONVERT_TO_T(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));
dst_x --;
}
@ -279,7 +279,7 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
sum = MAD(co1, smem[yin - y][2 + tid4 ], sum);
sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum);
storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));
storepix(CONVERT_TO_T(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));
}
}
#endif

@ -58,7 +58,7 @@
#endif
#endif
#if cn != 3
#if CN != 3
#define loadpix(addr) *(__global const T*)(addr)
#define storepix(val, addr) *(__global T*)(addr) = (val)
#define PIXSIZE ((int)sizeof(T))
@ -92,7 +92,7 @@ __kernel void pyrUp(__global const uchar * src, int src_step, int src_offset, in
int srcx = EXTRAPOLATE(mad24((int)get_group_id(0), LOCAL_SIZE/2, tidx) - 1, src_cols);
int srcy = EXTRAPOLATE(mad24((int)get_group_id(1), LOCAL_SIZE/2, tidy) - 1, src_rows);
s_srcPatch[tidy][tidx] = convertToFT(loadpix(srcData + srcy * src_step + srcx * PIXSIZE));
s_srcPatch[tidy][tidx] = CONVERT_TO_FT(loadpix(srcData + srcy * src_step + srcx * PIXSIZE));
}
barrier(CLK_LOCAL_MEM_FENCE);
@ -124,7 +124,7 @@ __kernel void pyrUp(__global const uchar * src, int src_step, int src_offset, in
sum = mad(coefy2, s_dstPatch[1 + ((tidy + 2) >> 1)][tidx], sum);
if ((x < dst_cols) && (y < dst_rows))
storepix(convertToT(sum), dstData + y * dst_step + x * PIXSIZE);
storepix(CONVERT_TO_T(sum), dstData + y * dst_step + x * PIXSIZE);
}
@ -149,10 +149,10 @@ __kernel void pyrUp_unrolled(__global const uchar * src, int src_step, int src_o
int srcx2 = EXTRAPOLATE(srcx+1, src_cols);
int srcy1 = EXTRAPOLATE(srcy, src_rows);
int srcy2 = EXTRAPOLATE(srcy+1, src_rows);
s_srcPatch[ly][lx] = convertToFT(loadpix(srcData + srcy1 * src_step + srcx1 * PIXSIZE));
s_srcPatch[ly+1][lx] = convertToFT(loadpix(srcData + srcy2 * src_step + srcx1 * PIXSIZE));
s_srcPatch[ly][lx+1] = convertToFT(loadpix(srcData + srcy1 * src_step + srcx2 * PIXSIZE));
s_srcPatch[ly+1][lx+1] = convertToFT(loadpix(srcData + srcy2 * src_step + srcx2 * PIXSIZE));
s_srcPatch[ly][lx] = CONVERT_TO_FT(loadpix(srcData + srcy1 * src_step + srcx1 * PIXSIZE));
s_srcPatch[ly+1][lx] = CONVERT_TO_FT(loadpix(srcData + srcy2 * src_step + srcx1 * PIXSIZE));
s_srcPatch[ly][lx+1] = CONVERT_TO_FT(loadpix(srcData + srcy1 * src_step + srcx2 * PIXSIZE));
s_srcPatch[ly+1][lx+1] = CONVERT_TO_FT(loadpix(srcData + srcy2 * src_step + srcx2 * PIXSIZE));
}
barrier(CLK_LOCAL_MEM_FENCE);
@ -213,22 +213,22 @@ __kernel void pyrUp_unrolled(__global const uchar * src, int src_step, int src_o
sum = co3 * s_dstPatch[1 + get_local_id(1) - 1][lx];
sum = mad(co1, s_dstPatch[1 + get_local_id(1) ][lx], sum);
sum = mad(co3, s_dstPatch[1 + get_local_id(1) + 1][lx], sum);
storepix(convertToT(sum), dstData + dst_y * dst_step + dst_x * PIXSIZE);
storepix(CONVERT_TO_T(sum), dstData + dst_y * dst_step + dst_x * PIXSIZE);
// (x+1,y)
sum = co3 * s_dstPatch[1 + get_local_id(1) - 1][lx+1];
sum = mad(co1, s_dstPatch[1 + get_local_id(1) ][lx+1], sum);
sum = mad(co3, s_dstPatch[1 + get_local_id(1) + 1][lx+1], sum);
storepix(convertToT(sum), dstData + dst_y * dst_step + (dst_x+1) * PIXSIZE);
storepix(CONVERT_TO_T(sum), dstData + dst_y * dst_step + (dst_x+1) * PIXSIZE);
// (x,y+1)
sum = co2 * s_dstPatch[1 + get_local_id(1) ][lx];
sum = mad(co2, s_dstPatch[1 + get_local_id(1) + 1][lx], sum);
storepix(convertToT(sum), dstData + (dst_y+1) * dst_step + dst_x * PIXSIZE);
storepix(CONVERT_TO_T(sum), dstData + (dst_y+1) * dst_step + dst_x * PIXSIZE);
// (x+1,y+1)
sum = co2 * s_dstPatch[1 + get_local_id(1) ][lx+1];
sum = mad(co2, s_dstPatch[1 + get_local_id(1) + 1][lx+1], sum);
storepix(convertToT(sum), dstData + (dst_y+1) * dst_step + (dst_x+1) * PIXSIZE);
storepix(CONVERT_TO_T(sum), dstData + (dst_y+1) * dst_step + (dst_x+1) * PIXSIZE);
}
}

@ -53,7 +53,7 @@
#define noconvert
#if cn != 3
#if CN != 3
#define loadpix(addr) *(__global const T*)(addr)
#define storepix(val, addr) *(__global T*)(addr) = val
#define TSIZE ((int)sizeof(T))
@ -73,7 +73,7 @@ enum
};
#ifdef INTER_NEAREST
#define convertToWT
#define CONVERT_TO_WT
#endif
#ifdef BORDER_CONSTANT
@ -82,7 +82,7 @@ enum
#define EXTRAPOLATE(v2, v) \
{ \
v2 = max(min(v2, (int2)(src_cols - 1, src_rows - 1)), (int2)(0)); \
v = convertToWT(loadpix((__global const T*)(srcptr + mad24(v2.y, src_step, v2.x * TSIZE + src_offset)))); \
v = CONVERT_TO_WT(loadpix((__global const T*)(srcptr + mad24(v2.y, src_step, v2.x * TSIZE + src_offset)))); \
}
#elif defined BORDER_WRAP
#define EXTRAPOLATE(v2, v) \
@ -96,7 +96,7 @@ enum
v2.y -= ((v2.y - src_rows + 1) / src_rows) * src_rows; \
if( v2.y >= src_rows ) \
v2.y %= src_rows; \
v = convertToWT(loadpix((__global const T*)(srcptr + mad24(v2.y, src_step, v2.x * TSIZE + src_offset)))); \
v = CONVERT_TO_WT(loadpix((__global const T*)(srcptr + mad24(v2.y, src_step, v2.x * TSIZE + src_offset)))); \
}
#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101)
#ifdef BORDER_REFLECT
@ -130,7 +130,7 @@ enum
v2.y = src_rows - 1 - (v2.y - src_rows) - delta; \
} \
while (v2.y >= src_rows || v2.y < 0); \
v = convertToWT(loadpix((__global const T*)(srcptr + mad24(v2.y, src_step, v2.x * TSIZE + src_offset)))); \
v = CONVERT_TO_WT(loadpix((__global const T*)(srcptr + mad24(v2.y, src_step, v2.x * TSIZE + src_offset)))); \
}
#else
#error No extrapolation method
@ -147,7 +147,7 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src
ST nVal)
{
int x = get_global_id(0);
int y = get_global_id(1) * rowsPerWI;
int y = get_global_id(1) * ROWS_PER_WI;
if (x < dst_cols)
{
@ -158,7 +158,7 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src
int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
#pragma unroll
for (int i = 0; i < rowsPerWI; ++i, ++y,
for (int i = 0; i < ROWS_PER_WI; ++i, ++y,
map1_index += map1_step, map2_index += map2_step, dst_index += dst_step)
if (y < dst_rows)
{
@ -193,7 +193,7 @@ __kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_o
ST nVal)
{
int x = get_global_id(0);
int y = get_global_id(1) * rowsPerWI;
int y = get_global_id(1) * ROWS_PER_WI;
if (x < dst_cols)
{
@ -202,7 +202,7 @@ __kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_o
int map_index = mad24(y, map_step, mad24(x, (int)sizeof(float2), map_offset));
#pragma unroll
for (int i = 0; i < rowsPerWI; ++i, ++y,
for (int i = 0; i < ROWS_PER_WI; ++i, ++y,
map_index += map_step, dst_index += dst_step)
if (y < dst_rows)
{
@ -233,7 +233,7 @@ __kernel void remap_16SC2(__global const uchar * srcptr, int src_step, int src_o
ST nVal)
{
int x = get_global_id(0);
int y = get_global_id(1) * rowsPerWI;
int y = get_global_id(1) * ROWS_PER_WI;
if (x < dst_cols)
{
@ -242,7 +242,7 @@ __kernel void remap_16SC2(__global const uchar * srcptr, int src_step, int src_o
int map_index = mad24(y, map_step, mad24(x, (int)sizeof(short2), map_offset));
#pragma unroll
for (int i = 0; i < rowsPerWI; ++i, ++y,
for (int i = 0; i < ROWS_PER_WI; ++i, ++y,
map_index += map_step, dst_index += dst_step)
if (y < dst_rows)
{
@ -274,7 +274,7 @@ __kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int
ST nVal)
{
int x = get_global_id(0);
int y = get_global_id(1) * rowsPerWI;
int y = get_global_id(1) * ROWS_PER_WI;
if (x < dst_cols)
{
@ -284,7 +284,7 @@ __kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int
int map2_index = mad24(y, map2_step, mad24(x, (int)sizeof(ushort), map2_offset));
#pragma unroll
for (int i = 0; i < rowsPerWI; ++i, ++y,
for (int i = 0; i < ROWS_PER_WI; ++i, ++y,
map1_index += map1_step, map2_index += map2_step, dst_index += dst_step)
if (y < dst_rows)
{
@ -330,17 +330,17 @@ __kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int
ST nVal)
{
int x = get_global_id(0);
int y = get_global_id(1) * rowsPerWI;
int y = get_global_id(1) * ROWS_PER_WI;
if (x < dst_cols)
{
WT scalar = convertToWT(convertScalar(nVal));
WT scalar = CONVERT_TO_WT(convertScalar(nVal));
int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
int map1_index = mad24(y, map1_step, mad24(x, (int)sizeof(short2), map1_offset));
int map2_index = mad24(y, map2_step, mad24(x, (int)sizeof(ushort), map2_offset));
#pragma unroll
for (int i = 0; i < rowsPerWI; ++i, ++y,
for (int i = 0; i < ROWS_PER_WI; ++i, ++y,
map1_index += map1_step, map2_index += map2_step, dst_index += dst_step)
if (y < dst_rows)
{
@ -359,22 +359,22 @@ __kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int
WT a = scalar, b = scalar, c = scalar, d = scalar;
if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y))
a = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset))));
a = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset))));
else
EXTRAPOLATE(map_dataA, a);
if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y))
b = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset))));
b = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset))));
else
EXTRAPOLATE(map_dataB, b);
if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y))
c = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset))));
c = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset))));
else
EXTRAPOLATE(map_dataC, c);
if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y))
d = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset))));
d = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset))));
else
EXTRAPOLATE(map_dataD, d);
@ -382,7 +382,7 @@ __kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int
b * (u.x) * (1 - u.y) +
c * (1 - u.x) * (u.y) +
d * (u.x) * (u.y);
storepix(convertToT(dst_data), dst);
storepix(CONVERT_TO_T(dst_data), dst);
}
}
}
@ -394,17 +394,17 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src
ST nVal)
{
int x = get_global_id(0);
int y = get_global_id(1) * rowsPerWI;
int y = get_global_id(1) * ROWS_PER_WI;
if (x < dst_cols)
{
WT scalar = convertToWT(convertScalar(nVal));
WT scalar = CONVERT_TO_WT(convertScalar(nVal));
int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
int map1_index = mad24(y, map1_step, mad24(x, (int)sizeof(float), map1_offset));
int map2_index = mad24(y, map2_step, mad24(x, (int)sizeof(float), map2_offset));
#pragma unroll
for (int i = 0; i < rowsPerWI; ++i, ++y,
for (int i = 0; i < ROWS_PER_WI; ++i, ++y,
map1_index += map1_step, map2_index += map2_step, dst_index += dst_step)
if (y < dst_rows)
{
@ -431,13 +431,13 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src
xsum = (WT)(0);
if (sx >= 0 && sx + 2 < src_cols)
{
#if depth == 0 && cn == 1
#if SRC_DEPTH == 0 && CN == 1
uchar2 value = vload2(0, srcptr + src_index);
xsum = dot(convert_float2(value), (float2)(coeffs_x[0], coeffs_x[1]));
#else
#pragma unroll
for (int xp = 0; xp < 2; ++xp)
xsum = fma(convertToWT(loadpix(srcptr + mad24(xp, TSIZE, src_index))), coeffs_x[xp], xsum);
xsum = fma(CONVERT_TO_WT(loadpix(srcptr + mad24(xp, TSIZE, src_index))), coeffs_x[xp], xsum);
#endif
}
else
@ -445,7 +445,7 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src
#pragma unroll
for (int xp = 0; xp < 2; ++xp)
xsum = fma(sx + xp >= 0 && sx + xp < src_cols ?
convertToWT(loadpix(srcptr + mad24(xp, TSIZE, src_index))) : scalar, coeffs_x[xp], xsum);
CONVERT_TO_WT(loadpix(srcptr + mad24(xp, TSIZE, src_index))) : scalar, coeffs_x[xp], xsum);
}
sum = fma(xsum, coeffs_y[yp], sum);
}
@ -453,7 +453,7 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src
sum = fma(scalar, coeffs_y[yp], sum);
}
storepix(convertToT(sum), dst);
storepix(CONVERT_TO_T(sum), dst);
#else
float2 map_data = (float2)(map1[0], map2[0]);
@ -463,27 +463,27 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src
int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1);
float2 _u = map_data - convert_float2(map_dataA);
WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)INTER_TAB_SIZE)) / (WT2)INTER_TAB_SIZE;
WT scalar = convertToWT(convertScalar(nVal));
WT2 u = CONVERT_TO_WT2(convert_int2_rte(CONVERT_TO_WT2(_u) * (WT2)INTER_TAB_SIZE)) / (WT2)INTER_TAB_SIZE;
WT scalar = CONVERT_TO_WT(convertScalar(nVal));
WT a = scalar, b = scalar, c = scalar, d = scalar;
if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y))
a = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset))));
a = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset))));
else
EXTRAPOLATE(map_dataA, a);
if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y))
b = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset))));
b = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset))));
else
EXTRAPOLATE(map_dataB, b);
if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y))
c = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset))));
c = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset))));
else
EXTRAPOLATE(map_dataC, c);
if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y))
d = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset))));
d = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset))));
else
EXTRAPOLATE(map_dataD, d);
@ -491,7 +491,7 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src
b * (u.x) * (1 - u.y) +
c * (1 - u.x) * (u.y) +
d * (u.x) * (u.y);
storepix(convertToT(dst_data), dst);
storepix(CONVERT_TO_T(dst_data), dst);
#endif
}
}
@ -503,16 +503,16 @@ __kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_o
ST nVal)
{
int x = get_global_id(0);
int y = get_global_id(1) * rowsPerWI;
int y = get_global_id(1) * ROWS_PER_WI;
if (x < dst_cols)
{
WT scalar = convertToWT(convertScalar(nVal));
WT scalar = CONVERT_TO_WT(convertScalar(nVal));
int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset));
int map_index = mad24(y, map_step, mad24(x, (int)sizeof(float2), map_offset));
#pragma unroll
for (int i = 0; i < rowsPerWI; ++i, ++y,
for (int i = 0; i < ROWS_PER_WI; ++i, ++y,
map_index += map_step, dst_index += dst_step)
if (y < dst_rows)
{
@ -526,26 +526,26 @@ __kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_o
int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1);
float2 _u = map_data - convert_float2(map_dataA);
WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)INTER_TAB_SIZE)) / (WT2)INTER_TAB_SIZE;
WT2 u = CONVERT_TO_WT2(convert_int2_rte(CONVERT_TO_WT2(_u) * (WT2)INTER_TAB_SIZE)) / (WT2)INTER_TAB_SIZE;
WT a = scalar, b = scalar, c = scalar, d = scalar;
if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y))
a = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset))));
a = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset))));
else
EXTRAPOLATE(map_dataA, a);
if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y))
b = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset))));
b = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset))));
else
EXTRAPOLATE(map_dataB, b);
if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y))
c = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset))));
c = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset))));
else
EXTRAPOLATE(map_dataC, c);
if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y))
d = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset))));
d = CONVERT_TO_WT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset))));
else
EXTRAPOLATE(map_dataD, d);
@ -553,7 +553,7 @@ __kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_o
b * (u.x) * (1 - u.y) +
c * (1 - u.x) * (u.y) +
d * (u.x) * (u.y);
storepix(convertToT(dst_data), dst);
storepix(CONVERT_TO_T(dst_data), dst);
}
}
}

@ -55,44 +55,44 @@
#define noconvert
#if cn != 3
#if CN != 3
#define loadpix(addr) *(__global const T *)(addr)
#define storepix(val, addr) *(__global T *)(addr) = val
#define TSIZE (int)sizeof(T)
#else
#define loadpix(addr) vload3(0, (__global const T1 *)(addr))
#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))
#define TSIZE (int)sizeof(T1)*cn
#define TSIZE (int)sizeof(T1)*CN
#endif
#if defined USE_SAMPLER
#if cn == 1
#if CN == 1
#define READ_IMAGE(X,Y,Z) read_imagef(X,Y,Z).x
#define INTERMEDIATE_TYPE float
#elif cn == 2
#elif CN == 2
#define READ_IMAGE(X,Y,Z) read_imagef(X,Y,Z).xy
#define INTERMEDIATE_TYPE float2
#elif cn == 3
#elif CN == 3
#define READ_IMAGE(X,Y,Z) read_imagef(X,Y,Z).xyz
#define INTERMEDIATE_TYPE float3
#elif cn == 4
#elif CN == 4
#define READ_IMAGE(X,Y,Z) read_imagef(X,Y,Z)
#define INTERMEDIATE_TYPE float4
#endif
#define __CAT(x, y) x##y
#define CAT(x, y) __CAT(x, y)
//#define INTERMEDIATE_TYPE CAT(float, cn)
//#define INTERMEDIATE_TYPE CAT(float, CN)
#define float1 float
#if depth == 0
#if SRC_DEPTH == 0
#define RESULT_SCALE 255.0f
#elif depth == 1
#elif SRC_DEPTH == 1
#define RESULT_SCALE 127.0f
#elif depth == 2
#elif SRC_DEPTH == 2
#define RESULT_SCALE 65535.0f
#elif depth == 3
#elif SRC_DEPTH == 3
#define RESULT_SCALE 32767.0f
#else
#define RESULT_SCALE 1.0f
@ -114,10 +114,10 @@ __kernel void resizeSampler(__read_only image2d_t srcImage,
INTERMEDIATE_TYPE intermediate = READ_IMAGE(srcImage, sampler, (float2)(sx, sy));
#if depth <= 4
T uval = convertToDT(round(intermediate * RESULT_SCALE));
#if SRC_DEPTH <= 4
T uval = CONVERT_TO_DT(round(intermediate * RESULT_SCALE));
#else
T uval = convertToDT(intermediate * RESULT_SCALE);
T uval = CONVERT_TO_DT(intermediate * RESULT_SCALE);
#endif
if(dx < dstcols && dy < dstrows)
@ -149,15 +149,15 @@ __kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offs
int src_index0 = mad24(sy0, src_step, mad24(sx0, TSIZE, src_offset)),
src_index1 = mad24(sy1, src_step, mad24(sx0, TSIZE, src_offset));
WT data0 = convertToWT(loadpix(srcptr + src_index0));
WT data1 = convertToWT(loadpix(srcptr + src_index0 + TSIZE));
WT data2 = convertToWT(loadpix(srcptr + src_index1));
WT data3 = convertToWT(loadpix(srcptr + src_index1 + TSIZE));
WT data0 = CONVERT_TO_WT(loadpix(srcptr + src_index0));
WT data1 = CONVERT_TO_WT(loadpix(srcptr + src_index0 + TSIZE));
WT data2 = CONVERT_TO_WT(loadpix(srcptr + src_index1));
WT data3 = CONVERT_TO_WT(loadpix(srcptr + src_index1 + TSIZE));
WT val = ( (((data0 * a0 + data1 * a1) >> 4) * b0) >> 16) +
( (((data2 * a0 + data3 * a1) >> 4) * b1) >> 16);
storepix(convertToDT((val + 2) >> 2),
storepix(CONVERT_TO_DT((val + 2) >> 2),
dstptr + mad24(dy, dst_step, mad24(dx, TSIZE, dst_offset)));
}
}
@ -186,7 +186,7 @@ __kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offs
int y_ = INC(y, src_rows);
int x_ = INC(x, src_cols);
#if depth <= 1 // 8U/8S only, 16U+ cause integer overflows
#if SRC_DEPTH <= 1 // 8U/8S only, 16U+ cause integer overflows
#define INTER_RESIZE_COEF_SCALE (1 << INTER_RESIZE_COEF_BITS)
#define CAST_BITS (INTER_RESIZE_COEF_BITS << 1)
u = u * INTER_RESIZE_COEF_SCALE;
@ -197,24 +197,24 @@ __kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offs
int U1 = rint(INTER_RESIZE_COEF_SCALE - u);
int V1 = rint(INTER_RESIZE_COEF_SCALE - v);
WT data0 = convertToWT(loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))));
WT data1 = convertToWT(loadpix(srcptr + mad24(y, src_step, mad24(x_, TSIZE, src_offset))));
WT data2 = convertToWT(loadpix(srcptr + mad24(y_, src_step, mad24(x, TSIZE, src_offset))));
WT data3 = convertToWT(loadpix(srcptr + mad24(y_, src_step, mad24(x_, TSIZE, src_offset))));
WT data0 = CONVERT_TO_WT(loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))));
WT data1 = CONVERT_TO_WT(loadpix(srcptr + mad24(y, src_step, mad24(x_, TSIZE, src_offset))));
WT data2 = CONVERT_TO_WT(loadpix(srcptr + mad24(y_, src_step, mad24(x, TSIZE, src_offset))));
WT data3 = CONVERT_TO_WT(loadpix(srcptr + mad24(y_, src_step, mad24(x_, TSIZE, src_offset))));
WT val = mul24((WT)mul24(U1, V1), data0) + mul24((WT)mul24(U, V1), data1) +
mul24((WT)mul24(U1, V), data2) + mul24((WT)mul24(U, V), data3);
T uval = convertToDT((val + (1<<(CAST_BITS-1)))>>CAST_BITS);
T uval = CONVERT_TO_DT((val + (1<<(CAST_BITS-1)))>>CAST_BITS);
#else
float u1 = 1.f - u;
float v1 = 1.f - v;
WT data0 = convertToWT(loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))));
WT data1 = convertToWT(loadpix(srcptr + mad24(y, src_step, mad24(x_, TSIZE, src_offset))));
WT data2 = convertToWT(loadpix(srcptr + mad24(y_, src_step, mad24(x, TSIZE, src_offset))));
WT data3 = convertToWT(loadpix(srcptr + mad24(y_, src_step, mad24(x_, TSIZE, src_offset))));
WT data0 = CONVERT_TO_WT(loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))));
WT data1 = CONVERT_TO_WT(loadpix(srcptr + mad24(y, src_step, mad24(x_, TSIZE, src_offset))));
WT data2 = CONVERT_TO_WT(loadpix(srcptr + mad24(y_, src_step, mad24(x, TSIZE, src_offset))));
WT data3 = CONVERT_TO_WT(loadpix(srcptr + mad24(y_, src_step, mad24(x_, TSIZE, src_offset))));
T uval = convertToDT((u1 * v1) * data0 + (u * v1) * data1 + (u1 * v) * data2 + (u * v) * data3);
T uval = CONVERT_TO_DT((u1 * v1) * data0 + (u * v1) * data1 + (u1 * v) * data2 + (u * v) * data3);
#endif
storepix(uval, dstptr + mad24(dy, dst_step, mad24(dx, TSIZE, dst_offset)));
}
@ -268,11 +268,11 @@ __kernel void resizeAREA_FAST(__global const uchar * src, int src_step, int src_
for (int px = 0; px < XSCALE; ++px)
{
int x = min(sx + px, src_cols - 1);
sum += convertToWTV(loadpix(src + src_index + x*TSIZE));
sum += CONVERT_TO_WTV(loadpix(src + src_index + x*TSIZE));
}
}
storepix(convertToT(convertToWT2V(sum) * (WT2V)(SCALE)), dst + mad24(dx, TSIZE, dst_index));
storepix(CONVERT_TO_T(CONVERT_TO_WT2V(sum) * (WT2V)(SCALE)), dst + mad24(dx, TSIZE, dst_index));
}
}
@ -314,12 +314,12 @@ __kernel void resizeAREA(__global const uchar * src, int src_step, int src_offse
for (int sx = sx0, xk = xk0; sx <= sx1; ++sx, ++xk)
{
WTV alpha = (WTV)(xalpha_tab[xk]);
buf += convertToWTV(loadpix(src + mad24(sx, TSIZE, src_index))) * alpha;
buf += CONVERT_TO_WTV(loadpix(src + mad24(sx, TSIZE, src_index))) * alpha;
}
sum += buf * beta;
}
storepix(convertToT(sum), dst + mad24(dx, TSIZE, dst_index));
storepix(CONVERT_TO_T(sum), dst + mad24(dx, TSIZE, dst_index));
}
}

@ -66,7 +66,7 @@
#define ST T
#endif
#if cn != 3
#if CN != 3
#define loadpix(addr) *(__global const T*)(addr)
#define storepix(val, addr) *(__global T*)(addr) = val
#define scalar scalar_
@ -89,7 +89,7 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
__constant CT * M, ST scalar_)
{
int dx = get_global_id(0);
int dy0 = get_global_id(1) * rowsPerWI;
int dy0 = get_global_id(1) * ROWS_PER_WI;
if (dx < dst_cols)
{
@ -99,7 +99,7 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
int Y0_ = rint(M[3] * dx * AB_SCALE);
int dst_index = mad24(dy0, dst_step, mad24(dx, pixsize, dst_offset));
for (int dy = dy0, dy1 = min(dst_rows, dy0 + rowsPerWI); dy < dy1; ++dy, dst_index += dst_step)
for (int dy = dy0, dy1 = min(dst_rows, dy0 + ROWS_PER_WI); dy < dy1; ++dy, dst_index += dst_step)
{
int X0 = X0_ + rint(fma(M[1], (CT)dy, M[2]) * AB_SCALE) + round_delta;
int Y0 = Y0_ + rint(fma(M[4], (CT)dy, M[5]) * AB_SCALE) + round_delta;
@ -133,7 +133,7 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
__constant CT * M, ST scalar_)
{
int dx = get_global_id(0);
int dy0 = get_global_id(1) * rowsPerWI;
int dy0 = get_global_id(1) * ROWS_PER_WI;
if (dx < dst_cols)
{
@ -141,7 +141,7 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
int X0_ = rint(M[0] * tmp);
int Y0_ = rint(M[3] * tmp);
for (int dy = dy0, dy1 = min(dst_rows, dy0 + rowsPerWI); dy < dy1; ++dy)
for (int dy = dy0, dy1 = min(dst_rows, dy0 + ROWS_PER_WI); dy < dy1; ++dy)
{
int X0 = X0_ + rint(fma(M[1], (CT)dy, M[2]) * AB_SCALE) + ROUND_DELTA;
int Y0 = Y0_ + rint(fma(M[4], (CT)dy, M[5]) * AB_SCALE) + ROUND_DELTA;
@ -151,21 +151,21 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
short sx = convert_short_sat(X0 >> INTER_BITS), sy = convert_short_sat(Y0 >> INTER_BITS);
short ax = convert_short(X0 & (INTER_TAB_SIZE-1)), ay = convert_short(Y0 & (INTER_TAB_SIZE-1));
#if defined AMD_DEVICE || depth > 4
#if defined AMD_DEVICE || SRC_DEPTH > 4
WT v0 = scalar, v1 = scalar, v2 = scalar, v3 = scalar;
if (sx >= 0 && sx < src_cols)
{
if (sy >= 0 && sy < src_rows)
v0 = convertToWT(loadpix(srcptr + mad24(sy, src_step, mad24(sx, pixsize, src_offset))));
v0 = CONVERT_TO_WT(loadpix(srcptr + mad24(sy, src_step, mad24(sx, pixsize, src_offset))));
if (sy+1 >= 0 && sy+1 < src_rows)
v2 = convertToWT(loadpix(srcptr + mad24(sy+1, src_step, mad24(sx, pixsize, src_offset))));
v2 = CONVERT_TO_WT(loadpix(srcptr + mad24(sy+1, src_step, mad24(sx, pixsize, src_offset))));
}
if (sx+1 >= 0 && sx+1 < src_cols)
{
if (sy >= 0 && sy < src_rows)
v1 = convertToWT(loadpix(srcptr + mad24(sy, src_step, mad24(sx+1, pixsize, src_offset))));
v1 = CONVERT_TO_WT(loadpix(srcptr + mad24(sy, src_step, mad24(sx+1, pixsize, src_offset))));
if (sy+1 >= 0 && sy+1 < src_rows)
v3 = convertToWT(loadpix(srcptr + mad24(sy+1, src_step, mad24(sx+1, pixsize, src_offset))));
v3 = CONVERT_TO_WT(loadpix(srcptr + mad24(sy+1, src_step, mad24(sx+1, pixsize, src_offset))));
}
float taby = 1.f/INTER_TAB_SIZE*ay;
@ -173,18 +173,18 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset));
#if depth <= 4
#if SRC_DEPTH <= 4
int itab0 = convert_short_sat_rte( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE );
int itab1 = convert_short_sat_rte( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE );
int itab2 = convert_short_sat_rte( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE );
int itab3 = convert_short_sat_rte( taby*tabx * INTER_REMAP_COEF_SCALE );
WT val = mad24(v0, itab0, mad24(v1, itab1, mad24(v2, itab2, v3 * itab3)));
storepix(convertToT((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS), dstptr + dst_index);
storepix(CONVERT_TO_T((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS), dstptr + dst_index);
#else
float tabx2 = 1.0f - tabx, taby2 = 1.0f - taby;
WT val = fma(tabx2, fma(v0, taby2, v2 * taby), tabx * fma(v1, taby2, v3 * taby));
storepix(convertToT(val), dstptr + dst_index);
storepix(CONVERT_TO_T(val), dstptr + dst_index);
#endif
#else // INTEL_DEVICE
__constant float * coeffs_y = coeffs + (ay << 1), * coeffs_x = coeffs + (ax << 1);
@ -202,13 +202,13 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
xsum = (WT)(0);
if (sx >= 0 && sx + 2 < src_cols)
{
#if depth == 0 && cn == 1
#if SRC_DEPTH == 0 && CN == 1
uchar2 value = vload2(0, srcptr + src_index);
xsum = dot(convert_float2(value), (float2)(coeffs_x[0], coeffs_x[1]));
#else
#pragma unroll
for (int x = 0; x < 2; x++)
xsum = fma(convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))), coeffs_x[x], xsum);
xsum = fma(CONVERT_TO_WT(loadpix(srcptr + mad24(x, pixsize, src_index))), coeffs_x[x], xsum);
#endif
}
else
@ -216,7 +216,7 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
#pragma unroll
for (int x = 0; x < 2; x++)
xsum = fma(sx + x >= 0 && sx + x < src_cols ?
convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))) : scalar, coeffs_x[x], xsum);
CONVERT_TO_WT(loadpix(srcptr + mad24(x, pixsize, src_index))) : scalar, coeffs_x[x], xsum);
}
sum = fma(xsum, coeffs_y[y], sum);
}
@ -224,7 +224,7 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
sum = fma(scalar, coeffs_y[y], sum);
}
storepix(convertToT(sum), dstptr + dst_index);
storepix(CONVERT_TO_T(sum), dstptr + dst_index);
#endif
}
}
@ -290,7 +290,7 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
#pragma unroll
for (int x = 0; x < 4; x++)
v[mad24(y, 4, x)] = sx+x >= 0 && sx+x < src_cols ?
convertToWT(loadpix(srcptr + mad24(sy+y, src_step, mad24(sx+x, pixsize, src_offset)))) : scalar;
CONVERT_TO_WT(loadpix(srcptr + mad24(sy+y, src_step, mad24(sx+x, pixsize, src_offset)))) : scalar;
}
else
{
@ -310,7 +310,7 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset));
WT sum = (WT)(0);
#if depth <= 4
#if SRC_DEPTH <= 4
int itab[16];
#pragma unroll
@ -320,12 +320,12 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
#pragma unroll
for (int i = 0; i < 16; i++)
sum = mad24(v[i], itab[i], sum);
storepix(convertToT( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ), dstptr + dst_index);
storepix(CONVERT_TO_T( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ), dstptr + dst_index);
#else
#pragma unroll
for (int i = 0; i < 16; i++)
sum = fma(v[i], tab1y[(i>>2)] * tab1x[(i&3)], sum);
storepix(convertToT( sum ), dstptr + dst_index);
storepix(CONVERT_TO_T( sum ), dstptr + dst_index);
#endif
#else // INTEL_DEVICE
__constant float * coeffs_y = coeffs + (ay << 2), * coeffs_x = coeffs + (ax << 2);
@ -343,13 +343,13 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
xsum = (WT)(0);
if (sx >= 0 && sx + 4 < src_cols)
{
#if depth == 0 && cn == 1
#if SRC_DEPTH == 0 && CN == 1
uchar4 value = vload4(0, srcptr + src_index);
xsum = dot(convert_float4(value), (float4)(coeffs_x[0], coeffs_x[1], coeffs_x[2], coeffs_x[3]));
#else
#pragma unroll
for (int x = 0; x < 4; x++)
xsum = fma(convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))), coeffs_x[x], xsum);
xsum = fma(CONVERT_TO_WT(loadpix(srcptr + mad24(x, pixsize, src_index))), coeffs_x[x], xsum);
#endif
}
else
@ -357,7 +357,7 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
#pragma unroll
for (int x = 0; x < 4; x++)
xsum = fma(sx + x >= 0 && sx + x < src_cols ?
convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))) : scalar, coeffs_x[x], xsum);
CONVERT_TO_WT(loadpix(srcptr + mad24(x, pixsize, src_index))) : scalar, coeffs_x[x], xsum);
}
sum = fma(xsum, coeffs_y[y], sum);
}
@ -365,7 +365,7 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
sum = fma(scalar, coeffs_y[y], sum);
}
storepix(convertToT(sum), dstptr + dst_index);
storepix(CONVERT_TO_T(sum), dstptr + dst_index);
#endif
}
}

@ -65,7 +65,7 @@
#define ST T
#endif
#if cn != 3
#if CN != 3
#define loadpix(addr) *(__global const T*)(addr)
#define storepix(val, addr) *(__global T*)(addr) = val
#define scalar scalar_
@ -134,31 +134,31 @@ __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int s
short ax = (short)(X & (INTER_TAB_SIZE - 1));
WT v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ?
convertToWT(loadpix(srcptr + mad24(sy, src_step, src_offset + sx * pixsize))) : scalar;
CONVERT_TO_WT(loadpix(srcptr + mad24(sy, src_step, src_offset + sx * pixsize))) : scalar;
WT v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ?
convertToWT(loadpix(srcptr + mad24(sy, src_step, src_offset + (sx+1) * pixsize))) : scalar;
CONVERT_TO_WT(loadpix(srcptr + mad24(sy, src_step, src_offset + (sx+1) * pixsize))) : scalar;
WT v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ?
convertToWT(loadpix(srcptr + mad24(sy+1, src_step, src_offset + sx * pixsize))) : scalar;
CONVERT_TO_WT(loadpix(srcptr + mad24(sy+1, src_step, src_offset + sx * pixsize))) : scalar;
WT v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ?
convertToWT(loadpix(srcptr + mad24(sy+1, src_step, src_offset + (sx+1) * pixsize))) : scalar;
CONVERT_TO_WT(loadpix(srcptr + mad24(sy+1, src_step, src_offset + (sx+1) * pixsize))) : scalar;
float taby = 1.f/INTER_TAB_SIZE*ay;
float tabx = 1.f/INTER_TAB_SIZE*ax;
int dst_index = mad24(dy, dst_step, dst_offset + dx * pixsize);
#if depth <= 4
#if SRC_DEPTH <= 4
int itab0 = convert_short_sat_rte( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE );
int itab1 = convert_short_sat_rte( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE );
int itab2 = convert_short_sat_rte( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE );
int itab3 = convert_short_sat_rte( taby*tabx * INTER_REMAP_COEF_SCALE );
WT val = v0 * itab0 + v1 * itab1 + v2 * itab2 + v3 * itab3;
storepix(convertToT((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS), dstptr + dst_index);
storepix(CONVERT_TO_T((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS), dstptr + dst_index);
#else
float tabx2 = 1.0f - tabx, taby2 = 1.0f - taby;
WT val = v0 * tabx2 * taby2 + v1 * tabx * taby2 + v2 * tabx2 * taby + v3 * tabx * taby;
storepix(convertToT(val), dstptr + dst_index);
storepix(CONVERT_TO_T(val), dstptr + dst_index);
#endif
}
}
@ -201,7 +201,7 @@ __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int s
#pragma unroll
for (int x = 0; x < 4; x++)
v[mad24(y, 4, x)] = (sx+x >= 0 && sx+x < src_cols && sy+y >= 0 && sy+y < src_rows) ?
convertToWT(loadpix(srcptr + mad24(sy+y, src_step, src_offset + (sx+x) * pixsize))) : scalar;
CONVERT_TO_WT(loadpix(srcptr + mad24(sy+y, src_step, src_offset + (sx+x) * pixsize))) : scalar;
float tab1y[4], tab1x[4];
@ -213,7 +213,7 @@ __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int s
int dst_index = mad24(dy, dst_step, dst_offset + dx * pixsize);
WT sum = (WT)(0);
#if depth <= 4
#if SRC_DEPTH <= 4
int itab[16];
#pragma unroll
@ -223,12 +223,12 @@ __kernel void warpPerspective(__global const uchar * srcptr, int src_step, int s
#pragma unroll
for (int i = 0; i < 16; i++)
sum += v[i] * itab[i];
storepix(convertToT( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ), dstptr + dst_index);
storepix(CONVERT_TO_T( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ), dstptr + dst_index);
#else
#pragma unroll
for (int i = 0; i < 16; i++)
sum += v[i] * tab1y[(i>>2)] * tab1x[(i&3)];
storepix(convertToT( sum ), dstptr + dst_index);
storepix(CONVERT_TO_T( sum ), dstptr + dst_index);
#endif
}
}

@ -1189,8 +1189,8 @@ static bool ocl_pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, in
"BORDER_REFLECT_101" };
char cvt[2][50];
String buildOptions = format(
"-D T=%s -D FT=%s -D convertToT=%s -D convertToFT=%s%s "
"-D T1=%s -D cn=%d -D kercn=%d -D fdepth=%d -D %s -D LOCAL_SIZE=%d",
"-D T=%s -D FT=%s -D CONVERT_TO_T=%s -D CONVERT_TO_FT=%s%s "
"-D T1=%s -D CN=%d -D KERCN=%d -D FDEPTH=%d -D %s -D LOCAL_SIZE=%d",
ocl::typeToStr(type), ocl::typeToStr(CV_MAKETYPE(float_depth, cn)),
ocl::convertTypeStr(float_depth, depth, cn, cvt[0], sizeof(cvt[0])),
ocl::convertTypeStr(depth, float_depth, cn, cvt[1], sizeof(cvt[1])),
@ -1232,8 +1232,8 @@ static bool ocl_pyrUp( InputArray _src, OutputArray _dst, const Size& _dsz, int
const int local_size = channels == 1 ? 16 : 8;
char cvt[2][50];
String buildOptions = format(
"-D T=%s -D FT=%s -D convertToT=%s -D convertToFT=%s%s "
"-D T1=%s -D cn=%d -D LOCAL_SIZE=%d",
"-D T=%s -D FT=%s -D CONVERT_TO_T=%s -D CONVERT_TO_FT=%s%s "
"-D T1=%s -D CN=%d -D LOCAL_SIZE=%d",
ocl::typeToStr(type), ocl::typeToStr(CV_MAKETYPE(float_depth, channels)),
ocl::convertTypeStr(float_depth, depth, channels, cvt[0], sizeof(cvt[0])),
ocl::convertTypeStr(depth, float_depth, channels, cvt[1], sizeof(cvt[1])),

@ -3396,8 +3396,8 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
{
int wdepth = std::max(depth, CV_32S);
char buf[2][50];
cv::String compileOpts = format("-D USE_SAMPLER -D depth=%d -D T=%s -D T1=%s "
"-D convertToDT=%s -D cn=%d",
cv::String compileOpts = format("-D USE_SAMPLER -D SRC_DEPTH=%d -D T=%s -D T1=%s "
"-D CONVERT_TO_DT=%s -D CN=%d",
depth, ocl::typeToStr(type), ocl::typeToStr(depth),
ocl::convertTypeStr(wdepth, depth, cn, buf[1], sizeof(buf[1])),
cn);
@ -3461,8 +3461,8 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
Mat(1, static_cast<int>(_buffer.size()), CV_8UC1, _buffer.data()).copyTo(coeffs);
k.create("resizeLN", ocl::imgproc::resize_oclsrc,
format("-D INTER_LINEAR_INTEGER -D depth=%d -D T=%s -D T1=%s "
"-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
format("-D INTER_LINEAR_INTEGER -D SRC_DEPTH=%d -D T=%s -D T1=%s "
"-D WT=%s -D CONVERT_TO_WT=%s -D CONVERT_TO_DT=%s -D CN=%d "
"-D INTER_RESIZE_COEF_BITS=%d",
depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
ocl::convertTypeStr(depth, wdepth, cn, buf[0], sizeof(buf[0])),
@ -3479,8 +3479,8 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
int wdepth = depth <= CV_8S ? CV_32S : std::max(depth, CV_32F);
int wtype = CV_MAKETYPE(wdepth, cn);
k.create("resizeLN", ocl::imgproc::resize_oclsrc,
format("-D INTER_LINEAR -D depth=%d -D T=%s -D T1=%s "
"-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
format("-D INTER_LINEAR -D SRC_DEPTH=%d -D T=%s -D T1=%s "
"-D WT=%s -D CONVERT_TO_WT=%s -D CONVERT_TO_DT=%s -D CN=%d "
"-D INTER_RESIZE_COEF_BITS=%d",
depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
ocl::convertTypeStr(depth, wdepth, cn, buf[0], sizeof(buf[0])),
@ -3496,7 +3496,7 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
else if (interpolation == INTER_NEAREST)
{
k.create("resizeNN", ocl::imgproc::resize_oclsrc,
format("-D INTER_NEAREST -D T=%s -D T1=%s -D cn=%d",
format("-D INTER_NEAREST -D T=%s -D T1=%s -D CN=%d",
ocl::vecopTypeToStr(type), ocl::vecopTypeToStr(depth), cn));
if (k.empty())
return false;
@ -3510,7 +3510,7 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
int wtype = CV_MAKE_TYPE(wdepth, cn);
char cvt[2][50];
String buildOption = format("-D INTER_AREA -D T=%s -D T1=%s -D WTV=%s -D convertToWTV=%s -D cn=%d",
String buildOption = format("-D INTER_AREA -D T=%s -D T1=%s -D WTV=%s -D CONVERT_TO_WTV=%s -D CN=%d",
ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
ocl::convertTypeStr(depth, wdepth, cn, cvt[0], sizeof(cvt[0])), cn);
@ -3520,7 +3520,7 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
if (is_area_fast)
{
int wdepth2 = std::max(CV_32F, depth), wtype2 = CV_MAKE_TYPE(wdepth2, cn);
buildOption = buildOption + format(" -D convertToT=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST"
buildOption = buildOption + format(" -D CONVERT_TO_T=%s -D WT2V=%s -D CONVERT_TO_WT2V=%s -D INTER_AREA_FAST"
" -D XSCALE=%d -D YSCALE=%d -D SCALE=%ff",
ocl::convertTypeStr(wdepth2, depth, cn, cvt[0], sizeof(cvt[0])),
ocl::typeToStr(wtype2), ocl::convertTypeStr(wdepth, wdepth2, cn, cvt[1], sizeof(cvt[1])),
@ -3532,7 +3532,7 @@ static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
}
else
{
buildOption = buildOption + format(" -D convertToT=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0], sizeof(cvt[0])));
buildOption = buildOption + format(" -D CONVERT_TO_T=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0], sizeof(cvt[0])));
k.create("resizeAREA", ocl::imgproc::resize_oclsrc, buildOption);
if (k.empty())
return false;

Loading…
Cancel
Save