Merge pull request #2635 from akarsakov:gaussian_float_intel

pull/2688/merge
Alexander Alekhin 11 years ago committed by OpenCV Buildbot
commit 347d5b96f3
  1. 10
      modules/imgproc/src/filter.cpp
  2. 10
      modules/imgproc/src/opencl/filterSepCol.cl
  3. 20
      modules/imgproc/src/opencl/filterSepRow.cl
  4. 18
      modules/imgproc/src/opencl/filterSep_singlePass.cl

@ -3493,10 +3493,20 @@ static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
if( sdepth == CV_8U && ddepth == CV_8U && if( sdepth == CV_8U && ddepth == CV_8U &&
rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL && rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL) ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL)
{
if (ocl::Device::getDefault().isIntel())
{
for (int i=0; i<kernelX.cols; i++)
kernelX.at<float>(0, i) = (float) cvRound(kernelX.at<float>(0, i) * (1 << shift_bits));
if (kernelX.data != kernelY.data)
for (int i=0; i<kernelX.cols; i++)
kernelY.at<float>(0, i) = (float) cvRound(kernelY.at<float>(0, i) * (1 << shift_bits));
} else
{ {
bdepth = CV_32S; bdepth = CV_32S;
kernelX.convertTo( kernelX, bdepth, 1 << shift_bits ); kernelX.convertTo( kernelX, bdepth, 1 << shift_bits );
kernelY.convertTo( kernelY, bdepth, 1 << shift_bits ); kernelY.convertTo( kernelY, bdepth, 1 << shift_bits );
}
int_arithm = true; int_arithm = true;
} }

@ -97,15 +97,19 @@ __kernel void col_filter(__global const uchar * src, int src_step, int src_offse
{ {
temp[0] = LDS_DAT[l_y + RADIUSY - i][l_x]; temp[0] = LDS_DAT[l_y + RADIUSY - i][l_x];
temp[1] = LDS_DAT[l_y + RADIUSY + i][l_x]; temp[1] = LDS_DAT[l_y + RADIUSY + i][l_x];
#ifndef INTEGER_ARITHMETIC #if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
sum += mad(temp[0], mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]);
#else
sum += mad24(temp[0],mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]); sum += mad24(temp[0],mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]);
#else
sum += mad(temp[0], mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]);
#endif #endif
} }
#ifdef INTEGER_ARITHMETIC #ifdef INTEGER_ARITHMETIC
#ifdef INTEL_DEVICE
sum = (sum + (1 << (SHIFT_BITS-1))) / (1 << SHIFT_BITS);
#else
sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS; sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS;
#endif
#endif #endif
// write the result to dst // write the result to dst

@ -141,12 +141,12 @@
#define DIG(a) a, #define DIG(a) a,
__constant dstT1 mat_kernel[] = { COEFF }; __constant dstT1 mat_kernel[] = { COEFF };
#ifndef INTEGER_ARITHMETIC #if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
#define dstT4 float4
#define convertDstVec convert_float4
#else
#define dstT4 int4 #define dstT4 int4
#define convertDstVec convert_int4 #define convertDstVec convert_int4
#else
#define dstT4 float4
#define convertDstVec convert_float4
#endif #endif
__kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel, int src_offset_x, int src_offset_y, __kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel, int src_offset_x, int src_offset_y,
@ -263,10 +263,10 @@ __kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel
{ {
temp[0] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset - i); temp[0] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset - i);
temp[1] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset + i); temp[1] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset + i);
#ifndef INTEGER_ARITHMETIC #if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
sum += mad(convertDstVec(temp[0]), mat_kernel[RADIUSX-i], convertDstVec(temp[1]) * mat_kernel[RADIUSX + i]);
#else
sum += mad24(convertDstVec(temp[0]), mat_kernel[RADIUSX-i], convertDstVec(temp[1]) * mat_kernel[RADIUSX + i]); sum += mad24(convertDstVec(temp[0]), mat_kernel[RADIUSX-i], convertDstVec(temp[1]) * mat_kernel[RADIUSX + i]);
#else
sum += mad(convertDstVec(temp[0]), mat_kernel[RADIUSX-i], convertDstVec(temp[1]) * mat_kernel[RADIUSX + i]);
#endif #endif
} }
@ -368,10 +368,10 @@ __kernel void row_filter(__global const uchar * src, int src_step, int src_offse
{ {
temp[0] = LDS_DAT[l_y][l_x + RADIUSX - i]; temp[0] = LDS_DAT[l_y][l_x + RADIUSX - i];
temp[1] = LDS_DAT[l_y][l_x + RADIUSX + i]; temp[1] = LDS_DAT[l_y][l_x + RADIUSX + i];
#ifndef INTEGER_ARITHMETIC #if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
sum += mad(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]);
#else
sum += mad24(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]); sum += mad24(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]);
#else
sum += mad(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]);
#endif #endif
} }

@ -162,10 +162,10 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int
{ {
sum = (WT) 0; sum = (WT) 0;
for (i=0; i<=2*RADIUSY; i++) for (i=0; i<=2*RADIUSY; i++)
#ifndef INTEGER_ARITHMETIC #if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
sum = mad(lsmem[liy+i][clocX], mat_kernelY[i], sum);
#else
sum = mad24(lsmem[liy+i][clocX], mat_kernelY[i], sum); sum = mad24(lsmem[liy+i][clocX], mat_kernelY[i], sum);
#else
sum = mad(lsmem[liy+i][clocX], mat_kernelY[i], sum);
#endif #endif
lsmemDy[liy][clocX] = sum; lsmemDy[liy][clocX] = sum;
clocX += BLK_X; clocX += BLK_X;
@ -182,12 +182,18 @@ __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int
// and calculate final result // and calculate final result
sum = 0.0f; sum = 0.0f;
for (i=0; i<=2*RADIUSX; i++) for (i=0; i<=2*RADIUSX; i++)
#ifndef INTEGER_ARITHMETIC #if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
#else
sum = mad24(lsmemDy[liy][lix+i], mat_kernelX[i], sum); sum = mad24(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
#else
sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
#endif
#ifdef INTEGER_ARITHMETIC
#ifdef INTEL_DEVICE
sum = (sum + (1 << (SHIFT_BITS-1))) / (1 << SHIFT_BITS);
#else
sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS; sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS;
#endif
#endif #endif
// store result into destination image // store result into destination image

Loading…
Cancel
Save