optimization of cv::warpAffine INTER_CUBIC

pull/2869/head
Ilya Lavrenov 11 years ago
parent 14671e0cd6
commit 93712e913a
  1. 3
      modules/imgproc/src/imgwarp.cpp
  2. 140
      modules/imgproc/src/opencl/warp_affine.cl

@ -4187,7 +4187,8 @@ static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0,
const char * const kernelName = op_type == OCL_OP_AFFINE ? "warpAffine" : "warpPerspective";
int scalarcn = cn == 3 ? 4 : cn;
int wdepth = interpolation == INTER_NEAREST ? depth : std::max(CV_32S, depth);
bool is32f = !dev.isAMD() && (interpolation == INTER_CUBIC || interpolation == INTER_LINEAR);
int wdepth = interpolation == INTER_NEAREST ? depth : std::max(is32f ? CV_32F : CV_32S, depth);
int sctype = CV_MAKETYPE(wdepth, scalarcn);
ocl::Kernel k;

@ -61,6 +61,7 @@
#define AB_SCALE (1 << AB_BITS)
#define INTER_REMAP_COEF_BITS 15
#define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS)
#define ROUND_DELTA (1 << (AB_BITS - INTER_BITS - 1))
#define noconvert
@ -122,6 +123,14 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
#elif defined INTER_LINEAR
__constant float coeffs[64] =
{ 1.000000f, 0.000000f, 0.968750f, 0.031250f, 0.937500f, 0.062500f, 0.906250f, 0.093750f, 0.875000f, 0.125000f, 0.843750f, 0.156250f,
0.812500f, 0.187500f, 0.781250f, 0.218750f, 0.750000f, 0.250000f, 0.718750f, 0.281250f, 0.687500f, 0.312500f, 0.656250f, 0.343750f,
0.625000f, 0.375000f, 0.593750f, 0.406250f, 0.562500f, 0.437500f, 0.531250f, 0.468750f, 0.500000f, 0.500000f, 0.468750f, 0.531250f,
0.437500f, 0.562500f, 0.406250f, 0.593750f, 0.375000f, 0.625000f, 0.343750f, 0.656250f, 0.312500f, 0.687500f, 0.281250f, 0.718750f,
0.250000f, 0.750000f, 0.218750f, 0.781250f, 0.187500f, 0.812500f, 0.156250f, 0.843750f, 0.125000f, 0.875000f, 0.093750f, 0.906250f,
0.062500f, 0.937500f, 0.031250f, 0.968750f };
__kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
__constant CT * M, ST scalar_)
@ -131,24 +140,21 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
if (dx < dst_cols)
{
int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
int tmp = (dx << AB_BITS);
int tmp = dx << AB_BITS;
int X0_ = rint(M[0] * tmp);
int Y0_ = rint(M[3] * tmp);
for (int dy = dy0, dy1 = min(dst_rows, dy0 + rowsPerWI); dy < dy1; ++dy)
{
int X0 = X0_ + rint(fma(M[1], dy, M[2]) * AB_SCALE) + round_delta;
int Y0 = Y0_ + rint(fma(M[4], dy, M[5]) * AB_SCALE) + round_delta;
int X0 = X0_ + rint(fma(M[1], dy, M[2]) * AB_SCALE) + ROUND_DELTA;
int Y0 = Y0_ + rint(fma(M[4], dy, M[5]) * AB_SCALE) + ROUND_DELTA;
X0 = X0 >> (AB_BITS - INTER_BITS);
Y0 = Y0 >> (AB_BITS - INTER_BITS);
short sx = convert_short_sat(X0 >> INTER_BITS);
short sy = convert_short_sat(Y0 >> INTER_BITS);
short ax = convert_short(X0 & (INTER_TAB_SIZE-1));
short ay = convert_short(Y0 & (INTER_TAB_SIZE-1));
short sx = convert_short_sat(X0 >> INTER_BITS), sy = convert_short_sat(Y0 >> INTER_BITS);
short ax = convert_short(X0 & (INTER_TAB_SIZE-1)), ay = convert_short(Y0 & (INTER_TAB_SIZE-1));
#if defined AMD_DEVICE || depth > 4
WT v0 = scalar, v1 = scalar, v2 = scalar, v3 = scalar;
if (sx >= 0 && sx < src_cols)
{
@ -180,8 +186,48 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
storepix(convertToT((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS), dstptr + dst_index);
#else
float tabx2 = 1.0f - tabx, taby2 = 1.0f - taby;
WT val = fma(v0, tabx2 * taby2, fma(v1, tabx * taby2, fma(v2, tabx2 * taby, v3 * tabx * taby)));
WT val = fma(tabx2, fma(v0, taby2, v2 * taby), tabx * fma(v1, taby2, v3 * taby));
storepix(convertToT(val), dstptr + dst_index);
#endif
#else // INTEL_DEVICE
__constant float * coeffs_y = coeffs + (ay << 1), * coeffs_x = coeffs + (ax << 1);
int src_index0 = mad24(sy, src_step, mad24(sx, pixsize, src_offset)), src_index;
int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset));
WT sum = (WT)(0), xsum;
#pragma unroll
for (int y = 0; y < 2; y++)
{
src_index = mad24(y, src_step, src_index0);
if (sy + y >= 0 && sy + y < src_rows)
{
xsum = (WT)(0);
if (sx >= 0 && sx + 2 < src_cols)
{
#if depth == 0 && cn == 1
uchar2 value = vload2(0, srcptr + src_index);
xsum = dot(convert_float2(value), (float2)(coeffs_x[0], coeffs_x[1]));
#else
#pragma unroll
for (int x = 0; x < 2; x++)
xsum = fma(convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))), coeffs_x[x], xsum);
#endif
}
else
{
#pragma unroll
for (int x = 0; x < 2; x++)
xsum = fma(sx + x >= 0 && sx + x < src_cols ?
convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))) : scalar, coeffs_x[x], xsum);
}
sum = fma(xsum, coeffs_y[y], sum);
}
else
sum = fma(scalar, coeffs_y[y], sum);
}
storepix(convertToT(sum), dstptr + dst_index);
#endif
}
}
@ -189,6 +235,8 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
#elif defined INTER_CUBIC
#ifdef AMD_DEVICE
inline void interpolateCubic( float x, float* coeffs )
{
const float A = -0.75f;
@ -199,6 +247,23 @@ inline void interpolateCubic( float x, float* coeffs )
coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
}
#else
__constant float coeffs[128] =
{ 0.000000f, 1.000000f, 0.000000f, 0.000000f, -0.021996f, 0.997841f, 0.024864f, -0.000710f, -0.041199f, 0.991516f, 0.052429f, -0.002747f,
-0.057747f, 0.981255f, 0.082466f, -0.005974f, -0.071777f, 0.967285f, 0.114746f, -0.010254f, -0.083427f, 0.949837f, 0.149040f, -0.015450f,
-0.092834f, 0.929138f, 0.185120f, -0.021423f, -0.100136f, 0.905418f, 0.222755f, -0.028038f, -0.105469f, 0.878906f, 0.261719f, -0.035156f,
-0.108971f, 0.849831f, 0.301781f, -0.042641f, -0.110779f, 0.818420f, 0.342712f, -0.050354f, -0.111031f, 0.784904f, 0.384285f, -0.058159f,
-0.109863f, 0.749512f, 0.426270f, -0.065918f, -0.107414f, 0.712471f, 0.468437f, -0.073494f, -0.103821f, 0.674011f, 0.510559f, -0.080750f,
-0.099220f, 0.634361f, 0.552406f, -0.087547f, -0.093750f, 0.593750f, 0.593750f, -0.093750f, -0.087547f, 0.552406f, 0.634361f, -0.099220f,
-0.080750f, 0.510559f, 0.674011f, -0.103821f, -0.073494f, 0.468437f, 0.712471f, -0.107414f, -0.065918f, 0.426270f, 0.749512f, -0.109863f,
-0.058159f, 0.384285f, 0.784904f, -0.111031f, -0.050354f, 0.342712f, 0.818420f, -0.110779f, -0.042641f, 0.301781f, 0.849831f, -0.108971f,
-0.035156f, 0.261719f, 0.878906f, -0.105469f, -0.028038f, 0.222755f, 0.905418f, -0.100136f, -0.021423f, 0.185120f, 0.929138f, -0.092834f,
-0.015450f, 0.149040f, 0.949837f, -0.083427f, -0.010254f, 0.114746f, 0.967285f, -0.071777f, -0.005974f, 0.082466f, 0.981255f, -0.057747f,
-0.002747f, 0.052429f, 0.991516f, -0.041199f, -0.000710f, 0.024864f, 0.997841f, -0.021996f };
#endif
__kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
__constant CT * M, ST scalar_)
@ -208,22 +273,17 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
if (dx < dst_cols && dy < dst_rows)
{
int round_delta = ((AB_SCALE>>INTER_BITS)>>1);
int tmp = (dx << AB_BITS);
int X0 = rint(M[0] * tmp);
int Y0 = rint(M[3] * tmp);
int X0 = rint(M[0] * tmp) + rint(fma(M[1], dy, M[2]) * AB_SCALE) + ROUND_DELTA;
int Y0 = rint(M[3] * tmp) + rint(fma(M[4], dy, M[5]) * AB_SCALE) + ROUND_DELTA;
X0 += rint(fma(M[1], dy, M[2]) * AB_SCALE) + round_delta;
Y0 += rint(fma(M[4], dy, M[5]) * AB_SCALE) + round_delta;
X0 = X0 >> (AB_BITS - INTER_BITS);
Y0 = Y0 >> (AB_BITS - INTER_BITS);
int sx = (short)(X0 >> INTER_BITS) - 1;
int sy = (short)(Y0 >> INTER_BITS) - 1;
int ay = (short)(Y0 & (INTER_TAB_SIZE-1));
int ax = (short)(X0 & (INTER_TAB_SIZE-1));
int sx = (short)(X0 >> INTER_BITS) - 1, sy = (short)(Y0 >> INTER_BITS) - 1;
int ay = (short)(Y0 & (INTER_TAB_SIZE - 1)), ax = (short)(X0 & (INTER_TAB_SIZE - 1));
#ifdef AMD_DEVICE
WT v[16];
#pragma unroll
for (int y = 0; y < 4; y++)
@ -269,6 +329,46 @@ __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_of
for (int i = 0; i < 16; i++)
sum = fma(v[i], tab1y[(i>>2)] * tab1x[(i&3)], sum);
storepix(convertToT( sum ), dstptr + dst_index);
#endif
#else // INTEL_DEVICE
__constant float * coeffs_y = coeffs + (ay << 2), * coeffs_x = coeffs + (ax << 2);
int src_index0 = mad24(sy, src_step, mad24(sx, pixsize, src_offset)), src_index;
int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset));
WT sum = (WT)(0), xsum;
#pragma unroll
for (int y = 0; y < 4; y++)
{
src_index = mad24(y, src_step, src_index0);
if (sy + y >= 0 && sy + y < src_rows)
{
xsum = (WT)(0);
if (sx >= 0 && sx + 4 < src_cols)
{
#if depth == 0 && cn == 1
uchar4 value = vload4(0, srcptr + src_index);
xsum = dot(convert_float4(value), (float4)(coeffs_x[0], coeffs_x[1], coeffs_x[2], coeffs_x[3]));
#else
#pragma unroll
for (int x = 0; x < 4; x++)
xsum = fma(convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))), coeffs_x[x], xsum);
#endif
}
else
{
#pragma unroll
for (int x = 0; x < 4; x++)
xsum = fma(sx + x >= 0 && sx + x < src_cols ?
convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))) : scalar, coeffs_x[x], xsum);
}
sum = fma(xsum, coeffs_y[y], sum);
}
else
sum = fma(scalar, coeffs_y[y], sum);
}
storepix(convertToT(sum), dstptr + dst_index);
#endif
}
}

Loading…
Cancel
Save