implemented gpu::remap for all types

pull/13383/head
Vladislav Vinogradov 13 years ago
parent 785428546b
commit 47d68f6967
  1. 21
      modules/gpu/include/opencv2/gpu/devmem2d.hpp
  2. 9
      modules/gpu/include/opencv2/gpu/gpu.hpp
  3. 8
      modules/gpu/src/cuda/filters.cu
  4. 376
      modules/gpu/src/cuda/imgproc.cu
  5. 46
      modules/gpu/src/cuda/surf.cu
  6. 2
      modules/gpu/src/gpumat.cpp
  7. 96
      modules/gpu/src/imgproc.cpp
  8. 422
      modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
  9. 55
      modules/gpu/src/opencv2/gpu/device/utility.hpp
  10. 56
      modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
  11. 65
      modules/gpu/test/test_imgproc.cpp
  12. 113
      samples/gpu/performance/tests.cpp

@ -66,6 +66,9 @@ namespace cv
template <typename T> struct DevMem2D_
{
typedef T elem_type;
typedef int index_type;
int cols;
int rows;
T* data;
@ -79,8 +82,7 @@ namespace cv
template <typename U>
explicit DevMem2D_(const DevMem2D_<U>& d)
: cols(d.cols), rows(d.rows), data((T*)d.data), step(d.step) {}
typedef T elem_type;
enum { elem_size = sizeof(elem_type) };
__CV_GPU_HOST_DEVICE__ size_t elemSize() const { return elem_size; }
@ -89,6 +91,9 @@ namespace cv
__CV_GPU_HOST_DEVICE__ operator T*() const { return data; }
__CV_GPU_HOST_DEVICE__ T& operator ()(int y, int x) { return ptr(y)[x]; }
__CV_GPU_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }
#if defined(__DEVCLASES_ADD_THRUST_BEGIN_END__)
thrust::device_ptr<T> begin() const { return thrust::device_ptr<T>(data); }
thrust::device_ptr<T> end() const { return thrust::device_ptr<T>(data) + cols * rows; }
@ -97,19 +102,24 @@ namespace cv
template<typename T> struct PtrStep_
{
typedef T elem_type;
typedef int index_type;
T* data;
size_t step;
PtrStep_() : data(0), step(0) {}
PtrStep_(const DevMem2D_<T>& mem) : data(mem.data), step(mem.step) {}
typedef T elem_type;
enum { elem_size = sizeof(elem_type) };
__CV_GPU_HOST_DEVICE__ size_t elemSize() const { return elem_size; }
__CV_GPU_HOST_DEVICE__ T* ptr(int y = 0) { return (T*)( (char*)data + y * step); }
__CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return (const T*)( (const char*)data + y * step); }
__CV_GPU_HOST_DEVICE__ T& operator ()(int y, int x) { return ptr(y)[x]; }
__CV_GPU_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }
#if defined(__DEVCLASES_ADD_THRUST_BEGIN_END__)
thrust::device_ptr<T> begin() const { return thrust::device_ptr<T>(data); }
#endif
@ -124,7 +134,10 @@ namespace cv
PtrStep_<T>::step /= PtrStep_<T>::elem_size;
}
__CV_GPU_HOST_DEVICE__ T* ptr(int y = 0) { return PtrStep_<T>::data + y * PtrStep_<T>::step; }
__CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return PtrStep_<T>::data + y * PtrStep_<T>::step; }
__CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return PtrStep_<T>::data + y * PtrStep_<T>::step; }
__CV_GPU_HOST_DEVICE__ T& operator ()(int y, int x) { return ptr(y)[x]; }
__CV_GPU_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }
};
typedef DevMem2D_<unsigned char> DevMem2D;

@ -596,8 +596,9 @@ namespace cv
////////////////////////////// Image processing //////////////////////////////
//! DST[x,y] = SRC[xmap[x,y],ymap[x,y]] with bilinear interpolation.
//! supports CV_8UC1, CV_8UC3 source types and CV_32FC1 map type
CV_EXPORTS void remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap);
//! supports CV_32FC1 map type
CV_EXPORTS void remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap,
int interpolation, int borderMode = BORDER_CONSTANT, const Scalar& borderValue = Scalar());
//! Does mean shift filtering on GPU.
CV_EXPORTS void meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr,
@ -761,10 +762,10 @@ namespace cv
CV_EXPORTS void upsample(const GpuMat& src, GpuMat &dst, Stream& stream = Stream::Null());
//! smoothes the source image and downsamples it
CV_EXPORTS void pyrDown(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
CV_EXPORTS void pyrDown(const GpuMat& src, GpuMat& dst, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());
//! upsamples the source image and then smoothes it
CV_EXPORTS void pyrUp(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
CV_EXPORTS void pyrUp(const GpuMat& src, GpuMat& dst, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());
//! performs linear blending of two images
//! to avoid accuracy errors sum of weigths shouldn't be very close to zero

@ -242,9 +242,9 @@ namespace filter_krnls
{
const T* srcCol = src.ptr() + x;
sDataColumn[ threadIdx.y * BLOCK_DIM_X] = b.at_low(y - BLOCK_DIM_Y, srcCol);
sDataColumn[(threadIdx.y + BLOCK_DIM_Y) * BLOCK_DIM_X] = b.at_high(y, srcCol);
sDataColumn[(threadIdx.y + BLOCK_DIM_Y * 2) * BLOCK_DIM_X] = b.at_high(y + BLOCK_DIM_Y, srcCol);
sDataColumn[ threadIdx.y * BLOCK_DIM_X] = b.at_low(y - BLOCK_DIM_Y, srcCol, src.step);
sDataColumn[(threadIdx.y + BLOCK_DIM_Y) * BLOCK_DIM_X] = b.at_high(y, srcCol, src.step);
sDataColumn[(threadIdx.y + BLOCK_DIM_Y * 2) * BLOCK_DIM_X] = b.at_high(y + BLOCK_DIM_Y, srcCol, src.step);
__syncthreads();
@ -273,7 +273,7 @@ namespace cv { namespace gpu { namespace filters
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
B<T> b(src.rows, src.step);
B<T> b(src.rows);
if (!b.is_range_safe(-BLOCK_DIM_Y, (grid.y + 1) * BLOCK_DIM_Y - 1))
{

@ -44,6 +44,8 @@
#include "opencv2/gpu/device/border_interpolate.hpp"
#include "opencv2/gpu/device/vec_traits.hpp"
#include "opencv2/gpu/device/vec_math.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/utility.hpp"
using namespace cv::gpu;
using namespace cv::gpu::device;
@ -51,109 +53,119 @@ using namespace cv::gpu::device;
/////////////////////////////////// Remap ///////////////////////////////////////////////
namespace cv { namespace gpu { namespace imgproc
{
texture<unsigned char, 2, cudaReadModeNormalizedFloat> tex_remap(0, cudaFilterModeLinear, cudaAddressModeWrap);
// cudaAddressModeClamp == BrdReplicate
/*texture<uchar, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_remap_uchar_LinearFilter(0, cudaFilterModeLinear, cudaAddressModeClamp);
__global__ void remap_1c(const float* mapx, const float* mapy, size_t map_step, uchar* out, size_t out_step, int width, int height)
__global__ void remap_uchar_LinearFilter(const PtrStepf mapx, const PtrStepf mapy, DevMem2D dst)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < width && y < height)
{
int idx = y * (map_step >> 2) + x; /* map_step >> 2 <=> map_step / sizeof(float)*/
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
float xcoo = mapx[idx];
float ycoo = mapy[idx];
if (x < dst.cols && y < dst.rows)
{
const float xcoo = mapx.ptr(y)[x];
const float ycoo = mapy.ptr(y)[x];
out[y * out_step + x] = (unsigned char)(255.f * tex2D(tex_remap, xcoo, ycoo));
dst.ptr(y)[x] = 255.0f * tex2D(tex_remap_uchar_LinearFilter, xcoo, ycoo);
}
}
}*/
__global__ void remap_3c(const uchar* src, size_t src_step, const float* mapx, const float* mapy,
size_t map_step, uchar* dst, size_t dst_step, int width, int height)
{
template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, DevMem2D_<T> dst)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < width && y < height)
if (x < dst.cols && y < dst.rows)
{
const int idx = y * (map_step >> 2) + x; /* map_step >> 2 <=> map_step / sizeof(float)*/
const float xcoo = mapx[idx];
const float ycoo = mapy[idx];
uchar3 out = make_uchar3(0, 0, 0);
const float xcoo = mapx.ptr(y)[x];
const float ycoo = mapy.ptr(y)[x];
if (xcoo >= 0 && xcoo < width - 1 && ycoo >= 0 && ycoo < height - 1)
{
const int x1 = __float2int_rd(xcoo);
const int y1 = __float2int_rd(ycoo);
const int x2 = x1 + 1;
const int y2 = y1 + 1;
uchar src_reg = *(src + y1 * src_step + 3 * x1);
out.x += src_reg * (x2 - xcoo) * (y2 - ycoo);
src_reg = *(src + y1 * src_step + 3 * x1 + 1);
out.y += src_reg * (x2 - xcoo) * (y2 - ycoo);
src_reg = *(src + y1 * src_step + 3 * x1 + 2);
out.z += src_reg * (x2 - xcoo) * (y2 - ycoo);
src_reg = *(src + y1 * src_step + 3 * x2);
out.x += src_reg * (xcoo - x1) * (y2 - ycoo);
src_reg = *(src + y1 * src_step + 3 * x2 + 1);
out.y += src_reg * (xcoo - x1) * (y2 - ycoo);
src_reg = *(src + y1 * src_step + 3 * x2 + 2);
out.z += src_reg * (xcoo - x1) * (y2 - ycoo);
src_reg = *(src + y2 * src_step + 3 * x1);
out.x += src_reg * (x2 - xcoo) * (ycoo - y1);
src_reg = *(src + y2 * src_step + 3 * x1 + 1);
out.y += src_reg * (x2 - xcoo) * (ycoo - y1);
src_reg = *(src + y2 * src_step + 3 * x1 + 2);
out.z += src_reg * (x2 - xcoo) * (ycoo - y1);
src_reg = *(src + y2 * src_step + 3 * x2);
out.x += src_reg * (xcoo - x1) * (ycoo - y1);
src_reg = *(src + y2 * src_step + 3 * x2 + 1);
out.y += src_reg * (xcoo - x1) * (ycoo - y1);
src_reg = *(src + y2 * src_step + 3 * x2 + 2);
out.z += src_reg * (xcoo - x1) * (ycoo - y1);
}
/**(uchar3*)(dst + y * dst_step + 3 * x) = out;*/
*(dst + y * dst_step + 3 * x) = out.x;
*(dst + y * dst_step + 3 * x + 1) = out.y;
*(dst + y * dst_step + 3 * x + 2) = out.z;
dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
}
}
void remap_gpu_1c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst)
template <template <typename> class Filter, template <typename> class B, typename T>
void remap_caller(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, T borderValue)
{
dim3 threads(16, 16, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(dst.cols, threads.x);
grid.y = divUp(dst.rows, threads.y);
dim3 block(32, 8);
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
TextureBinder tex(&tex_remap, src);
B<T> brd(src.rows, src.cols, borderValue);
BorderReader< PtrStep_<T>, B<T> > brd_src(src, brd);
Filter< BorderReader< PtrStep_<T>, B<T> > > filter_src(brd_src);
remap_1c<<<grid, threads>>>(xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);
remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
void remap_gpu_3c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst)
#define OPENCV_GPU_IMPLEMENT_REMAP_TEX(type, filter) \
template <> void remap_caller<filter, BrdReplicate>(const DevMem2D_<type>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<type>& dst, type) \
{ \
const dim3 block(16, 16); \
const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
TextureBinder tex(&tex_remap_ ## type ## _ ## filter ## , src); \
remap_ ## type ## _ ## filter ## <<<grid, block>>>(mapx, mapy, dst); \
cudaSafeCall( cudaGetLastError() ); \
cudaSafeCall( cudaDeviceSynchronize() ); \
}
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar, LinearFilter)
#undef OPENCV_GPU_IMPLEMENT_REMAP_TEX
template <typename T> void remap_gpu(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst,
int interpolation, int borderMode, const double borderValue[4])
{
dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1);
grid.x = divUp(dst.cols, threads.x);
grid.y = divUp(dst.rows, threads.y);
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D_<T>& dst, T borderValue);
remap_3c<<<grid, threads>>>(src.data, src.step, xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);
cudaSafeCall( cudaGetLastError() );
static const caller_t callers[2][3] =
{
{ remap_caller<PointFilter, BrdReflect101>, remap_caller<PointFilter, BrdReplicate>, remap_caller<PointFilter, BrdConstant> },
{ remap_caller<LinearFilter, BrdReflect101>, remap_caller<LinearFilter, BrdReplicate>, remap_caller<LinearFilter, BrdConstant> }
};
cudaSafeCall( cudaDeviceSynchronize() );
typename VecTraits<T>::elem_type brd[] = {(typename VecTraits<T>::elem_type)borderValue[0], (typename VecTraits<T>::elem_type)borderValue[1], (typename VecTraits<T>::elem_type)borderValue[2], (typename VecTraits<T>::elem_type)borderValue[3]};
callers[interpolation][borderMode](static_cast< DevMem2D_<T> >(src), xmap, ymap, static_cast< DevMem2D_<T> >(dst), VecTraits<T>::make(brd));
}
template void remap_gpu<uchar >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
template void remap_gpu<uchar2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
template void remap_gpu<uchar3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
template void remap_gpu<uchar4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
template void remap_gpu<schar>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
template void remap_gpu<char2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
template void remap_gpu<char3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
template void remap_gpu<char4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
template void remap_gpu<ushort >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
template void remap_gpu<ushort2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
template void remap_gpu<ushort3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
template void remap_gpu<ushort4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
template void remap_gpu<short >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
template void remap_gpu<short2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
template void remap_gpu<short3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
template void remap_gpu<short4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
template void remap_gpu<uint >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
template void remap_gpu<uint2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
template void remap_gpu<uint3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
template void remap_gpu<uint4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
template void remap_gpu<int >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
template void remap_gpu<int2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
template void remap_gpu<int3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
template void remap_gpu<int4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
template void remap_gpu<float >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
template void remap_gpu<float2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
template void remap_gpu<float3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
template void remap_gpu<float4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
/////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
texture<uchar4, 2> tex_meanshift;
@ -541,9 +553,9 @@ namespace cv { namespace gpu { namespace imgproc
}
}
template <typename B>
template <typename BR, typename BC>
__global__ void cornerHarris_kernel(const int cols, const int rows, const int block_size, const float k,
PtrStep dst, B border_row, B border_col)
PtrStep dst, BR border_row, BC border_col)
{
const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -561,10 +573,10 @@ namespace cv { namespace gpu { namespace imgproc
for (int i = ibegin; i < iend; ++i)
{
int y = border_col.idx(i);
int y = border_col.idx_row(i);
for (int j = jbegin; j < jend; ++j)
{
int x = border_row.idx(j);
int x = border_row.idx_col(j);
float dx = tex2D(harrisDxTex, x, y);
float dy = tex2D(harrisDyTex, x, y);
a += dx * dx;
@ -596,7 +608,7 @@ namespace cv { namespace gpu { namespace imgproc
{
case BORDER_REFLECT101_GPU:
cornerHarris_kernel<<<grid, threads>>>(
cols, rows, block_size, k, dst, BrdReflect101(cols), BrdReflect101(rows));
cols, rows, block_size, k, dst, BrdRowReflect101<void>(cols), BrdColReflect101<void>(rows));
break;
case BORDER_REPLICATE_GPU:
harrisDxTex.addressMode[0] = cudaAddressModeClamp;
@ -656,9 +668,9 @@ namespace cv { namespace gpu { namespace imgproc
}
template <typename B>
template <typename BR, typename BC>
__global__ void cornerMinEigenVal_kernel(const int cols, const int rows, const int block_size,
PtrStep dst, B border_row, B border_col)
PtrStep dst, BR border_row, BC border_col)
{
const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -676,10 +688,10 @@ namespace cv { namespace gpu { namespace imgproc
for (int i = ibegin; i < iend; ++i)
{
int y = border_col.idx(i);
int y = border_col.idx_row(i);
for (int j = jbegin; j < jend; ++j)
{
int x = border_row.idx(j);
int x = border_row.idx_col(j);
float dx = tex2D(minEigenValDxTex, x, y);
float dy = tex2D(minEigenValDyTex, x, y);
a += dx * dx;
@ -713,7 +725,7 @@ namespace cv { namespace gpu { namespace imgproc
{
case BORDER_REFLECT101_GPU:
cornerMinEigenVal_kernel<<<grid, threads>>>(
cols, rows, block_size, dst, BrdReflect101(cols), BrdReflect101(rows));
cols, rows, block_size, dst, BrdRowReflect101<void>(cols), BrdColReflect101<void>(rows));
break;
case BORDER_REPLICATE_GPU:
minEigenValDxTex.addressMode[0] = cudaAddressModeClamp;
@ -981,7 +993,7 @@ namespace cv { namespace gpu { namespace imgproc
//////////////////////////////////////////////////////////////////////////
// pyrDown
template <typename T> __global__ void pyrDown(const PtrStep_<T> src, PtrStep_<T> dst, const BrdReflect101 rowBrd, const BrdReflect101 colBrd, int dst_cols)
template <typename T, typename B> __global__ void pyrDown(const PtrStep_<T> src, PtrStep_<T> dst, const B b, int dst_cols)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
@ -996,11 +1008,11 @@ namespace cv { namespace gpu { namespace imgproc
sum = VecTraits<value_type>::all(0);
sum = sum + 0.0625f * src.ptr(colBrd.idx(src_y - 2))[rowBrd.idx(x)];
sum = sum + 0.25f * src.ptr(colBrd.idx(src_y - 1))[rowBrd.idx(x)];
sum = sum + 0.375f * src.ptr(colBrd.idx(src_y ))[rowBrd.idx(x)];
sum = sum + 0.25f * src.ptr(colBrd.idx(src_y + 1))[rowBrd.idx(x)];
sum = sum + 0.0625f * src.ptr(colBrd.idx(src_y + 2))[rowBrd.idx(x)];
sum = sum + 0.0625f * b.at(src_y - 2, x, src.data, src.step);
sum = sum + 0.25f * b.at(src_y - 1, x, src.data, src.step);
sum = sum + 0.375f * b.at(src_y , x, src.data, src.step);
sum = sum + 0.25f * b.at(src_y + 1, x, src.data, src.step);
sum = sum + 0.0625f * b.at(src_y + 2, x, src.data, src.step);
smem[2 + threadIdx.x] = sum;
@ -1010,11 +1022,11 @@ namespace cv { namespace gpu { namespace imgproc
sum = VecTraits<value_type>::all(0);
sum = sum + 0.0625f * src.ptr(colBrd.idx(src_y - 2))[rowBrd.idx(left_x)];
sum = sum + 0.25f * src.ptr(colBrd.idx(src_y - 1))[rowBrd.idx(left_x)];
sum = sum + 0.375f * src.ptr(colBrd.idx(src_y ))[rowBrd.idx(left_x)];
sum = sum + 0.25f * src.ptr(colBrd.idx(src_y + 1))[rowBrd.idx(left_x)];
sum = sum + 0.0625f * src.ptr(colBrd.idx(src_y + 2))[rowBrd.idx(left_x)];
sum = sum + 0.0625f * b.at(src_y - 2, left_x, src.data, src.step);
sum = sum + 0.25f * b.at(src_y - 1, left_x, src.data, src.step);
sum = sum + 0.375f * b.at(src_y , left_x, src.data, src.step);
sum = sum + 0.25f * b.at(src_y + 1, left_x, src.data, src.step);
sum = sum + 0.0625f * b.at(src_y + 2, left_x, src.data, src.step);
smem[threadIdx.x] = sum;
}
@ -1025,11 +1037,11 @@ namespace cv { namespace gpu { namespace imgproc
sum = VecTraits<value_type>::all(0);
sum = sum + 0.0625f * src.ptr(colBrd.idx(src_y - 2))[rowBrd.idx(right_x)];
sum = sum + 0.25f * src.ptr(colBrd.idx(src_y - 1))[rowBrd.idx(right_x)];
sum = sum + 0.375f * src.ptr(colBrd.idx(src_y ))[rowBrd.idx(right_x)];
sum = sum + 0.25f * src.ptr(colBrd.idx(src_y + 1))[rowBrd.idx(right_x)];
sum = sum + 0.0625f * src.ptr(colBrd.idx(src_y + 2))[rowBrd.idx(right_x)];
sum = sum + 0.0625f * b.at(src_y - 2, right_x, src.data, src.step);
sum = sum + 0.25f * b.at(src_y - 1, right_x, src.data, src.step);
sum = sum + 0.375f * b.at(src_y , right_x, src.data, src.step);
sum = sum + 0.25f * b.at(src_y + 1, right_x, src.data, src.step);
sum = sum + 0.0625f * b.at(src_y + 2, right_x, src.data, src.step);
smem[4 + threadIdx.x] = sum;
}
@ -1055,58 +1067,68 @@ namespace cv { namespace gpu { namespace imgproc
}
}
template <typename T, int cn> void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream)
template <typename T, template <typename> class B> void pyrDown_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
{
const dim3 block(256);
const dim3 grid(divUp(src.cols, block.x), dst.rows);
BrdReflect101 rowBrd(src.cols);
BrdReflect101 colBrd(src.rows);
B<T> b(src.rows, src.cols);
pyrDown<typename TypeVec<T, cn>::vec_type><<<grid, block, 0, stream>>>(
static_cast< DevMem2D_<typename TypeVec<T, cn>::vec_type> >(src),
static_cast< DevMem2D_<typename TypeVec<T, cn>::vec_type> >(dst),
rowBrd, colBrd, dst.cols);
pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template void pyrDown_gpu<uchar, 1>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrDown_gpu<uchar, 2>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrDown_gpu<uchar, 3>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrDown_gpu<uchar, 4>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template <typename T, int cn> void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream)
{
typedef typename TypeVec<T, cn>::vec_type type;
typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
static const caller_t callers[] =
{
pyrDown_caller<type, BrdReflect101>, pyrDown_caller<type, BrdReplicate>, pyrDown_caller<type, BrdConstant>
};
callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
}
template void pyrDown_gpu<uchar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<uchar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<uchar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<uchar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<schar, 1>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrDown_gpu<schar, 2>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrDown_gpu<schar, 3>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrDown_gpu<schar, 4>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrDown_gpu<schar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<schar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<schar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<schar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<ushort, 1>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrDown_gpu<ushort, 2>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrDown_gpu<ushort, 3>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrDown_gpu<ushort, 4>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrDown_gpu<ushort, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<ushort, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<ushort, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<ushort, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<short, 1>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrDown_gpu<short, 2>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrDown_gpu<short, 3>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrDown_gpu<short, 4>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrDown_gpu<short, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<short, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<short, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<short, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<int, 1>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrDown_gpu<int, 2>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrDown_gpu<int, 3>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrDown_gpu<int, 4>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrDown_gpu<int, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<int, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<int, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<int, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<float, 1>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrDown_gpu<float, 2>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrDown_gpu<float, 3>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrDown_gpu<float, 4>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrDown_gpu<float, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<float, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<float, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<float, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
//////////////////////////////////////////////////////////////////////////
// pyrUp
template <typename T> __global__ void pyrUp(const PtrStep_<T> src, DevMem2D_<T> dst, const BrdReflect101 rowBrd, const BrdReflect101 colBrd)
template <typename T, typename B> __global__ void pyrUp(const PtrStep_<T> src, DevMem2D_<T> dst, const B b)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
@ -1119,7 +1141,7 @@ namespace cv { namespace gpu { namespace imgproc
value_type sum;
if (threadIdx.x < 10 && threadIdx.y < 10)
smem1[threadIdx.y][threadIdx.x] = src.ptr(colBrd.idx(blockIdx.y * blockDim.y / 2 + threadIdx.y - 1))[rowBrd.idx(blockIdx.x * blockDim.x / 2 + threadIdx.x - 1)];
smem1[threadIdx.y][threadIdx.x] = b.at(blockIdx.y * blockDim.y / 2 + threadIdx.y - 1, blockIdx.x * blockDim.x / 2 + threadIdx.x - 1, src.data, src.step);
__syncthreads();
@ -1175,53 +1197,63 @@ namespace cv { namespace gpu { namespace imgproc
dst.ptr(y)[x] = saturate_cast<T>(4.0f * sum);
}
template <typename T, int cn> void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream)
template <typename T, template <typename> class B> void pyrUp_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
{
const dim3 block(16, 16);
const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
BrdReflect101 rowBrd(src.cols);
BrdReflect101 colBrd(src.rows);
B<T> b(src.rows, src.cols);
pyrUp<typename TypeVec<T, cn>::vec_type><<<grid, block, 0, stream>>>(
static_cast< DevMem2D_<typename TypeVec<T, cn>::vec_type> >(src),
static_cast< DevMem2D_<typename TypeVec<T, cn>::vec_type> >(dst),
rowBrd, colBrd);
pyrUp<T><<<grid, block, 0, stream>>>(src, dst, b);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template void pyrUp_gpu<uchar, 1>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrUp_gpu<uchar, 2>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrUp_gpu<uchar, 3>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrUp_gpu<uchar, 4>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrUp_gpu<schar, 1>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrUp_gpu<schar, 2>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrUp_gpu<schar, 3>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrUp_gpu<schar, 4>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrUp_gpu<ushort, 1>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrUp_gpu<ushort, 2>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrUp_gpu<ushort, 3>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrUp_gpu<ushort, 4>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrUp_gpu<short, 1>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrUp_gpu<short, 2>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrUp_gpu<short, 3>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrUp_gpu<short, 4>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrUp_gpu<int, 1>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrUp_gpu<int, 2>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrUp_gpu<int, 3>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrUp_gpu<int, 4>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrUp_gpu<float, 1>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrUp_gpu<float, 2>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrUp_gpu<float, 3>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template void pyrUp_gpu<float, 4>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template <typename T, int cn> void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream)
{
typedef typename TypeVec<T, cn>::vec_type type;
typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
static const caller_t callers[] =
{
pyrUp_caller<type, BrdReflect101>, pyrUp_caller<type, BrdReplicate>, pyrUp_caller<type, BrdConstant>
};
callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
}
template void pyrUp_gpu<uchar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<uchar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<uchar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<uchar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<schar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<schar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<schar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<schar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<ushort, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<ushort, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<ushort, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<ushort, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<short, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<short, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<short, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<short, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<int, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<int, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<int, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<int, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<float, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<float, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<float, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<float, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
//////////////////////////////////////////////////////////////////////////
// buildWarpMaps

@ -675,31 +675,29 @@ namespace cv { namespace gpu { namespace surf
3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f
};
__device__ __forceinline__ unsigned char calcWin(int i, int j, float centerX, float centerY, float win_offset, float cos_dir, float sin_dir)
struct WinReader
{
float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
typedef uchar elem_type;
return tex2D(imgTex, pixel_x, pixel_y);
}
__device__ unsigned char calcPATCH(int i1, int j1, float centerX, float centerY, float win_offset, float cos_dir, float sin_dir, int win_size)
{
/* Scale the window to size PATCH_SZ so each pixel's size is s. This
makes calculating the gradients with wavelets of size 2s easy */
const float icoo = ((float)i1 / (PATCH_SZ + 1)) * win_size;
const float jcoo = ((float)j1 / (PATCH_SZ + 1)) * win_size;
__device__ __forceinline__ WinReader(float centerX_, float centerY_, float win_offset_, float cos_dir_, float sin_dir_) :
centerX(centerX_), centerY(centerY_), win_offset(win_offset_), cos_dir(cos_dir_), sin_dir(sin_dir_)
{
}
const int i = __float2int_rd(icoo);
const int j = __float2int_rd(jcoo);
__device__ __forceinline__ uchar operator ()(int i, int j) const
{
float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
float res = calcWin(i, j, centerX, centerY, win_offset, cos_dir, sin_dir) * (i + 1 - icoo) * (j + 1 - jcoo);
res += calcWin(i + 1, j, centerX, centerY, win_offset, cos_dir, sin_dir) * (icoo - i) * (j + 1 - jcoo);
res += calcWin(i + 1, j + 1, centerX, centerY, win_offset, cos_dir, sin_dir) * (icoo - i) * (jcoo - j);
res += calcWin(i, j + 1, centerX, centerY, win_offset, cos_dir, sin_dir) * (i + 1 - icoo) * (jcoo - j);
return tex2D(imgTex, pixel_x, pixel_y);
}
return saturate_cast<unsigned char>(res);
}
float centerX;
float centerY;
float win_offset;
float cos_dir;
float sin_dir;
};
__device__ void calc_dx_dy(float s_dx_bin[25], float s_dy_bin[25],
const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
@ -732,7 +730,13 @@ namespace cv { namespace gpu { namespace surf
const int xIndex = xBlock * 5 + threadIdx.x;
const int yIndex = yBlock * 5 + threadIdx.y;
s_PATCH[threadIdx.y][threadIdx.x] = calcPATCH(yIndex, xIndex, centerX, centerY, win_offset, cos_dir, sin_dir, win_size);
const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size;
const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size;
LinearFilter<WinReader> filter(WinReader(centerX, centerY, win_offset, cos_dir, sin_dir));
s_PATCH[threadIdx.y][threadIdx.x] = filter(icoo, jcoo);
__syncthreads();
if (threadIdx.x < 5 && threadIdx.y < 5)

@ -885,7 +885,7 @@ void cv::gpu::GpuMat::release()
if( refcount && CV_XADD(refcount, -1) == 1 )
{
fastFree(refcount);
cudaSafeCall( cudaFree(datastart) );
cudaFree(datastart);
}
data = datastart = dataend = 0;
step = rows = cols = 0;

@ -47,7 +47,7 @@ using namespace cv::gpu;
#if !defined (HAVE_CUDA)
void cv::gpu::remap(const GpuMat&, GpuMat&, const GpuMat&, const GpuMat&){ throw_nogpu(); }
void cv::gpu::remap(const GpuMat&, GpuMat&, const GpuMat&, const GpuMat&, int, int, const Scalar&){ throw_nogpu(); }
void cv::gpu::meanShiftFiltering(const GpuMat&, GpuMat&, int, int, TermCriteria) { throw_nogpu(); }
void cv::gpu::meanShiftProc(const GpuMat&, GpuMat&, GpuMat&, int, int, TermCriteria) { throw_nogpu(); }
void cv::gpu::drawColorDisp(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
@ -92,8 +92,8 @@ void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool) { throw_nogp
void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&) { throw_nogpu(); }
void cv::gpu::downsample(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::upsample(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::pyrDown(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::pyrUp(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
void cv::gpu::pyrDown(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
void cv::gpu::pyrUp(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
void cv::gpu::Canny(const GpuMat&, GpuMat&, double, double, int, bool) { throw_nogpu(); }
void cv::gpu::Canny(const GpuMat&, CannyBuf&, GpuMat&, double, double, int, bool) { throw_nogpu(); }
void cv::gpu::Canny(const GpuMat&, const GpuMat&, GpuMat&, double, double, bool) { throw_nogpu(); }
@ -104,39 +104,52 @@ void cv::gpu::CannyBuf::release() { throw_nogpu(); }
#else /* !defined (HAVE_CUDA) */
////////////////////////////////////////////////////////////////////////
// remap
namespace cv { namespace gpu { namespace imgproc
{
void remap_gpu_1c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst);
void remap_gpu_3c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst);
extern "C" void meanShiftFiltering_gpu(const DevMem2D& src, DevMem2D dst, int sp, int sr, int maxIter, float eps);
extern "C" void meanShiftProc_gpu(const DevMem2D& src, DevMem2D dstr, DevMem2D dstsp, int sp, int sr, int maxIter, float eps);
template <typename T> void remap_gpu(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst,
int interpolation, int borderMode, const double borderValue[4]);
}}}
void drawColorDisp_gpu(const DevMem2D& src, const DevMem2D& dst, int ndisp, const cudaStream_t& stream);
void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2D& dst, int ndisp, const cudaStream_t& stream);
void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap, int interpolation, int borderMode, const Scalar& borderValue)
{
using namespace cv::gpu::imgproc;
void reprojectImageTo3D_gpu(const DevMem2D& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream);
void reprojectImageTo3D_gpu(const DevMem2D_<short>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream);
}}}
typedef void (*caller_t)(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);;
static const caller_t callers[6][4] =
{
{remap_gpu<uchar>, remap_gpu<uchar2>, remap_gpu<uchar3>, remap_gpu<uchar4>},
{remap_gpu<schar>, remap_gpu<char2>, remap_gpu<char3>, remap_gpu<char4>},
{remap_gpu<ushort>, remap_gpu<ushort2>, remap_gpu<ushort3>, remap_gpu<ushort4>},
{remap_gpu<short>, remap_gpu<short2>, remap_gpu<short3>, remap_gpu<short4>},
{remap_gpu<int>, remap_gpu<int2>, remap_gpu<int3>, remap_gpu<int4>},
{remap_gpu<float>, remap_gpu<float2>, remap_gpu<float3>, remap_gpu<float4>}
};
////////////////////////////////////////////////////////////////////////
// remap
CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
CV_Assert(xmap.type() == CV_32F && ymap.type() == CV_32F && xmap.size() == ymap.size());
void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap)
{
typedef void (*remap_gpu_t)(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst);
static const remap_gpu_t callers[] = {imgproc::remap_gpu_1c, 0, imgproc::remap_gpu_3c};
CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR);
CV_Assert((src.type() == CV_8U || src.type() == CV_8UC3) && xmap.type() == CV_32F && ymap.type() == CV_32F);
CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT);
int gpuBorderType;
CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));
dst.create(xmap.size(), src.type());
callers[src.channels() - 1](src, xmap, ymap, dst);
callers[src.depth()][src.channels() - 1](src, xmap, ymap, dst, interpolation, gpuBorderType, borderValue.val);
}
////////////////////////////////////////////////////////////////////////
// meanShiftFiltering_GPU
namespace cv { namespace gpu { namespace imgproc
{
extern "C" void meanShiftFiltering_gpu(const DevMem2D& src, DevMem2D dst, int sp, int sr, int maxIter, float eps);
}}}
void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr, TermCriteria criteria)
{
if( src.empty() )
@ -163,6 +176,11 @@ void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr,
////////////////////////////////////////////////////////////////////////
// meanShiftProc_GPU
namespace cv { namespace gpu { namespace imgproc
{
extern "C" void meanShiftProc_gpu(const DevMem2D& src, DevMem2D dstr, DevMem2D dstsp, int sp, int sr, int maxIter, float eps);
}}}
void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr, TermCriteria criteria)
{
if( src.empty() )
@ -190,6 +208,12 @@ void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int
////////////////////////////////////////////////////////////////////////
// drawColorDisp
namespace cv { namespace gpu { namespace imgproc
{
void drawColorDisp_gpu(const DevMem2D& src, const DevMem2D& dst, int ndisp, const cudaStream_t& stream);
void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2D& dst, int ndisp, const cudaStream_t& stream);
}}}
namespace
{
template <typename T>
@ -215,6 +239,12 @@ void cv::gpu::drawColorDisp(const GpuMat& src, GpuMat& dst, int ndisp, Stream& s
////////////////////////////////////////////////////////////////////////
// reprojectImageTo3D
namespace cv { namespace gpu { namespace imgproc
{
void reprojectImageTo3D_gpu(const DevMem2D& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream);
void reprojectImageTo3D_gpu(const DevMem2D_<short>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream);
}}}
namespace
{
template <typename T>
@ -1596,14 +1626,14 @@ void cv::gpu::upsample(const GpuMat& src, GpuMat& dst, Stream& stream)
namespace cv { namespace gpu { namespace imgproc
{
template <typename T, int cn> void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template <typename T, int cn> void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
}}}
void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, Stream& stream)
void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, int borderType, Stream& stream)
{
using namespace cv::gpu::imgproc;
typedef void (*func_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
typedef void (*func_t)(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
static const func_t funcs[6][4] =
{
@ -1617,9 +1647,13 @@ void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, Stream& stream)
CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
CV_Assert(borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT);
int gpuBorderType;
CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));
dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type());
funcs[src.depth()][src.channels() - 1](src, dst, StreamAccessor::getStream(stream));
funcs[src.depth()][src.channels() - 1](src, dst, gpuBorderType, StreamAccessor::getStream(stream));
}
@ -1628,14 +1662,14 @@ void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, Stream& stream)
namespace cv { namespace gpu { namespace imgproc
{
template <typename T, int cn> void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
template <typename T, int cn> void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
}}}
void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst, Stream& stream)
void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst, int borderType, Stream& stream)
{
using namespace cv::gpu::imgproc;
typedef void (*func_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
typedef void (*func_t)(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
static const func_t funcs[6][4] =
{
@ -1649,9 +1683,13 @@ void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst, Stream& stream)
CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
CV_Assert(borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT);
int gpuBorderType;
CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));
dst.create(src.rows*2, src.cols*2, src.type());
funcs[src.depth()][src.channels() - 1](src, dst, StreamAccessor::getStream(stream));
funcs[src.depth()][src.channels() - 1](src, dst, gpuBorderType, StreamAccessor::getStream(stream));
}

@ -48,82 +48,131 @@
namespace cv { namespace gpu { namespace device
{
struct BrdReflect101
//////////////////////////////////////////////////////////////
// BrdConstant
template <typename D> struct BrdRowConstant
{
explicit __host__ __device__ __forceinline__ BrdReflect101(int len): last(len - 1) {}
typedef D result_type;
explicit __host__ __device__ __forceinline__ BrdRowConstant(int width_, const D& val_ = VecTraits<D>::all(0)) : width(width_), val(val_) {}
__device__ __forceinline__ int idx_low(int i) const
template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const
{
return abs(i);
return x >= 0 ? saturate_cast<D>(data[x]) : val;
}
__device__ __forceinline__ int idx_high(int i) const
template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const
{
return last - abs(last - i);
return x < width ? saturate_cast<D>(data[x]) : val;
}
__device__ __forceinline__ int idx(int i) const
template <typename T> __device__ __forceinline__ D at(int x, const T* data) const
{
return idx_low(idx_high(i));
return (x >= 0 && x < width) ? saturate_cast<D>(data[x]) : val;
}
__host__ __device__ __forceinline__ bool is_range_safe(int mini, int maxi) const
{
return -last <= mini && maxi <= 2 * last;
return true;
}
const int last;
const int width;
const D val;
};
template <typename D> struct BrdRowReflect101 : BrdReflect101
template <typename D> struct BrdColConstant
{
explicit __host__ __device__ __forceinline__ BrdRowReflect101(int len): BrdReflect101(len) {}
typedef D result_type;
template <typename T> __device__ __forceinline__ D at_low(int i, const T* data) const
explicit __host__ __device__ __forceinline__ BrdColConstant(int height_, const D& val_ = VecTraits<D>::all(0)) : height(height_), val(val_) {}
template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const
{
return saturate_cast<D>(data[idx_low(i)]);
return y >= 0 ? saturate_cast<D>(*(const T*)((const char*)data + y * step)) : val;
}
template <typename T> __device__ __forceinline__ D at_high(int i, const T* data) const
template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const
{
return saturate_cast<D>(data[idx_high(i)]);
return y < height ? saturate_cast<D>(*(const T*)((const char*)data + y * step)) : val;
}
template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const
{
return (y >= 0 && y < height) ? saturate_cast<D>(*(const T*)((const char*)data + y * step)) : val;
}
__host__ __device__ __forceinline__ bool is_range_safe(int mini, int maxi) const
{
return true;
}
const int height;
const D val;
};
template <typename D> struct BrdColReflect101 : BrdReflect101
template <typename D> struct BrdConstant
{
__host__ __device__ __forceinline__ BrdColReflect101(int len, size_t step): BrdReflect101(len), step(step) {}
typedef D result_type;
template <typename T> __device__ __forceinline__ D at_low(int i, const T* data) const
__host__ __device__ __forceinline__ BrdConstant(int height_, int width_, const D& val_ = VecTraits<D>::all(0)) :
height(height_), width(width_), val(val_)
{
return saturate_cast<D>(*(const D*)((const char*)data + idx_low(i)*step));
}
template <typename T> __device__ __forceinline__ D at_high(int i, const T* data) const
template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const
{
return saturate_cast<D>(*(const D*)((const char*)data + idx_high(i)*step));
return (x >= 0 && x < width && y >= 0 && y < height) ? saturate_cast<D>(((const T*)((const uchar*)data + y * step))[x]) : val;
}
const size_t step;
template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const
{
return (x >= 0 && x < width && y >= 0 && y < height) ? saturate_cast<D>(src(y, x)) : val;
}
const int height;
const int width;
const D val;
};
struct BrdReplicate
//////////////////////////////////////////////////////////////
// BrdReplicate
template <typename D> struct BrdRowReplicate
{
explicit __host__ __device__ __forceinline__ BrdReplicate(int len): last(len - 1) {}
typedef D result_type;
explicit __host__ __device__ __forceinline__ BrdRowReplicate(int width) : last_col(width - 1) {}
template <typename U> __host__ __device__ __forceinline__ BrdRowReplicate(int width, U) : last_col(width - 1) {}
__device__ __forceinline__ int idx_col_low(int x) const
{
return ::max(x, 0);
}
__device__ __forceinline__ int idx_col_high(int x) const
{
return ::min(x, last_col);
}
__device__ __forceinline__ int idx_col(int x) const
{
return idx_col_low(idx_col_high(x));
}
__device__ __forceinline__ int idx_low(int i) const
template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const
{
return ::max(i, 0);
return saturate_cast<D>(data[idx_col_low(x)]);
}
__device__ __forceinline__ int idx_high(int i) const
template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const
{
return ::min(i, last);
return saturate_cast<D>(data[idx_col_high(x)]);
}
__device__ __forceinline__ int idx(int i) const
template <typename T> __device__ __forceinline__ D at(int x, const T* data) const
{
return idx_low(idx_high(i));
return saturate_cast<D>(data[idx_col(x)]);
}
bool is_range_safe(int mini, int maxi) const
@ -131,103 +180,328 @@ namespace cv { namespace gpu { namespace device
return true;
}
const int last;
const int last_col;
};
template <typename D> struct BrdRowReplicate : BrdReplicate
template <typename D> struct BrdColReplicate
{
explicit __host__ __device__ __forceinline__ BrdRowReplicate(int len): BrdReplicate(len) {}
typedef D result_type;
template <typename T> __device__ __forceinline__ D at_low(int i, const T* data) const
explicit __host__ __device__ __forceinline__ BrdColReplicate(int height) : last_row(height - 1) {}
template <typename U> __host__ __device__ __forceinline__ BrdColReplicate(int height, U) : last_row(height - 1) {}
__device__ __forceinline__ int idx_row_low(int y) const
{
return saturate_cast<D>(data[idx_low(i)]);
return ::max(y, 0);
}
template <typename T> __device__ __forceinline__ D at_high(int i, const T* data) const
__device__ __forceinline__ int idx_row_high(int y) const
{
return saturate_cast<D>(data[idx_high(i)]);
return ::min(y, last_row);
}
__device__ __forceinline__ int idx_row(int y) const
{
return idx_row_low(idx_row_high(y));
}
template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const
{
return saturate_cast<D>(*(const T*)((const char*)data + idx_row_low(y) * step));
}
template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const
{
return saturate_cast<D>(*(const T*)((const char*)data + idx_row_high(y) * step));
}
};
template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const
{
return saturate_cast<D>(*(const T*)((const char*)data + idx_row(y) * step));
}
template <typename D> struct BrdColReplicate : BrdReplicate
bool is_range_safe(int mini, int maxi) const
{
return true;
}
const int last_row;
};
template <typename D> struct BrdReplicate
{
__host__ __device__ __forceinline__ BrdColReplicate(int len, size_t step): BrdReplicate(len), step(step) {}
typedef D result_type;
__host__ __device__ __forceinline__ BrdReplicate(int height, int width) :
last_row(height - 1), last_col(width - 1)
{
}
template <typename U>
__host__ __device__ __forceinline__ BrdReplicate(int height, int width, U) :
last_row(height - 1), last_col(width - 1)
{
}
__device__ __forceinline__ int idx_row_low(int y) const
{
return ::max(y, 0);
}
__device__ __forceinline__ float idx_row_low(float y) const
{
return ::fmax(y, 0.0f);
}
__device__ __forceinline__ int idx_row_high(int y) const
{
return ::min(y, last_row);
}
__device__ __forceinline__ float idx_row_high(float y) const
{
return ::fmin(y, last_row);
}
template <typename T> __device__ __forceinline__ D at_low(int i, const T* data) const
__device__ __forceinline__ int idx_row(int y) const
{
return idx_row_low(idx_row_high(y));
}
__device__ __forceinline__ float idx_row(float y) const
{
return idx_row_low(idx_row_high(y));
}
__device__ __forceinline__ int idx_col_low(int x) const
{
return ::max(x, 0);
}
__device__ __forceinline__ float idx_col_low(float x) const
{
return ::fmax(x, 0);
}
__device__ __forceinline__ int idx_col_high(int x) const
{
return ::min(x, last_col);
}
__device__ __forceinline__ float idx_col_high(float x) const
{
return ::fmin(x, last_col);
}
__device__ __forceinline__ int idx_col(int x) const
{
return idx_col_low(idx_col_high(x));
}
__device__ __forceinline__ float idx_col(float x) const
{
return saturate_cast<D>(*(const D*)((const char*)data + idx_low(i)*step));
return idx_col_low(idx_col_high(x));
}
template <typename T> __device__ __forceinline__ D at_high(int i, const T* data) const
template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const
{
return saturate_cast<D>(*(const D*)((const char*)data + idx_high(i)*step));
return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);
}
const size_t step;
template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const
{
return saturate_cast<D>(src(idx_row(y), idx_col(x)));
}
const int last_row;
const int last_col;
};
template <typename D> struct BrdRowConstant
//////////////////////////////////////////////////////////////
// BrdReflect101
template <typename D> struct BrdRowReflect101
{
explicit __host__ __device__ __forceinline__ BrdRowConstant(int len_, const D& val_ = VecTraits<D>::all(0)): len(len_), val(val_) {}
typedef D result_type;
explicit __host__ __device__ __forceinline__ BrdRowReflect101(int width) : last_col(width - 1) {}
template <typename U> __host__ __device__ __forceinline__ BrdRowReflect101(int width, U) : last_col(width - 1) {}
template <typename T> __device__ __forceinline__ D at_low(int i, const T* data) const
__device__ __forceinline__ int idx_col_low(int x) const
{
return i >= 0 ? saturate_cast<D>(data[i]) : val;
return ::abs(x);
}
template <typename T> __device__ __forceinline__ D at_high(int i, const T* data) const
__device__ __forceinline__ int idx_col_high(int x) const
{
return i < len ? saturate_cast<D>(data[i]) : val;
return last_col - ::abs(last_col - x);
}
__device__ __forceinline__ int idx_col(int x) const
{
return idx_col_low(idx_col_high(x));
}
template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const
{
return saturate_cast<D>(data[idx_col_low(x)]);
}
template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const
{
return saturate_cast<D>(data[idx_col_high(x)]);
}
template <typename T> __device__ __forceinline__ D at(int x, const T* data) const
{
return saturate_cast<D>(data[idx_col(x)]);
}
__host__ __device__ __forceinline__ bool is_range_safe(int mini, int maxi) const
{
return true;
return -last_col <= mini && maxi <= 2 * last_col;
}
const int len;
const D val;
const int last_col;
};
template <typename D> struct BrdColConstant
template <typename D> struct BrdColReflect101
{
__host__ __device__ __forceinline__ BrdColConstant(int len_, size_t step_, const D& val_ = VecTraits<D>::all(0)): len(len_), step(step_), val(val_) {}
typedef D result_type;
template <typename T> __device__ __forceinline__ D at_low(int i, const T* data) const
explicit __host__ __device__ __forceinline__ BrdColReflect101(int height) : last_row(height - 1) {}
template <typename U> __host__ __device__ __forceinline__ BrdColReflect101(int height, U) : last_row(height - 1) {}
__device__ __forceinline__ int idx_row_low(int y) const
{
return ::abs(y);
}
__device__ __forceinline__ int idx_row_high(int y) const
{
return i >= 0 ? saturate_cast<D>(*(const D*)((const char*)data + i*step)) : val;
return last_row - ::abs(last_row - y);
}
template <typename T> __device__ __forceinline__ D at_high(int i, const T* data) const
__device__ __forceinline__ int idx_row(int y) const
{
return i < len ? saturate_cast<D>(*(const D*)((const char*)data + i*step)) : val;
return idx_row_low(idx_row_high(y));
}
template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const
{
return saturate_cast<D>(*(const D*)((const char*)data + idx_row_low(y) * step));
}
template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const
{
return saturate_cast<D>(*(const D*)((const char*)data + idx_row_high(y) * step));
}
template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const
{
return saturate_cast<D>(*(const D*)((const char*)data + idx_row(y) * step));
}
__host__ __device__ __forceinline__ bool is_range_safe(int mini, int maxi) const
{
return true;
return -last_row <= mini && maxi <= 2 * last_row;
}
const int len;
const size_t step;
const D val;
const int last_row;
};
template <typename OutT> struct BrdConstant
template <typename D> struct BrdReflect101
{
__host__ __device__ __forceinline__ BrdConstant(int w, int h, const OutT &val = VecTraits<OutT>::all(0)) : w(w), h(h), val(val) {}
typedef D result_type;
__host__ __device__ __forceinline__ BrdReflect101(int height, int width) :
last_row(height - 1), last_col(width - 1)
{
}
template <typename U>
__host__ __device__ __forceinline__ BrdReflect101(int height, int width, U) :
last_row(height - 1), last_col(width - 1)
{
}
__device__ __forceinline__ int idx_row_low(int y) const
{
return ::abs(y);
}
__device__ __forceinline__ float idx_row_low(float y) const
{
return ::fabs(y);
}
__device__ __forceinline__ int idx_row_high(int y) const
{
return last_row - ::abs(last_row - y);
}
__device__ __forceinline__ float idx_row_high(float y) const
{
return last_row - ::fabs(last_row - y);
}
__device__ __forceinline__ int idx_row(int y) const
{
return idx_row_low(idx_row_high(y));
}
__device__ __forceinline__ float idx_row(float y) const
{
return idx_row_low(idx_row_high(y));
}
__device__ __forceinline__ int idx_col_low(int x) const
{
return ::abs(x);
}
__device__ __forceinline__ float idx_col_low(float x) const
{
return ::fabs(x);
}
__device__ __forceinline__ int idx_col_high(int x) const
{
return last_col - ::abs(last_col - x);
}
__device__ __forceinline__ float idx_col_high(float x) const
{
return last_col - ::fabs(last_col - x);
}
__device__ __forceinline__ int idx_col(int x) const
{
return idx_col_low(idx_col_high(x));
}
__device__ __forceinline__ float idx_col(float x) const
{
return idx_col_low(idx_col_high(x));
}
template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const
{
return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);
}
template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const
{
return saturate_cast<D>(src(idx_row(y), idx_col(x)));
}
const int last_row;
const int last_col;
};
//////////////////////////////////////////////////////////////
// BorderReader
template <typename Ptr2D, typename B> struct BorderReader
{
typedef typename B::result_type elem_type;
typedef typename Ptr2D::index_type index_type;
__host__ __device__ __forceinline__ BorderReader(const Ptr2D& ptr_, const B& b_) : ptr(ptr_), b(b_) {}
__device__ __forceinline__ OutT at(int x, int y, const uchar* data, int step) const
__device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const
{
if (x >= 0 && x <= w - 1 && y >= 0 && y <= h - 1)
return ((const OutT*)(data + y * step))[x];
return val;
return b.at(y, x, ptr);
}
const int w;
const int h;
OutT val;
const Ptr2D ptr;
const B b;
};
}}}

@ -309,7 +309,6 @@ namespace cv { namespace gpu { namespace device
U vec1Vals[MAX_LEN / THREAD_DIM];
};
///////////////////////////////////////////////////////////////////////////////
// Solve linear system
@ -364,6 +363,60 @@ namespace cv { namespace gpu { namespace device
return false;
}
///////////////////////////////////////////////////////////////////////////////
// Filters
template <typename Ptr2D> struct PointFilter
{
typedef typename Ptr2D::elem_type elem_type;
typedef float index_type;
explicit __host__ __device__ __forceinline__ PointFilter(const Ptr2D& src_) : src(src_) {}
__device__ __forceinline__ elem_type operator ()(float y, float x) const
{
return src(__float2int_rn(y), __float2int_rn(x));
}
const Ptr2D src;
};
template <typename Ptr2D> struct LinearFilter
{
typedef typename Ptr2D::elem_type elem_type;
typedef float index_type;
explicit __host__ __device__ __forceinline__ LinearFilter(const Ptr2D& src_) : src(src_) {}
__device__ __forceinline__ elem_type operator ()(float y, float x) const
{
typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;
work_type out = VecTraits<work_type>::all(0);
const int x1 = __float2int_rd(x);
const int y1 = __float2int_rd(y);
const int x2 = x1 + 1;
const int y2 = y1 + 1;
elem_type src_reg = src(y1, x1);
out = out + src_reg * ((x2 - x) * (y2 - y));
src_reg = src(y1, x2);
out = out + src_reg * ((x - x1) * (y2 - y));
src_reg = src(y2, x1);
out = out + src_reg * ((x2 - x) * (y - y1));
src_reg = src(y2, x2);
out = out + src_reg * ((x - x1) * (y - y1));
return saturate_cast<elem_type>(out);
}
const Ptr2D src;
};
}}}
#endif // __OPENCV_GPU_UTILITY_HPP__

@ -166,6 +166,7 @@ namespace cv { namespace gpu { namespace device
enum {cn=1}; \
static __device__ __host__ __forceinline__ type all(type v) {return v;} \
static __device__ __host__ __forceinline__ type make(type x) {return x;} \
static __device__ __host__ __forceinline__ type make(const type* v) {return *v;} \
}; \
template<> struct VecTraits<type ## 1> \
{ \
@ -173,6 +174,7 @@ namespace cv { namespace gpu { namespace device
enum {cn=1}; \
static __device__ __host__ __forceinline__ type ## 1 all(type v) {return make_ ## type ## 1(v);} \
static __device__ __host__ __forceinline__ type ## 1 make(type x) {return make_ ## type ## 1(x);} \
static __device__ __host__ __forceinline__ type ## 1 make(const type* v) {return make_ ## type ## 1(*v);} \
}; \
template<> struct VecTraits<type ## 2> \
{ \
@ -180,6 +182,7 @@ namespace cv { namespace gpu { namespace device
enum {cn=2}; \
static __device__ __host__ __forceinline__ type ## 2 all(type v) {return make_ ## type ## 2(v, v);} \
static __device__ __host__ __forceinline__ type ## 2 make(type x, type y) {return make_ ## type ## 2(x, y);} \
static __device__ __host__ __forceinline__ type ## 2 make(const type* v) {return make_ ## type ## 2(v[0], v[1]);} \
}; \
template<> struct VecTraits<type ## 3> \
{ \
@ -187,6 +190,7 @@ namespace cv { namespace gpu { namespace device
enum {cn=3}; \
static __device__ __host__ __forceinline__ type ## 3 all(type v) {return make_ ## type ## 3(v, v, v);} \
static __device__ __host__ __forceinline__ type ## 3 make(type x, type y, type z) {return make_ ## type ## 3(x, y, z);} \
static __device__ __host__ __forceinline__ type ## 3 make(const type* v) {return make_ ## type ## 3(v[0], v[1], v[2]);} \
}; \
template<> struct VecTraits<type ## 4> \
{ \
@ -194,6 +198,7 @@ namespace cv { namespace gpu { namespace device
enum {cn=4}; \
static __device__ __host__ __forceinline__ type ## 4 all(type v) {return make_ ## type ## 4(v, v, v, v);} \
static __device__ __host__ __forceinline__ type ## 4 make(type x, type y, type z, type w) {return make_ ## type ## 4(x, y, z, w);} \
static __device__ __host__ __forceinline__ type ## 4 make(const type* v) {return make_ ## type ## 4(v[0], v[1], v[2], v[3]);} \
}; \
template<> struct VecTraits<type ## 8> \
{ \
@ -201,10 +206,10 @@ namespace cv { namespace gpu { namespace device
enum {cn=8}; \
static __device__ __host__ __forceinline__ type ## 8 all(type v) {return make_ ## type ## 8(v, v, v, v, v, v, v, v);} \
static __device__ __host__ __forceinline__ type ## 8 make(type a0, type a1, type a2, type a3, type a4, type a5, type a6, type a7) {return make_ ## type ## 8(a0, a1, a2, a3, a4, a5, a6, a7);} \
static __device__ __host__ __forceinline__ type ## 8 make(const type* v) {return make_ ## type ## 8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);} \
};
OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uchar)
OPENCV_GPU_IMPLEMENT_VEC_TRAITS(char)
OPENCV_GPU_IMPLEMENT_VEC_TRAITS(ushort)
OPENCV_GPU_IMPLEMENT_VEC_TRAITS(short)
OPENCV_GPU_IMPLEMENT_VEC_TRAITS(int)
@ -214,12 +219,61 @@ namespace cv { namespace gpu { namespace device
#undef OPENCV_GPU_IMPLEMENT_VEC_TRAITS
template<> struct VecTraits<char>
{
typedef char elem_type;
enum {cn=1};
static __device__ __host__ __forceinline__ char all(char v) {return v;}
static __device__ __host__ __forceinline__ char make(char x) {return x;}
static __device__ __host__ __forceinline__ char make(const char* x) {return *x;}
};
template<> struct VecTraits<schar>
{
typedef schar elem_type;
enum {cn=1};
static __device__ __host__ __forceinline__ schar all(schar v) {return v;}
static __device__ __host__ __forceinline__ schar make(schar x) {return x;}
static __device__ __host__ __forceinline__ schar make(const schar* x) {return *x;}
};
template<> struct VecTraits<char1>
{
typedef schar elem_type;
enum {cn=1};
static __device__ __host__ __forceinline__ char1 all(schar v) {return make_char1(v);}
static __device__ __host__ __forceinline__ char1 make(schar x) {return make_char1(x);}
static __device__ __host__ __forceinline__ char1 make(const schar* v) {return make_char1(v[0]);}
};
template<> struct VecTraits<char2>
{
typedef schar elem_type;
enum {cn=2};
static __device__ __host__ __forceinline__ char2 all(schar v) {return make_char2(v, v);}
static __device__ __host__ __forceinline__ char2 make(schar x, schar y) {return make_char2(x, y);}
static __device__ __host__ __forceinline__ char2 make(const schar* v) {return make_char2(v[0], v[1]);}
};
template<> struct VecTraits<char3>
{
typedef schar elem_type;
enum {cn=3};
static __device__ __host__ __forceinline__ char3 all(schar v) {return make_char3(v, v, v);}
static __device__ __host__ __forceinline__ char3 make(schar x, schar y, schar z) {return make_char3(x, y, z);}
static __device__ __host__ __forceinline__ char3 make(const schar* v) {return make_char3(v[0], v[1], v[2]);}
};
template<> struct VecTraits<char4>
{
typedef schar elem_type;
enum {cn=4};
static __device__ __host__ __forceinline__ char4 all(schar v) {return make_char4(v, v, v, v);}
static __device__ __host__ __forceinline__ char4 make(schar x, schar y, schar z, schar w) {return make_char4(x, y, z, w);}
static __device__ __host__ __forceinline__ char4 make(const schar* v) {return make_char4(v[0], v[1], v[2], v[3]);}
};
template<> struct VecTraits<char8>
{
typedef schar elem_type;
enum {cn=8};
static __device__ __host__ __forceinline__ char8 all(schar v) {return make_char8(v, v, v, v, v, v, v, v);}
static __device__ __host__ __forceinline__ char8 make(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7) {return make_char8(a0, a1, a2, a3, a4, a5, a6, a7);}
static __device__ __host__ __forceinline__ char8 make(const schar* v) {return make_char8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);}
};
}}}

@ -181,15 +181,18 @@ INSTANTIATE_TEST_CASE_P(ImgProc, Resize, testing::Combine(
///////////////////////////////////////////////////////////////////////////////////////////////////////
// remap
struct Remap : testing::TestWithParam< std::tr1::tuple<cv::gpu::DeviceInfo, int> >
struct Remap : testing::TestWithParam< std::tr1::tuple<cv::gpu::DeviceInfo, int, int, int> >
{
cv::gpu::DeviceInfo devInfo;
int type;
int interpolation;
int borderType;
cv::Size size;
cv::Mat src;
cv::Mat xmap;
cv::Mat ymap;
cv::Scalar borderValue;
cv::Mat dst_gold;
@ -197,43 +200,83 @@ struct Remap : testing::TestWithParam< std::tr1::tuple<cv::gpu::DeviceInfo, int>
{
devInfo = std::tr1::get<0>(GetParam());
type = std::tr1::get<1>(GetParam());
interpolation = std::tr1::get<2>(GetParam());
borderType = std::tr1::get<3>(GetParam());
cv::gpu::setDevice(devInfo.deviceID());
cv::RNG& rng = cvtest::TS::ptr()->get_rng();
size = cv::Size(rng.uniform(20, 150), rng.uniform(20, 150));
size = cv::Size(rng.uniform(100, 200), rng.uniform(100, 200));
src = cvtest::randomMat(rng, size, type, 0.0, 127.0, false);
xmap = cvtest::randomMat(rng, size, CV_32FC1, 0.0, src.cols - 1, false);
ymap = cvtest::randomMat(rng, size, CV_32FC1, 0.0, src.rows - 1, false);
src = cvtest::randomMat(rng, size, type, 0.0, 256.0, false);
xmap.create(size, CV_32FC1);
ymap.create(size, CV_32FC1);
for (int y = 0; y < src.rows; ++y)
{
float* xmap_row = xmap.ptr<float>(y);
float* ymap_row = ymap.ptr<float>(y);
for (int x = 0; x < src.cols; ++x)
{
xmap_row[x] = src.cols - 1 - x;
ymap_row[x] = src.rows - 1 - y;
}
}
borderValue[0] = rng.uniform(0.0, 256.0);
borderValue[1] = rng.uniform(0.0, 256.0);
borderValue[2] = rng.uniform(0.0, 256.0);
borderValue[3] = rng.uniform(0.0, 256.0);
cv::remap(src, dst_gold, xmap, ymap, cv::INTER_LINEAR, cv::BORDER_WRAP);
cv::remap(src, dst_gold, xmap, ymap, interpolation, borderType, borderValue);
}
};
TEST_P(Remap, Accuracy)
{
static const char* interpolations_str[] = {"INTER_NEAREST", "INTER_LINEAR", "INTER_CUBIC"};
static const char* borderTypes_str[] = {"BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", "BORDER_REFLECT_101"};
const char* interpolationStr = interpolations_str[interpolation];
const char* borderTypeStr = borderTypes_str[borderType];
PRINT_PARAM(devInfo);
PRINT_TYPE(type);
PRINT_PARAM(interpolationStr);
PRINT_PARAM(borderTypeStr);
PRINT_PARAM(size);
PRINT_PARAM(borderValue);
cv::Mat dst;
ASSERT_NO_THROW(
cv::gpu::GpuMat gpuRes;
cv::gpu::remap(cv::gpu::GpuMat(src), gpuRes, cv::gpu::GpuMat(xmap), cv::gpu::GpuMat(ymap));
cv::gpu::remap(cv::gpu::GpuMat(src), gpuRes, cv::gpu::GpuMat(xmap), cv::gpu::GpuMat(ymap), interpolation, borderType, borderValue);
gpuRes.download(dst);
);
EXPECT_MAT_SIMILAR(dst_gold, dst, 0.5);
EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
}
INSTANTIATE_TEST_CASE_P(ImgProc, Remap, testing::Combine(
testing::ValuesIn(devices()),
testing::Values(CV_8UC1, CV_8UC3)));
INSTANTIATE_TEST_CASE_P
(
ImgProc, Remap, testing::Combine
(
testing::ValuesIn(devices()),
testing::Values
(
CV_8UC1, CV_8UC3, CV_8UC4,
CV_32FC1, CV_32FC3, CV_32FC4
),
testing::Values(cv::INTER_NEAREST, cv::INTER_LINEAR),
testing::Values(cv::BORDER_REFLECT101, cv::BORDER_REPLICATE, cv::BORDER_CONSTANT)
)
);
///////////////////////////////////////////////////////////////////////////////////////////////////////
// copyMakeBorder

@ -79,9 +79,9 @@ TEST(remap)
Mat src, dst, xmap, ymap;
gpu::GpuMat d_src, d_dst, d_xmap, d_ymap;
for (int size = 1000; size <= 8000; size *= 2)
for (int size = 1000; size <= 4000; size *= 2)
{
SUBTEST << "src " << size << " and 8U, 32F maps";
SUBTEST << "src " << size << ", 8UC1";
gen(src, size, size, CV_8UC1, 0, 256);
@ -101,7 +101,112 @@ TEST(remap)
dst.create(xmap.size(), src.type());
CPU_ON;
remap(src, dst, xmap, ymap, INTER_LINEAR);
remap(src, dst, xmap, ymap, INTER_LINEAR, BORDER_REPLICATE);
CPU_OFF;
d_src = src;
d_xmap = xmap;
d_ymap = ymap;
d_dst.create(d_xmap.size(), d_src.type());
GPU_ON;
gpu::remap(d_src, d_dst, d_xmap, d_ymap, INTER_LINEAR, BORDER_REPLICATE);
GPU_OFF;
}
for (int size = 1000; size <= 4000; size *= 2)
{
SUBTEST << "src " << size << ", 8UC3";
gen(src, size, size, CV_8UC3, 0, 256);
xmap.create(size, size, CV_32F);
ymap.create(size, size, CV_32F);
for (int i = 0; i < size; ++i)
{
float* xmap_row = xmap.ptr<float>(i);
float* ymap_row = ymap.ptr<float>(i);
for (int j = 0; j < size; ++j)
{
xmap_row[j] = (j - size * 0.5f) * 0.75f + size * 0.5f;
ymap_row[j] = (i - size * 0.5f) * 0.75f + size * 0.5f;
}
}
dst.create(xmap.size(), src.type());
CPU_ON;
remap(src, dst, xmap, ymap, INTER_LINEAR, BORDER_REPLICATE);
CPU_OFF;
d_src = src;
d_xmap = xmap;
d_ymap = ymap;
d_dst.create(d_xmap.size(), d_src.type());
GPU_ON;
gpu::remap(d_src, d_dst, d_xmap, d_ymap, INTER_LINEAR, BORDER_REPLICATE);
GPU_OFF;
}
for (int size = 1000; size <= 4000; size *= 2)
{
SUBTEST << "src " << size << ", 8UC4";
gen(src, size, size, CV_8UC4, 0, 256);
xmap.create(size, size, CV_32F);
ymap.create(size, size, CV_32F);
for (int i = 0; i < size; ++i)
{
float* xmap_row = xmap.ptr<float>(i);
float* ymap_row = ymap.ptr<float>(i);
for (int j = 0; j < size; ++j)
{
xmap_row[j] = (j - size * 0.5f) * 0.75f + size * 0.5f;
ymap_row[j] = (i - size * 0.5f) * 0.75f + size * 0.5f;
}
}
dst.create(xmap.size(), src.type());
CPU_ON;
remap(src, dst, xmap, ymap, INTER_LINEAR, BORDER_REPLICATE);
CPU_OFF;
d_src = src;
d_xmap = xmap;
d_ymap = ymap;
d_dst.create(d_xmap.size(), d_src.type());
GPU_ON;
gpu::remap(d_src, d_dst, d_xmap, d_ymap, INTER_LINEAR, BORDER_REPLICATE);
GPU_OFF;
}
for (int size = 1000; size <= 4000; size *= 2)
{
SUBTEST << "src " << size << ", 16SC3";
gen(src, size, size, CV_16SC3, 0, 256);
xmap.create(size, size, CV_32F);
ymap.create(size, size, CV_32F);
for (int i = 0; i < size; ++i)
{
float* xmap_row = xmap.ptr<float>(i);
float* ymap_row = ymap.ptr<float>(i);
for (int j = 0; j < size; ++j)
{
xmap_row[j] = (j - size * 0.5f) * 0.75f + size * 0.5f;
ymap_row[j] = (i - size * 0.5f) * 0.75f + size * 0.5f;
}
}
dst.create(xmap.size(), src.type());
CPU_ON;
remap(src, dst, xmap, ymap, INTER_LINEAR, BORDER_REPLICATE);
CPU_OFF;
d_src = src;
@ -110,7 +215,7 @@ TEST(remap)
d_dst.create(d_xmap.size(), d_src.type());
GPU_ON;
gpu::remap(d_src, d_dst, d_xmap, d_ymap);
gpu::remap(d_src, d_dst, d_xmap, d_ymap, INTER_LINEAR, BORDER_REPLICATE);
GPU_OFF;
}
}

Loading…
Cancel
Save