optimized gpu pyrDown and sepFilter2D

pull/31/head^2
Vladislav Vinogradov 13 years ago
parent 7591ee1688
commit fd77ef7ece
  1. 2
      modules/gpu/perf/perf_imgproc.cpp
  2. 49
      modules/gpu/src/cuda/column_filter.cu
  3. 117
      modules/gpu/src/cuda/pyr_down.cu
  4. 49
      modules/gpu/src/cuda/row_filter.cu

@ -1720,7 +1720,7 @@ CV_FLAGS(GHMethod, cv::GHT_POSITION, cv::GHT_SCALE, cv::GHT_ROTATION);
DEF_PARAM_TEST(Method_Sz, GHMethod, cv::Size); DEF_PARAM_TEST(Method_Sz, GHMethod, cv::Size);
PERF_TEST_P(Method_Sz, GeneralizedHough, Combine( PERF_TEST_P(Method_Sz, ImgProc_GeneralizedHough, Combine(
Values(GHMethod(cv::GHT_POSITION), GHMethod(cv::GHT_POSITION | cv::GHT_SCALE), GHMethod(cv::GHT_POSITION | cv::GHT_ROTATION), GHMethod(cv::GHT_POSITION | cv::GHT_SCALE | cv::GHT_ROTATION)), Values(GHMethod(cv::GHT_POSITION), GHMethod(cv::GHT_POSITION | cv::GHT_SCALE), GHMethod(cv::GHT_POSITION | cv::GHT_ROTATION), GHMethod(cv::GHT_POSITION | cv::GHT_SCALE | cv::GHT_ROTATION)),
GPU_TYPICAL_MAT_SIZES)) GPU_TYPICAL_MAT_SIZES))
{ {

@ -89,20 +89,45 @@ namespace cv { namespace gpu { namespace device
const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y; const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y;
//Upper halo if (blockIdx.y > 0)
#pragma unroll {
for (int j = 0; j < HALO_SIZE; ++j) //Upper halo
smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step)); #pragma unroll
for (int j = 0; j < HALO_SIZE; ++j)
smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, x));
}
else
{
//Upper halo
#pragma unroll
for (int j = 0; j < HALO_SIZE; ++j)
smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step));
}
//Main data if (blockIdx.y + 2 < gridDim.y)
#pragma unroll {
for (int j = 0; j < PATCH_PER_BLOCK; ++j) //Main data
smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step)); #pragma unroll
for (int j = 0; j < PATCH_PER_BLOCK; ++j)
smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + j * BLOCK_DIM_Y, x));
//Lower halo //Lower halo
#pragma unroll #pragma unroll
for (int j = 0; j < HALO_SIZE; ++j) for (int j = 0; j < HALO_SIZE; ++j)
smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step)); smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, x));
}
else
{
//Main data
#pragma unroll
for (int j = 0; j < PATCH_PER_BLOCK; ++j)
smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step));
//Lower halo
#pragma unroll
for (int j = 0; j < HALO_SIZE; ++j)
smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step));
}
__syncthreads(); __syncthreads();

@ -40,7 +40,7 @@
// //
//M*/ //M*/
#include "internal_shared.hpp" #include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/border_interpolate.hpp" #include "opencv2/gpu/device/border_interpolate.hpp"
#include "opencv2/gpu/device/vec_traits.hpp" #include "opencv2/gpu/device/vec_traits.hpp"
#include "opencv2/gpu/device/vec_math.hpp" #include "opencv2/gpu/device/vec_math.hpp"
@ -50,57 +50,104 @@ namespace cv { namespace gpu { namespace device
{ {
namespace imgproc namespace imgproc
{ {
template <typename T, typename B> __global__ void pyrDown(const PtrStep<T> src, PtrStep<T> dst, const B b, int dst_cols) template <typename T, typename B> __global__ void pyrDown(const PtrStepSz<T> src, PtrStep<T> dst, const B b, int dst_cols)
{ {
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type; typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_t;
__shared__ work_t smem[256 + 4];
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y; const int y = blockIdx.y;
__shared__ value_type smem[256 + 4]; const int src_y = 2 * y;
value_type sum; if (src_y >= 2 && src_y < src.rows - 2 && x >= 2 && x < src.cols - 2)
{
{
work_t sum;
const int src_y = 2*y; sum = 0.0625f * src(src_y - 2, x);
sum = sum + 0.25f * src(src_y - 1, x);
sum = sum + 0.375f * src(src_y , x);
sum = sum + 0.25f * src(src_y + 1, x);
sum = sum + 0.0625f * src(src_y + 2, x);
sum = VecTraits<value_type>::all(0); smem[2 + threadIdx.x] = sum;
}
sum = sum + 0.0625f * b.at(src_y - 2, x, src.data, src.step); if (threadIdx.x < 2)
sum = sum + 0.25f * b.at(src_y - 1, x, src.data, src.step); {
sum = sum + 0.375f * b.at(src_y , x, src.data, src.step); const int left_x = x - 2;
sum = sum + 0.25f * b.at(src_y + 1, x, src.data, src.step);
sum = sum + 0.0625f * b.at(src_y + 2, x, src.data, src.step);
smem[2 + threadIdx.x] = sum; work_t sum;
if (threadIdx.x < 2) sum = 0.0625f * src(src_y - 2, left_x);
{ sum = sum + 0.25f * src(src_y - 1, left_x);
const int left_x = x - 2; sum = sum + 0.375f * src(src_y , left_x);
sum = sum + 0.25f * src(src_y + 1, left_x);
sum = sum + 0.0625f * src(src_y + 2, left_x);
sum = VecTraits<value_type>::all(0); smem[threadIdx.x] = sum;
}
sum = sum + 0.0625f * b.at(src_y - 2, left_x, src.data, src.step); if (threadIdx.x > 253)
sum = sum + 0.25f * b.at(src_y - 1, left_x, src.data, src.step); {
sum = sum + 0.375f * b.at(src_y , left_x, src.data, src.step); const int right_x = x + 2;
sum = sum + 0.25f * b.at(src_y + 1, left_x, src.data, src.step);
sum = sum + 0.0625f * b.at(src_y + 2, left_x, src.data, src.step);
smem[threadIdx.x] = sum; work_t sum;
}
if (threadIdx.x > 253) sum = 0.0625f * src(src_y - 2, right_x);
sum = sum + 0.25f * src(src_y - 1, right_x);
sum = sum + 0.375f * src(src_y , right_x);
sum = sum + 0.25f * src(src_y + 1, right_x);
sum = sum + 0.0625f * src(src_y + 2, right_x);
smem[4 + threadIdx.x] = sum;
}
}
else
{ {
const int right_x = x + 2; {
work_t sum;
sum = 0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col_high(x));
sum = sum + 0.25f * src(b.idx_row_low (src_y - 1), b.idx_col_high(x));
sum = sum + 0.375f * src(src_y , b.idx_col_high(x));
sum = sum + 0.25f * src(b.idx_row_high(src_y + 1), b.idx_col_high(x));
sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col_high(x));
smem[2 + threadIdx.x] = sum;
}
if (threadIdx.x < 2)
{
const int left_x = x - 2;
work_t sum;
sum = 0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col(left_x));
sum = sum + 0.25f * src(b.idx_row_low (src_y - 1), b.idx_col(left_x));
sum = sum + 0.375f * src(src_y , b.idx_col(left_x));
sum = sum + 0.25f * src(b.idx_row_high(src_y + 1), b.idx_col(left_x));
sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col(left_x));
smem[threadIdx.x] = sum;
}
if (threadIdx.x > 253)
{
const int right_x = x + 2;
sum = VecTraits<value_type>::all(0); work_t sum;
sum = sum + 0.0625f * b.at(src_y - 2, right_x, src.data, src.step); sum = 0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col_high(right_x));
sum = sum + 0.25f * b.at(src_y - 1, right_x, src.data, src.step); sum = sum + 0.25f * src(b.idx_row_low (src_y - 1), b.idx_col_high(right_x));
sum = sum + 0.375f * b.at(src_y , right_x, src.data, src.step); sum = sum + 0.375f * src(src_y , b.idx_col_high(right_x));
sum = sum + 0.25f * b.at(src_y + 1, right_x, src.data, src.step); sum = sum + 0.25f * src(b.idx_row_high(src_y + 1), b.idx_col_high(right_x));
sum = sum + 0.0625f * b.at(src_y + 2, right_x, src.data, src.step); sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col_high(right_x));
smem[4 + threadIdx.x] = sum; smem[4 + threadIdx.x] = sum;
}
} }
__syncthreads(); __syncthreads();
@ -109,9 +156,9 @@ namespace cv { namespace gpu { namespace device
{ {
const int tid2 = threadIdx.x * 2; const int tid2 = threadIdx.x * 2;
sum = VecTraits<value_type>::all(0); work_t sum;
sum = sum + 0.0625f * smem[2 + tid2 - 2]; sum = 0.0625f * smem[2 + tid2 - 2];
sum = sum + 0.25f * smem[2 + tid2 - 1]; sum = sum + 0.25f * smem[2 + tid2 - 1];
sum = sum + 0.375f * smem[2 + tid2 ]; sum = sum + 0.375f * smem[2 + tid2 ];
sum = sum + 0.25f * smem[2 + tid2 + 1]; sum = sum + 0.25f * smem[2 + tid2 + 1];

@ -89,20 +89,45 @@ namespace cv { namespace gpu { namespace device
const int xStart = blockIdx.x * (PATCH_PER_BLOCK * BLOCK_DIM_X) + threadIdx.x; const int xStart = blockIdx.x * (PATCH_PER_BLOCK * BLOCK_DIM_X) + threadIdx.x;
//Load left halo if (blockIdx.x > 0)
#pragma unroll {
for (int j = 0; j < HALO_SIZE; ++j) //Load left halo
smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_low(xStart - (HALO_SIZE - j) * BLOCK_DIM_X, src_row)); #pragma unroll
for (int j = 0; j < HALO_SIZE; ++j)
smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart - (HALO_SIZE - j) * BLOCK_DIM_X]);
}
else
{
//Load left halo
#pragma unroll
for (int j = 0; j < HALO_SIZE; ++j)
smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_low(xStart - (HALO_SIZE - j) * BLOCK_DIM_X, src_row));
}
//Load main data if (blockIdx.x + 2 < gridDim.x)
#pragma unroll {
for (int j = 0; j < PATCH_PER_BLOCK; ++j) //Load main data
smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + j * BLOCK_DIM_X, src_row)); #pragma unroll
for (int j = 0; j < PATCH_PER_BLOCK; ++j)
smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + j * BLOCK_DIM_X]);
//Load right halo //Load right halo
#pragma unroll #pragma unroll
for (int j = 0; j < HALO_SIZE; ++j) for (int j = 0; j < HALO_SIZE; ++j)
smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X, src_row)); smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X]);
}
else
{
//Load main data
#pragma unroll
for (int j = 0; j < PATCH_PER_BLOCK; ++j)
smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + j * BLOCK_DIM_X, src_row));
//Load right halo
#pragma unroll
for (int j = 0; j < HALO_SIZE; ++j)
smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X, src_row));
}
__syncthreads(); __syncthreads();

Loading…
Cancel
Save