diff --git a/modules/gpu/src/cuda/resize.cu b/modules/gpu/src/cuda/resize.cu index c244ca7d6e..ce4d96c248 100644 --- a/modules/gpu/src/cuda/resize.cu +++ b/modules/gpu/src/cuda/resize.cu @@ -54,17 +54,68 @@ namespace cv { namespace gpu { namespace device { namespace imgproc { + template __global__ void resize_nearest(const PtrStep src, const float fx, const float fy, PtrStepSz dst) + { + const int dst_x = blockDim.x * blockIdx.x + threadIdx.x; + const int dst_y = blockDim.y * blockIdx.y + threadIdx.y; + + if (dst_x < dst.cols && dst_y < dst.rows) + { + const float src_x = dst_x * fx; + const float src_y = dst_y * fy; + + dst(dst_y, dst_x) = src(__float2int_rz(src_y), __float2int_rz(src_x)); + } + } + + template __global__ void resize_linear(const PtrStepSz src, const float fx, const float fy, PtrStepSz dst) + { + typedef typename TypeVec::cn>::vec_type work_type; + + const int dst_x = blockDim.x * blockIdx.x + threadIdx.x; + const int dst_y = blockDim.y * blockIdx.y + threadIdx.y; + + if (dst_x < dst.cols && dst_y < dst.rows) + { + const float src_x = dst_x * fx; + const float src_y = dst_y * fy; + + work_type out = VecTraits::all(0); + + const int x1 = __float2int_rd(src_x); + const int y1 = __float2int_rd(src_y); + const int x2 = x1 + 1; + const int y2 = y1 + 1; + const int x2_read = ::min(x2, src.cols - 1); + const int y2_read = ::min(y2, src.rows - 1); + + T src_reg = src(y1, x1); + out = out + src_reg * ((x2 - src_x) * (y2 - src_y)); + + src_reg = src(y1, x2_read); + out = out + src_reg * ((src_x - x1) * (y2 - src_y)); + + src_reg = src(y2_read, x1); + out = out + src_reg * ((x2 - src_x) * (src_y - y1)); + + src_reg = src(y2_read, x2_read); + out = out + src_reg * ((src_x - x1) * (src_y - y1)); + + dst(dst_y, dst_x) = saturate_cast(out); + } + } + template __global__ void resize(const Ptr2D src, const float fx, const float fy, PtrStepSz dst) { - const int x = blockDim.x * blockIdx.x + threadIdx.x; - const int y = blockDim.y * blockIdx.y + threadIdx.y; + const int dst_x = blockDim.x * blockIdx.x + threadIdx.x; + const int dst_y = blockDim.y * blockIdx.y + threadIdx.y; - if (x < dst.cols && y < dst.rows) + if (dst_x < dst.cols && dst_y < dst.rows) { - const float xcoo = x * fx; - const float ycoo = y * fy; + const float src_x = dst_x * fx; + const float src_y = dst_y * fy; - dst(y, x) = saturate_cast(src(ycoo, xcoo)); + dst(dst_y, dst_x) = src(src_y, src_x); } } @@ -77,12 +128,34 @@ namespace cv { namespace gpu { namespace device BrdReplicate brd(src.rows, src.cols); BorderReader< PtrStep, BrdReplicate > brdSrc(src, brd); - Filter< BorderReader< PtrStep, BrdReplicate > > filteredSrc(brdSrc, fx, fy); + Filter< BorderReader< PtrStep, BrdReplicate > > filteredSrc(brdSrc); resize<<>>(filteredSrc, fx, fy, dst); cudaSafeCall( cudaGetLastError() ); } }; + template struct ResizeDispatcherStream + { + static void call(PtrStepSz src, float fx, float fy, PtrStepSz dst, cudaStream_t stream) + { + const dim3 block(32, 8); + const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); + + resize_nearest<<>>(src, fx, fy, dst); + cudaSafeCall( cudaGetLastError() ); + } + }; + template struct ResizeDispatcherStream + { + static void call(PtrStepSz src, float fx, float fy, PtrStepSz dst, cudaStream_t stream) + { + const dim3 block(32, 8); + const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); + + resize_linear<<>>(src, fx, fy, dst); + cudaSafeCall( cudaGetLastError() ); + } + }; template