diff --git a/modules/gpu/src/nvidia/NCVBroxOpticalFlow.cu b/modules/gpu/src/nvidia/NCVBroxOpticalFlow.cu new file mode 100644 index 0000000000..916734b4f8 --- /dev/null +++ b/modules/gpu/src/nvidia/NCVBroxOpticalFlow.cu @@ -0,0 +1,1136 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +//////////////////////////////////////////////////////////////////////////////// +// +// NVIDIA CUDA implementation of Brox et al Optical Flow algorithm +// +// Algorithm is explained in the original paper: +// T. Brox, A. Bruhn, N. Papenberg, J. Weickert: +// High accuracy optical flow estimation based on a theory for warping. +// ECCV 2004. +// +// Implementation by Mikhail Smirnov +// email: msmirnov@nvidia.com, devsupport@nvidia.com +// +// Credits for help with the code to: +// Alexey Mendelenko, Anton Obukhov, and Alexander Kharlamov. +// +//////////////////////////////////////////////////////////////////////////////// + +#include +#include +#include + +#include "NPP_staging\NPP_staging.hpp" +#include "NCVBroxOpticalFlow.hpp" + +using std::tr1::shared_ptr; + +typedef NCVVectorAlloc FloatVector; + +///////////////////////////////////////////////////////////////////////////////////////// +// Implementation specific constants +///////////////////////////////////////////////////////////////////////////////////////// +__device__ const float eps2 = 1e-6f; + +///////////////////////////////////////////////////////////////////////////////////////// +// Additional defines +///////////////////////////////////////////////////////////////////////////////////////// + +// rounded up division +inline int iDivUp(int a, int b) +{ + return (a + b - 1)/b; +} + +///////////////////////////////////////////////////////////////////////////////////////// +// Texture references +///////////////////////////////////////////////////////////////////////////////////////// + +texture tex_coarse; +texture tex_fine; + +texture tex_I1; +texture tex_I0; + +texture tex_Ix; +texture tex_Ixx; +texture tex_Ix0; + +texture tex_Iy; +texture tex_Iyy; +texture tex_Iy0; + +texture tex_Ixy; + +texture tex_u; +texture tex_v; +texture tex_du; +texture tex_dv; +texture tex_numerator_dudv; +texture tex_numerator_u; +texture tex_numerator_v; +texture tex_inv_denominator_u; +texture tex_inv_denominator_v; +texture tex_diffusivity_x; +texture tex_diffusivity_y; + + +///////////////////////////////////////////////////////////////////////////////////////// +// SUPPLEMENTARY FUNCTIONS +///////////////////////////////////////////////////////////////////////////////////////// + +/////////////////////////////////////////////////////////////////////////////// +/// \brief performs pointwise summation of two vectors stored in device memory +/// \param d_res - pointer to resulting vector (device memory) +/// \param d_op1 - term #1 (device memory) +/// \param d_op2 - term #2 (device memory) +/// \param len - vector size +/////////////////////////////////////////////////////////////////////////////// +__global__ void pointwise_add(float *d_res, const float *d_op1, const float *d_op2, const int len) +{ + const int pos = blockIdx.x*blockDim.x + threadIdx.x; + + if(pos >= len) return; + + d_res[pos] = d_op1[pos] + d_op2[pos]; +} + +/////////////////////////////////////////////////////////////////////////////// +/// \brief wrapper for summation kernel. +/// Computes \b op1 + \b op2 and stores result to \b res +/// \param res array, containing op1 + op2 (device memory) +/// \param op1 term #1 (device memory) +/// \param op2 term #2 (device memory) +/// \param count vector size +/////////////////////////////////////////////////////////////////////////////// +static void add(float *res, const float *op1, const float *op2, const int count, cudaStream_t stream) +{ + dim3 threads(256); + dim3 blocks(iDivUp(count, threads.x)); + + pointwise_add<<>>(res, op1, op2, count); +} + +/////////////////////////////////////////////////////////////////////////////// +/// \brief wrapper for summation kernel. +/// Increments \b res by \b rhs +/// \param res initial vector, will be replaced with result (device memory) +/// \param rhs increment (device memory) +/// \param count vector size +/////////////////////////////////////////////////////////////////////////////// +static void add(float *res, const float *rhs, const int count, cudaStream_t stream) +{ + add(res, res, rhs, count, stream); +} + +/////////////////////////////////////////////////////////////////////////////// +/// \brief kernel for scaling vector by scalar +/// \param d_res scaled vector (device memory) +/// \param d_src source vector (device memory) +/// \param scale scalar to scale by +/// \param len vector size (number of elements) +/////////////////////////////////////////////////////////////////////////////// +__global__ void scaleVector(float *d_res, const float *d_src, float scale, const int len) +{ + const int pos = blockIdx.x * blockDim.x + threadIdx.x; + + if (pos >= len) return; + + d_res[pos] = d_src[pos] * scale; +} + +/////////////////////////////////////////////////////////////////////////////// +/// \brief scale vector by scalar +/// +/// kernel wrapper +/// \param d_res scaled vector (device memory) +/// \param d_src source vector (device memory) +/// \param scale scalar to scale by +/// \param len vector size (number of elements) +/// \param stream CUDA stream +/////////////////////////////////////////////////////////////////////////////// +static void ScaleVector(float *d_res, const float *d_src, float scale, const int len, cudaStream_t stream) +{ + dim3 threads(256); + dim3 blocks(iDivUp(len, threads.x)); + + scaleVector<<>>(d_res, d_src, scale, len); +} + +const int SOR_TILE_WIDTH = 32; +const int SOR_TILE_HEIGHT = 6; +const int PSOR_TILE_WIDTH = 32; +const int PSOR_TILE_HEIGHT = 6; +const int PSOR_PITCH = PSOR_TILE_WIDTH + 4; +const int PSOR_HEIGHT = PSOR_TILE_HEIGHT + 4; + +/////////////////////////////////////////////////////////////////////////////// +///\brief Utility function. Compute smooth term diffusivity along x axis +///\param s (out) pointer to memory location for result (diffusivity) +///\param pos (in) position within shared memory array containing \b u +///\param u (in) shared memory array containing \b u +///\param v (in) shared memory array containing \b v +///\param du (in) shared memory array containing \b du +///\param dv (in) shared memory array containing \b dv +/////////////////////////////////////////////////////////////////////////////// +__forceinline__ __device__ void diffusivity_along_x(float *s, int pos, const float *u, const float *v, const float *du, const float *dv) +{ + //x derivative between pixels (i,j) and (i-1,j) + const int left = pos-1; + float u_x = u[pos] + du[pos] - u[left] - du[left]; + float v_x = v[pos] + dv[pos] - v[left] - dv[left]; + const int up = pos + PSOR_PITCH; + const int down = pos - PSOR_PITCH; + const int up_left = up - 1; + const int down_left = down-1; + //y derivative between pixels (i,j) and (i-1,j) + float u_y = 0.25f*(u[up] + du[up] + u[up_left] + du[up_left] - u[down] - du[down] - u[down_left] - du[down_left]); + float v_y = 0.25f*(v[up] + dv[up] + v[up_left] + dv[up_left] - v[down] - dv[down] - v[down_left] - dv[down_left]); + *s = 0.5f / sqrtf(u_x*u_x + v_x*v_x + u_y*u_y + v_y*v_y + eps2); +} + +/////////////////////////////////////////////////////////////////////////////// +///\brief Utility function. Compute smooth term diffusivity along y axis +///\param s (out) pointer to memory location for result (diffusivity) +///\param pos (in) position within shared memory array containing \b u +///\param u (in) shared memory array containing \b u +///\param v (in) shared memory array containing \b v +///\param du (in) shared memory array containing \b du +///\param dv (in) shared memory array containing \b dv +/////////////////////////////////////////////////////////////////////////////// +__forceinline__ __device__ void diffusivity_along_y(float *s, int pos, const float *u, const float *v, const float *du, const float *dv) +{ + //y derivative between pixels (i,j) and (i,j-1) + const int down = pos-PSOR_PITCH; + float u_y = u[pos] + du[pos] - u[down] - du[down]; + float v_y = v[pos] + dv[pos] - v[down] - dv[down]; + const int right = pos + 1; + const int left = pos - 1; + const int down_right = down + 1; + const int down_left = down - 1; + //x derivative between pixels (i,j) and (i,j-1); + float u_x = 0.25f*(u[right] + u[down_right] + du[right] + du[down_right] - u[left] - u[down_left] - du[left] - du[down_left]); + float v_x = 0.25f*(v[right] + v[down_right] + dv[right] + dv[down_right] - v[left] - v[down_left] - dv[left] - dv[down_left]); + *s = 0.5f/sqrtf(u_x*u_x + v_x*v_x + u_y*u_y + v_y*v_y + eps2); +} + +/////////////////////////////////////////////////////////////////////////////// +///\brief Utility function. Load element of 2D global memory to shared memory +///\param smem pointer to shared memory array +///\param is shared memory array column +///\param js shared memory array row +///\param w number of columns in global memory array +///\param h number of rows in global memory array +///\param p global memory array pitch in floats +/////////////////////////////////////////////////////////////////////////////// +template +__forceinline__ __device__ void load_array_element(float *smem, int is, int js, int i, int j, int w, int h, int p) +{ + //position within shared memory array + const int ijs = js * PSOR_PITCH + is; + //mirror reflection across borders + i = max(i, -i-1); + i = min(i, w-i+w-1); + j = max(j, -j-1); + j = min(j, h-j+h-1); + const int pos = j * p + i; + switch(tex_id){ + case 0: + smem[ijs] = tex1Dfetch(tex_u, pos); + break; + case 1: + smem[ijs] = tex1Dfetch(tex_v, pos); + break; + case 2: + smem[ijs] = tex1Dfetch(tex_du, pos); + break; + case 3: + smem[ijs] = tex1Dfetch(tex_dv, pos); + break; + } +} + +/////////////////////////////////////////////////////////////////////////////// +///\brief Utility function. Load part (tile) of 2D global memory to shared memory +///\param smem pointer to target shared memory array +///\param ig column number within source +///\param jg row number within source +///\param w number of columns in global memory array +///\param h number of rows in global memory array +///\param p global memory array pitch in floats +/////////////////////////////////////////////////////////////////////////////// +template +__forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, int h, int p) +{ + const int i = threadIdx.x + 2; + const int j = threadIdx.y + 2; + load_array_element(smem, i, j, ig, jg, w, h, p);//load current pixel + __syncthreads(); + if(threadIdx.y < 2) + { + //load bottom shadow elements + load_array_element(smem, i, j-2, ig, jg-2, w, h, p); + if(threadIdx.x < 2) + { + //load bottom right shadow elements + load_array_element(smem, i+PSOR_TILE_WIDTH, j-2, ig+PSOR_TILE_WIDTH, jg-2, w, h, p); + //load middle right shadow elements + load_array_element(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p); + } + else if(threadIdx.x >= PSOR_TILE_WIDTH-2) + { + //load bottom left shadow elements + load_array_element(smem, i-PSOR_TILE_WIDTH, j-2, ig-PSOR_TILE_WIDTH, jg-2, w, h, p); + //load middle left shadow elements + load_array_element(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p); + } + } + else if(threadIdx.y >= PSOR_TILE_HEIGHT-2) + { + //load upper shadow elements + load_array_element(smem, i, j+2, ig, jg+2, w, h, p); + if(threadIdx.x < 2) + { + //load upper right shadow elements + load_array_element(smem, i+PSOR_TILE_WIDTH, j+2, ig+PSOR_TILE_WIDTH, jg+2, w, h, p); + //load middle right shadow elements + load_array_element(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p); + } + else if(threadIdx.x >= PSOR_TILE_WIDTH-2) + { + //load upper left shadow elements + load_array_element(smem, i-PSOR_TILE_WIDTH, j+2, ig-PSOR_TILE_WIDTH, jg+2, w, h, p); + //load middle left shadow elements + load_array_element(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p); + } + } + else + { + //load middle shadow elements + if(threadIdx.x < 2) + { + //load middle right shadow elements + load_array_element(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p); + } + else if(threadIdx.x >= PSOR_TILE_WIDTH-2) + { + //load middle left shadow elements + load_array_element(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p); + } + } + __syncthreads(); +} + +/////////////////////////////////////////////////////////////////////////////// +/// \brief computes matrix of linearised system for \c du, \c dv +/// Computed values reside in GPU memory. \n +/// Matrix computation is divided into two steps. This kernel performs first step\n +/// - compute smoothness term diffusivity between pixels - psi dash smooth +/// - compute robustness factor in the data term - psi dash data +/// \param diffusivity_x (in/out) diffusivity between pixels along x axis in smoothness term +/// \param diffusivity_y (in/out) diffusivity between pixels along y axis in smoothness term +/// \param denominator_u (in/out) precomputed part of expression for new du value in SOR iteration +/// \param denominator_v (in/out) precomputed part of expression for new dv value in SOR iteration +/// \param numerator_dudv (in/out) precomputed part of expression for new du and dv value in SOR iteration +/// \param numerator_u (in/out) precomputed part of expression for new du value in SOR iteration +/// \param numerator_v (in/out) precomputed part of expression for new dv value in SOR iteration +/// \param w (in) frame width +/// \param h (in) frame height +/// \param pitch (in) pitch in floats +/// \param alpha (in) alpha in Brox model (flow smoothness) +/// \param gamma (in) gamma in Brox model (edge importance) +/////////////////////////////////////////////////////////////////////////////// + +__global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity_y, + float *denominator_u, float *denominator_v, + float *numerator_dudv, + float *numerator_u, float *numerator_v, + int w, int h, int s, + float alpha, float gamma) +{ + __shared__ float u[PSOR_PITCH * PSOR_HEIGHT]; + __shared__ float v[PSOR_PITCH * PSOR_HEIGHT]; + __shared__ float du[PSOR_PITCH * PSOR_HEIGHT]; + __shared__ float dv[PSOR_PITCH * PSOR_HEIGHT]; + + //position within tile + const int i = threadIdx.x; + const int j = threadIdx.y; + //position within smem arrays + const int ijs = (j+2) * PSOR_PITCH + i + 2; + //position within global memory + const int ig = blockIdx.x * blockDim.x + threadIdx.x; + const int jg = blockIdx.y * blockDim.y + threadIdx.y; + const int ijg = jg * s + ig; + //position within texture + float x = (float)ig + 0.5f; + float y = (float)jg + 0.5f; + //load u and v to smem + load_array<0>(u, ig, jg, w, h, s); + load_array<1>(v, ig, jg, w, h, s); + load_array<2>(du, ig, jg, w, h, s); + load_array<3>(dv, ig, jg, w, h, s); + //warped position + float wx = (x + u[ijs])/(float)w; + float wy = (y + v[ijs])/(float)h; + x /= (float)w; + y /= (float)h; + //compute image derivatives + const float Iz = tex2D(tex_I1, wx, wy) - tex2D(tex_I0, x, y); + const float Ix = tex2D(tex_Ix, wx, wy); + const float Ixz = Ix - tex2D(tex_Ix0, x, y); + const float Ixy = tex2D(tex_Ixy, wx, wy); + const float Ixx = tex2D(tex_Ixx, wx, wy); + const float Iy = tex2D(tex_Iy, wx, wy); + const float Iyz = Iy - tex2D(tex_Iy0, x, y); + const float Iyy = tex2D(tex_Iyy, wx, wy); + //compute data term + float q0, q1, q2; + q0 = Iz + Ix * du[ijs] + Iy * dv[ijs]; + q1 = Ixz + Ixx * du[ijs] + Ixy * dv[ijs]; + q2 = Iyz + Ixy * du[ijs] + Iyy * dv[ijs]; + float data_term = 0.5f * rsqrtf(q0*q0 + gamma*(q1*q1 + q2*q2) + eps2); + //scale data term by 1/alpha + data_term /= alpha; + //compute smoothness term (diffusivity) + float sx, sy; + + if(ig >= w || jg >= h) return; + + diffusivity_along_x(&sx, ijs, u, v, du, dv); + diffusivity_along_y(&sy, ijs, u, v, du, dv); + + if(ig == 0) sx = 0.0f; + if(jg == 0) sy = 0.0f; + + numerator_dudv[ijg] = data_term * (Ix*Iy + gamma * Ixy*(Ixx + Iyy)); + numerator_u[ijg] = data_term * (Ix*Iz + gamma * (Ixx*Ixz + Ixy*Iyz)); + numerator_v[ijg] = data_term * (Iy*Iz + gamma * (Iyy*Iyz + Ixy*Ixz)); + denominator_u[ijg] = data_term * (Ix*Ix + gamma * (Ixy*Ixy + Ixx*Ixx)); + denominator_v[ijg] = data_term * (Iy*Iy + gamma * (Ixy*Ixy + Iyy*Iyy)); + diffusivity_x[ijg] = sx; + diffusivity_y[ijg] = sy; +} + +/////////////////////////////////////////////////////////////////////////////// +///\brief computes matrix of linearised system for \c du, \c dv +///\param inv_denominator_u +///\param inv_denominator_v +///\param w +///\param h +///\param s +/////////////////////////////////////////////////////////////////////////////// +__global__ void prepare_sor_stage_2(float *inv_denominator_u, float *inv_denominator_v, + int w, int h, int s) +{ + __shared__ float sx[(PSOR_TILE_WIDTH+1) * (PSOR_TILE_HEIGHT+1)]; + __shared__ float sy[(PSOR_TILE_WIDTH+1) * (PSOR_TILE_HEIGHT+1)]; + //position within tile + const int i = threadIdx.x; + const int j = threadIdx.y; + //position within smem arrays + const int ijs = j*(PSOR_TILE_WIDTH+1) + i; + //position within global memory + const int ig = blockIdx.x * blockDim.x + threadIdx.x; + const int jg = blockIdx.y * blockDim.y + threadIdx.y; + const int ijg = jg*s + ig; + int inside = ig < w && jg < h; + float denom_u; + float denom_v; + if(inside) + { + denom_u = inv_denominator_u[ijg]; + denom_v = inv_denominator_v[ijg]; + } + if(inside) + { + sx[ijs] = tex1Dfetch(tex_diffusivity_x, ijg); + sy[ijs] = tex1Dfetch(tex_diffusivity_y, ijg); + } + else + { + sx[ijs] = 0.0f; + sy[ijs] = 0.0f; + } + int up = ijs+PSOR_TILE_WIDTH+1; + if(j == PSOR_TILE_HEIGHT-1) + { + if(jg < h-1 && inside) + { + sy[up] = tex1Dfetch(tex_diffusivity_y, ijg + s); + } + else + { + sy[up] = 0.0f; + } + } + int right = ijs + 1; + if(threadIdx.x == PSOR_TILE_WIDTH-1) + { + if(ig < w-1 && inside) + { + sx[right] = tex1Dfetch(tex_diffusivity_x, ijg + 1); + } + else + { + sx[right] = 0.0f; + } + } + __syncthreads(); + float diffusivity_sum; + diffusivity_sum = sx[ijs] + sx[ijs+1] + sy[ijs] + sy[ijs+PSOR_TILE_WIDTH+1]; + if(inside) + { + denom_u += diffusivity_sum; + denom_v += diffusivity_sum; + inv_denominator_u[ijg] = 1.0f/denom_u; + inv_denominator_v[ijg] = 1.0f/denom_v; + } +} + +///////////////////////////////////////////////////////////////////////////////////////// +// Red-Black SOR +///////////////////////////////////////////////////////////////////////////////////////// + +template __global__ void sor_pass(float *new_du, + float *new_dv, + const float *g_inv_denominator_u, + const float *g_inv_denominator_v, + const float *g_numerator_u, + const float *g_numerator_v, + const float *g_numerator_dudv, + float omega, + int width, + int height, + int stride) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + + if(i >= width || j >= height) + return; + + const int pos = j * stride + i; + const int pos_r = pos + 1; + const int pos_u = pos + stride; + const int pos_d = pos - stride; + const int pos_l = pos - 1; + + //load smooth term + float s_up, s_left, s_right, s_down; + s_left = tex1Dfetch(tex_diffusivity_x, pos); + s_down = tex1Dfetch(tex_diffusivity_y, pos); + if(i < width-1) + s_right = tex1Dfetch(tex_diffusivity_x, pos_r); + else + s_right = 0.0f; //Neumann BC + if(j < height-1) + s_up = tex1Dfetch(tex_diffusivity_y, pos_u); + else + s_up = 0.0f; //Neumann BC + + //load u, v and du, dv + float u_up, u_left, u_right, u_down, u; + float v_up, v_left, v_right, v_down, v; + float du_up, du_left, du_right, du_down, du; + float dv_up, dv_left, dv_right, dv_down, dv; + + u_left = tex1Dfetch(tex_u, pos_l); + u_right = tex1Dfetch(tex_u, pos_r); + u_down = tex1Dfetch(tex_u, pos_d); + u_up = tex1Dfetch(tex_u, pos_u); + u = tex1Dfetch(tex_u, pos); + + v_left = tex1Dfetch(tex_v, pos_l); + v_right = tex1Dfetch(tex_v, pos_r); + v_down = tex1Dfetch(tex_v, pos_d); + v = tex1Dfetch(tex_v, pos); + v_up = tex1Dfetch(tex_v, pos_u); + + du = tex1Dfetch(tex_du, pos); + du_left = tex1Dfetch(tex_du, pos_l); + du_right = tex1Dfetch(tex_du, pos_r); + du_down = tex1Dfetch(tex_du, pos_d); + du_up = tex1Dfetch(tex_du, pos_u); + + dv = tex1Dfetch(tex_dv, pos); + dv_left = tex1Dfetch(tex_dv, pos_l); + dv_right = tex1Dfetch(tex_dv, pos_r); + dv_down = tex1Dfetch(tex_dv, pos_d); + dv_up = tex1Dfetch(tex_dv, pos_u); + + float numerator_dudv = g_numerator_dudv[pos]; + + if((i+j)%2 == isBlack) + { + // update du + float numerator_u = (s_left*(u_left + du_left) + s_up*(u_up + du_up) + s_right*(u_right + du_right) + s_down*(u_down + du_down) - + u * (s_left + s_right + s_up + s_down) - g_numerator_u[pos] - numerator_dudv*dv); + + du = (1.0f - omega) * du + omega * g_inv_denominator_u[pos] * numerator_u; + + // update dv + float numerator_v = (s_left*(v_left + dv_left) + s_up*(v_up + dv_up) + s_right*(v_right + dv_right) + s_down*(v_down + dv_down) - + v * (s_left + s_right + s_up + s_down) - g_numerator_v[pos] - numerator_dudv*du); + + dv = (1.0f - omega) * dv + omega * g_inv_denominator_v[pos] * numerator_v; + } + new_du[pos] = du; + new_dv[pos] = dv; +} + +/////////////////////////////////////////////////////////////////////////////// +// utility functions +/////////////////////////////////////////////////////////////////////////////// + +void initTexture1D(texture &tex) +{ + tex.addressMode[0] = cudaAddressModeClamp; + tex.filterMode = cudaFilterModePoint; + tex.normalized = false; +} + +void initTexture2D(texture &tex) +{ + tex.addressMode[0] = cudaAddressModeMirror; + tex.addressMode[1] = cudaAddressModeMirror; + tex.filterMode = cudaFilterModeLinear; + tex.normalized = true; +} + +void InitTextures() +{ + initTexture2D(tex_I0); + initTexture2D(tex_I1); + initTexture2D(tex_fine); // for downsampling + initTexture2D(tex_coarse); // for prolongation + + initTexture2D(tex_Ix); + initTexture2D(tex_Ixx); + initTexture2D(tex_Ix0); + + initTexture2D(tex_Iy); + initTexture2D(tex_Iyy); + initTexture2D(tex_Iy0); + + initTexture2D(tex_Ixy); + + initTexture1D(tex_u); + initTexture1D(tex_v); + initTexture1D(tex_du); + initTexture1D(tex_dv); + initTexture1D(tex_diffusivity_x); + initTexture1D(tex_diffusivity_y); + initTexture1D(tex_inv_denominator_u); + initTexture1D(tex_inv_denominator_v); + initTexture1D(tex_numerator_dudv); + initTexture1D(tex_numerator_u); + initTexture1D(tex_numerator_v); +} + +///////////////////////////////////////////////////////////////////////////////////////// +// MAIN FUNCTION +///////////////////////////////////////////////////////////////////////////////////////// +NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc, + INCVMemAllocator &gpu_mem_allocator, + const NCVMatrix &frame0, + const NCVMatrix &frame1, + NCVMatrix &uOut, + NCVMatrix &vOut, + cudaStream_t stream) +{ + ncvAssertPrintReturn(desc.alpha > 0.0f , "Invalid alpha" , NCV_INCONSISTENT_INPUT); + ncvAssertPrintReturn(desc.gamma >= 0.0f , "Invalid gamma" , NCV_INCONSISTENT_INPUT); + ncvAssertPrintReturn(desc.number_of_inner_iterations > 0 , "Invalid number of inner iterations" , NCV_INCONSISTENT_INPUT); + ncvAssertPrintReturn(desc.number_of_outer_iterations > 0 , "Invalid number of outer iterations" , NCV_INCONSISTENT_INPUT); + ncvAssertPrintReturn(desc.number_of_solver_iterations > 0, "Invalid number of solver iterations", NCV_INCONSISTENT_INPUT); + + const Ncv32u kSourceWidth = frame0.width(); + const Ncv32u kSourceHeight = frame0.height(); + + ncvAssertPrintReturn(frame1.width() == kSourceWidth && frame1.height() == kSourceHeight, "Frame dims do not match", NCV_INCONSISTENT_INPUT); + ncvAssertReturn(uOut.width() == kSourceWidth && vOut.width() == kSourceWidth && + uOut.height() == kSourceHeight && vOut.height() == kSourceHeight, NCV_INCONSISTENT_INPUT); + + ncvAssertReturn(gpu_mem_allocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED); + + bool kSkipProcessing = gpu_mem_allocator.isCounting(); + + cudaDeviceProp device_props; + int cuda_device; + + ncvAssertCUDAReturn(cudaGetDevice(&cuda_device), NCV_CUDA_ERROR); + + ncvAssertCUDAReturn(cudaGetDeviceProperties(&device_props, cuda_device), NCV_CUDA_ERROR); + + + Ncv32u alignmentValue = gpu_mem_allocator.alignment (); + + const Ncv32u kStrideAlignmentFloat = alignmentValue / sizeof(float); + const Ncv32u kSourcePitch = alignUp(kSourceWidth, kStrideAlignmentFloat) * sizeof(float); + + const Ncv32f scale_factor = desc.scale_factor; + const Ncv32f alpha = desc.alpha; + const Ncv32f gamma = desc.gamma; + + const Ncv32u kSizeInPixelsAligned = alignUp(kSourceWidth, kStrideAlignmentFloat)*kSourceHeight; + +#if defined SAFE_VECTOR_DECL +#undef SAFE_VECTOR_DECL +#endif +#define SAFE_VECTOR_DECL(name, allocator, size) \ + FloatVector name((allocator), (size)); \ + ncvAssertReturn(name##.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); + + // matrix elements + SAFE_VECTOR_DECL(diffusivity_x, gpu_mem_allocator, kSizeInPixelsAligned); + SAFE_VECTOR_DECL(diffusivity_y, gpu_mem_allocator, kSizeInPixelsAligned); + SAFE_VECTOR_DECL(denom_u, gpu_mem_allocator, kSizeInPixelsAligned); + SAFE_VECTOR_DECL(denom_v, gpu_mem_allocator, kSizeInPixelsAligned); + SAFE_VECTOR_DECL(num_dudv, gpu_mem_allocator, kSizeInPixelsAligned); + SAFE_VECTOR_DECL(num_u, gpu_mem_allocator, kSizeInPixelsAligned); + SAFE_VECTOR_DECL(num_v, gpu_mem_allocator, kSizeInPixelsAligned); + + // flow components + SAFE_VECTOR_DECL(u, gpu_mem_allocator, kSizeInPixelsAligned); + SAFE_VECTOR_DECL(v, gpu_mem_allocator, kSizeInPixelsAligned); + + SAFE_VECTOR_DECL(u_new, gpu_mem_allocator, kSizeInPixelsAligned); + SAFE_VECTOR_DECL(v_new, gpu_mem_allocator, kSizeInPixelsAligned); + + // flow increments + SAFE_VECTOR_DECL(du, gpu_mem_allocator, kSizeInPixelsAligned); + SAFE_VECTOR_DECL(dv, gpu_mem_allocator, kSizeInPixelsAligned); + + SAFE_VECTOR_DECL(du_new, gpu_mem_allocator, kSizeInPixelsAligned); + SAFE_VECTOR_DECL(dv_new, gpu_mem_allocator, kSizeInPixelsAligned); + + // temporary storage + SAFE_VECTOR_DECL(device_buffer, gpu_mem_allocator, + alignUp(kSourceWidth, kStrideAlignmentFloat) + * alignUp(kSourceHeight, kStrideAlignmentFloat)); + + // image derivatives + SAFE_VECTOR_DECL(Ix, gpu_mem_allocator, kSizeInPixelsAligned); + SAFE_VECTOR_DECL(Ixx, gpu_mem_allocator, kSizeInPixelsAligned); + SAFE_VECTOR_DECL(Ix0, gpu_mem_allocator, kSizeInPixelsAligned); + SAFE_VECTOR_DECL(Iy, gpu_mem_allocator, kSizeInPixelsAligned); + SAFE_VECTOR_DECL(Iyy, gpu_mem_allocator, kSizeInPixelsAligned); + SAFE_VECTOR_DECL(Iy0, gpu_mem_allocator, kSizeInPixelsAligned); + SAFE_VECTOR_DECL(Ixy, gpu_mem_allocator, kSizeInPixelsAligned); + + // spatial derivative filter size + const int kDFilterSize = 5; + const float derivativeFilterHost[kDFilterSize] = {1.0f, -8.0f, 0.0f, 8.0f, -1.0f}; + SAFE_VECTOR_DECL(derivativeFilter, gpu_mem_allocator, kDFilterSize); + + ncvAssertCUDAReturn( + cudaMemcpy(derivativeFilter.ptr(), + derivativeFilterHost, + sizeof(float) * kDFilterSize, + cudaMemcpyHostToDevice), + NCV_CUDA_ERROR); + + InitTextures(); + + //prepare image pyramid + std::vector< shared_ptr > img0_pyramid; + std::vector< shared_ptr > img1_pyramid; + + std::vector w_pyramid; + std::vector h_pyramid; + + cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc(); + + float scale = 1.0f; + + //cuda arrays for frames + shared_ptr I0(new FloatVector(gpu_mem_allocator, kSizeInPixelsAligned)); + ncvAssertReturn(I0->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); + + shared_ptr I1(new FloatVector(gpu_mem_allocator, kSizeInPixelsAligned)); + ncvAssertReturn(I1->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); + + if (!kSkipProcessing) + { + //copy frame data to device + size_t dst_width_in_bytes = alignUp(kSourceWidth, kStrideAlignmentFloat) * sizeof(float); + size_t src_width_in_bytes = kSourceWidth * sizeof(float); + size_t src_pitch_in_bytes = frame0.pitch(); + ncvAssertCUDAReturn( cudaMemcpy2DAsync(I0->ptr(), dst_width_in_bytes, frame0.ptr(), + src_pitch_in_bytes, src_width_in_bytes, kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR ); + + ncvAssertCUDAReturn( cudaMemcpy2DAsync(I1->ptr(), dst_width_in_bytes, frame1.ptr(), + src_pitch_in_bytes, src_width_in_bytes, kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR ); + } + + //prepare pyramid + img0_pyramid.push_back(I0); + img1_pyramid.push_back(I1); + + w_pyramid.push_back(kSourceWidth); + h_pyramid.push_back(kSourceHeight); + + scale *= scale_factor; + + Ncv32u prev_level_width = kSourceWidth; + Ncv32u prev_level_height = kSourceHeight; + while((prev_level_width > 15) && (prev_level_height > 15) && (static_cast(img0_pyramid.size()) < desc.number_of_outer_iterations)) + { + //current resolution + Ncv32u level_width = static_cast(ceilf(kSourceWidth * scale)); + Ncv32u level_height = static_cast(ceilf(kSourceHeight * scale)); + + Ncv32u level_width_aligned = alignUp(level_width, kStrideAlignmentFloat); + + Ncv32u buffer_size = alignUp(level_width, kStrideAlignmentFloat) * level_height; // buffer size in floats + + Ncv32u prev_level_pitch = alignUp(prev_level_width, kStrideAlignmentFloat) * sizeof(float); + + shared_ptr level_frame0(new FloatVector(gpu_mem_allocator, buffer_size)); + ncvAssertReturn(level_frame0->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); + + shared_ptr level_frame1(new FloatVector(gpu_mem_allocator, buffer_size)); + ncvAssertReturn(level_frame1->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); + + ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR); + + if (!kSkipProcessing) + { + NcvSize32u srcSize (prev_level_width, prev_level_height); + NcvSize32u dstSize (level_width, level_height); + NcvRect32u srcROI (0, 0, prev_level_width, prev_level_height); + NcvRect32u dstROI (0, 0, level_width, level_height); + + // frame 0 + nppiStResize_32f_C1R (I0->ptr(), srcSize, prev_level_pitch, srcROI, + level_frame0->ptr(), dstSize, level_width_aligned * sizeof (float), dstROI, scale_factor, scale_factor, nppStSupersample); + + // frame 1 + nppiStResize_32f_C1R (I1->ptr(), srcSize, prev_level_pitch, srcROI, + level_frame1->ptr(), dstSize, level_width_aligned * sizeof (float), dstROI, scale_factor, scale_factor, nppStSupersample); + } + + //store pointers + img0_pyramid.push_back(level_frame0); + img1_pyramid.push_back(level_frame1); + + w_pyramid.push_back(level_width); + h_pyramid.push_back(level_height); + + scale *= scale_factor; + + prev_level_width = level_width; + prev_level_height = level_height; + + I0 = level_frame0; + I1 = level_frame1; + } + + if (!kSkipProcessing) + { + //initial values for flow is 0 + ncvAssertCUDAReturn(cudaMemsetAsync(u.ptr(), 0, kSizeInPixelsAligned * sizeof(float), stream), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaMemsetAsync(v.ptr(), 0, kSizeInPixelsAligned * sizeof(float), stream), NCV_CUDA_ERROR); + + //select images with lowest resolution + size_t pitch = alignUp(w_pyramid.back(), kStrideAlignmentFloat) * sizeof(float); + ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, img0_pyramid.back()->ptr(), channel_desc, w_pyramid.back(), h_pyramid.back(), pitch), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, img1_pyramid.back()->ptr(), channel_desc, w_pyramid.back(), h_pyramid.back(), pitch), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR); + } + + FloatVector* ptrU = &u; + FloatVector* ptrV = &v; + FloatVector* ptrUNew = &u_new; + FloatVector* ptrVNew = &v_new; + + std::vector< shared_ptr >::const_reverse_iterator img0Iter = img0_pyramid.rbegin(); + std::vector< shared_ptr >::const_reverse_iterator img1Iter = img1_pyramid.rbegin(); + //outer loop + //warping fixed point iteration + while(!w_pyramid.empty()) + { + //current grid dimensions + const Ncv32u kLevelWidth = w_pyramid.back(); + const Ncv32u kLevelHeight = h_pyramid.back(); + const Ncv32u kLevelStride = alignUp(kLevelWidth, kStrideAlignmentFloat); + + //size of current image in bytes + const int kLevelSizeInBytes = kLevelStride * kLevelHeight * sizeof(float); + + //number of points at current resolution + const int kLevelSizeInPixels = kLevelStride * kLevelHeight; + + if (!kSkipProcessing) + { + //initial guess for du and dv + ncvAssertCUDAReturn(cudaMemsetAsync(du.ptr(), 0, kLevelSizeInBytes, stream), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaMemsetAsync(dv.ptr(), 0, kLevelSizeInBytes, stream), NCV_CUDA_ERROR); + } + + //texture format descriptor + cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc(); + + I0 = *img0Iter; + I1 = *img1Iter; + + ++img0Iter; + ++img1Iter; + + if (!kSkipProcessing) + { + ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, I0->ptr(), channel_desc, kLevelWidth, kLevelHeight, kLevelStride*sizeof(float)), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, I1->ptr(), channel_desc, kLevelWidth, kLevelHeight, kLevelStride*sizeof(float)), NCV_CUDA_ERROR); + } + //compute derivatives + dim3 dBlocks(iDivUp(kLevelWidth, 32), iDivUp(kLevelHeight, 6)); + dim3 dThreads(32, 6); + + const int kPitchTex = kLevelStride * sizeof(float); + if (!kSkipProcessing) + { + NcvSize32u srcSize(kLevelWidth, kLevelHeight); + Ncv32u nSrcStep = kLevelStride * sizeof(float); + NcvRect32u oROI(0, 0, kLevelWidth, kLevelHeight); + + // Ix0 + nppiStFilterRowBorder_32f_C1R (I0->ptr(), srcSize, nSrcStep, Ix0.ptr(), srcSize, nSrcStep, oROI, + nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f); + + // Iy0 + nppiStFilterColumnBorder_32f_C1R (I0->ptr(), srcSize, nSrcStep, Iy0.ptr(), srcSize, nSrcStep, oROI, + nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f); + + // Ix + nppiStFilterRowBorder_32f_C1R (I1->ptr(), srcSize, nSrcStep, Ix.ptr(), srcSize, nSrcStep, oROI, + nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f); + + // Iy + nppiStFilterColumnBorder_32f_C1R (I1->ptr(), srcSize, nSrcStep, Iy.ptr(), srcSize, nSrcStep, oROI, + nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f); + + // Ixx + nppiStFilterRowBorder_32f_C1R (Ix.ptr(), srcSize, nSrcStep, Ixx.ptr(), srcSize, nSrcStep, oROI, + nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f); + + // Iyy + nppiStFilterColumnBorder_32f_C1R (Iy.ptr(), srcSize, nSrcStep, Iyy.ptr(), srcSize, nSrcStep, oROI, + nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f); + + // Ixy + nppiStFilterRowBorder_32f_C1R (Iy.ptr(), srcSize, nSrcStep, Ixy.ptr(), srcSize, nSrcStep, oROI, + nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f); + } + + if (!kSkipProcessing) + { + ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix, Ix.ptr(), channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixx, Ixx.ptr(), channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix0, Ix0.ptr(), channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iy, Iy.ptr(), channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iyy, Iyy.ptr(), channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iy0, Iy0.ptr(), channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixy, Ixy.ptr(), channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR); + + // flow + ncvAssertCUDAReturn(cudaBindTexture(0, tex_u, ptrU->ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaBindTexture(0, tex_v, ptrV->ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); + // flow increments + ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); + } + + + dim3 psor_blocks(iDivUp(kLevelWidth, PSOR_TILE_WIDTH), iDivUp(kLevelHeight, PSOR_TILE_HEIGHT)); + dim3 psor_threads(PSOR_TILE_WIDTH, PSOR_TILE_HEIGHT); + + dim3 sor_blocks(iDivUp(kLevelWidth, SOR_TILE_WIDTH), iDivUp(kLevelHeight, SOR_TILE_HEIGHT)); + dim3 sor_threads(SOR_TILE_WIDTH, SOR_TILE_HEIGHT); + + // inner loop + // lagged nonlinearity fixed point iteration + ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR); + for (Ncv32u current_inner_iteration = 0; current_inner_iteration < desc.number_of_inner_iterations; ++current_inner_iteration) + { + //compute coefficients + if (!kSkipProcessing) + { + prepare_sor_stage_1_tex<<>> + (diffusivity_x.ptr(), + diffusivity_y.ptr(), + denom_u.ptr(), + denom_v.ptr(), + num_dudv.ptr(), + num_u.ptr(), + num_v.ptr(), + kLevelWidth, + kLevelHeight, + kLevelStride, + alpha, + gamma); + + ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); + + ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_dudv, num_dudv.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); + + ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_u, num_u.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_v, num_v.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); + + prepare_sor_stage_2<<>>(denom_u.ptr(), denom_v.ptr(), kLevelWidth, kLevelHeight, kLevelStride); + + // linear system coefficients + ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); + + ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_dudv, num_dudv.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); + + ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_u, num_u.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_v, num_v.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); + + ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_u, denom_u.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_v, denom_v.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); + } + //solve linear system + for (Ncv32u solver_iteration = 0; solver_iteration < desc.number_of_solver_iterations; ++solver_iteration) + { + float omega = 1.99f; + if (!kSkipProcessing) + { + ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); + + sor_pass<0><<>> + (du_new.ptr(), + dv_new.ptr(), + denom_u.ptr(), + denom_v.ptr(), + num_u.ptr(), + num_v.ptr(), + num_dudv.ptr(), + omega, + kLevelWidth, + kLevelHeight, + kLevelStride); + + ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du_new.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv_new.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); + + sor_pass<1><<>> + (du.ptr(), + dv.ptr(), + denom_u.ptr(), + denom_v.ptr(), + num_u.ptr(), + num_v.ptr(), + num_dudv.ptr(), + omega, + kLevelWidth, + kLevelHeight, + kLevelStride); + } + + ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); + }//end of solver loop + }// end of inner loop + + //update u and v + if (!kSkipProcessing) + { + add(ptrU->ptr(), du.ptr(), kLevelSizeInPixels, stream); + add(ptrV->ptr(), dv.ptr(), kLevelSizeInPixels, stream); + } + //prolongate using texture + w_pyramid.pop_back(); + h_pyramid.pop_back(); + if (!w_pyramid.empty()) + { + //compute new image size + Ncv32u nw = w_pyramid.back(); + Ncv32u nh = h_pyramid.back(); + Ncv32u ns = alignUp(nw, kStrideAlignmentFloat); + + dim3 p_blocks(iDivUp(nw, 32), iDivUp(nh, 8)); + dim3 p_threads(32, 8); + if (!kSkipProcessing) + { + NcvSize32u srcSize (kLevelWidth, kLevelHeight); + NcvSize32u dstSize (nw, nh); + NcvRect32u srcROI (0, 0, kLevelWidth, kLevelHeight); + NcvRect32u dstROI (0, 0, nw, nh); + + nppiStResize_32f_C1R (ptrU->ptr(), srcSize, kLevelStride * sizeof (float), srcROI, + ptrUNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic); + + ScaleVector(ptrUNew->ptr(), ptrUNew->ptr(), 1.0f/scale_factor, ns * nh, stream); + + nppiStResize_32f_C1R (ptrV->ptr(), srcSize, kLevelStride * sizeof (float), srcROI, + ptrVNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic); + + ScaleVector(ptrVNew->ptr(), ptrVNew->ptr(), 1.0f/scale_factor, ns * nh, stream); + } + + std::swap(ptrU, ptrUNew); + std::swap(ptrV, ptrVNew); + } + scale /= scale_factor; + } + + // end of warping iterations + ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR); + + ncvAssertCUDAReturn( cudaMemcpy2DAsync + (uOut.ptr(), uOut.pitch(), ptrU->ptr(), + kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR ); + + ncvAssertCUDAReturn( cudaMemcpy2DAsync + (vOut.ptr(), vOut.pitch(), ptrV->ptr(), + kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR ); + + ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR); + ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR); + + return NCV_SUCCESS; +} diff --git a/modules/gpu/src/nvidia/NCVBroxOpticalFlow.hpp b/modules/gpu/src/nvidia/NCVBroxOpticalFlow.hpp new file mode 100644 index 0000000000..0c8ad59d1c --- /dev/null +++ b/modules/gpu/src/nvidia/NCVBroxOpticalFlow.hpp @@ -0,0 +1,103 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +//////////////////////////////////////////////////////////////////////////////// +// +// NVIDIA CUDA implementation of Brox et al Optical Flow algorithm +// +// Algorithm is explained in the original paper: +// T. Brox, A. Bruhn, N. Papenberg, J. Weickert: +// High accuracy optical flow estimation based on a theory for warping. +// ECCV 2004. +// +// Implementation by Mikhail Smirnov +// email: msmirnov@nvidia.com, devsupport@nvidia.com +// +// Credits for help with the code to: +// Alexey Mendelenko, Anton Obukhov, and Alexander Kharlamov. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef _ncv_optical_flow_h_ +#define _ncv_optical_flow_h_ + +#include "NCV.hpp" + +/// \brief Model and solver parameters +struct NCVBroxOpticalFlowDescriptor +{ + /// flow smoothness + Ncv32f alpha; + /// gradient constancy importance + Ncv32f gamma; + /// pyramid scale factor + Ncv32f scale_factor; + /// number of lagged non-linearity iterations (inner loop) + Ncv32u number_of_inner_iterations; + /// number of warping iterations (number of pyramid levels) + Ncv32u number_of_outer_iterations; + /// number of linear system solver iterations + Ncv32u number_of_solver_iterations; +}; + +///////////////////////////////////////////////////////////////////////////////////////// +/// \brief Compute optical flow +/// +/// Based on method by Brox et al [2004] +/// \param [in] desc model and solver parameters +/// \param [in] gpu_mem_allocator GPU memory allocator +/// \param [in] frame0 source frame +/// \param [in] frame1 frame to track +/// \param [out] u flow horizontal component (along \b x axis) +/// \param [out] v flow vertical component (along \b y axis) +/// \return computation status +///////////////////////////////////////////////////////////////////////////////////////// + +NCV_EXPORTS +NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc, + INCVMemAllocator &gpu_mem_allocator, + const NCVMatrix &frame0, + const NCVMatrix &frame1, + NCVMatrix &u, + NCVMatrix &v, + cudaStream_t stream); + +#endif diff --git a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu index b0772d6359..ac29ce235d 100644 --- a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu +++ b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu @@ -1610,3 +1610,952 @@ NCVStatus nppsStCompact_32f_host(Ncv32f *h_src, Ncv32u srcLen, { return nppsStCompact_32u_host((Ncv32u *)h_src, srcLen, (Ncv32u *)h_dst, dstLen, *(Ncv32u *)&elemRemove); } + + +//============================================================================== +// +// Filter.cu +// +//============================================================================== + + +texture texSrc; +texture texKernel; + + +__forceinline__ __device__ float getValueMirrorRow(const int rowOffset, + int i, + int w) +{ + if (i < 0) i = 1 - i; + if (i >= w) i = w + w - i - 1; + return tex1Dfetch (texSrc, rowOffset + i); +} + + +__forceinline__ __device__ float getValueMirrorColumn(const int offset, + const int rowStep, + int j, + int h) +{ + if (j < 0) j = 1 - j; + if (j >= h) j = h + h - j - 1; + return tex1Dfetch (texSrc, offset + j * rowStep); +} + + +__global__ void FilterRowBorderMirror_32f_C1R(Ncv32u srcStep, + Ncv32f *pDst, + NcvSize32u dstSize, + Ncv32u dstStep, + NcvRect32u roi, + Ncv32s nKernelSize, + Ncv32s nAnchor, + Ncv32f multiplier) +{ + // position within ROI + const int ix = blockDim.x * blockIdx.x + threadIdx.x; + const int iy = blockDim.y * blockIdx.y + threadIdx.y; + + if (ix >= roi.width || iy >= roi.height) + { + return; + } + + const int p = nKernelSize - nAnchor - 1; + + const int j = roi.y + iy; + + const int rowOffset = j * srcStep + roi.x; + + float sum = 0.0f; + for (int m = 0; m < nKernelSize; ++m) + { + sum += getValueMirrorRow (rowOffset, ix + m - p, roi.width) + * tex1Dfetch (texKernel, m); + } + + pDst[iy * dstStep + ix] = sum * multiplier; +} + + +__global__ void FilterColumnBorderMirror_32f_C1R(Ncv32u srcStep, + Ncv32f *pDst, + NcvSize32u dstSize, + Ncv32u dstStep, + NcvRect32u roi, + Ncv32s nKernelSize, + Ncv32s nAnchor, + Ncv32f multiplier) +{ + const int ix = blockDim.x * blockIdx.x + threadIdx.x; + const int iy = blockDim.y * blockIdx.y + threadIdx.y; + + if (ix >= roi.width || iy >= roi.height) + { + return; + } + + const int p = nKernelSize - nAnchor - 1; + const int i = roi.x + ix; + const int offset = i + roi.y * srcStep; + + float sum = 0.0f; + for (int m = 0; m < nKernelSize; ++m) + { + sum += getValueMirrorColumn (offset, srcStep, iy + m - p, roi.height) + * tex1Dfetch (texKernel, m); + } + + pDst[ix + iy * dstStep] = sum * multiplier; +} + + +NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc, + NcvSize32u srcSize, + Ncv32u nSrcStep, + Ncv32f *pDst, + NcvSize32u dstSize, + Ncv32u nDstStep, + NcvRect32u oROI, + NppStBorderType borderType, + const Ncv32f *pKernel, + Ncv32s nKernelSize, + Ncv32s nAnchor, + Ncv32f multiplier) +{ + ncvAssertReturn (pSrc != NULL && + pDst != NULL && + pKernel != NULL, NCV_NULL_PTR); + + ncvAssertReturn (oROI.width > 0 && oROI.height > 0, NPPST_INVALID_ROI); + + ncvAssertReturn (srcSize.width * sizeof (Ncv32f) <= nSrcStep && + dstSize.width * sizeof (Ncv32f) <= nDstStep && + oROI.width * sizeof (Ncv32f) <= nSrcStep && + oROI.width * sizeof (Ncv32f) <= nDstStep && + nSrcStep % sizeof (Ncv32f) == 0 && + nDstStep % sizeof (Ncv32f) == 0, NPPST_INVALID_STEP); + + Ncv32u srcStep = nSrcStep / sizeof (Ncv32f); + Ncv32u dstStep = nDstStep / sizeof (Ncv32f); + + // adjust ROI size to be within source image + if (oROI.x + oROI.width > srcSize.width) + { + oROI.width = srcSize.width - oROI.x; + } + + if (oROI.y + oROI.height > srcSize.height) + { + oROI.height = srcSize.height - oROI.y; + } + + cudaChannelFormatDesc floatChannel = cudaCreateChannelDesc (); + texSrc.normalized = false; + texKernel.normalized = false; + + cudaBindTexture (0, texSrc, pSrc, floatChannel, srcSize.height * nSrcStep); + cudaBindTexture (0, texKernel, pKernel, floatChannel, nKernelSize * sizeof (Ncv32f)); + + dim3 ctaSize (32, 6); + dim3 gridSize ((oROI.width + ctaSize.x - 1) / ctaSize.x, + (oROI.height + ctaSize.y - 1) / ctaSize.y); + + switch (borderType) + { + case nppStBorderNone: + return NPPST_ERROR; + case nppStBorderClamp: + return NPPST_ERROR; + case nppStBorderWrap: + return NPPST_ERROR; + case nppStBorderMirror: + FilterRowBorderMirror_32f_C1R <<>> + (srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier); + break; + default: + return NPPST_ERROR; + } + + return NPPST_SUCCESS; +} + + +NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc, + NcvSize32u srcSize, + Ncv32u nSrcStep, + Ncv32f *pDst, + NcvSize32u dstSize, + Ncv32u nDstStep, + NcvRect32u oROI, + NppStBorderType borderType, + const Ncv32f *pKernel, + Ncv32s nKernelSize, + Ncv32s nAnchor, + Ncv32f multiplier) +{ + ncvAssertReturn (pSrc != NULL && + pDst != NULL && + pKernel != NULL, NCV_NULL_PTR); + + ncvAssertReturn (oROI.width > 0 && oROI.height > 0, NPPST_INVALID_ROI); + + ncvAssertReturn (srcSize.width * sizeof (Ncv32f) <= nSrcStep && + dstSize.width * sizeof (Ncv32f) <= nDstStep && + oROI.width * sizeof (Ncv32f) <= nSrcStep && + oROI.width * sizeof (Ncv32f) <= nDstStep && + nSrcStep % sizeof (Ncv32f) == 0 && + nDstStep % sizeof (Ncv32f) == 0, NPPST_INVALID_STEP); + + Ncv32u srcStep = nSrcStep / sizeof (Ncv32f); + Ncv32u dstStep = nDstStep / sizeof (Ncv32f); + + // adjust ROI size to be within source image + if (oROI.x + oROI.width > srcSize.width) + { + oROI.width = srcSize.width - oROI.x; + } + + if (oROI.y + oROI.height > srcSize.height) + { + oROI.height = srcSize.height - oROI.y; + } + + cudaChannelFormatDesc floatChannel = cudaCreateChannelDesc (); + texSrc.normalized = false; + texKernel.normalized = false; + + cudaBindTexture (0, texSrc, pSrc, floatChannel, srcSize.height * nSrcStep); + cudaBindTexture (0, texKernel, pKernel, floatChannel, nKernelSize * sizeof (Ncv32f)); + + dim3 ctaSize (32, 6); + dim3 gridSize ((oROI.width + ctaSize.x - 1) / ctaSize.x, + (oROI.height + ctaSize.y - 1) / ctaSize.y); + + switch (borderType) + { + case nppStBorderClamp: + return NPPST_ERROR; + case nppStBorderWrap: + return NPPST_ERROR; + case nppStBorderMirror: + FilterColumnBorderMirror_32f_C1R <<>> + (srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier); + break; + default: + return NPPST_ERROR; + } + + return NPPST_SUCCESS; +} + + +//============================================================================== +// +// FrameInterpolate.cu +// +//============================================================================== + + +inline Ncv32u iDivUp(Ncv32u num, Ncv32u denom) +{ + return (num + denom - 1)/denom; +} + + +texture tex_src1; +texture tex_src0; + + +__global__ void BlendFramesKernel(const float *u, const float *v, // forward flow + const float *ur, const float *vr, // backward flow + const float *o0, const float *o1, // coverage masks + int w, int h, int s, + float theta, float *out) +{ + const int ix = threadIdx.x + blockDim.x * blockIdx.x; + const int iy = threadIdx.y + blockDim.y * blockIdx.y; + + const int pos = ix + s * iy; + + if (ix >= w || iy >= h) return; + + float _u = u[pos]; + float _v = v[pos]; + + float _ur = ur[pos]; + float _vr = vr[pos]; + + float x = (float)ix + 0.5f; + float y = (float)iy + 0.5f; + bool b0 = o0[pos] > 1e-4f; + bool b1 = o1[pos] > 1e-4f; + + if (b0 && b1) + { + // pixel is visible on both frames + out[pos] = tex2D(tex_src0, x - _u * theta, y - _v * theta) * (1.0f - theta) + + tex2D(tex_src1, x + _u * (1.0f - theta), y + _v * (1.0f - theta)) * theta; + } + else if (b0) + { + // visible on the first frame only + out[pos] = tex2D(tex_src0, x - _u * theta, y - _v * theta); + } + else + { + // visible on the second frame only + out[pos] = tex2D(tex_src1, x - _ur * (1.0f - theta), y - _vr * (1.0f - theta)); + } +} + + +NCVStatus BlendFrames(const Ncv32f *src0, + const Ncv32f *src1, + const Ncv32f *ufi, + const Ncv32f *vfi, + const Ncv32f *ubi, + const Ncv32f *vbi, + const Ncv32f *o1, + const Ncv32f *o2, + Ncv32u width, + Ncv32u height, + Ncv32u stride, + Ncv32f theta, + Ncv32f *out) +{ + tex_src1.addressMode[0] = cudaAddressModeClamp; + tex_src1.addressMode[1] = cudaAddressModeClamp; + tex_src1.filterMode = cudaFilterModeLinear; + tex_src1.normalized = false; + + tex_src0.addressMode[0] = cudaAddressModeClamp; + tex_src0.addressMode[1] = cudaAddressModeClamp; + tex_src0.filterMode = cudaFilterModeLinear; + tex_src0.normalized = false; + + cudaChannelFormatDesc desc = cudaCreateChannelDesc (); + const Ncv32u pitch = stride * sizeof (float); + ncvAssertCUDAReturn (cudaBindTexture2D (0, tex_src1, src1, desc, width, height, pitch), NPPST_TEXTURE_BIND_ERROR); + ncvAssertCUDAReturn (cudaBindTexture2D (0, tex_src0, src0, desc, width, height, pitch), NPPST_TEXTURE_BIND_ERROR); + + dim3 threads (32, 4); + dim3 blocks (iDivUp (width, threads.x), iDivUp (height, threads.y)); + + BlendFramesKernel<<>> + (ufi, vfi, ubi, vbi, o1, o2, width, height, stride, theta, out); + + ncvAssertCUDAReturn (cudaGetLastError (), NPPST_CUDA_KERNEL_EXECUTION_ERROR); + + return NPPST_SUCCESS; +} + + +NCVStatus nppiStGetInterpolationBufferSize(NcvSize32u srcSize, + Ncv32u nStep, + Ncv32u *hpSize) +{ + NCVStatus status = NPPST_ERROR; + status = nppiStVectorWarpGetBufferSize(srcSize, nStep, hpSize); + return status; +} + + +NCVStatus nppiStInterpolateFrames(const NppStInterpolationState *pState) +{ + // check state validity + ncvAssertReturn (pState->pSrcFrame0 != 0 && + pState->pSrcFrame1 != 0 && + pState->pFU != 0 && + pState->pFV != 0 && + pState->pBU != 0 && + pState->pBV != 0 && + pState->pNewFrame != 0 && + pState->ppBuffers[0] != 0 && + pState->ppBuffers[1] != 0 && + pState->ppBuffers[2] != 0 && + pState->ppBuffers[3] != 0 && + pState->ppBuffers[4] != 0 && + pState->ppBuffers[5] != 0, NPPST_NULL_POINTER_ERROR); + + ncvAssertReturn (pState->size.width > 0 && + pState->size.height > 0, NPPST_ERROR); + + ncvAssertReturn (pState->nStep >= pState->size.width * sizeof (Ncv32f) && + pState->nStep > 0 && + pState->nStep % sizeof (Ncv32f) == 0, + NPPST_INVALID_STEP); + + // change notation + Ncv32f *cov0 = pState->ppBuffers[0]; + Ncv32f *cov1 = pState->ppBuffers[1]; + Ncv32f *fwdU = pState->ppBuffers[2]; // forward u + Ncv32f *fwdV = pState->ppBuffers[3]; // forward v + Ncv32f *bwdU = pState->ppBuffers[4]; // backward u + Ncv32f *bwdV = pState->ppBuffers[5]; // backward v + // warp flow + ncvAssertReturnNcvStat ( + nppiStVectorWarp_PSF2x2_32f_C1 (pState->pFU, + pState->size, + pState->nStep, + pState->pFU, + pState->pFV, + pState->nStep, + cov0, + pState->pos, + fwdU) ); + ncvAssertReturnNcvStat ( + nppiStVectorWarp_PSF2x2_32f_C1 (pState->pFV, + pState->size, + pState->nStep, + pState->pFU, + pState->pFV, + pState->nStep, + cov0, + pState->pos, + fwdV) ); + // warp backward flow + ncvAssertReturnNcvStat ( + nppiStVectorWarp_PSF2x2_32f_C1 (pState->pBU, + pState->size, + pState->nStep, + pState->pBU, + pState->pBV, + pState->nStep, + cov1, + 1.0f - pState->pos, + bwdU) ); + ncvAssertReturnNcvStat ( + nppiStVectorWarp_PSF2x2_32f_C1 (pState->pBV, + pState->size, + pState->nStep, + pState->pBU, + pState->pBV, + pState->nStep, + cov1, + 1.0f - pState->pos, + bwdU) ); + // interpolate frame + ncvAssertReturnNcvStat ( + BlendFrames (pState->pSrcFrame0, + pState->pSrcFrame1, + fwdU, + fwdV, + bwdU, + bwdV, + cov0, + cov1, + pState->size.width, + pState->size.height, + pState->nStep / sizeof (Ncv32f), + pState->pos, + pState->pNewFrame) ); + + return NPPST_SUCCESS; +} + + +//============================================================================== +// +// VectorWarpFrame.cu +// +//============================================================================== + + +#if __CUDA_ARCH__ < 200 + +// FP32 atomic add +static __forceinline__ __device__ float _atomicAdd(float *addr, float val) +{ + float old = *addr, assumed; + + do { + assumed = old; + old = int_as_float(__iAtomicCAS((int*)addr, + float_as_int(assumed), + float_as_int(val+assumed))); + } while( assumed!=old ); + + return old; +} +#else +#define _atomicAdd atomicAdd +#endif + + +__global__ void ForwardWarpKernel_PSF2x2(const float *u, + const float *v, + const float *src, + const int w, + const int h, + const int flow_stride, + const int image_stride, + const float time_scale, + float *normalization_factor, + float *dst) +{ + int j = threadIdx.x + blockDim.x * blockIdx.x; + int i = threadIdx.y + blockDim.y * blockIdx.y; + + if (i >= h || j >= w) return; + + int flow_row_offset = i * flow_stride; + int image_row_offset = i * image_stride; + + //bottom left corner of a target pixel + float cx = u[flow_row_offset + j] * time_scale + (float)j + 1.0f; + float cy = v[flow_row_offset + j] * time_scale + (float)i + 1.0f; + // pixel containing bottom left corner + float px; + float py; + float dx = modff (cx, &px); + float dy = modff (cy, &py); + // target pixel integer coords + int tx; + int ty; + tx = (int) px; + ty = (int) py; + float value = src[image_row_offset + j]; + float weight; + // fill pixel containing bottom right corner + if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0))) + { + weight = dx * dy; + _atomicAdd (dst + ty * image_stride + tx, value * weight); + _atomicAdd (normalization_factor + ty * image_stride + tx, weight); + } + + // fill pixel containing bottom left corner + tx -= 1; + if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0))) + { + weight = (1.0f - dx) * dy; + _atomicAdd (dst + ty * image_stride + tx, value * weight); + _atomicAdd (normalization_factor + ty * image_stride + tx, weight); + } + + // fill pixel containing upper left corner + ty -= 1; + if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0))) + { + weight = (1.0f - dx) * (1.0f - dy); + _atomicAdd (dst + ty * image_stride + tx, value * weight); + _atomicAdd (normalization_factor + ty * image_stride + tx, weight); + } + + // fill pixel containing upper right corner + tx += 1; + if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0))) + { + weight = dx * (1.0f - dy); + _atomicAdd (dst + ty * image_stride + tx, value * weight); + _atomicAdd (normalization_factor + ty * image_stride + tx, weight); + } +} + + +__global__ void ForwardWarpKernel_PSF1x1(const float *u, + const float *v, + const float *src, + const int w, + const int h, + const int flow_stride, + const int image_stride, + const float time_scale, + float *dst) +{ + int j = threadIdx.x + blockDim.x * blockIdx.x; + int i = threadIdx.y + blockDim.y * blockIdx.y; + + if (i >= h || j >= w) return; + + int flow_row_offset = i * flow_stride; + int image_row_offset = i * image_stride; + + float u_ = u[flow_row_offset + j]; + float v_ = v[flow_row_offset + j]; + + //bottom left corner of target pixel + float cx = u_ * time_scale + (float)j + 1.0f; + float cy = v_ * time_scale + (float)i + 1.0f; + // pixel containing bottom left corner + int tx = __float2int_rn (cx); + int ty = __float2int_rn (cy); + + float value = src[image_row_offset + j]; + // fill pixel + if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0))) + { + _atomicAdd (dst + ty * image_stride + tx, value); + } +} + + +__global__ void NormalizeKernel(const float *normalization_factor, int w, int h, int s, float *image) +{ + int i = threadIdx.y + blockDim.y * blockIdx.y; + int j = threadIdx.x + blockDim.x * blockIdx.x; + + if (i >= h || j >= w) return; + + const int pos = i * s + j; + + float scale = normalization_factor[pos]; + + float invScale = (scale == 0.0f) ? 1.0f : (1.0f / scale); + + image[pos] *= invScale; +} + + +__global__ void MemsetKernel(const float value, int w, int h, float *image) +{ + int i = threadIdx.y + blockDim.y * blockIdx.y; + int j = threadIdx.x + blockDim.x * blockIdx.x; + + if (i >= h || j >= w) return; + + const int pos = i * w + j; + + image[pos] = value; +} + + +NCVStatus nppiStVectorWarpGetBufferSize (NcvSize32u srcSize, Ncv32u nSrcStep, Ncv32u *hpSize) +{ + ncvAssertReturn (hpSize != NULL, NPPST_NULL_POINTER_ERROR); + ncvAssertReturn (srcSize.width * sizeof (Ncv32f) <= nSrcStep, + NPPST_INVALID_STEP); + + *hpSize = nSrcStep * srcSize.height; + + return NPPST_SUCCESS; +} + + +// does not require normalization +NCVStatus nppiStVectorWarp_PSF1x1_32f_C1(const Ncv32f *pSrc, + NcvSize32u srcSize, + Ncv32u nSrcStep, + const Ncv32f *pU, + const Ncv32f *pV, + Ncv32u nVFStep, + Ncv32f timeScale, + Ncv32f *pDst) +{ + ncvAssertReturn (pSrc != NULL && + pU != NULL && + pV != NULL && + pDst != NULL, NPPST_NULL_POINTER_ERROR); + + ncvAssertReturn (srcSize.width * sizeof (Ncv32f) <= nSrcStep && + srcSize.width * sizeof (Ncv32f) <= nVFStep, + NPPST_INVALID_STEP); + + Ncv32u srcStep = nSrcStep / sizeof (Ncv32f); + Ncv32u vfStep = nVFStep / sizeof (Ncv32f); + + dim3 ctaSize (32, 6); + dim3 gridSize (iDivUp (srcSize.width, ctaSize.x), iDivUp (srcSize.height, ctaSize.y)); + + ForwardWarpKernel_PSF1x1 <<>> + (pU, pV, pSrc, srcSize.width, srcSize.height, vfStep, srcStep, timeScale, pDst); + + return NPPST_SUCCESS; +} + + +NCVStatus nppiStVectorWarp_PSF2x2_32f_C1(const Ncv32f *pSrc, + NcvSize32u srcSize, + Ncv32u nSrcStep, + const Ncv32f *pU, + const Ncv32f *pV, + Ncv32u nVFStep, + Ncv32f *pBuffer, + Ncv32f timeScale, + Ncv32f *pDst) +{ + ncvAssertReturn (pSrc != NULL && + pU != NULL && + pV != NULL && + pDst != NULL && + pBuffer != NULL, NPPST_NULL_POINTER_ERROR); + + ncvAssertReturn (srcSize.width * sizeof (Ncv32f) <= nSrcStep && + srcSize.width * sizeof (Ncv32f) <= nVFStep, NPPST_INVALID_STEP); + + Ncv32u srcStep = nSrcStep / sizeof (Ncv32f); + Ncv32u vfStep = nVFStep / sizeof(Ncv32f); + + dim3 ctaSize(32, 6); + dim3 gridSize (iDivUp (srcSize.width, ctaSize.x), iDivUp (srcSize.height, ctaSize.y)); + + MemsetKernel <<>> + (0, srcSize.width, srcSize.height, pBuffer); + + ForwardWarpKernel_PSF2x2 <<>> + (pU, pV, pSrc, srcSize.width, srcSize.height, vfStep, srcStep, timeScale, pBuffer, pDst); + + NormalizeKernel <<>> + (pBuffer, srcSize.width, srcSize.height, srcStep, pDst); + + return NPPST_SUCCESS; +} + + +//============================================================================== +// +// Resize.cu +// +//============================================================================== + + +texture texSrc2D; + + +__forceinline__ +__device__ float processLine(int spos, + float xmin, + float xmax, + int ixmin, + int ixmax, + float fxmin, + float cxmax) +{ + // first element + float wsum = 1.0f - xmin + fxmin; + float sum = tex1Dfetch(texSrc, spos) * (1.0f - xmin + fxmin); + spos++; + for (int ix = ixmin + 1; ix < ixmax; ++ix) + { + sum += tex1Dfetch(texSrc, spos); + spos++; + wsum += 1.0f; + } + sum += tex1Dfetch(texSrc, spos) * (cxmax - xmax); + wsum += cxmax - xmax; + return sum / wsum; +} + + +__global__ void resizeSuperSample_32f(NcvSize32u srcSize, + Ncv32u srcStep, + NcvRect32u srcROI, + Ncv32f *dst, + NcvSize32u dstSize, + Ncv32u dstStep, + NcvRect32u dstROI, + Ncv32f scaleX, + Ncv32f scaleY) +{ + // position within dst ROI + const int ix = blockIdx.x * blockDim.x + threadIdx.x; + const int iy = blockIdx.y * blockDim.y + threadIdx.y; + + if (ix >= dstROI.width || iy >= dstROI.height) + { + return; + } + + float rw = (float) srcROI.width; + float rh = (float) srcROI.height; + + // source position + float x = scaleX * (float) ix; + float y = scaleY * (float) iy; + + // x sampling range + float xBegin = fmax (x - scaleX, 0.0f); + float xEnd = fmin (x + scaleX, rw - 1.0f); + // y sampling range + float yBegin = fmax (y - scaleY, 0.0f); + float yEnd = fmin (y + scaleY, rh - 1.0f); + // x range of source samples + float floorXBegin = floorf (xBegin); + float ceilXEnd = ceilf (xEnd); + int iXBegin = srcROI.x + (int) floorXBegin; + int iXEnd = srcROI.x + (int) ceilXEnd; + // y range of source samples + float floorYBegin = floorf (yBegin); + float ceilYEnd = ceilf (yEnd); + int iYBegin = srcROI.y + (int) floorYBegin; + int iYEnd = srcROI.y + (int) ceilYEnd; + + // first row + int pos = iYBegin * srcStep + iXBegin; + + float wsum = 1.0f - yBegin + floorYBegin; + + float sum = processLine (pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin, + ceilXEnd) * (1.0f - yBegin + floorYBegin); + pos += srcStep; + for (int iy = iYBegin + 1; iy < iYEnd; ++iy) + { + sum += processLine (pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin, + ceilXEnd); + pos += srcStep; + wsum += 1.0f; + } + + sum += processLine (pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin, + ceilXEnd) * (ceilYEnd - yEnd); + wsum += ceilYEnd - yEnd; + sum /= wsum; + + dst[(ix + dstROI.x) + (iy + dstROI.y) * dstStep] = sum; +} + + +// bicubic interpolation +__forceinline__ +__device__ float bicubicCoeff(float x_) +{ + float x = fabsf(x_); + if (x <= 1.0f) + { + return x * x * (1.5f * x - 2.5f) + 1.0f; + } + else if (x < 2.0f) + { + return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f; + } + else + { + return 0.0f; + } +} + + +__global__ void resizeBicubic(NcvSize32u srcSize, + NcvRect32u srcROI, + NcvSize32u dstSize, + Ncv32u dstStep, + Ncv32f *dst, + NcvRect32u dstROI, + Ncv32f scaleX, + Ncv32f scaleY) +{ + const int ix = blockIdx.x * blockDim.x + threadIdx.x; + const int iy = blockIdx.y * blockDim.y + threadIdx.y; + + if (ix >= dstROI.width || iy >= dstROI.height) + { + return; + } + + const float dx = 1.0f / srcROI.width; + const float dy = 1.0f / srcROI.height; + + float rx = (float) srcROI.x; + float ry = (float) srcROI.y; + + float rw = (float) srcROI.width; + float rh = (float) srcROI.height; + + float x = scaleX * (float) ix; + float y = scaleY * (float) iy; + + // sampling range + // border mode is clamp + float xmin = fmax (ceilf (x - 2.0f), 0.0f); + float xmax = fmin (floorf (x + 2.0f), rw - 1.0f); + + float ymin = fmax (ceilf (y - 2.0f), 0.0f); + float ymax = fmin (floorf (y + 2.0f), rh - 1.0f); + + // shift data window to match ROI + rx += 0.5f; + ry += 0.5f; + + x += rx; + y += ry; + + xmin += rx; + xmax += rx; + ymin += ry; + ymax += ry; + + float sum = 0.0f; + float wsum = 0.0f; + + for (float cy = ymin; cy <= ymax; cy += 1.0f) + { + for (float cx = xmin; cx <= xmax; cx += 1.0f) + { + float xDist = x - cx; + float yDist = y - cy; + float wx = bicubicCoeff (xDist); + float wy = bicubicCoeff (yDist); + wx *= wy; + sum += wx * tex2D (texSrc2D, cx * dx, cy * dy); + wsum += wx; + } + } + dst[(ix + dstROI.x)+ (iy + dstROI.y) * dstStep] = sum / wsum; +} + + +NCVStatus nppiStResize_32f_C1R(const Ncv32f *pSrc, + NcvSize32u srcSize, + Ncv32u nSrcStep, + NcvRect32u srcROI, + Ncv32f *pDst, + NcvSize32u dstSize, + Ncv32u nDstStep, + NcvRect32u dstROI, + Ncv32f xFactor, + Ncv32f yFactor, + NppStInterpMode interpolation) +{ + NCVStatus status = NPPST_SUCCESS; + + ncvAssertReturn (pSrc != NULL && pDst != NULL, NPPST_NULL_POINTER_ERROR); + ncvAssertReturn (xFactor != 0.0 && yFactor != 0.0, NPPST_INVALID_SCALE); + + ncvAssertReturn (nSrcStep >= sizeof (Ncv32f) * (Ncv32u) srcSize.width && + nDstStep >= sizeof (Ncv32f) * (Ncv32f) dstSize.width, + NPPST_INVALID_STEP); + + Ncv32u srcStep = nSrcStep / sizeof (Ncv32f); + Ncv32u dstStep = nDstStep / sizeof (Ncv32f); + + // TODO: preprocess ROI to prevent out of bounds access + + if (interpolation == nppStSupersample) + { + // bind texture + cudaBindTexture (0, texSrc, pSrc, srcSize.height * nSrcStep); + // invoke kernel + dim3 ctaSize (32, 6); + dim3 gridSize ((dstROI.width + ctaSize.x - 1) / ctaSize.x, + (dstROI.height + ctaSize.y - 1) / ctaSize.y); + + resizeSuperSample_32f <<>> + (srcSize, srcStep, srcROI, pDst, dstSize, dstStep, dstROI, 1.0f / xFactor, 1.0f / yFactor); + } + else if (interpolation == nppStBicubic) + { + texSrc2D.addressMode[0] = cudaAddressModeMirror; + texSrc2D.addressMode[1] = cudaAddressModeMirror; + texSrc2D.normalized = true; + + cudaChannelFormatDesc desc = cudaCreateChannelDesc (); + + cudaBindTexture2D (0, texSrc2D, pSrc, desc, srcSize.width, srcSize.height, + nSrcStep); + + dim3 ctaSize (32, 6); + dim3 gridSize ((dstSize.width + ctaSize.x - 1) / ctaSize.x, + (dstSize.height + ctaSize.y - 1) / ctaSize.y); + + resizeBicubic <<>> + (srcSize, srcROI, dstSize, dstStep, pDst, dstROI, 1.0f / xFactor, 1.0f / yFactor); + } + else + { + status = NPPST_ERROR; + } + + return status; +} diff --git a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.hpp b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.hpp index fedc7685d3..e7a65e583a 100644 --- a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.hpp +++ b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.hpp @@ -84,6 +84,255 @@ cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream); */ +/** Border type + * + * Filtering operations assume that each pixel has a neighborhood of pixels. + * The following structure describes possible ways to define non-existent pixels. + */ +enum NppStBorderType +{ + nppStBorderNone = 0, ///< There is no need to define additional pixels, image is extended already + nppStBorderClamp = 1, ///< Clamp out of range position to borders + nppStBorderWrap = 2, ///< Wrap out of range position. Image becomes periodic. + nppStBorderMirror = 3 ///< reflect out of range position across borders +}; + + +/** + * Filter types for image resizing + */ +enum NppStInterpMode +{ + nppStSupersample, ///< Supersampling. For downscaling only + nppStBicubic ///< Bicubic convolution filter, a = -0.5 (cubic Hermite spline) +}; + + +/** Frame interpolation state + * + * This structure holds parameters required for frame interpolation. + * Forward displacement field is a per-pixel mapping from frame 0 to frame 1. + * Backward displacement field is a per-pixel mapping from frame 1 to frame 0. + */ + + struct NppStInterpolationState +{ + NcvSize32u size; ///< frame size + Ncv32u nStep; ///< pitch + Ncv32f pos; ///< new frame position + Ncv32f *pSrcFrame0; ///< frame 0 + Ncv32f *pSrcFrame1; ///< frame 1 + Ncv32f *pFU; ///< forward horizontal displacement + Ncv32f *pFV; ///< forward vertical displacement + Ncv32f *pBU; ///< backward horizontal displacement + Ncv32f *pBV; ///< backward vertical displacement + Ncv32f *pNewFrame; ///< new frame + Ncv32f *ppBuffers[6]; ///< temporary buffers +}; + + +/** Size of a buffer required for interpolation. + * + * Requires several such buffers. See \see NppStInterpolationState. + * + * \param srcSize [IN] Frame size (both frames must be of the same size) + * \param nStep [IN] Frame line step + * \param hpSize [OUT] Where to store computed size (host memory) + * + * \return NCV status code + */ +NCV_EXPORTS +NCVStatus nppiStGetInterpolationBufferSize(NcvSize32u srcSize, + Ncv32u nStep, + Ncv32u *hpSize); + + +/** Interpolate frames (images) using provided optical flow (displacement field). + * 32-bit floating point images, single channel + * + * \param pState [IN] structure containing all required parameters (host memory) + * + * \return NCV status code + */ +NCV_EXPORTS +NCVStatus nppiStInterpolateFrames(const NppStInterpolationState *pState); + + +/** Row linear filter. 32-bit floating point image, single channel + * + * Apply horizontal linear filter + * + * \param pSrc [IN] Source image pointer (CUDA device memory) + * \param srcSize [IN] Source image size + * \param nSrcStep [IN] Source image line step + * \param pDst [OUT] Destination image pointer (CUDA device memory) + * \param dstSize [OUT] Destination image size + * \param oROI [IN] Region of interest in the source image + * \param borderType [IN] Type of border + * \param pKernel [IN] Pointer to row kernel values (CUDA device memory) + * \param nKernelSize [IN] Size of the kernel in pixels + * \param nAnchor [IN] The kernel row alignment with respect to the position of the input pixel + * \param multiplier [IN] Value by which the computed result is multiplied + * + * \return NCV status code + */ +NCV_EXPORTS +NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc, + NcvSize32u srcSize, + Ncv32u nSrcStep, + Ncv32f *pDst, + NcvSize32u dstSize, + Ncv32u nDstStep, + NcvRect32u oROI, + NppStBorderType borderType, + const Ncv32f *pKernel, + Ncv32s nKernelSize, + Ncv32s nAnchor, + Ncv32f multiplier); + + +/** Column linear filter. 32-bit floating point image, single channel + * + * Apply vertical linear filter + * + * \param pSrc [IN] Source image pointer (CUDA device memory) + * \param srcSize [IN] Source image size + * \param nSrcStep [IN] Source image line step + * \param pDst [OUT] Destination image pointer (CUDA device memory) + * \param dstSize [OUT] Destination image size + * \param oROI [IN] Region of interest in the source image + * \param borderType [IN] Type of border + * \param pKernel [IN] Pointer to column kernel values (CUDA device memory) + * \param nKernelSize [IN] Size of the kernel in pixels + * \param nAnchor [IN] The kernel column alignment with respect to the position of the input pixel + * \param multiplier [IN] Value by which the computed result is multiplied + * + * \return NCV status code + */ +NCV_EXPORTS +NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc, + NcvSize32u srcSize, + Ncv32u nSrcStep, + Ncv32f *pDst, + NcvSize32u dstSize, + Ncv32u nDstStep, + NcvRect32u oROI, + NppStBorderType borderType, + const Ncv32f *pKernel, + Ncv32s nKernelSize, + Ncv32s nAnchor, + Ncv32f multiplier); + + +/** Size of buffer required for vector image warping. + * + * \param srcSize [IN] Source image size + * \param nStep [IN] Source image line step + * \param hpSize [OUT] Where to store computed size (host memory) + * + * \return NCV status code + */ +NCV_EXPORTS +NCVStatus nppiStVectorWarpGetBufferSize(NcvSize32u srcSize, + Ncv32u nSrcStep, + Ncv32u *hpSize); + + +/** Warp image using provided 2D vector field and 1x1 point spread function. + * 32-bit floating point image, single channel + * + * During warping pixels from the source image may fall between pixels of the destination image. + * PSF (point spread function) describes how the source image pixel affects pixels of the destination. + * For 1x1 PSF only single pixel with the largest intersection is affected (similar to nearest interpolation). + * + * Destination image size and line step must be the same as the source image size and line step + * + * \param pSrc [IN] Source image pointer (CUDA device memory) + * \param srcSize [IN] Source image size + * \param nSrcStep [IN] Source image line step + * \param pU [IN] Pointer to horizontal displacement field (CUDA device memory) + * \param pV [IN] Pointer to vertical displacement field (CUDA device memory) + * \param nVFStep [IN] Displacement field line step + * \param timeScale [IN] Value by which displacement field will be scaled for warping + * \param pDst [OUT] Destination image pointer (CUDA device memory) + * + * \return NCV status code + */ +NCV_EXPORTS +NCVStatus nppiStVectorWarp_PSF1x1_32f_C1(const Ncv32f *pSrc, + NcvSize32u srcSize, + Ncv32u nSrcStep, + const Ncv32f *pU, + const Ncv32f *pV, + Ncv32u nVFStep, + Ncv32f timeScale, + Ncv32f *pDst); + + +/** Warp image using provided 2D vector field and 2x2 point spread function. + * 32-bit floating point image, single channel + * + * During warping pixels from the source image may fall between pixels of the destination image. + * PSF (point spread function) describes how the source image pixel affects pixels of the destination. + * For 2x2 PSF all four intersected pixels will be affected. + * + * Destination image size and line step must be the same as the source image size and line step + * + * \param pSrc [IN] Source image pointer (CUDA device memory) + * \param srcSize [IN] Source image size + * \param nSrcStep [IN] Source image line step + * \param pU [IN] Pointer to horizontal displacement field (CUDA device memory) + * \param pV [IN] Pointer to vertical displacement field (CUDA device memory) + * \param nVFStep [IN] Displacement field line step + * \param timeScale [IN] Value by which displacement field will be scaled for warping + * \param pDst [OUT] Destination image pointer (CUDA device memory) + * + * \return NCV status code + */ +NCV_EXPORTS +NCVStatus nppiStVectorWarp_PSF2x2_32f_C1(const Ncv32f *pSrc, + NcvSize32u srcSize, + Ncv32u nSrcStep, + const Ncv32f *pU, + const Ncv32f *pV, + Ncv32u nVFStep, + Ncv32f *pBuffer, + Ncv32f timeScale, + Ncv32f *pDst); + + +/** Resize. 32-bit floating point image, single channel + * + * Resizes image using specified filter (interpolation type) + * + * \param pSrc [IN] Source image pointer (CUDA device memory) + * \param srcSize [IN] Source image size + * \param nSrcStep [IN] Source image line step + * \param srcROI [IN] Source image region of interest + * \param pDst [OUT] Destination image pointer (CUDA device memory) + * \param dstSize [IN] Destination image size + * \param nDstStep [IN] Destination image line step + * \param dstROI [IN] Destination image region of interest + * \param xFactor [IN] Row scale factor + * \param yFactor [IN] Column scale factor + * \param interpolation [IN] Interpolation type + * + * \return NCV status code + */ +NCV_EXPORTS +NCVStatus nppiStResize_32f_C1R(const Ncv32f *pSrc, + NcvSize32u srcSize, + Ncv32u nSrcStep, + NcvRect32u srcROI, + Ncv32f *pDst, + NcvSize32u dstSize, + Ncv32u nDstStep, + NcvRect32u dstROI, + Ncv32f xFactor, + Ncv32f yFactor, + NppStInterpMode interpolation); + + /** * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit unsigned pixels, single channel. * diff --git a/samples/gpu/opticalflow_nvidia_api.cpp b/samples/gpu/opticalflow_nvidia_api.cpp new file mode 100644 index 0000000000..5ae038e5f6 --- /dev/null +++ b/samples/gpu/opticalflow_nvidia_api.cpp @@ -0,0 +1,639 @@ +#if _MSC_VER >= 1400 +#pragma warning( disable : 4201 4408 4127 4100) +#endif + +#include +#include +#include +#include +#include + +#include "cvconfig.h" +#include +#include +#include "opencv2/opencv.hpp" +#include "opencv2/gpu/gpu.hpp" + +#ifdef HAVE_CUDA +#include "NPP_staging/NPP_staging.hpp" +#include "NCVBroxOpticalFlow.hpp" +#endif + +#if !defined(HAVE_CUDA) +int main( int argc, const char** argv ) +{ + cout << "Please compile the library with CUDA support" << endl; + return -1; +} +#else + +using std::tr1::shared_ptr; + +#define PARAM_INPUT "--input" +#define PARAM_SCALE "--scale" +#define PARAM_ALPHA "--alpha" +#define PARAM_GAMMA "--gamma" +#define PARAM_INNER "--inner" +#define PARAM_OUTER "--outer" +#define PARAM_SOLVER "--solver" +#define PARAM_TIME_STEP "--time-step" +#define PARAM_HELP "--help" + +shared_ptr g_pGPUMemAllocator; +shared_ptr g_pHostMemAllocator; + +class RgbToMonochrome +{ +public: + float operator ()(unsigned char b, unsigned char g, unsigned char r) + { + float _r = static_cast(r)/255.0f; + float _g = static_cast(g)/255.0f; + float _b = static_cast(b)/255.0f; + return (_r + _g + _b)/3.0f; + } +}; + +class RgbToR +{ +public: + float operator ()(unsigned char b, unsigned char g, unsigned char r) + { + return static_cast(r)/255.0f; + } +}; + + +class RgbToG +{ +public: + float operator ()(unsigned char b, unsigned char g, unsigned char r) + { + return static_cast(g)/255.0f; + } +}; + +class RgbToB +{ +public: + float operator ()(unsigned char b, unsigned char g, unsigned char r) + { + return static_cast(b)/255.0f; + } +}; + +template +NCVStatus CopyData(IplImage *image, shared_ptr> &dst) +{ + dst = shared_ptr> (new NCVMatrixAlloc (*g_pHostMemAllocator, image->width, image->height)); + ncvAssertReturn (dst->isMemAllocated (), NCV_ALLOCATOR_BAD_ALLOC); + + unsigned char *row = reinterpret_cast (image->imageData); + T convert; + for (int i = 0; i < image->height; ++i) + { + for (int j = 0; j < image->width; ++j) + { + if (image->nChannels < 3) + { + dst->ptr ()[j + i*dst->stride ()] = static_cast (*(row + j*image->nChannels))/255.0f; + } + else + { + unsigned char *color = row + j * image->nChannels; + dst->ptr ()[j +i*dst->stride ()] = convert (color[0], color[1], color[2]); + } + } + row += image->widthStep; + } + return NCV_SUCCESS; +} + +template +NCVStatus CopyData(const IplImage *image, const NCVMatrixAlloc &dst) +{ + unsigned char *row = reinterpret_cast (image->imageData); + T convert; + for (int i = 0; i < image->height; ++i) + { + for (int j = 0; j < image->width; ++j) + { + if (image->nChannels < 3) + { + dst.ptr ()[j + i*dst.stride ()] = static_cast(*(row + j*image->nChannels))/255.0f; + } + else + { + unsigned char *color = row + j * image->nChannels; + dst.ptr ()[j +i*dst.stride()] = convert (color[0], color[1], color[2]); + } + } + row += image->widthStep; + } + return NCV_SUCCESS; +} + +NCVStatus LoadImages (const char *frame0Name, + const char *frame1Name, + int &width, + int &height, + shared_ptr> &src, + shared_ptr> &dst, + IplImage *&firstFrame, + IplImage *&lastFrame) +{ + IplImage *image; + image = cvLoadImage (frame0Name); + if (image == 0) + { + std::cout << "Could not open '" << frame0Name << "'\n"; + return NCV_FILE_ERROR; + } + + firstFrame = image; + // copy data to src + ncvAssertReturnNcvStat (CopyData (image, src)); + + IplImage *image2; + image2 = cvLoadImage (frame1Name); + if (image2 == 0) + { + std::cout << "Could not open '" << frame1Name << "'\n"; + return NCV_FILE_ERROR; + } + lastFrame = image2; + + ncvAssertReturnNcvStat (CopyData (image2, dst)); + + width = image->width; + height = image->height; + + return NCV_SUCCESS; +} + +template +inline T Clamp (T x, T a, T b) +{ + return ((x) > (a) ? ((x) < (b) ? (x) : (b)) : (a)); +} + +template +inline T MapValue (T x, T a, T b, T c, T d) +{ + x = Clamp (x, a, b); + return c + (d - c) * (x - a) / (b - a); +} + +NCVStatus ShowFlow (NCVMatrixAlloc &u, NCVMatrixAlloc &v, const char *name) +{ + IplImage *flowField; + + NCVMatrixAlloc host_u(*g_pHostMemAllocator, u.width(), u.height()); + ncvAssertReturn(host_u.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); + + NCVMatrixAlloc host_v (*g_pHostMemAllocator, u.width (), u.height ()); + ncvAssertReturn (host_v.isMemAllocated (), NCV_ALLOCATOR_BAD_ALLOC); + + ncvAssertReturnNcvStat (u.copySolid (host_u, 0)); + ncvAssertReturnNcvStat (v.copySolid (host_v, 0)); + + float *ptr_u = host_u.ptr (); + float *ptr_v = host_v.ptr (); + + float maxDisplacement = 1.0f; + + for (Ncv32u i = 0; i < u.height (); ++i) + { + for (Ncv32u j = 0; j < u.width (); ++j) + { + float d = std::max ( fabsf(*ptr_u), fabsf(*ptr_v) ); + if (d > maxDisplacement) maxDisplacement = d; + ++ptr_u; + ++ptr_v; + } + ptr_u += u.stride () - u.width (); + ptr_v += v.stride () - v.width (); + } + + CvSize image_size = cvSize (u.width (), u.height ()); + flowField = cvCreateImage (image_size, IPL_DEPTH_8U, 4); + if (flowField == 0) return NCV_NULL_PTR; + + unsigned char *row = reinterpret_cast (flowField->imageData); + + ptr_u = host_u.ptr(); + ptr_v = host_v.ptr(); + for (int i = 0; i < flowField->height; ++i) + { + for (int j = 0; j < flowField->width; ++j) + { + (row + j * flowField->nChannels)[0] = 0; + (row + j * flowField->nChannels)[1] = static_cast (MapValue (-(*ptr_v), -maxDisplacement, maxDisplacement, 0.0f, 255.0f)); + (row + j * flowField->nChannels)[2] = static_cast (MapValue (*ptr_u , -maxDisplacement, maxDisplacement, 0.0f, 255.0f)); + (row + j * flowField->nChannels)[3] = 255; + ++ptr_u; + ++ptr_v; + } + row += flowField->widthStep; + ptr_u += u.stride () - u.width (); + ptr_v += v.stride () - v.width (); + } + + cvShowImage (name, flowField); + + return NCV_SUCCESS; +} + +IplImage *CreateImage (NCVMatrixAlloc &h_r, NCVMatrixAlloc &h_g, NCVMatrixAlloc &h_b) +{ + CvSize imageSize = cvSize (h_r.width (), h_r.height ()); + IplImage *image = cvCreateImage (imageSize, IPL_DEPTH_8U, 4); + if (image == 0) return 0; + + unsigned char *row = reinterpret_cast (image->imageData); + + for (int i = 0; i < image->height; ++i) + { + for (int j = 0; j < image->width; ++j) + { + int offset = j * image->nChannels; + int pos = i * h_r.stride () + j; + row[offset + 0] = static_cast (h_b.ptr ()[pos] * 255.0f); + row[offset + 1] = static_cast (h_g.ptr ()[pos] * 255.0f); + row[offset + 2] = static_cast (h_r.ptr ()[pos] * 255.0f); + row[offset + 3] = 255; + } + row += image->widthStep; + } + return image; +} + +void PrintHelp () +{ + std::cout << "Usage help:\n"; + std::cout << std::setiosflags(std::ios::left); + std::cout << "\t" << std::setw(15) << PARAM_ALPHA << " - set alpha\n"; + std::cout << "\t" << std::setw(15) << PARAM_GAMMA << " - set gamma\n"; + std::cout << "\t" << std::setw(15) << PARAM_INNER << " - set number of inner iterations\n"; + std::cout << "\t" << std::setw(15) << PARAM_INPUT << " - specify input file names (2 image files)\n"; + std::cout << "\t" << std::setw(15) << PARAM_OUTER << " - set number of outer iterations\n"; + std::cout << "\t" << std::setw(15) << PARAM_SCALE << " - set pyramid scale factor\n"; + std::cout << "\t" << std::setw(15) << PARAM_SOLVER << " - set number of basic solver iterations\n"; + std::cout << "\t" << std::setw(15) << PARAM_TIME_STEP << " - set frame interpolation time step\n"; + std::cout << "\t" << std::setw(15) << PARAM_HELP << " - display this help message\n"; +} + +int ProcessCommandLine(int argc, char **argv, + Ncv32f &timeStep, + char *&frame0Name, + char *&frame1Name, + NCVBroxOpticalFlowDescriptor &desc) +{ + timeStep = 0.25f; + for (int iarg = 1; iarg < argc; ++iarg) + { + if (strcmp(argv[iarg], PARAM_INPUT) == 0) + { + if (iarg + 2 < argc) + { + frame0Name = argv[++iarg]; + frame1Name = argv[++iarg]; + } + else + return -1; + } + else if(strcmp(argv[iarg], PARAM_SCALE) == 0) + { + if (iarg + 1 < argc) + desc.scale_factor = static_cast(atof(argv[++iarg])); + else + return -1; + } + else if(strcmp(argv[iarg], PARAM_ALPHA) == 0) + { + if (iarg + 1 < argc) + desc.alpha = static_cast(atof(argv[++iarg])); + else + return -1; + } + else if(strcmp(argv[iarg], PARAM_GAMMA) == 0) + { + if (iarg + 1 < argc) + desc.gamma = static_cast(atof(argv[++iarg])); + else + return -1; + } + else if(strcmp(argv[iarg], PARAM_INNER) == 0) + { + if (iarg + 1 < argc) + desc.number_of_inner_iterations = static_cast(atoi(argv[++iarg])); + else + return -1; + } + else if(strcmp(argv[iarg], PARAM_OUTER) == 0) + { + if (iarg + 1 < argc) + desc.number_of_outer_iterations = static_cast(atoi(argv[++iarg])); + else + return -1; + } + else if(strcmp(argv[iarg], PARAM_SOLVER) == 0) + { + if (iarg + 1 < argc) + desc.number_of_solver_iterations = static_cast(atoi(argv[++iarg])); + else + return -1; + } + else if(strcmp(argv[iarg], PARAM_TIME_STEP) == 0) + { + if (iarg + 1 < argc) + timeStep = static_cast(atof(argv[++iarg])); + else + return -1; + } + else if(strcmp(argv[iarg], PARAM_HELP) == 0) + { + PrintHelp (); + return 0; + } + } + return 0; +} + + +int main(int argc, char **argv) +{ + char *frame0Name = 0, *frame1Name = 0; + Ncv32f timeStep = 0.01f; + + NCVBroxOpticalFlowDescriptor desc; + + desc.alpha = 0.197f; + desc.gamma = 50.0f; + desc.number_of_inner_iterations = 10; + desc.number_of_outer_iterations = 77; + desc.number_of_solver_iterations = 10; + desc.scale_factor = 0.8f; + + int result = ProcessCommandLine (argc, argv, timeStep, frame0Name, frame1Name, desc); + if (argc == 1 || result) + { + PrintHelp(); + return result; + } + + std::cout << "OpenCV / NVIDIA Computer Vision\n"; + std::cout << "Optical Flow Demo: Frame Interpolation\n"; + std::cout << "=========================================\n"; + std::cout << "Press:\n ESC to quit\n 'a' to move to the previous frame\n 's' to move to the next frame\n"; + + int devId; + ncvAssertCUDAReturn(cudaGetDevice(&devId), -1); + cudaDeviceProp devProp; + ncvAssertCUDAReturn(cudaGetDeviceProperties(&devProp, devId), -1); + std::cout << "Using GPU: " << devId << "(" << devProp.name << + "), arch=" << devProp.major << "." << devProp.minor << std::endl; + + g_pGPUMemAllocator = shared_ptr (new NCVMemNativeAllocator (NCVMemoryTypeDevice, devProp.textureAlignment)); + ncvAssertPrintReturn (g_pGPUMemAllocator->isInitialized (), "Device memory allocator isn't initialized", -1); + + g_pHostMemAllocator = shared_ptr (new NCVMemNativeAllocator (NCVMemoryTypeHostPageable, devProp.textureAlignment)); + ncvAssertPrintReturn (g_pHostMemAllocator->isInitialized (), "Host memory allocator isn't initialized", -1); + + int width, height; + + shared_ptr> src_host; + shared_ptr> dst_host; + + IplImage *firstFrame, *lastFrame; + if (frame0Name != 0 && frame1Name != 0) + { + ncvAssertReturnNcvStat (LoadImages (frame0Name, frame1Name, width, height, src_host, dst_host, firstFrame, lastFrame)); + } + else + { + ncvAssertReturnNcvStat (LoadImages ("frame10.bmp", "frame11.bmp", width, height, src_host, dst_host, firstFrame, lastFrame)); + } + + shared_ptr> src (new NCVMatrixAlloc (*g_pGPUMemAllocator, src_host->width (), src_host->height ())); + ncvAssertReturn(src->isMemAllocated(), -1); + + shared_ptr> dst (new NCVMatrixAlloc (*g_pGPUMemAllocator, src_host->width (), src_host->height ())); + ncvAssertReturn (dst->isMemAllocated (), -1); + + ncvAssertReturnNcvStat (src_host->copySolid ( *src, 0 )); + ncvAssertReturnNcvStat (dst_host->copySolid ( *dst, 0 )); + +#if defined SAFE_MAT_DECL +#undef SAFE_MAT_DECL +#endif +#define SAFE_MAT_DECL(name, allocator, sx, sy) \ + NCVMatrixAlloc name(*allocator, sx, sy);\ + ncvAssertReturn(name##.isMemAllocated(), -1); + + SAFE_MAT_DECL (u, g_pGPUMemAllocator, width, height); + SAFE_MAT_DECL (v, g_pGPUMemAllocator, width, height); + + SAFE_MAT_DECL (uBck, g_pGPUMemAllocator, width, height); + SAFE_MAT_DECL (vBck, g_pGPUMemAllocator, width, height); + + SAFE_MAT_DECL (h_r, g_pHostMemAllocator, width, height); + SAFE_MAT_DECL (h_g, g_pHostMemAllocator, width, height); + SAFE_MAT_DECL (h_b, g_pHostMemAllocator, width, height); + + std::cout << "Estimating optical flow\nForward...\n"; + + if (NCV_SUCCESS != NCVBroxOpticalFlow (desc, *g_pGPUMemAllocator, *src, *dst, u, v, 0)) + { + std::cout << "Failed\n"; + return -1; + } + + std::cout << "Backward...\n"; + if (NCV_SUCCESS != NCVBroxOpticalFlow (desc, *g_pGPUMemAllocator, *dst, *src, uBck, vBck, 0)) + { + std::cout << "Failed\n"; + return -1; + } + + // matrix for temporary data + SAFE_MAT_DECL (d_temp, g_pGPUMemAllocator, width, height); + + // first frame color components (GPU memory) + SAFE_MAT_DECL (d_r, g_pGPUMemAllocator, width, height); + SAFE_MAT_DECL (d_g, g_pGPUMemAllocator, width, height); + SAFE_MAT_DECL (d_b, g_pGPUMemAllocator, width, height); + + // second frame color components (GPU memory) + SAFE_MAT_DECL (d_rt, g_pGPUMemAllocator, width, height); + SAFE_MAT_DECL (d_gt, g_pGPUMemAllocator, width, height); + SAFE_MAT_DECL (d_bt, g_pGPUMemAllocator, width, height); + + // intermediate frame color components (GPU memory) + SAFE_MAT_DECL (d_rNew, g_pGPUMemAllocator, width, height); + SAFE_MAT_DECL (d_gNew, g_pGPUMemAllocator, width, height); + SAFE_MAT_DECL (d_bNew, g_pGPUMemAllocator, width, height); + + // interpolated forward flow + SAFE_MAT_DECL (ui, g_pGPUMemAllocator, width, height); + SAFE_MAT_DECL (vi, g_pGPUMemAllocator, width, height); + + // interpolated backward flow + SAFE_MAT_DECL (ubi, g_pGPUMemAllocator, width, height); + SAFE_MAT_DECL (vbi, g_pGPUMemAllocator, width, height); + + // occlusion masks + SAFE_MAT_DECL (occ0, g_pGPUMemAllocator, width, height); + SAFE_MAT_DECL (occ1, g_pGPUMemAllocator, width, height); + + // prepare color components on host and copy them to device memory + ncvAssertReturnNcvStat (CopyData (firstFrame, h_r)); + ncvAssertReturnNcvStat (CopyData (firstFrame, h_g)); + ncvAssertReturnNcvStat (CopyData (firstFrame, h_b)); + + ncvAssertReturnNcvStat (h_r.copySolid ( d_r, 0 )); + ncvAssertReturnNcvStat (h_g.copySolid ( d_g, 0 )); + ncvAssertReturnNcvStat (h_b.copySolid ( d_b, 0 )); + + ncvAssertReturnNcvStat (CopyData (lastFrame, h_r)); + ncvAssertReturnNcvStat (CopyData (lastFrame, h_g)); + ncvAssertReturnNcvStat (CopyData (lastFrame, h_b)); + + ncvAssertReturnNcvStat (h_r.copySolid ( d_rt, 0 )); + ncvAssertReturnNcvStat (h_g.copySolid ( d_gt, 0 )); + ncvAssertReturnNcvStat (h_b.copySolid ( d_bt, 0 )); + + std::cout << "Interpolating...\n"; + std::cout.precision (4); + + std::vector frames; + frames.push_back (firstFrame); + + // compute interpolated frames + for (Ncv32f timePos = timeStep; timePos < 1.0f; timePos += timeStep) + { + ncvAssertCUDAReturn (cudaMemset (ui.ptr (), 0, ui.pitch () * ui.height ()), NCV_CUDA_ERROR); + ncvAssertCUDAReturn (cudaMemset (vi.ptr (), 0, vi.pitch () * vi.height ()), NCV_CUDA_ERROR); + + ncvAssertCUDAReturn (cudaMemset (ubi.ptr (), 0, ubi.pitch () * ubi.height ()), NCV_CUDA_ERROR); + ncvAssertCUDAReturn (cudaMemset (vbi.ptr (), 0, vbi.pitch () * vbi.height ()), NCV_CUDA_ERROR); + + ncvAssertCUDAReturn (cudaMemset (occ0.ptr (), 0, occ0.pitch () * occ0.height ()), NCV_CUDA_ERROR); + ncvAssertCUDAReturn (cudaMemset (occ1.ptr (), 0, occ1.pitch () * occ1.height ()), NCV_CUDA_ERROR); + + NppStInterpolationState state; + // interpolation state should be filled once except pSrcFrame0, pSrcFrame1, and pNewFrame + // we will only need to reset buffers content to 0 since interpolator doesn't do this itself + state.size = NcvSize32u (width, height); + state.nStep = d_r.pitch (); + state.pSrcFrame0 = d_r.ptr (); + state.pSrcFrame1 = d_rt.ptr (); + state.pFU = u.ptr (); + state.pFV = v.ptr (); + state.pBU = uBck.ptr (); + state.pBV = vBck.ptr (); + state.pos = timePos; + state.pNewFrame = d_rNew.ptr (); + state.ppBuffers[0] = occ0.ptr (); + state.ppBuffers[1] = occ1.ptr (); + state.ppBuffers[2] = ui.ptr (); + state.ppBuffers[3] = vi.ptr (); + state.ppBuffers[4] = ubi.ptr (); + state.ppBuffers[5] = vbi.ptr (); + + // interpolate red channel + nppiStInterpolateFrames (&state); + + // reset buffers + ncvAssertCUDAReturn (cudaMemset (ui.ptr (), 0, ui.pitch () * ui.height ()), NCV_CUDA_ERROR); + ncvAssertCUDAReturn (cudaMemset (vi.ptr (), 0, vi.pitch () * vi.height ()), NCV_CUDA_ERROR); + + ncvAssertCUDAReturn (cudaMemset (ubi.ptr (), 0, ubi.pitch () * ubi.height ()), NCV_CUDA_ERROR); + ncvAssertCUDAReturn (cudaMemset (vbi.ptr (), 0, vbi.pitch () * vbi.height ()), NCV_CUDA_ERROR); + + ncvAssertCUDAReturn (cudaMemset (occ0.ptr (), 0, occ0.pitch () * occ0.height ()), NCV_CUDA_ERROR); + ncvAssertCUDAReturn (cudaMemset (occ1.ptr (), 0, occ1.pitch () * occ1.height ()), NCV_CUDA_ERROR); + + // interpolate green channel + state.pSrcFrame0 = d_g.ptr (); + state.pSrcFrame1 = d_gt.ptr (); + state.pNewFrame = d_gNew.ptr (); + + nppiStInterpolateFrames (&state); + + // reset buffers + ncvAssertCUDAReturn (cudaMemset (ui.ptr (), 0, ui.pitch () * ui.height ()), NCV_CUDA_ERROR); + ncvAssertCUDAReturn (cudaMemset (vi.ptr (), 0, vi.pitch () * vi.height ()), NCV_CUDA_ERROR); + + ncvAssertCUDAReturn (cudaMemset (ubi.ptr (), 0, ubi.pitch () * ubi.height ()), NCV_CUDA_ERROR); + ncvAssertCUDAReturn (cudaMemset (vbi.ptr (), 0, vbi.pitch () * vbi.height ()), NCV_CUDA_ERROR); + + ncvAssertCUDAReturn (cudaMemset (occ0.ptr (), 0, occ0.pitch () * occ0.height ()), NCV_CUDA_ERROR); + ncvAssertCUDAReturn (cudaMemset (occ1.ptr (), 0, occ1.pitch () * occ1.height ()), NCV_CUDA_ERROR); + + // interpolate blue channel + state.pSrcFrame0 = d_b.ptr (); + state.pSrcFrame1 = d_bt.ptr (); + state.pNewFrame = d_bNew.ptr (); + + nppiStInterpolateFrames (&state); + + // copy to host memory + ncvAssertReturnNcvStat (d_rNew.copySolid (h_r, 0)); + ncvAssertReturnNcvStat (d_gNew.copySolid (h_g, 0)); + ncvAssertReturnNcvStat (d_bNew.copySolid (h_b, 0)); + + // convert to IplImage + IplImage *newFrame = CreateImage (h_r, h_g, h_b); + if (newFrame == 0) + { + std::cout << "Could not create new frame in host memory\n"; + break; + } + frames.push_back (newFrame); + std::cout << timePos * 100.0f << "%\r"; + } + std::cout << std::setw (5) << "100%\n"; + + frames.push_back (lastFrame); + + Ncv32u currentFrame; + currentFrame = 0; + + ShowFlow (u, v, "Forward flow"); + ShowFlow (uBck, vBck, "Backward flow"); + + cvShowImage ("Interpolated frame", frames[currentFrame]); + + bool qPressed = false; + while ( !qPressed ) + { + int key = toupper (cvWaitKey (10)); + switch (key) + { + case 27: + qPressed = true; + break; + case 'A': + if (currentFrame > 0) --currentFrame; + cvShowImage ("Interpolated frame", frames[currentFrame]); + break; + case 'S': + if (currentFrame < frames.size()-1) ++currentFrame; + cvShowImage ("Interpolated frame", frames[currentFrame]); + break; + } + } + + cvDestroyAllWindows (); + + std::vector::iterator iter; + for (iter = frames.begin (); iter != frames.end (); ++iter) + { + cvReleaseImage (&(*iter)); + } + + return 0; +} + +#endif diff --git a/samples/gpu/rubberwhale1.png b/samples/gpu/rubberwhale1.png new file mode 100644 index 0000000000..e2b08dba29 Binary files /dev/null and b/samples/gpu/rubberwhale1.png differ diff --git a/samples/gpu/rubberwhale2.png b/samples/gpu/rubberwhale2.png new file mode 100644 index 0000000000..f365370a4c Binary files /dev/null and b/samples/gpu/rubberwhale2.png differ