minor formating changes

13 years ago · e69c6fdef4
parent 7c160cdc30
commit e69c6fdef4
25 changed files with 453 additions and 453 deletions
--- a/modules/gpu/src/cuda/canny.cu
+++ b/modules/gpu/src/cuda/canny.cu
@ -44,9 +44,9 @@
 #include <algorithm>
 #include "internal_shared.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace canny 
+    namespace canny
    {
        __global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
        {
@ -99,7 +99,7 @@ namespace cv { namespace gpu { namespace device
            }
        };

-        template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf, 
+        template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf,
            PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
        {
            __shared__ int sdx[18][16];
@ -175,7 +175,7 @@ namespace cv { namespace gpu { namespace device
        }

        //////////////////////////////////////////////////////////////////////////////////////////
-            
+
        #define CANNY_SHIFT 15
        #define TG22        (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)

@ -236,7 +236,7 @@ namespace cv { namespace gpu { namespace device
                            edge_type = 1 + (int)(m > high_thresh);
                    }
                }
-                
+
                map.ptr(i + 1)[j + 1] = edge_type;
            }
        }
@ -270,7 +270,7 @@ namespace cv { namespace gpu { namespace device

            const int tid = threadIdx.y * 16 + threadIdx.x;
            const int lx = tid % 18;
-            const int ly = tid / 18; 
+            const int ly = tid / 18;

            if (ly < 14)
                smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
@ -294,10 +294,10 @@ namespace cv { namespace gpu { namespace device
                        n += smem[threadIdx.y    ][threadIdx.x    ] == 2;
                        n += smem[threadIdx.y    ][threadIdx.x + 1] == 2;
                        n += smem[threadIdx.y    ][threadIdx.x + 2] == 2;
-                        
+
                        n += smem[threadIdx.y + 1][threadIdx.x    ] == 2;
                        n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
-                        
+
                        n += smem[threadIdx.y + 2][threadIdx.x    ] == 2;
                        n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
                        n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
@ -318,10 +318,10 @@ namespace cv { namespace gpu { namespace device
                    n += smem[threadIdx.y    ][threadIdx.x    ] == 1;
                    n += smem[threadIdx.y    ][threadIdx.x + 1] == 1;
                    n += smem[threadIdx.y    ][threadIdx.x + 2] == 1;
-                    
+
                    n += smem[threadIdx.y + 1][threadIdx.x    ] == 1;
                    n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
-                    
+
                    n += smem[threadIdx.y + 2][threadIdx.x    ] == 1;
                    n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
                    n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
@ -361,7 +361,7 @@ namespace cv { namespace gpu { namespace device
            #if __CUDA_ARCH__ >= 120

            const int stack_size = 512;
-            
+
            __shared__ unsigned int s_counter;
            __shared__ unsigned int s_ind;
            __shared__ ushort2 s_st[stack_size];
@ -404,11 +404,11 @@ namespace cv { namespace gpu { namespace device
                        if (subTaskIdx < portion)
                            pos = s_st[s_counter - 1 - subTaskIdx];
                        __syncthreads();
-                            
+
                        if (threadIdx.x == 0)
                            s_counter -= portion;
                        __syncthreads();
-                         
+
                        if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
                        {
                            pos.x += c_dx[threadIdx.x & 7];
@ -452,7 +452,7 @@ namespace cv { namespace gpu { namespace device
        {
            void* counter_ptr;
            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
-            
+
            unsigned int count;
            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );

--- a/modules/gpu/src/cuda/color.cu
+++ b/modules/gpu/src/cuda/color.cu
@ -45,7 +45,7 @@
 #include <opencv2/gpu/device/color.hpp>
 #include <cvt_colot_internal.h>

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_rgba_traits<uchar>::functor_type)
    {
@ -153,7 +153,7 @@ namespace cv { namespace gpu { namespace device
    {
        enum { smart_block_dim_y = 8 };
        enum { smart_shift = 4 };
-    };    
+    };

    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_xyz4_traits<uchar>::functor_type)
    {
--- a/modules/gpu/src/cuda/column_filter.cu
+++ b/modules/gpu/src/cuda/column_filter.cu
@ -48,9 +48,9 @@
 #include "opencv2/gpu/device/border_interpolate.hpp"
 #include "opencv2/gpu/device/static_check.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace column_filter 
+    namespace column_filter
    {
        #define MAX_KERNEL_SIZE 32

@ -146,7 +146,7 @@ namespace cv { namespace gpu { namespace device

            const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
            const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK));
-            
+
            B<T> brd(src.rows);

            linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
@ -162,7 +162,7 @@ namespace cv { namespace gpu { namespace device
        {
            typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, int cc, cudaStream_t stream);

-            static const caller_t callers[5][33] = 
+            static const caller_t callers[5][33] =
            {
                {
                    0,
@ -338,9 +338,9 @@ namespace cv { namespace gpu { namespace device
                    linearColumnFilter_caller<30, T, D, BrdColWrap>,
                    linearColumnFilter_caller<31, T, D, BrdColWrap>,
                    linearColumnFilter_caller<32, T, D, BrdColWrap>
-                }               
+                }
            };
-            
+
            loadKernel(kernel, ksize);

            callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, cc, stream);
--- a/modules/gpu/src/cuda/copy_make_border.cu
+++ b/modules/gpu/src/cuda/copy_make_border.cu
@ -43,9 +43,9 @@
 #include "internal_shared.hpp"
 #include "opencv2/gpu/device/border_interpolate.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace imgproc 
+    namespace imgproc
    {
        template <typename Ptr2D, typename T> __global__ void copyMakeBorder(const Ptr2D src, DevMem2D_<T> dst, int top, int left)
        {
@ -58,9 +58,9 @@ namespace cv { namespace gpu { namespace device

        template <template <typename> class B, typename T> struct CopyMakeBorderDispatcher
        {
-            static void call(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, int top, int left, 
+            static void call(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, int top, int left,
                const typename VecTraits<T>::elem_type* borderValue, cudaStream_t stream)
-            {        
+            {
                dim3 block(32, 8);
                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));

@ -75,20 +75,20 @@ namespace cv { namespace gpu { namespace device
            }
        };

-        template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, 
+        template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode,
            const T* borderValue, cudaStream_t stream)
        {
            typedef typename TypeVec<T, cn>::vec_type vec_type;

            typedef void (*caller_t)(const DevMem2D_<vec_type>& src, const DevMem2D_<vec_type>& dst, int top, int left, const T* borderValue, cudaStream_t stream);

-            static const caller_t callers[5] = 
+            static const caller_t callers[5] =
            {
-                CopyMakeBorderDispatcher<BrdReflect101, vec_type>::call, 
-                CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call, 
-                CopyMakeBorderDispatcher<BrdConstant, vec_type>::call, 
-                CopyMakeBorderDispatcher<BrdReflect, vec_type>::call, 
-                CopyMakeBorderDispatcher<BrdWrap, vec_type>::call 
+                CopyMakeBorderDispatcher<BrdReflect101, vec_type>::call,
+                CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call,
+                CopyMakeBorderDispatcher<BrdConstant, vec_type>::call,
+                CopyMakeBorderDispatcher<BrdReflect, vec_type>::call,
+                CopyMakeBorderDispatcher<BrdWrap, vec_type>::call
            };

            callers[borderMode](DevMem2D_<vec_type>(src), DevMem2D_<vec_type>(dst), top, left, borderValue, stream);
--- a/modules/gpu/src/cuda/fast.cu
+++ b/modules/gpu/src/cuda/fast.cu
@ -40,7 +40,7 @@
 //
 // Copyright (c) 2010, Paul Furgale, Chi Hay Tong
 //
-// The original code was written by Paul Furgale and Chi Hay Tong 
+// The original code was written by Paul Furgale and Chi Hay Tong
 // and later optimized and prepared for integration into OpenCV by Itseez.
 //
 //M*/
@ -48,9 +48,9 @@
 #include "opencv2/gpu/device/common.hpp"
 #include "opencv2/gpu/device/utility.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace fast 
+    namespace fast
    {
        __device__ unsigned int g_counter = 0;

@ -78,14 +78,14 @@ namespace cv { namespace gpu { namespace device



-            d1 = diffType(v, C[0] & 0xff, th);            
+            d1 = diffType(v, C[0] & 0xff, th);
            d2 = diffType(v, C[2] & 0xff, th);

            if ((d1 | d2) == 0)
                return;

            mask1 |= (d1 & 1) << 0;
-            mask2 |= ((d1 & 2) >> 1) << 0;            
+            mask2 |= ((d1 & 2) >> 1) << 0;

            mask1 |= (d2 & 1) << 8;
            mask2 |= ((d2 & 2) >> 1) << 8;
@ -141,7 +141,7 @@ namespace cv { namespace gpu { namespace device
                return;*/

            mask1 |= (d1 & 1) << 1;
-            mask2 |= ((d1 & 2) >> 1) << 1; 
+            mask2 |= ((d1 & 2) >> 1) << 1;

            mask1 |= (d2 & 1) << 9;
            mask2 |= ((d2 & 2) >> 1) << 9;
@ -169,7 +169,7 @@ namespace cv { namespace gpu { namespace device
                return;*/

            mask1 |= (d1 & 1) << 5;
-            mask2 |= ((d1 & 2) >> 1) << 5; 
+            mask2 |= ((d1 & 2) >> 1) << 5;

            mask1 |= (d2 & 1) << 13;
            mask2 |= ((d2 & 2) >> 1) << 13;
@ -191,7 +191,7 @@ namespace cv { namespace gpu { namespace device
        // 0 -> not a keypoint
        __device__ __forceinline__ bool isKeyPoint(int mask1, int mask2)
        {
-            return (__popc(mask1) > 8 && (c_table[(mask1 >> 3) - 63] & (1 << (mask1 & 7)))) || 
+            return (__popc(mask1) > 8 && (c_table[(mask1 >> 3) - 63] & (1 << (mask1 & 7)))) ||
                   (__popc(mask2) > 8 && (c_table[(mask2 >> 3) - 63] & (1 << (mask2 & 7))));
        }

@ -212,14 +212,14 @@ namespace cv { namespace gpu { namespace device
                calcMask(C, v, mid, mask1, mask2);

                int isKp = static_cast<int>(isKeyPoint(mask1, mask2));
-                
+
                min = isKp * (mid + 1) + (isKp ^ 1) * min;
                max = (isKp ^ 1) * (mid - 1) + isKp * max;
            }

            return min - 1;
        }
-        
+
        template <bool calcScore, class Mask>
        __global__ void calcKeypoints(const DevMem2Db img, const Mask mask, short2* kpLoc, const unsigned int maxKeypoints, PtrStepi score, const int threshold)
        {
@ -243,7 +243,7 @@ namespace cv { namespace gpu { namespace device
                C[2] |= static_cast<uint>(img(i - 1, j - 3)) << (3 * 8);
                C[1] |= static_cast<uint>(img(i - 1, j + 3)) << 8;

-                C[3] |= static_cast<uint>(img(i, j - 3));                
+                C[3] |= static_cast<uint>(img(i, j - 3));
                v     = static_cast<int>(img(i, j));
                C[1] |= static_cast<uint>(img(i, j + 3));

@ -313,7 +313,7 @@ namespace cv { namespace gpu { namespace device
            cudaSafeCall( cudaGetLastError() );

            cudaSafeCall( cudaDeviceSynchronize() );
-            
+
            unsigned int count;
            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );

@ -335,14 +335,14 @@ namespace cv { namespace gpu { namespace device

                int score = scoreMat(loc.y, loc.x);

-                bool ismax = 
+                bool ismax =
                    score > scoreMat(loc.y - 1, loc.x - 1) &&
                    score > scoreMat(loc.y - 1, loc.x    ) &&
                    score > scoreMat(loc.y - 1, loc.x + 1) &&

                    score > scoreMat(loc.y    , loc.x - 1) &&
                    score > scoreMat(loc.y    , loc.x + 1) &&
-                
+
                    score > scoreMat(loc.y + 1, loc.x - 1) &&
                    score > scoreMat(loc.y + 1, loc.x    ) &&
                    score > scoreMat(loc.y + 1, loc.x + 1);
@ -375,7 +375,7 @@ namespace cv { namespace gpu { namespace device
            cudaSafeCall( cudaGetLastError() );

            cudaSafeCall( cudaDeviceSynchronize() );
-            
+
            unsigned int new_count;
            cudaSafeCall( cudaMemcpy(&new_count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );

--- a/modules/gpu/src/cuda/gftt.cu
+++ b/modules/gpu/src/cuda/gftt.cu
@ -40,7 +40,7 @@
 //
 // Copyright (c) 2010, Paul Furgale, Chi Hay Tong
 //
-// The original code was written by Paul Furgale and Chi Hay Tong 
+// The original code was written by Paul Furgale and Chi Hay Tong
 // and later optimized and prepared for integration into OpenCV by Itseez.
 //
 //M*/
@ -50,9 +50,9 @@
 #include "opencv2/gpu/device/common.hpp"
 #include "opencv2/gpu/device/utility.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace gfft 
+    namespace gfft
    {
        texture<float, cudaTextureType2D, cudaReadModeElementType> eigTex(0, cudaFilterModePoint, cudaAddressModeClamp);

@ -117,7 +117,7 @@ namespace cv { namespace gpu { namespace device
            cudaSafeCall( cudaGetLastError() );

            cudaSafeCall( cudaDeviceSynchronize() );
-            
+
            uint count;
            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(uint), cudaMemcpyDeviceToHost) );

@ -126,9 +126,9 @@ namespace cv { namespace gpu { namespace device

        class EigGreater
        {
-        public:            
-            __device__ __forceinline__ bool operator()(float2 a, float2 b) const 
-            { 
+        public:
+            __device__ __forceinline__ bool operator()(float2 a, float2 b) const
+            {
                return tex2D(eigTex, a.x, a.y) > tex2D(eigTex, b.x, b.y);
            }
        };
--- a/modules/gpu/src/cuda/hist.cu
+++ b/modules/gpu/src/cuda/hist.cu
@ -45,7 +45,7 @@
 #include "opencv2/gpu/device/utility.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    #define UINT_BITS 32U

@ -65,7 +65,7 @@ namespace cv { namespace gpu { namespace device

    #define USE_SMEM_ATOMICS (__CUDA_ARCH__ >= 120)

-    namespace hist 
+    namespace hist
    {
        #if (!USE_SMEM_ATOMICS)

@ -173,7 +173,7 @@ namespace cv { namespace gpu { namespace device
        {
            histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>(
                DevMem2D_<uint>(src),
-                buf, 
+                buf,
                static_cast<uint>(src.rows * src.step / sizeof(uint)),
                src.cols);

--- a/modules/gpu/src/cuda/hog.cu
+++ b/modules/gpu/src/cuda/hog.cu
@ -42,7 +42,7 @@

 #include "internal_shared.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    // Other values are not supported
    #define CELL_WIDTH 8
@ -50,7 +50,7 @@ namespace cv { namespace gpu { namespace device
    #define CELLS_PER_BLOCK_X 2
    #define CELLS_PER_BLOCK_Y 2

-    namespace hog 
+    namespace hog
    {
        __constant__ int cnbins;
        __constant__ int cblock_stride_x;
@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device
        __constant__ int cdescr_width;


-        /* Returns the nearest upper power of two, works only for 
+        /* Returns the nearest upper power of two, works only for
        the typical GPU thread count (pert block) values */
        int power_2up(unsigned int n)
        {
@ -82,19 +82,19 @@ namespace cv { namespace gpu { namespace device
        }


-        void set_up_constants(int nbins, int block_stride_x, int block_stride_y, 
+        void set_up_constants(int nbins, int block_stride_x, int block_stride_y,
                              int nblocks_win_x, int nblocks_win_y)
        {
-            cudaSafeCall( cudaMemcpyToSymbol(cnbins, &nbins, sizeof(nbins)) ); 
-            cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_x, &block_stride_x, sizeof(block_stride_x)) ); 
-            cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_y, &block_stride_y, sizeof(block_stride_y)) ); 
-            cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_x, &nblocks_win_x, sizeof(nblocks_win_x)) );  
-            cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_y, &nblocks_win_y, sizeof(nblocks_win_y)) ); 
+            cudaSafeCall( cudaMemcpyToSymbol(cnbins, &nbins, sizeof(nbins)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_x, &block_stride_x, sizeof(block_stride_x)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_y, &block_stride_y, sizeof(block_stride_y)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_x, &nblocks_win_x, sizeof(nblocks_win_x)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_y, &nblocks_win_y, sizeof(nblocks_win_y)) );

-            int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y; 
-            cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size, &block_hist_size, sizeof(block_hist_size)) ); 
+            int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
+            cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size, &block_hist_size, sizeof(block_hist_size)) );

-            int block_hist_size_2up = power_2up(block_hist_size);  
+            int block_hist_size_2up = power_2up(block_hist_size);
            cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size_2up, &block_hist_size_2up, sizeof(block_hist_size_2up)) );

            int descr_width = nblocks_win_x * block_hist_size;
@ -110,7 +110,7 @@ namespace cv { namespace gpu { namespace device


        template <int nblocks> // Number of histogram blocks processed by single GPU thread block
-        __global__ void compute_hists_kernel_many_blocks(const int img_block_width, const PtrElemStepf grad, 
+        __global__ void compute_hists_kernel_many_blocks(const int img_block_width, const PtrElemStepf grad,
                                                         const PtrElemStep qangle, float scale, float* block_hists)
        {
            const int block_x = threadIdx.z;
@ -125,7 +125,7 @@ namespace cv { namespace gpu { namespace device
            float* hists = smem;
            float* final_hist = smem + cnbins * 48 * nblocks;

-            const int offset_x = (blockIdx.x * blockDim.z + block_x) * cblock_stride_x + 
+            const int offset_x = (blockIdx.x * blockDim.z + block_x) * cblock_stride_x +
                                 4 * cell_x + cell_thread_x;
            const int offset_y = blockIdx.y * cblock_stride_y + 4 * cell_y;

@ -135,8 +135,8 @@ namespace cv { namespace gpu { namespace device
            // 12 means that 12 pixels affect on block's cell (in one row)
            if (cell_thread_x < 12)
            {
-                float* hist = hists + 12 * (cell_y * blockDim.z * CELLS_PER_BLOCK_Y + 
-                                            cell_x + block_x * CELLS_PER_BLOCK_X) + 
+                float* hist = hists + 12 * (cell_y * blockDim.z * CELLS_PER_BLOCK_Y +
+                                            cell_x + block_x * CELLS_PER_BLOCK_X) +
                                           cell_thread_x;
                for (int bin_id = 0; bin_id < cnbins; ++bin_id)
                    hist[bin_id * 48 * nblocks] = 0.f;
@ -155,9 +155,9 @@ namespace cv { namespace gpu { namespace device
                    int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
                    int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);

-                    float gaussian = ::expf(-(dist_center_y * dist_center_y + 
+                    float gaussian = ::expf(-(dist_center_y * dist_center_y +
                                              dist_center_x * dist_center_x) * scale);
-                    float interp_weight = (8.f - ::fabs(dist_y + 0.5f)) * 
+                    float interp_weight = (8.f - ::fabs(dist_y + 0.5f)) *
                                          (8.f - ::fabs(dist_x + 0.5f)) / 64.f;

                    hist[bin.x * 48 * nblocks] += gaussian * interp_weight * vote.x;
@ -169,41 +169,41 @@ namespace cv { namespace gpu { namespace device
                {
                    if (cell_thread_x < 6) hist_[0] += hist_[6];
                    if (cell_thread_x < 3) hist_[0] += hist_[3];
-                    if (cell_thread_x == 0) 
-                        final_hist[((cell_x + block_x * 2) * 2 + cell_y) * cnbins + bin_id] 
+                    if (cell_thread_x == 0)
+                        final_hist[((cell_x + block_x * 2) * 2 + cell_y) * cnbins + bin_id]
                            = hist_[0] + hist_[1] + hist_[2];
                }
            }

            __syncthreads();

-            float* block_hist = block_hists + (blockIdx.y * img_block_width + 
-                                               blockIdx.x * blockDim.z + block_x) * 
-                                              cblock_hist_size;        
+            float* block_hist = block_hists + (blockIdx.y * img_block_width +
+                                               blockIdx.x * blockDim.z + block_x) *
+                                              cblock_hist_size;

            int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 16 + cell_thread_x;
            if (tid < cblock_hist_size)
-                block_hist[tid] = final_hist[block_x * cblock_hist_size + tid];     
+                block_hist[tid] = final_hist[block_x * cblock_hist_size + tid];
        }


-        void compute_hists(int nbins, int block_stride_x, int block_stride_y, 
-                           int height, int width, const DevMem2Df& grad, 
-                           const DevMem2Db& qangle, float sigma, float* block_hists)                             
+        void compute_hists(int nbins, int block_stride_x, int block_stride_y,
+                           int height, int width, const DevMem2Df& grad,
+                           const DevMem2Db& qangle, float sigma, float* block_hists)
        {
            const int nblocks = 1;

-            int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / 
+            int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) /
                                  block_stride_x;
-            int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / 
+            int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) /
                                   block_stride_y;

            dim3 grid(divUp(img_block_width, nblocks), img_block_height);
            dim3 threads(32, 2, nblocks);

-            cudaSafeCall(cudaFuncSetCacheConfig(compute_hists_kernel_many_blocks<nblocks>, 
+            cudaSafeCall(cudaFuncSetCacheConfig(compute_hists_kernel_many_blocks<nblocks>,
                                                cudaFuncCachePreferL1));
-         
+
            // Precompute gaussian spatial window parameter
            float scale = 1.f / (2.f * sigma * sigma);

@ -223,18 +223,18 @@ namespace cv { namespace gpu { namespace device
        //


-        template<int size> 
+        template<int size>
        __device__ float reduce_smem(volatile float* smem)
-        {        
+        {
            unsigned int tid = threadIdx.x;
            float sum = smem[tid];

            if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; __syncthreads(); }
            if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; __syncthreads(); }
            if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; __syncthreads(); }
-            
+
            if (tid < 32)
-            {        
+            {
                if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];
                if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
                if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
@ -245,54 +245,54 @@ namespace cv { namespace gpu { namespace device

            __syncthreads();
            sum = smem[0];
-            
+
            return sum;
        }


-        template <int nthreads, // Number of threads which process one block historgam 
+        template <int nthreads, // Number of threads which process one block historgam
                  int nblocks> // Number of block hisograms processed by one GPU thread block
        __global__ void normalize_hists_kernel_many_blocks(const int block_hist_size,
-                                                           const int img_block_width, 
+                                                           const int img_block_width,
                                                           float* block_hists, float threshold)
        {
            if (blockIdx.x * blockDim.z + threadIdx.z >= img_block_width)
                return;

-            float* hist = block_hists + (blockIdx.y * img_block_width + 
-                                         blockIdx.x * blockDim.z + threadIdx.z) * 
+            float* hist = block_hists + (blockIdx.y * img_block_width +
+                                         blockIdx.x * blockDim.z + threadIdx.z) *
                                        block_hist_size + threadIdx.x;
-            
+
            __shared__ float sh_squares[nthreads * nblocks];
            float* squares = sh_squares + threadIdx.z * nthreads;
-            
+
            float elem = 0.f;
            if (threadIdx.x < block_hist_size)
                elem = hist[0];
-            
-            squares[threadIdx.x] = elem * elem;        
+
+            squares[threadIdx.x] = elem * elem;

            __syncthreads();
            float sum = reduce_smem<nthreads>(squares);
-            
-            float scale = 1.0f / (::sqrtf(sum) + 0.1f * block_hist_size);        
+
+            float scale = 1.0f / (::sqrtf(sum) + 0.1f * block_hist_size);
            elem = ::min(elem * scale, threshold);
-            
+
            __syncthreads();
            squares[threadIdx.x] = elem * elem;

            __syncthreads();
            sum = reduce_smem<nthreads>(squares);
            scale = 1.0f / (::sqrtf(sum) + 1e-3f);
-            
+
            if (threadIdx.x < block_hist_size)
                hist[0] = elem * scale;
        }


-        void normalize_hists(int nbins, int block_stride_x, int block_stride_y, 
+        void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
                             int height, int width, float* block_hists, float threshold)
-        {   
+        {
            const int nblocks = 1;

            int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
@ -327,19 +327,19 @@ namespace cv { namespace gpu { namespace device
        //


-        template <int nthreads, // Number of threads per one histogram block 
+        template <int nthreads, // Number of threads per one histogram block
                  int nblocks> // Number of histogram block processed by single GPU thread block
-        __global__ void classify_hists_kernel_many_blocks(const int img_win_width, const int img_block_width, 
+        __global__ void classify_hists_kernel_many_blocks(const int img_win_width, const int img_block_width,
                                                          const int win_block_stride_x, const int win_block_stride_y,
                                                          const float* block_hists, const float* coefs,
                                                          float free_coef, float threshold, unsigned char* labels)
-        {            
+        {
            const int win_x = threadIdx.z;
            if (blockIdx.x * blockDim.z + win_x >= img_win_width)
                return;

-            const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + 
-                                               blockIdx.x * win_block_stride_x * blockDim.z + win_x) * 
+            const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
+                                               blockIdx.x * win_block_stride_x * blockDim.z + win_x) *
                                              cblock_hist_size;

            float product = 0.f;
@ -357,24 +357,24 @@ namespace cv { namespace gpu { namespace device

            __syncthreads();

-            if (nthreads >= 512) 
-            { 
+            if (nthreads >= 512)
+            {
                if (threadIdx.x < 256) products[tid] = product = product + products[tid + 256];
-                __syncthreads(); 
+                __syncthreads();
            }
-            if (nthreads >= 256) 
-            { 
-                if (threadIdx.x < 128) products[tid] = product = product + products[tid + 128]; 
-                __syncthreads(); 
+            if (nthreads >= 256)
+            {
+                if (threadIdx.x < 128) products[tid] = product = product + products[tid + 128];
+                __syncthreads();
            }
-            if (nthreads >= 128) 
-            { 
-                if (threadIdx.x < 64) products[tid] = product = product + products[tid + 64]; 
-                __syncthreads(); 
+            if (nthreads >= 128)
+            {
+                if (threadIdx.x < 64) products[tid] = product = product + products[tid + 64];
+                __syncthreads();
            }
-            
+
            if (threadIdx.x < 32)
-            {        
+            {
                volatile float* smem = products;
                if (nthreads >= 64) smem[tid] = product = product + smem[tid + 32];
                if (nthreads >= 32) smem[tid] = product = product + smem[tid + 16];
@ -389,10 +389,10 @@ namespace cv { namespace gpu { namespace device
        }


-        void classify_hists(int win_height, int win_width, int block_stride_y, int block_stride_x, 
-                            int win_stride_y, int win_stride_x, int height, int width, float* block_hists, 
+        void classify_hists(int win_height, int win_width, int block_stride_y, int block_stride_x,
+                            int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
                            float* coefs, float free_coef, float threshold, unsigned char* labels)
-        {   
+        {
            const int nthreads = 256;
            const int nblocks = 1;

@ -408,7 +408,7 @@ namespace cv { namespace gpu { namespace device

            int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
            classify_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(
-                img_win_width, img_block_width, win_block_stride_x, win_block_stride_y, 
+                img_win_width, img_block_width, win_block_stride_x, win_block_stride_y,
                block_hists, coefs, free_coef, threshold, labels);
            cudaSafeCall( cudaGetLastError() );

@ -420,11 +420,11 @@ namespace cv { namespace gpu { namespace device


        template <int nthreads>
-        __global__ void extract_descrs_by_rows_kernel(const int img_block_width, const int win_block_stride_x, const int win_block_stride_y, 
+        __global__ void extract_descrs_by_rows_kernel(const int img_block_width, const int win_block_stride_x, const int win_block_stride_y,
 											          const float* block_hists, PtrElemStepf descriptors)
        {
            // Get left top corner of the window in src
-            const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + 
+            const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
                                               blockIdx.x * win_block_stride_x) * cblock_hist_size;

            // Get left top corner of the window in dst
@ -440,7 +440,7 @@ namespace cv { namespace gpu { namespace device
        }


-        void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x, 
+        void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x,
 							        int height, int width, float* block_hists, DevMem2Df descriptors)
        {
            const int nthreads = 256;
@ -462,12 +462,12 @@ namespace cv { namespace gpu { namespace device


        template <int nthreads>
-        __global__ void extract_descrs_by_cols_kernel(const int img_block_width, const int win_block_stride_x, 
-                                                      const int win_block_stride_y, const float* block_hists, 
+        __global__ void extract_descrs_by_cols_kernel(const int img_block_width, const int win_block_stride_x,
+                                                      const int win_block_stride_y, const float* block_hists,
                                                      PtrElemStepf descriptors)
        {
            // Get left top corner of the window in src
-            const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + 
+            const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
                                               blockIdx.x * win_block_stride_x) * cblock_hist_size;

            // Get left top corner of the window in dst
@ -482,14 +482,14 @@ namespace cv { namespace gpu { namespace device
                int y = block_idx / cnblocks_win_x;
                int x = block_idx - y * cnblocks_win_x;

-                descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] 
+                descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block]
                    = hist[(y * img_block_width  + x) * cblock_hist_size + idx_in_block];
            }
        }


-        void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, 
-                                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists, 
+        void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
+                                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
                                    DevMem2Df descriptors)
        {
            const int nthreads = 256;
@ -514,7 +514,7 @@ namespace cv { namespace gpu { namespace device


        template <int nthreads, int correct_gamma>
-        __global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrElemStep img, 
+        __global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrElemStep img,
                                                      float angle_scale, PtrElemStepf grad, PtrElemStep qangle)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -524,9 +524,9 @@ namespace cv { namespace gpu { namespace device
            __shared__ float sh_row[(nthreads + 2) * 3];

            uchar4 val;
-            if (x < width) 
-                val = row[x]; 
-            else 
+            if (x < width)
+                val = row[x];
+            else
                val = row[width - 2];

            sh_row[threadIdx.x + 1] = val.x;
@ -563,9 +563,9 @@ namespace cv { namespace gpu { namespace device

                float3 dx;
                if (correct_gamma)
-                    dx = make_float3(::sqrtf(b.x) - ::sqrtf(a.x), ::sqrtf(b.y) - ::sqrtf(a.y), ::sqrtf(b.z) - ::sqrtf(a.z));    
+                    dx = make_float3(::sqrtf(b.x) - ::sqrtf(a.x), ::sqrtf(b.y) - ::sqrtf(a.y), ::sqrtf(b.z) - ::sqrtf(a.z));
                else
-                    dx = make_float3(b.x - a.x, b.y - a.y, b.z - a.z);    
+                    dx = make_float3(b.x - a.x, b.y - a.y, b.z - a.z);

                float3 dy = make_float3(0.f, 0.f, 0.f);

@ -588,7 +588,7 @@ namespace cv { namespace gpu { namespace device

                float mag0 = dx.x * dx.x + dy.x * dy.x;
                float mag1 = dx.y * dx.y + dy.y * dy.y;
-                if (mag0 < mag1) 
+                if (mag0 < mag1)
                {
                    best_dx = dx.y;
                    best_dy = dy.y;
@ -616,7 +616,7 @@ namespace cv { namespace gpu { namespace device
        }


-        void compute_gradients_8UC4(int nbins, int height, int width, const DevMem2Db& img, 
+        void compute_gradients_8UC4(int nbins, int height, int width, const DevMem2Db& img,
                                    float angle_scale, DevMem2Df grad, DevMem2Db qangle, bool correct_gamma)
        {
            const int nthreads = 256;
@ -635,7 +635,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <int nthreads, int correct_gamma>
-        __global__ void compute_gradients_8UC1_kernel(int height, int width, const PtrElemStep img, 
+        __global__ void compute_gradients_8UC1_kernel(int height, int width, const PtrElemStep img,
                                                      float angle_scale, PtrElemStepf grad, PtrElemStep qangle)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -644,9 +644,9 @@ namespace cv { namespace gpu { namespace device

            __shared__ float sh_row[nthreads + 2];

-            if (x < width) 
-                sh_row[threadIdx.x + 1] = row[x]; 
-            else 
+            if (x < width)
+                sh_row[threadIdx.x + 1] = row[x];
+            else
                sh_row[threadIdx.x + 1] = row[width - 2];

            if (threadIdx.x == 0)
@ -688,7 +688,7 @@ namespace cv { namespace gpu { namespace device
        }


-        void compute_gradients_8UC1(int nbins, int height, int width, const DevMem2Db& img, 
+        void compute_gradients_8UC1(int nbins, int height, int width, const DevMem2Db& img,
                                    float angle_scale, DevMem2Df grad, DevMem2Db qangle, bool correct_gamma)
        {
            const int nthreads = 256;
@ -729,13 +729,13 @@ namespace cv { namespace gpu { namespace device
            unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

            if (x < dst.cols && y < dst.rows)
-	        {        
+	        {
 		        float4 val = tex2D(resize8UC4_tex, x * sx + colOfs, y * sy);
                dst.ptr(y)[x] = make_uchar4(val.x * 255, val.y * 255, val.z * 255, val.w * 255);
 	        }
        }

-        template<class T, class TEX> 
+        template<class T, class TEX>
        static void resize_for_hog(const DevMem2Db& src, DevMem2Db dst, TEX& tex)
        {
            tex.filterMode = cudaFilterModeLinear;
@ -743,19 +743,19 @@ namespace cv { namespace gpu { namespace device
            size_t texOfs = 0;
            int colOfs = 0;

-            cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();    
+            cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
            cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );

-            if (texOfs != 0) 
+            if (texOfs != 0)
            {
                colOfs = static_cast<int>( texOfs/sizeof(T) );
                cudaSafeCall( cudaUnbindTexture(tex) );
                cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );
-            }    
+            }

            dim3 threads(32, 8);
            dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));
-            
+
 	        float sx = static_cast<float>(src.cols) / dst.cols;
            float sy = static_cast<float>(src.rows) / dst.rows;

@ -769,5 +769,5 @@ namespace cv { namespace gpu { namespace device

        void resize_8UC1(const DevMem2Db& src, DevMem2Db dst) { resize_for_hog<uchar> (src, dst, resize8UC1_tex); }
        void resize_8UC4(const DevMem2Db& src, DevMem2Db dst) { resize_for_hog<uchar4>(src, dst, resize8UC4_tex); }
-    } // namespace hog 
+    } // namespace hog
 }}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@ -970,12 +970,12 @@ namespace cv { namespace gpu { namespace device
        #undef IMPLEMENT_FILTER2D_TEX_READER

        template <typename T, typename D>
-        void filter2D_gpu(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst, 
-                          int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, 
+        void filter2D_gpu(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst,
+                          int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel,
                          int borderMode, const float* borderValue, cudaStream_t stream)
        {
            typedef void (*func_t)(const DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2D_<D> dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream);
-            static const func_t funcs[] = 
+            static const func_t funcs[] =
            {
                Filter2DCaller<T, D, BrdReflect101>::call,
                Filter2DCaller<T, D, BrdReplicate>::call,
--- a/modules/gpu/src/cuda/internal_shared.hpp
+++ b/modules/gpu/src/cuda/internal_shared.hpp
@ -50,9 +50,9 @@
 #include "safe_call.hpp"
 #include "opencv2/gpu/device/common.hpp"

-namespace cv { namespace gpu 
+namespace cv { namespace gpu
 {
-    enum 
+    enum
    {
        BORDER_REFLECT101_GPU = 0,
        BORDER_REPLICATE_GPU,
@ -60,7 +60,7 @@ namespace cv { namespace gpu
        BORDER_REFLECT_GPU,
        BORDER_WRAP_GPU
    };
-            
+
    // Converts CPU border extrapolation mode into GPU internal analogue.
    // Returns true if the GPU analogue exists, false otherwise.
    bool tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType);
--- a/modules/gpu/src/cuda/match_template.cu
+++ b/modules/gpu/src/cuda/match_template.cu
@ -43,9 +43,9 @@
 #include "internal_shared.hpp"
 #include "opencv2/gpu/device/vec_math.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace match_template 
+    namespace match_template
    {
        __device__ __forceinline__ float sum(float v) { return v; }
        __device__ __forceinline__ float sum(float2 v) { return v.x + v.y; }
@ -80,7 +80,7 @@ namespace cv { namespace gpu { namespace device
        //////////////////////////////////////////////////////////////////////
        // Naive_CCORR

-        template <typename T, int cn> 
+        template <typename T, int cn>
        __global__ void matchTemplateNaiveKernel_CCORR(int w, int h, const PtrStepb image, const PtrStepb templ, DevMem2Df result)
        {
            typedef typename TypeVec<T, cn>::vec_type Type;
@ -122,7 +122,7 @@ namespace cv { namespace gpu { namespace device
        {
            typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);

-            static const caller_t callers[] = 
+            static const caller_t callers[] =
            {
                0, matchTemplateNaive_CCORR<float, 1>, matchTemplateNaive_CCORR<float, 2>, matchTemplateNaive_CCORR<float, 3>, matchTemplateNaive_CCORR<float, 4>
            };
@ -135,7 +135,7 @@ namespace cv { namespace gpu { namespace device
        {
            typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);

-            static const caller_t callers[] = 
+            static const caller_t callers[] =
            {
                0, matchTemplateNaive_CCORR<uchar, 1>, matchTemplateNaive_CCORR<uchar, 2>, matchTemplateNaive_CCORR<uchar, 3>, matchTemplateNaive_CCORR<uchar, 4>
            };
@ -192,7 +192,7 @@ namespace cv { namespace gpu { namespace device
        {
            typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);

-            static const caller_t callers[] = 
+            static const caller_t callers[] =
            {
                0, matchTemplateNaive_SQDIFF<float, 1>, matchTemplateNaive_SQDIFF<float, 2>, matchTemplateNaive_SQDIFF<float, 3>, matchTemplateNaive_SQDIFF<float, 4>
            };
@ -204,7 +204,7 @@ namespace cv { namespace gpu { namespace device
        {
            typedef void (*caller_t)(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, cudaStream_t stream);

-            static const caller_t callers[] = 
+            static const caller_t callers[] =
            {
                0, matchTemplateNaive_SQDIFF<uchar, 1>, matchTemplateNaive_SQDIFF<uchar, 2>, matchTemplateNaive_SQDIFF<uchar, 3>, matchTemplateNaive_SQDIFF<uchar, 4>
            };
@ -249,7 +249,7 @@ namespace cv { namespace gpu { namespace device
        {
            typedef void (*caller_t)(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned long long templ_sqsum, DevMem2Df result, cudaStream_t stream);

-            static const caller_t callers[] = 
+            static const caller_t callers[] =
            {
                0, matchTemplatePrepared_SQDIFF_8U<1>, matchTemplatePrepared_SQDIFF_8U<2>, matchTemplatePrepared_SQDIFF_8U<3>, matchTemplatePrepared_SQDIFF_8U<4>
            };
@ -321,7 +321,7 @@ namespace cv { namespace gpu { namespace device
                                                    DevMem2Df result, int cn, cudaStream_t stream)
        {
            typedef void (*caller_t)(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned long long templ_sqsum, DevMem2Df result, cudaStream_t stream);
-            static const caller_t callers[] = 
+            static const caller_t callers[] =
            {
                0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4>
            };
@ -379,16 +379,16 @@ namespace cv { namespace gpu { namespace device
                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
                float ccorr = result.ptr(y)[x];
-                result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r 
+                result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
                                         - image_sum_g_ * templ_sum_scale_g;
            }
        }

        void matchTemplatePrepared_CCOFF_8UC2(
-                int w, int h, 
-                const DevMem2D_<unsigned int> image_sum_r, 
+                int w, int h,
+                const DevMem2D_<unsigned int> image_sum_r,
                const DevMem2D_<unsigned int> image_sum_g,
-                unsigned int templ_sum_r, unsigned int templ_sum_g, 
+                unsigned int templ_sum_r, unsigned int templ_sum_g,
                DevMem2Df result, cudaStream_t stream)
        {
            dim3 threads(32, 8);
@ -406,7 +406,7 @@ namespace cv { namespace gpu { namespace device


        __global__ void matchTemplatePreparedKernel_CCOFF_8UC3(
-                int w, int h, 
+                int w, int h,
                float templ_sum_scale_r,
                float templ_sum_scale_g,
                float templ_sum_scale_b,
@ -437,20 +437,20 @@ namespace cv { namespace gpu { namespace device
        }

        void matchTemplatePrepared_CCOFF_8UC3(
-                int w, int h, 
-                const DevMem2D_<unsigned int> image_sum_r, 
+                int w, int h,
+                const DevMem2D_<unsigned int> image_sum_r,
                const DevMem2D_<unsigned int> image_sum_g,
                const DevMem2D_<unsigned int> image_sum_b,
-                unsigned int templ_sum_r, 
-                unsigned int templ_sum_g, 
-                unsigned int templ_sum_b, 
+                unsigned int templ_sum_r,
+                unsigned int templ_sum_g,
+                unsigned int templ_sum_b,
                DevMem2Df result, cudaStream_t stream)
        {
            dim3 threads(32, 8);
            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));

            matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads, 0, stream>>>(
-                    w, h, 
+                    w, h,
                    (float)templ_sum_r / (w * h),
                    (float)templ_sum_g / (w * h),
                    (float)templ_sum_b / (w * h),
@ -464,8 +464,8 @@ namespace cv { namespace gpu { namespace device


        __global__ void matchTemplatePreparedKernel_CCOFF_8UC4(
-                int w, int h, 
-                float templ_sum_scale_r, 
+                int w, int h,
+                float templ_sum_scale_r,
                float templ_sum_scale_g,
                float templ_sum_scale_b,
                float templ_sum_scale_a,
@ -493,7 +493,7 @@ namespace cv { namespace gpu { namespace device
                        (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -
                        (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));
                float ccorr = result.ptr(y)[x];
-                result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r 
+                result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
                                         - image_sum_g_ * templ_sum_scale_g
                                         - image_sum_b_ * templ_sum_scale_b
                                         - image_sum_a_ * templ_sum_scale_a;
@ -501,24 +501,24 @@ namespace cv { namespace gpu { namespace device
        }

        void matchTemplatePrepared_CCOFF_8UC4(
-                int w, int h, 
-                const DevMem2D_<unsigned int> image_sum_r, 
+                int w, int h,
+                const DevMem2D_<unsigned int> image_sum_r,
                const DevMem2D_<unsigned int> image_sum_g,
                const DevMem2D_<unsigned int> image_sum_b,
                const DevMem2D_<unsigned int> image_sum_a,
-                unsigned int templ_sum_r, 
-                unsigned int templ_sum_g, 
-                unsigned int templ_sum_b, 
-                unsigned int templ_sum_a, 
+                unsigned int templ_sum_r,
+                unsigned int templ_sum_g,
+                unsigned int templ_sum_b,
+                unsigned int templ_sum_a,
                DevMem2Df result, cudaStream_t stream)
        {
            dim3 threads(32, 8);
            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));

            matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads, 0, stream>>>(
-                    w, h, 
-                    (float)templ_sum_r / (w * h), 
-                    (float)templ_sum_g / (w * h), 
+                    w, h,
+                    (float)templ_sum_r / (w * h),
+                    (float)templ_sum_g / (w * h),
                    (float)templ_sum_b / (w * h),
                    (float)templ_sum_a / (w * h),
                    image_sum_r, image_sum_g, image_sum_b, image_sum_a,
@ -533,9 +533,9 @@ namespace cv { namespace gpu { namespace device
        // Prepared_CCOFF_NORMED

        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(
-                int w, int h, float weight, 
+                int w, int h, float weight,
                float templ_sum_scale, float templ_sqsum_scale,
-                const PtrStep<unsigned int> image_sum, 
+                const PtrStep<unsigned int> image_sum,
                const PtrStep<unsigned long long> image_sqsum,
                DevMem2Df result)
        {
@ -557,7 +557,7 @@ namespace cv { namespace gpu { namespace device
        }

        void matchTemplatePrepared_CCOFF_NORMED_8U(
-                    int w, int h, const DevMem2D_<unsigned int> image_sum, 
+                    int w, int h, const DevMem2D_<unsigned int> image_sum,
                    const DevMem2D_<unsigned long long> image_sqsum,
                    unsigned int templ_sum, unsigned long long templ_sqsum,
                    DevMem2Df result, cudaStream_t stream)
@ -570,7 +570,7 @@ namespace cv { namespace gpu { namespace device
            float templ_sqsum_scale = templ_sqsum - weight * templ_sum * templ_sum;

            matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads, 0, stream>>>(
-                    w, h, weight, templ_sum_scale, templ_sqsum_scale, 
+                    w, h, weight, templ_sum_scale, templ_sqsum_scale,
                    image_sum, image_sqsum, result);
            cudaSafeCall( cudaGetLastError() );

@ -581,8 +581,8 @@ namespace cv { namespace gpu { namespace device


        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2(
-                int w, int h, float weight, 
-                float templ_sum_scale_r, float templ_sum_scale_g, 
+                int w, int h, float weight,
+                float templ_sum_scale_r, float templ_sum_scale_g,
                float templ_sqsum_scale,
                const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
                const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
@ -615,7 +615,7 @@ namespace cv { namespace gpu { namespace device
        }

        void matchTemplatePrepared_CCOFF_NORMED_8UC2(
-                    int w, int h, 
+                    int w, int h,
                    const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
                    const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
                    unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
@ -628,15 +628,15 @@ namespace cv { namespace gpu { namespace device
            float weight = 1.f / (w * h);
            float templ_sum_scale_r = templ_sum_r * weight;
            float templ_sum_scale_g = templ_sum_g * weight;
-            float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r 
+            float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
                                       + templ_sqsum_g - weight * templ_sum_g * templ_sum_g;

            matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads, 0, stream>>>(
-                    w, h, weight, 
+                    w, h, weight,
                    templ_sum_scale_r, templ_sum_scale_g,
                    templ_sqsum_scale,
-                    image_sum_r, image_sqsum_r, 
-                    image_sum_g, image_sqsum_g, 
+                    image_sum_r, image_sqsum_r,
+                    image_sum_g, image_sqsum_g,
                    result);
            cudaSafeCall( cudaGetLastError() );

@ -647,8 +647,8 @@ namespace cv { namespace gpu { namespace device


        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3(
-                int w, int h, float weight, 
-                float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, 
+                int w, int h, float weight,
+                float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
                float templ_sqsum_scale,
                const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
                const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
@ -690,7 +690,7 @@ namespace cv { namespace gpu { namespace device
        }

        void matchTemplatePrepared_CCOFF_NORMED_8UC3(
-                    int w, int h, 
+                    int w, int h,
                    const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
                    const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
                    const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,
@ -706,17 +706,17 @@ namespace cv { namespace gpu { namespace device
            float templ_sum_scale_r = templ_sum_r * weight;
            float templ_sum_scale_g = templ_sum_g * weight;
            float templ_sum_scale_b = templ_sum_b * weight;
-            float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r 
+            float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
                                      + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
                                      + templ_sqsum_b - weight * templ_sum_b * templ_sum_b;

            matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads, 0, stream>>>(
-                    w, h, weight, 
-                    templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, 
-                    templ_sqsum_scale, 
-                    image_sum_r, image_sqsum_r, 
-                    image_sum_g, image_sqsum_g, 
-                    image_sum_b, image_sqsum_b, 
+                    w, h, weight,
+                    templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b,
+                    templ_sqsum_scale,
+                    image_sum_r, image_sqsum_r,
+                    image_sum_g, image_sqsum_g,
+                    image_sum_b, image_sqsum_b,
                    result);
            cudaSafeCall( cudaGetLastError() );

@ -727,8 +727,8 @@ namespace cv { namespace gpu { namespace device


        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4(
-                int w, int h, float weight, 
-                float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, 
+                int w, int h, float weight,
+                float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
                float templ_sum_scale_a, float templ_sqsum_scale,
                const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
                const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
@ -777,7 +777,7 @@ namespace cv { namespace gpu { namespace device
        }

        void matchTemplatePrepared_CCOFF_NORMED_8UC4(
-                    int w, int h, 
+                    int w, int h,
                    const DevMem2D_<unsigned int> image_sum_r, const DevMem2D_<unsigned long long> image_sqsum_r,
                    const DevMem2D_<unsigned int> image_sum_g, const DevMem2D_<unsigned long long> image_sqsum_g,
                    const DevMem2D_<unsigned int> image_sum_b, const DevMem2D_<unsigned long long> image_sqsum_b,
@ -802,13 +802,13 @@ namespace cv { namespace gpu { namespace device
                                      + templ_sqsum_a - weight * templ_sum_a * templ_sum_a;

            matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads, 0, stream>>>(
-                    w, h, weight, 
-                    templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_a, 
-                    templ_sqsum_scale, 
-                    image_sum_r, image_sqsum_r, 
-                    image_sum_g, image_sqsum_g, 
-                    image_sum_b, image_sqsum_b, 
-                    image_sum_a, image_sqsum_a, 
+                    w, h, weight,
+                    templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_a,
+                    templ_sqsum_scale,
+                    image_sum_r, image_sqsum_r,
+                    image_sum_g, image_sqsum_g,
+                    image_sum_b, image_sqsum_b,
+                    image_sum_a, image_sqsum_a,
                    result);
            cudaSafeCall( cudaGetLastError() );

@ -821,7 +821,7 @@ namespace cv { namespace gpu { namespace device

        template <int cn>
        __global__ void normalizeKernel_8U(
-                int w, int h, const PtrStep<unsigned long long> image_sqsum, 
+                int w, int h, const PtrStep<unsigned long long> image_sqsum,
                unsigned long long templ_sqsum, DevMem2Df result)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -836,7 +836,7 @@ namespace cv { namespace gpu { namespace device
            }
        }

-        void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, 
+        void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum,
                          unsigned long long templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream)
        {
            dim3 threads(32, 8);
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@ -42,9 +42,9 @@

 #include "internal_shared.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace mathfunc 
+    namespace mathfunc
    {
        //////////////////////////////////////////////////////////////////////////////////////
        // Cart <-> Polar
@ -79,7 +79,7 @@ namespace cv { namespace gpu { namespace device
            }
        };
        template <typename Mag, typename Angle>
-        __global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step, 
+        __global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step,
                                    float* mag, size_t mag_step, float* angle, size_t angle_step, float scale, int width, int height)
        {
 	        const int x = blockDim.x * blockIdx.x + threadIdx.x;
@ -137,11 +137,11 @@ namespace cv { namespace gpu { namespace device

            grid.x = divUp(x.cols, threads.x);
            grid.y = divUp(x.rows, threads.y);
-            
+
            const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f;

            cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
-                x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), 
+                x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(),
                mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);
            cudaSafeCall( cudaGetLastError() );

@ -152,7 +152,7 @@ namespace cv { namespace gpu { namespace device
        void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)
        {
            typedef void (*caller_t)(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream);
-            static const caller_t callers[2][2][2] = 
+            static const caller_t callers[2][2][2] =
            {
                {
                    {
@ -187,10 +187,10 @@ namespace cv { namespace gpu { namespace device

            grid.x = divUp(mag.cols, threads.x);
            grid.y = divUp(mag.rows, threads.y);
-            
+
            const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f;

-            polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(), 
+            polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(),
                angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);
            cudaSafeCall( cudaGetLastError() );

@ -201,7 +201,7 @@ namespace cv { namespace gpu { namespace device
        void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)
        {
            typedef void (*caller_t)(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream);
-            static const caller_t callers[2] = 
+            static const caller_t callers[2] =
            {
                polarToCart_caller<NonEmptyMag>,
                polarToCart_caller<EmptyMag>
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
@ -45,9 +45,9 @@
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/vec_math.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace matrix_reductions 
+    namespace matrix_reductions
    {
        // Performs reduction in shared memory
        template <int size, typename T>
@ -74,19 +74,19 @@ namespace cv { namespace gpu { namespace device
        {
            explicit Mask8U(PtrStepb mask): mask(mask) {}

-            __device__ __forceinline__ bool operator()(int y, int x) const 
-            { 
-                return mask.ptr(y)[x]; 
+            __device__ __forceinline__ bool operator()(int y, int x) const
+            {
+                return mask.ptr(y)[x];
            }

            PtrStepb mask;
        };

-        struct MaskTrue 
-        { 
-            __device__ __forceinline__ bool operator()(int y, int x) const 
-            { 
-                return true; 
+        struct MaskTrue
+        {
+            __device__ __forceinline__ bool operator()(int y, int x) const
+            {
+                return true;
            }
            __device__ __forceinline__ MaskTrue(){}
            __device__ __forceinline__ MaskTrue(const MaskTrue& mask_){}
@ -95,7 +95,7 @@ namespace cv { namespace gpu { namespace device
        //////////////////////////////////////////////////////////////////////////////
        // Min max

-        // To avoid shared bank conflicts we convert each value into value of 
+        // To avoid shared bank conflicts we convert each value into value of
        // appropriate type (32 bits minimum)
        template <typename T> struct MinMaxTypeTraits {};
        template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; };
@ -106,7 +106,7 @@ namespace cv { namespace gpu { namespace device
        template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
        template <> struct MinMaxTypeTraits<double> { typedef double best_type; };

-        namespace minmax 
+        namespace minmax
        {
            __constant__ int ctwidth;
            __constant__ int ctheight;
@ -131,19 +131,19 @@ namespace cv { namespace gpu { namespace device
            {
                dim3 threads, grid;
                estimateThreadCfg(cols, rows, threads, grid);
-                bufcols = grid.x * grid.y * elem_size; 
+                bufcols = grid.x * grid.y * elem_size;
                bufrows = 2;
            }


            // Estimates device constants which are used in the kernels using specified thread configuration
            void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)
-            {        
+            {
                int twidth = divUp(divUp(cols, grid.x), threads.x);
                int theight = divUp(divUp(rows, grid.y), threads.y);
-                cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth))); 
-                cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight))); 
-            }  
+                cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth)));
+                cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight)));
+            }


            // Does min and max in shared memory
@ -195,10 +195,10 @@ namespace cv { namespace gpu { namespace device
                    for (uint x = x0; x < x_end; x += blockDim.x)
                    {
                        T val = src_row[x];
-                        if (mask(y, x)) 
-                        { 
-                            mymin = ::min(mymin, val); 
-                            mymax = ::max(mymax, val); 
+                        if (mask(y, x))
+                        {
+                            mymin = ::min(mymin, val);
+                            mymax = ::max(mymax, val);
                        }
                    }
                }
@ -209,7 +209,7 @@ namespace cv { namespace gpu { namespace device

                findMinMaxInSmem<nthreads, best_type>(sminval, smaxval, tid);

-                if (tid == 0) 
+                if (tid == 0)
                {
                    minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
                    maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
@ -240,7 +240,7 @@ namespace cv { namespace gpu { namespace device

 			        findMinMaxInSmem<nthreads, best_type>(sminval, smaxval, tid);

-                    if (tid == 0) 
+                    if (tid == 0)
                    {
                        minval[0] = (T)sminval[0];
                        maxval[0] = (T)smaxval[0];
@ -248,7 +248,7 @@ namespace cv { namespace gpu { namespace device
                    }
 		        }
            #else
-                if (tid == 0) 
+                if (tid == 0)
                {
                    minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
                    maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
@ -256,7 +256,7 @@ namespace cv { namespace gpu { namespace device
            #endif
            }

-   
+
            template <typename T>
            void minMaxMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf)
            {
@ -277,7 +277,7 @@ namespace cv { namespace gpu { namespace device
                cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
                *minval = minval_;
                *maxval = maxval_;
-            }  
+            }

            template void minMaxMaskCaller<uchar>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
            template void minMaxMaskCaller<char>(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
@ -308,7 +308,7 @@ namespace cv { namespace gpu { namespace device
                cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
                *minval = minval_;
                *maxval = maxval_;
-            }  
+            }

            template void minMaxCaller<uchar>(const DevMem2Db, double*, double*, PtrStepb);
            template void minMaxCaller<char>(const DevMem2Db, double*, double*, PtrStepb);
@ -325,7 +325,7 @@ namespace cv { namespace gpu { namespace device
                typedef typename MinMaxTypeTraits<T>::best_type best_type;
                __shared__ best_type sminval[nthreads];
                __shared__ best_type smaxval[nthreads];
-                
+
                uint tid = threadIdx.y * blockDim.x + threadIdx.x;
                uint idx = ::min(tid, size - 1);

@ -335,7 +335,7 @@ namespace cv { namespace gpu { namespace device

                findMinMaxInSmem<nthreads, best_type>(sminval, smaxval, tid);

-                if (tid == 0) 
+                if (tid == 0)
                {
                    minval[0] = (T)sminval[0];
                    maxval[0] = (T)smaxval[0];
@ -410,7 +410,7 @@ namespace cv { namespace gpu { namespace device
        ///////////////////////////////////////////////////////////////////////////////
        // minMaxLoc

-        namespace minmaxloc 
+        namespace minmaxloc
        {
            __constant__ int ctwidth;
            __constant__ int ctheight;
@ -431,7 +431,7 @@ namespace cv { namespace gpu { namespace device


            // Returns required buffer sizes
-            void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols, 
+            void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols,
                                    int& b1rows, int& b2cols, int& b2rows)
            {
                dim3 threads, grid;
@ -445,16 +445,16 @@ namespace cv { namespace gpu { namespace device

            // Estimates device constants which are used in the kernels using specified thread configuration
            void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)
-            {        
+            {
                int twidth = divUp(divUp(cols, grid.x), threads.x);
                int theight = divUp(divUp(rows, grid.y), threads.y);
-                cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth))); 
-                cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight))); 
-            }  
+                cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth)));
+                cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight)));
+            }


            template <typename T>
-            __device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval, 
+            __device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval,
                                  volatile uint* minloc, volatile uint* maxloc)
            {
                T val = minval[tid + offset];
@ -473,7 +473,7 @@ namespace cv { namespace gpu { namespace device


            template <int size, typename T>
-            __device__ void findMinMaxLocInSmem(volatile T* minval, volatile T* maxval, volatile uint* minloc, 
+            __device__ void findMinMaxLocInSmem(volatile T* minval, volatile T* maxval, volatile uint* minloc,
                                                volatile uint* maxloc, const uint tid)
            {
                if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval, minloc, maxloc); } __syncthreads(); }
@ -493,7 +493,7 @@ namespace cv { namespace gpu { namespace device


            template <int nthreads, typename T, typename Mask>
-            __global__ void minMaxLocKernel(const DevMem2Db src, Mask mask, T* minval, T* maxval, 
+            __global__ void minMaxLocKernel(const DevMem2Db src, Mask mask, T* minval, T* maxval,
                                            uint* minloc, uint* maxloc)
            {
                typedef typename MinMaxTypeTraits<T>::best_type best_type;
@ -507,7 +507,7 @@ namespace cv { namespace gpu { namespace device
                uint tid = threadIdx.y * blockDim.x + threadIdx.x;

                T mymin = numeric_limits<T>::max();
-                T mymax = numeric_limits<T>::is_signed ? -numeric_limits<T>::max() : numeric_limits<T>::min(); 
+                T mymax = numeric_limits<T>::is_signed ? -numeric_limits<T>::max() : numeric_limits<T>::min();
                uint myminloc = 0;
                uint mymaxloc = 0;
                uint y_end = ::min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
@ -527,7 +527,7 @@ namespace cv { namespace gpu { namespace device
                    }
                }

-                sminval[tid] = mymin; 
+                sminval[tid] = mymin;
                smaxval[tid] = mymax;
                sminloc[tid] = myminloc;
                smaxloc[tid] = mymaxloc;
@ -564,7 +564,7 @@ namespace cv { namespace gpu { namespace device

 			        findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);

-                    if (tid == 0) 
+                    if (tid == 0)
                    {
                        minval[0] = (T)sminval[0];
                        maxval[0] = (T)smaxval[0];
@ -574,7 +574,7 @@ namespace cv { namespace gpu { namespace device
                    }
 		        }
            #else
-                if (tid == 0) 
+                if (tid == 0)
                {
                    minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
                    maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
@ -586,7 +586,7 @@ namespace cv { namespace gpu { namespace device


            template <typename T>
-            void minMaxLocMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, 
+            void minMaxLocMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval,
                                     int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)
            {
                dim3 threads, grid;
@ -598,7 +598,7 @@ namespace cv { namespace gpu { namespace device
                uint* minloc_buf = (uint*)locbuf.ptr(0);
                uint* maxloc_buf = (uint*)locbuf.ptr(1);

-                minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, 
+                minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf,
                                                                   minloc_buf, maxloc_buf);
                cudaSafeCall( cudaGetLastError() );

@ -627,7 +627,7 @@ namespace cv { namespace gpu { namespace device


            template <typename T>
-            void minMaxLocCaller(const DevMem2Db src, double* minval, double* maxval, 
+            void minMaxLocCaller(const DevMem2Db src, double* minval, double* maxval,
                                 int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)
            {
                dim3 threads, grid;
@ -639,7 +639,7 @@ namespace cv { namespace gpu { namespace device
                uint* minloc_buf = (uint*)locbuf.ptr(0);
                uint* maxloc_buf = (uint*)locbuf.ptr(1);

-                minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, 
+                minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf,
                                                                     minloc_buf, maxloc_buf);
                cudaSafeCall( cudaGetLastError() );

@ -688,7 +688,7 @@ namespace cv { namespace gpu { namespace device

                findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);

-                if (tid == 0) 
+                if (tid == 0)
                {
                    minval[0] = (T)sminval[0];
                    maxval[0] = (T)smaxval[0];
@ -699,7 +699,7 @@ namespace cv { namespace gpu { namespace device


            template <typename T>
-            void minMaxLocMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, 
+            void minMaxLocMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval,
                                              int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)
            {
                dim3 threads, grid;
@ -711,7 +711,7 @@ namespace cv { namespace gpu { namespace device
                uint* minloc_buf = (uint*)locbuf.ptr(0);
                uint* maxloc_buf = (uint*)locbuf.ptr(1);

-                minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, 
+                minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf,
                                                                   minloc_buf, maxloc_buf);
                cudaSafeCall( cudaGetLastError() );
                minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
@ -741,7 +741,7 @@ namespace cv { namespace gpu { namespace device


            template <typename T>
-            void minMaxLocMultipassCaller(const DevMem2Db src, double* minval, double* maxval, 
+            void minMaxLocMultipassCaller(const DevMem2Db src, double* minval, double* maxval,
                                          int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)
            {
                dim3 threads, grid;
@ -753,7 +753,7 @@ namespace cv { namespace gpu { namespace device
                uint* minloc_buf = (uint*)locbuf.ptr(0);
                uint* maxloc_buf = (uint*)locbuf.ptr(1);

-                minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, 
+                minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf,
                                                                     minloc_buf, maxloc_buf);
                cudaSafeCall( cudaGetLastError() );
                minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
@ -785,7 +785,7 @@ namespace cv { namespace gpu { namespace device
        //////////////////////////////////////////////////////////////////////////////////////////////////////////
        // countNonZero

-        namespace countnonzero 
+        namespace countnonzero
        {
            __constant__ int ctwidth;
            __constant__ int ctheight;
@ -811,11 +811,11 @@ namespace cv { namespace gpu { namespace device


            void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)
-            {        
+            {
                int twidth = divUp(divUp(cols, grid.x), threads.x);
                int theight = divUp(divUp(rows, grid.y), threads.y);
-                cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth))); 
-                cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); 
+                cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth)));
+                cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight)));
            }


@ -862,7 +862,7 @@ namespace cv { namespace gpu { namespace device

 			        sumInSmem<nthreads, uint>(scount, tid);

-			        if (tid == 0) 
+			        if (tid == 0)
                    {
                        count[0] = scount[0];
                        blocks_finished = 0;
@ -873,7 +873,7 @@ namespace cv { namespace gpu { namespace device
            #endif
            }

-           
+
            template <typename T>
            int countNonZeroCaller(const DevMem2Db src, PtrStepb buf)
            {
@ -890,9 +890,9 @@ namespace cv { namespace gpu { namespace device

                uint count;
                cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
-                
+
                return count;
-            }  
+            }

            template int countNonZeroCaller<uchar>(const DevMem2Db, PtrStepb);
            template int countNonZeroCaller<char>(const DevMem2Db, PtrStepb);
@ -914,7 +914,7 @@ namespace cv { namespace gpu { namespace device

                sumInSmem<nthreads, uint>(scount, tid);

-                if (tid == 0) 
+                if (tid == 0)
                    count[0] = scount[0];
            }

@ -937,9 +937,9 @@ namespace cv { namespace gpu { namespace device

                uint count;
                cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
-                
+
                return count;
-            }  
+            }

            template int countNonZeroMultipassCaller<uchar>(const DevMem2Db, PtrStepb);
            template int countNonZeroMultipassCaller<char>(const DevMem2Db, PtrStepb);
@ -965,16 +965,16 @@ namespace cv { namespace gpu { namespace device
            template <> struct SumType<float> { typedef float R; };
            template <> struct SumType<double> { typedef double R; };

-            template <typename R> 
+            template <typename R>
            struct IdentityOp { static __device__ __forceinline__ R call(R x) { return x; } };

-            template <typename R> 
+            template <typename R>
            struct AbsOp { static __device__ __forceinline__ R call(R x) { return ::abs(x); } };

            template <>
            struct AbsOp<uint> { static __device__ __forceinline__ uint call(uint x) { return x; } };

-            template <typename R> 
+            template <typename R>
            struct SqrOp { static __device__ __forceinline__ R call(R x) { return x * x; } };

            __constant__ int ctwidth;
@ -987,7 +987,7 @@ namespace cv { namespace gpu { namespace device
            void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid)
            {
                threads = dim3(threads_x, threads_y);
-                grid = dim3(divUp(cols, threads.x * threads.y), 
+                grid = dim3(divUp(cols, threads.x * threads.y),
                            divUp(rows, threads.y * threads.x));
                grid.x = std::min(grid.x, threads.x);
                grid.y = std::min(grid.y, threads.y);
@ -1004,11 +1004,11 @@ namespace cv { namespace gpu { namespace device


            void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)
-            {        
+            {
                int twidth = divUp(divUp(cols, grid.x), threads.x);
                int theight = divUp(divUp(rows, grid.y), threads.y);
-                cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth))); 
-                cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); 
+                cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth)));
+                cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight)));
            }

            template <typename T, typename R, typename Op, int nthreads>
@ -1055,7 +1055,7 @@ namespace cv { namespace gpu { namespace device

                    sumInSmem<nthreads, R>(smem, tid);

-                    if (tid == 0) 
+                    if (tid == 0)
                    {
                        result[0] = smem[0];
                        blocks_finished = 0;
@ -1078,7 +1078,7 @@ namespace cv { namespace gpu { namespace device

                sumInSmem<nthreads, R>(smem, tid);

-                if (tid == 0) 
+                if (tid == 0)
                    result[0] = smem[0];
            }

@ -1142,7 +1142,7 @@ namespace cv { namespace gpu { namespace device
                    sumInSmem<nthreads, R>(smem, tid);
                    sumInSmem<nthreads, R>(smem + nthreads, tid);

-                    if (tid == 0) 
+                    if (tid == 0)
                    {
                        res.x = smem[0];
                        res.y = smem[nthreads];
@ -1151,7 +1151,7 @@ namespace cv { namespace gpu { namespace device
                    }
                }
            #else
-                if (tid == 0) 
+                if (tid == 0)
                {
                    DstType res;
                    res.x = smem[0];
@ -1179,7 +1179,7 @@ namespace cv { namespace gpu { namespace device
                sumInSmem<nthreads, R>(smem, tid);
                sumInSmem<nthreads, R>(smem + nthreads, tid);

-                if (tid == 0) 
+                if (tid == 0)
                {
                    res.x = smem[0];
                    res.y = smem[nthreads];
@ -1252,7 +1252,7 @@ namespace cv { namespace gpu { namespace device
                    sumInSmem<nthreads, R>(smem + nthreads, tid);
                    sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);

-                    if (tid == 0) 
+                    if (tid == 0)
                    {
                        res.x = smem[0];
                        res.y = smem[nthreads];
@ -1262,7 +1262,7 @@ namespace cv { namespace gpu { namespace device
                    }
                }
            #else
-                if (tid == 0) 
+                if (tid == 0)
                {
                    DstType res;
                    res.x = smem[0];
@ -1293,7 +1293,7 @@ namespace cv { namespace gpu { namespace device
                sumInSmem<nthreads, R>(smem + nthreads, tid);
                sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);

-                if (tid == 0) 
+                if (tid == 0)
                {
                    res.x = smem[0];
                    res.y = smem[nthreads];
@ -1323,7 +1323,7 @@ namespace cv { namespace gpu { namespace device
                    for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
                    {
                        val = ptr[x0 + x * blockDim.x];
-                        sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y), 
+                        sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y),
                                                             Op::call(val.z), Op::call(val.w));
                    }
                }
@ -1372,7 +1372,7 @@ namespace cv { namespace gpu { namespace device
                    sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
                    sumInSmem<nthreads, R>(smem + 3 * nthreads, tid);

-                    if (tid == 0) 
+                    if (tid == 0)
                    {
                        res.x = smem[0];
                        res.y = smem[nthreads];
@ -1383,7 +1383,7 @@ namespace cv { namespace gpu { namespace device
                    }
                }
            #else
-                if (tid == 0) 
+                if (tid == 0)
                {
                    DstType res;
                    res.x = smem[0];
@ -1417,7 +1417,7 @@ namespace cv { namespace gpu { namespace device
                sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
                sumInSmem<nthreads, R>(smem + 3 * nthreads, tid);

-                if (tid == 0) 
+                if (tid == 0)
                {
                    res.x = smem[0];
                    res.y = smem[nthreads];
@ -1488,7 +1488,7 @@ namespace cv { namespace gpu { namespace device
                sum[1] = result[1];
                sum[2] = result[2];
                sum[3] = result[3];
-            }  
+            }

            template void sumMultipassCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);
            template void sumMultipassCaller<char>(const DevMem2Db, PtrStepb, double*, int);
@ -1537,7 +1537,7 @@ namespace cv { namespace gpu { namespace device
                sum[1] = result[1];
                sum[2] = result[2];
                sum[3] = result[3];
-            }  
+            }

            template void sumCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);
            template void sumCaller<char>(const DevMem2Db, PtrStepb, double*, int);
@ -1608,7 +1608,7 @@ namespace cv { namespace gpu { namespace device
                sum[1] = result[1];
                sum[2] = result[2];
                sum[3] = result[3];
-            }  
+            }

            template void absSumMultipassCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);
            template void absSumMultipassCaller<char>(const DevMem2Db, PtrStepb, double*, int);
@ -1728,7 +1728,7 @@ namespace cv { namespace gpu { namespace device
                sum[1] = result[1];
                sum[2] = result[2];
                sum[3] = result[3];
-            }  
+            }

            template void sqrSumMultipassCaller<uchar>(const DevMem2Db, PtrStepb, double*, int);
            template void sqrSumMultipassCaller<char>(const DevMem2Db, PtrStepb, double*, int);
@ -1894,7 +1894,7 @@ namespace cv { namespace gpu { namespace device
            {
                for (int y = threadIdx.y; y < src.rows; y += 16)
                    myVal = op(myVal, src.ptr(y)[x]);
-            }        
+            }

            smem[threadIdx.x * 16 + threadIdx.y] = myVal;
            __syncthreads();
@ -1931,11 +1931,11 @@ namespace cv { namespace gpu { namespace device
        {
            typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream);

-            static const caller_t callers[] = 
+            static const caller_t callers[] =
            {
-                reduceRows_caller<SumReductor, T, S, D>, 
-                reduceRows_caller<AvgReductor, T, S, D>, 
-                reduceRows_caller<MaxReductor, T, S, D>, 
+                reduceRows_caller<SumReductor, T, S, D>,
+                reduceRows_caller<AvgReductor, T, S, D>,
+                reduceRows_caller<MaxReductor, T, S, D>,
                reduceRows_caller<MinReductor, T, S, D>
            };

@ -1944,15 +1944,15 @@ namespace cv { namespace gpu { namespace device

        template void reduceRows_gpu<uchar, int, uchar>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
        template void reduceRows_gpu<uchar, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-        template void reduceRows_gpu<uchar, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);  
+        template void reduceRows_gpu<uchar, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);

        template void reduceRows_gpu<ushort, int, ushort>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
        template void reduceRows_gpu<ushort, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-        template void reduceRows_gpu<ushort, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); 
+        template void reduceRows_gpu<ushort, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);

        template void reduceRows_gpu<short, int, short>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
        template void reduceRows_gpu<short, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-        template void reduceRows_gpu<short, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); 
+        template void reduceRows_gpu<short, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);

        template void reduceRows_gpu<int, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
        template void reduceRows_gpu<int, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
@ -2068,7 +2068,7 @@ namespace cv { namespace gpu { namespace device
        {
            typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream);

-            static const caller_t callers[4][4] = 
+            static const caller_t callers[4][4] =
            {
                {reduceCols_caller<1, SumReductor, T, S, D>, reduceCols_caller<1, AvgReductor, T, S, D>, reduceCols_caller<1, MaxReductor, T, S, D>, reduceCols_caller<1, MinReductor, T, S, D>},
                {reduceCols_caller<2, SumReductor, T, S, D>, reduceCols_caller<2, AvgReductor, T, S, D>, reduceCols_caller<2, MaxReductor, T, S, D>, reduceCols_caller<2, MinReductor, T, S, D>},
@ -2083,15 +2083,15 @@ namespace cv { namespace gpu { namespace device
        template void reduceCols_gpu<uchar, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
        template void reduceCols_gpu<uchar, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);

-        template void reduceCols_gpu<ushort, int, ushort>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); 
-        template void reduceCols_gpu<ushort, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);                  
+        template void reduceCols_gpu<ushort, int, ushort>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+        template void reduceCols_gpu<ushort, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
        template void reduceCols_gpu<ushort, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);

-        template void reduceCols_gpu<short, int, short>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);  
-        template void reduceCols_gpu<short, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);                  
-        template void reduceCols_gpu<short, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);  
+        template void reduceCols_gpu<short, int, short>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+        template void reduceCols_gpu<short, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+        template void reduceCols_gpu<short, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);

-        template void reduceCols_gpu<int, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);                  
+        template void reduceCols_gpu<int, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
        template void reduceCols_gpu<int, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);

        template void reduceCols_gpu<float, float, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
--- a/modules/gpu/src/cuda/optical_flow.cu
+++ b/modules/gpu/src/cuda/optical_flow.cu
@ -42,7 +42,7 @@

 #include "opencv2/gpu/device/common.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    namespace optical_flow
    {
@ -50,7 +50,7 @@ namespace cv { namespace gpu { namespace device
        #define NUM_VERTS_PER_ARROW 6

        __global__ void NeedleMapAverageKernel(const DevMem2Df u, const PtrStepf v, PtrStepf u_avg, PtrStepf v_avg)
-        {   
+        {
            __shared__ float smem[2 * NEEDLE_MAP_SCALE];

            volatile float* u_col_sum = smem;
@ -70,7 +70,7 @@ namespace cv { namespace gpu { namespace device
            }

            if (threadIdx.x < 8)
-            {        	
+            {
                // now add the column sums
                const uint X = threadIdx.x;

@ -80,8 +80,8 @@ namespace cv { namespace gpu { namespace device
                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 1];
                }

-                if (X | 0xfe == 0xfc) // bits 0 & 1 == 0 
-                { 
+                if (X | 0xfe == 0xfc) // bits 0 & 1 == 0
+                {
                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 2];
                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 2];
                }
@ -110,7 +110,7 @@ namespace cv { namespace gpu { namespace device
                v_avg(blockIdx.y, blockIdx.x) = v_col_sum[0];
            }
        }
-        
+
        void NeedleMapAverage_gpu(DevMem2Df u, DevMem2Df v, DevMem2Df u_avg, DevMem2Df v_avg)
        {
            const dim3 block(NEEDLE_MAP_SCALE);
--- a/modules/gpu/src/cuda/orb.cu
+++ b/modules/gpu/src/cuda/orb.cu
@ -40,7 +40,7 @@
 //
 // Copyright (c) 2010, Paul Furgale, Chi Hay Tong
 //
-// The original code was written by Paul Furgale and Chi Hay Tong 
+// The original code was written by Paul Furgale and Chi Hay Tong
 // and later optimized and prepared for integration into OpenCV by Itseez.
 //
 //M*/
@ -51,7 +51,7 @@
 #include "opencv2/gpu/device/utility.hpp"
 #include "opencv2/gpu/device/functional.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    namespace orb
    {
@ -59,7 +59,7 @@ namespace cv { namespace gpu { namespace device
        // cull

        int cull_gpu(int* loc, float* response, int size, int n_points)
-        {            
+        {
            thrust::device_ptr<int> loc_ptr(loc);
            thrust::device_ptr<float> response_ptr(response);

@ -83,10 +83,10 @@ namespace cv { namespace gpu { namespace device
            {
                const short2 loc = loc_[ptidx];

-                const int r = blockSize / 2;                
+                const int r = blockSize / 2;
                const int x0 = loc.x - r;
                const int y0 = loc.y - r;
-                
+
                int a = 0, b = 0, c = 0;

                for (int ind = threadIdx.x; ind < blockSize * blockSize; ind += blockDim.x)
@ -94,12 +94,12 @@ namespace cv { namespace gpu { namespace device
                    const int i = ind / blockSize;
                    const int j = ind % blockSize;

-                    int Ix = (img(y0 + i, x0 + j + 1) - img(y0 + i, x0 + j - 1)) * 2 + 
-                        (img(y0 + i - 1, x0 + j + 1) - img(y0 + i - 1, x0 + j - 1)) + 
+                    int Ix = (img(y0 + i, x0 + j + 1) - img(y0 + i, x0 + j - 1)) * 2 +
+                        (img(y0 + i - 1, x0 + j + 1) - img(y0 + i - 1, x0 + j - 1)) +
                        (img(y0 + i + 1, x0 + j + 1) - img(y0 + i + 1, x0 + j - 1));

-                    int Iy = (img(y0 + i + 1, x0 + j) - img(y0 + i - 1, x0 + j)) * 2 + 
-                        (img(y0 + i + 1, x0 + j - 1) - img(y0 + i - 1, x0 + j - 1)) + 
+                    int Iy = (img(y0 + i + 1, x0 + j) - img(y0 + i - 1, x0 + j)) * 2 +
+                        (img(y0 + i + 1, x0 + j - 1) - img(y0 + i - 1, x0 + j - 1)) +
                        (img(y0 + i + 1, x0 + j + 1) - img(y0 + i - 1, x0 + j + 1));

                    a += Ix * Ix;
@ -160,7 +160,7 @@ namespace cv { namespace gpu { namespace device
                int m_01 = 0, m_10 = 0;

                const short2 loc = loc_[ptidx];
-                        
+
                // Treat the center line differently, v=0
                for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x)
                    m_10 += u * image(loc.y, loc.x + u);
@ -173,7 +173,7 @@ namespace cv { namespace gpu { namespace device
                    int v_sum = 0;
                    int m_sum = 0;
                    const int d = c_u_max[v];
-                    
+
                    for (int u = threadIdx.x - d; u <= d; u += blockDim.x)
                    {
                        int val_plus = image(loc.y + v, loc.x + u);
@ -229,7 +229,7 @@ namespace cv { namespace gpu { namespace device
        {
            __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
            {
-                pattern_x += 16 * i; 
+                pattern_x += 16 * i;
                pattern_y += 16 * i;

                int t0, t1, val;
@ -257,7 +257,7 @@ namespace cv { namespace gpu { namespace device

                t0 = GET_VALUE(14); t1 = GET_VALUE(15);
                val |= (t0 < t1) << 7;
-                
+
                return val;
            }
        };
@ -266,23 +266,23 @@ namespace cv { namespace gpu { namespace device
        {
            __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
            {
-                pattern_x += 12 * i; 
+                pattern_x += 12 * i;
                pattern_y += 12 * i;
-             
+
                int t0, t1, t2, val;

                t0 = GET_VALUE(0); t1 = GET_VALUE(1); t2 = GET_VALUE(2);
                val = t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0);
-                
+
                t0 = GET_VALUE(3); t1 = GET_VALUE(4); t2 = GET_VALUE(5);
                val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 2;
-                
+
                t0 = GET_VALUE(6); t1 = GET_VALUE(7); t2 = GET_VALUE(8);
                val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 4;
-                
+
                t0 = GET_VALUE(9); t1 = GET_VALUE(10); t2 = GET_VALUE(11);
                val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 6;
-                
+
                return val;
            }
        };
@ -291,9 +291,9 @@ namespace cv { namespace gpu { namespace device
        {
            __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
            {
-                pattern_x += 16 * i; 
+                pattern_x += 16 * i;
                pattern_y += 16 * i;
-             
+
                int t0, t1, t2, t3, k, val;
                int a, b;

@ -304,7 +304,7 @@ namespace cv { namespace gpu { namespace device
                if( t3 > t2 ) t2 = t3, b = 3;
                k = t0 > t2 ? a : b;
                val = k;
-                
+
                t0 = GET_VALUE(4); t1 = GET_VALUE(5);
                t2 = GET_VALUE(6); t3 = GET_VALUE(7);
                a = 0, b = 2;
@ -312,7 +312,7 @@ namespace cv { namespace gpu { namespace device
                if( t3 > t2 ) t2 = t3, b = 3;
                k = t0 > t2 ? a : b;
                val |= k << 2;
-                
+
                t0 = GET_VALUE(8); t1 = GET_VALUE(9);
                t2 = GET_VALUE(10); t3 = GET_VALUE(11);
                a = 0, b = 2;
@ -320,7 +320,7 @@ namespace cv { namespace gpu { namespace device
                if( t3 > t2 ) t2 = t3, b = 3;
                k = t0 > t2 ? a : b;
                val |= k << 4;
-                
+
                t0 = GET_VALUE(12); t1 = GET_VALUE(13);
                t2 = GET_VALUE(14); t3 = GET_VALUE(15);
                a = 0, b = 2;
@ -328,7 +328,7 @@ namespace cv { namespace gpu { namespace device
                if( t3 > t2 ) t2 = t3, b = 3;
                k = t0 > t2 ? a : b;
                val |= k << 6;
-                
+
                return val;
            }
        };
@ -399,7 +399,7 @@ namespace cv { namespace gpu { namespace device
                y[ptidx] = loc.y * scale;
            }
        }
-        
+
        void mergeLocation_gpu(const short2* loc, float* x, float* y, int npoints, float scale, cudaStream_t stream)
        {
            dim3 block(256);
--- a/modules/gpu/src/cuda/remap.cu
+++ b/modules/gpu/src/cuda/remap.cu
@ -69,7 +69,7 @@ namespace cv { namespace gpu { namespace device
        {
            static void call(DevMem2D_<T> src, DevMem2Df mapx, DevMem2Df mapy, DevMem2D_<T> dst, const float* borderValue, cudaStream_t stream, int)
            {
-                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; 
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;

                dim3 block(32, 8);
                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
@ -159,7 +159,7 @@ namespace cv { namespace gpu { namespace device
                    cudaSafeCall( cudaDeviceSynchronize() ); \
                } \
            };
-            
+
        OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar)
        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2)
        OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4)
@ -188,7 +188,7 @@ namespace cv { namespace gpu { namespace device

        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
        {
-            static void call(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2Df mapx, DevMem2Df mapy, 
+            static void call(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2Df mapx, DevMem2Df mapy,
                DevMem2D_<T> dst, const float* borderValue, cudaStream_t stream, int cc)
            {
                if (stream == 0)
@ -198,13 +198,13 @@ namespace cv { namespace gpu { namespace device
            }
        };

-        template <typename T> void remap_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, 
+        template <typename T> void remap_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap,
            DevMem2Db dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc)
        {
-            typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap, 
+            typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2Df xmap, DevMem2Df ymap,
                DevMem2D_<T> dst, const float* borderValue, cudaStream_t stream, int cc);

-            static const caller_t callers[3][5] = 
+            static const caller_t callers[3][5] =
            {
                {
                    RemapDispatcher<PointFilter, BrdReflect101, T>::call,
@ -229,7 +229,7 @@ namespace cv { namespace gpu { namespace device
                }
            };

-            callers[interpolation][borderMode](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<T> >(srcWhole), xoff, yoff, xmap, ymap, 
+            callers[interpolation][borderMode](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<T> >(srcWhole), xoff, yoff, xmap, ymap,
                static_cast< DevMem2D_<T> >(dst), borderValue, stream, cc);
        }

--- a/modules/gpu/src/cuda/resize.cu
+++ b/modules/gpu/src/cuda/resize.cu
@ -228,7 +228,7 @@ namespace cv { namespace gpu { namespace device
            }
        };

-        template <typename T> void resize_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy, 
+        template <typename T> void resize_gpu(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, float fx, float fy,
            DevMem2Db dst, int interpolation, cudaStream_t stream)
        {
            typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, float fx, float fy, DevMem2D_<T> dst, cudaStream_t stream);
@ -244,7 +244,7 @@ namespace cv { namespace gpu { namespace device
            if (interpolation == 3 && (fx <= 1.f || fy <= 1.f))
                interpolation = 1;

-            callers[interpolation](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<T> >(srcWhole), xoff, yoff, fx, fy, 
+            callers[interpolation](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<T> >(srcWhole), xoff, yoff, fx, fy,
                static_cast< DevMem2D_<T> >(dst), stream);
        }

--- a/modules/gpu/src/cuda/rgb_to_yv12.cu
+++ b/modules/gpu/src/cuda/rgb_to_yv12.cu
@ -43,7 +43,7 @@
 #include "opencv2/gpu/device/common.hpp"
 #include "opencv2/gpu/device/vec_traits.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    namespace video_encoding
    {
@ -159,12 +159,12 @@ namespace cv { namespace gpu { namespace device
        void YV12_gpu(const DevMem2Db src, int cn, DevMem2Db dst)
        {
            typedef void (*func_t)(const DevMem2Db src, PtrStepb dst);
-            
-            static const func_t funcs[] = 
+
+            static const func_t funcs[] =
            {
                0, Gray_to_YV12_caller, 0, BGR_to_YV12_caller<3>, BGR_to_YV12_caller<4>
            };
-            
+
            funcs[cn](src, dst);
        }
    }
--- a/modules/gpu/src/cuda/row_filter.cu
+++ b/modules/gpu/src/cuda/row_filter.cu
@ -48,9 +48,9 @@
 #include "opencv2/gpu/device/border_interpolate.hpp"
 #include "opencv2/gpu/device/static_check.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace row_filter 
+    namespace row_filter
    {
        #define MAX_KERNEL_SIZE 32

@ -79,7 +79,7 @@ namespace cv { namespace gpu { namespace device
            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;

            __shared__ sum_t smem[BLOCK_DIM_Y][(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_X];
-            
+
            const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;

            if (y >= src.rows)
@ -161,7 +161,7 @@ namespace cv { namespace gpu { namespace device
        {
            typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, int cc, cudaStream_t stream);

-            static const caller_t callers[5][33] = 
+            static const caller_t callers[5][33] =
            {
                {
                    0,
@ -337,9 +337,9 @@ namespace cv { namespace gpu { namespace device
                    linearRowFilter_caller<30, T, D, BrdRowWrap>,
                    linearRowFilter_caller<31, T, D, BrdRowWrap>,
                    linearRowFilter_caller<32, T, D, BrdRowWrap>
-                }               
+                }
            };
-            
+
            loadKernel(kernel, ksize);

            callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, cc, stream);
--- a/modules/gpu/src/cuda/safe_call.hpp
+++ b/modules/gpu/src/cuda/safe_call.hpp
@ -60,7 +60,7 @@
    #define cublasSafeCall(expr)  ___cublasSafeCall(expr, __FILE__, __LINE__)
 #endif

-namespace cv { namespace gpu 
+namespace cv { namespace gpu
 {
    void nppError(int err, const char *file, const int line, const char *func = "");
    void ncvError(int err, const char *file, const int line, const char *func = "");
--- a/modules/gpu/src/cuda/split_merge.cu
+++ b/modules/gpu/src/cuda/split_merge.cu
@ -42,12 +42,12 @@

 #include "internal_shared.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace split_merge 
+    namespace split_merge
    {
        template <typename T, size_t elem_size = sizeof(T)>
-        struct TypeTraits 
+        struct TypeTraits
        {
            typedef T type;
            typedef T type2;
@ -74,7 +74,7 @@ namespace cv { namespace gpu { namespace device
        };

        template <typename T>
-        struct TypeTraits<T, 4> 
+        struct TypeTraits<T, 4>
        {
            typedef int type;
            typedef int2 type2;
@ -83,7 +83,7 @@ namespace cv { namespace gpu { namespace device
        };

        template <typename T>
-        struct TypeTraits<T, 8> 
+        struct TypeTraits<T, 8>
        {
            typedef double type;
            typedef double2 type2;
@ -95,11 +95,11 @@ namespace cv { namespace gpu { namespace device
        typedef void (*SplitFunction)(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream);

        //------------------------------------------------------------
-        // Merge    
+        // Merge

        template <typename T>
-        __global__ void mergeC2_(const uchar* src0, size_t src0_step, 
-                                 const uchar* src1, size_t src1_step, 
+        __global__ void mergeC2_(const uchar* src0, size_t src0_step,
+                                 const uchar* src1, size_t src1_step,
                                 int rows, int cols, uchar* dst, size_t dst_step)
        {
            typedef typename TypeTraits<T>::type2 dst_type;
@ -111,8 +111,8 @@ namespace cv { namespace gpu { namespace device
            const T* src1_y = (const T*)(src1 + y * src1_step);
            dst_type* dst_y = (dst_type*)(dst + y * dst_step);

-            if (x < cols && y < rows) 
-            {                        
+            if (x < cols && y < rows)
+            {
                dst_type dst_elem;
                dst_elem.x = src0_y[x];
                dst_elem.y = src1_y[x];
@ -122,9 +122,9 @@ namespace cv { namespace gpu { namespace device


        template <typename T>
-        __global__ void mergeC3_(const uchar* src0, size_t src0_step, 
-                                 const uchar* src1, size_t src1_step, 
-                                 const uchar* src2, size_t src2_step, 
+        __global__ void mergeC3_(const uchar* src0, size_t src0_step,
+                                 const uchar* src1, size_t src1_step,
+                                 const uchar* src2, size_t src2_step,
                                 int rows, int cols, uchar* dst, size_t dst_step)
        {
            typedef typename TypeTraits<T>::type3 dst_type;
@ -137,8 +137,8 @@ namespace cv { namespace gpu { namespace device
            const T* src2_y = (const T*)(src2 + y * src2_step);
            dst_type* dst_y = (dst_type*)(dst + y * dst_step);

-            if (x < cols && y < rows) 
-            {                        
+            if (x < cols && y < rows)
+            {
                dst_type dst_elem;
                dst_elem.x = src0_y[x];
                dst_elem.y = src1_y[x];
@ -149,9 +149,9 @@ namespace cv { namespace gpu { namespace device


        template <>
-        __global__ void mergeC3_<double>(const uchar* src0, size_t src0_step, 
-                                 const uchar* src1, size_t src1_step, 
-                                 const uchar* src2, size_t src2_step, 
+        __global__ void mergeC3_<double>(const uchar* src0, size_t src0_step,
+                                 const uchar* src1, size_t src1_step,
+                                 const uchar* src2, size_t src2_step,
                                 int rows, int cols, uchar* dst, size_t dst_step)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -162,8 +162,8 @@ namespace cv { namespace gpu { namespace device
            const double* src2_y = (const double*)(src2 + y * src2_step);
            double* dst_y = (double*)(dst + y * dst_step);

-            if (x < cols && y < rows) 
-            {                        
+            if (x < cols && y < rows)
+            {
                dst_y[3 * x] = src0_y[x];
                dst_y[3 * x + 1] = src1_y[x];
                dst_y[3 * x + 2] = src2_y[x];
@ -172,10 +172,10 @@ namespace cv { namespace gpu { namespace device


        template <typename T>
-        __global__ void mergeC4_(const uchar* src0, size_t src0_step, 
-                                 const uchar* src1, size_t src1_step, 
-                                 const uchar* src2, size_t src2_step, 
-                                 const uchar* src3, size_t src3_step, 
+        __global__ void mergeC4_(const uchar* src0, size_t src0_step,
+                                 const uchar* src1, size_t src1_step,
+                                 const uchar* src2, size_t src2_step,
+                                 const uchar* src3, size_t src3_step,
                                 int rows, int cols, uchar* dst, size_t dst_step)
        {
            typedef typename TypeTraits<T>::type4 dst_type;
@ -189,8 +189,8 @@ namespace cv { namespace gpu { namespace device
            const T* src3_y = (const T*)(src3 + y * src3_step);
            dst_type* dst_y = (dst_type*)(dst + y * dst_step);

-            if (x < cols && y < rows) 
-            {                        
+            if (x < cols && y < rows)
+            {
                dst_type dst_elem;
                dst_elem.x = src0_y[x];
                dst_elem.y = src1_y[x];
@ -202,10 +202,10 @@ namespace cv { namespace gpu { namespace device


        template <>
-        __global__ void mergeC4_<double>(const uchar* src0, size_t src0_step, 
-                                 const uchar* src1, size_t src1_step, 
-                                 const uchar* src2, size_t src2_step, 
-                                 const uchar* src3, size_t src3_step, 
+        __global__ void mergeC4_<double>(const uchar* src0, size_t src0_step,
+                                 const uchar* src1, size_t src1_step,
+                                 const uchar* src2, size_t src2_step,
+                                 const uchar* src3, size_t src3_step,
                                 int rows, int cols, uchar* dst, size_t dst_step)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -217,8 +217,8 @@ namespace cv { namespace gpu { namespace device
            const double* src3_y = (const double*)(src3 + y * src3_step);
            double2* dst_y = (double2*)(dst + y * dst_step);

-            if (x < cols && y < rows) 
-            {                        
+            if (x < cols && y < rows)
+            {
                dst_y[2 * x] = make_double2(src0_y[x], src1_y[x]);
                dst_y[2 * x + 1] = make_double2(src2_y[x], src3_y[x]);
            }
@ -303,7 +303,7 @@ namespace cv { namespace gpu { namespace device


        template <typename T>
-        __global__ void splitC2_(const uchar* src, size_t src_step, 
+        __global__ void splitC2_(const uchar* src, size_t src_step,
                                int rows, int cols,
                                uchar* dst0, size_t dst0_step,
                                uchar* dst1, size_t dst1_step)
@ -317,7 +317,7 @@ namespace cv { namespace gpu { namespace device
            T* dst0_y = (T*)(dst0 + y * dst0_step);
            T* dst1_y = (T*)(dst1 + y * dst1_step);

-            if (x < cols && y < rows) 
+            if (x < cols && y < rows)
            {
                src_type src_elem = src_y[x];
                dst0_y[x] = src_elem.x;
@ -327,7 +327,7 @@ namespace cv { namespace gpu { namespace device


        template <typename T>
-        __global__ void splitC3_(const uchar* src, size_t src_step, 
+        __global__ void splitC3_(const uchar* src, size_t src_step,
                                int rows, int cols,
                                uchar* dst0, size_t dst0_step,
                                uchar* dst1, size_t dst1_step,
@ -343,7 +343,7 @@ namespace cv { namespace gpu { namespace device
            T* dst1_y = (T*)(dst1 + y * dst1_step);
            T* dst2_y = (T*)(dst2 + y * dst2_step);

-            if (x < cols && y < rows) 
+            if (x < cols && y < rows)
            {
                src_type src_elem = src_y[x];
                dst0_y[x] = src_elem.x;
@ -368,7 +368,7 @@ namespace cv { namespace gpu { namespace device
            double* dst1_y = (double*)(dst1 + y * dst1_step);
            double* dst2_y = (double*)(dst2 + y * dst2_step);

-            if (x < cols && y < rows) 
+            if (x < cols && y < rows)
            {
                dst0_y[x] = src_y[3 * x];
                dst1_y[x] = src_y[3 * x + 1];
@ -395,7 +395,7 @@ namespace cv { namespace gpu { namespace device
            T* dst2_y = (T*)(dst2 + y * dst2_step);
            T* dst3_y = (T*)(dst3 + y * dst3_step);

-            if (x < cols && y < rows) 
+            if (x < cols && y < rows)
            {
                src_type src_elem = src_y[x];
                dst0_y[x] = src_elem.x;
@ -423,7 +423,7 @@ namespace cv { namespace gpu { namespace device
            double* dst2_y = (double*)(dst2 + y * dst2_step);
            double* dst3_y = (double*)(dst3 + y * dst3_step);

-            if (x < cols && y < rows) 
+            if (x < cols && y < rows)
            {
                double2 src_elem1 = src_y[2 * x];
                double2 src_elem2 = src_y[2 * x + 1];
--- a/modules/gpu/src/cuda/stereobm.cu
+++ b/modules/gpu/src/cuda/stereobm.cu
@ -42,9 +42,9 @@

 #include "internal_shared.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace stereobm 
+    namespace stereobm
    {
        //////////////////////////////////////////////////////////////////////////////////////////////////
        /////////////////////////////////////// Stereo BM ////////////////////////////////////////////////
@ -70,7 +70,7 @@ namespace cv { namespace gpu { namespace device

        template<int RADIUS>
        __device__ unsigned int CalcSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)
-        {	
+        {
            unsigned int cache = 0;
            unsigned int cache2 = 0;

@ -401,8 +401,8 @@ namespace cv { namespace gpu { namespace device
            prefilter_kernel<<<grid, threads, 0, stream>>>(output, prefilterCap);
            cudaSafeCall( cudaGetLastError() );

-            if (stream == 0)   
-                cudaSafeCall( cudaDeviceSynchronize() );    
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );

            cudaSafeCall( cudaUnbindTexture (texForSobel ) );
        }
--- a/modules/gpu/src/cuda/stereobp.cu
+++ b/modules/gpu/src/cuda/stereobp.cu
@ -44,9 +44,9 @@
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/limits.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace stereobp 
+    namespace stereobp
    {
        ///////////////////////////////////////////////////////////////
        /////////////////////// load constants ////////////////////////
--- a/modules/gpu/src/cuda/stereocsbp.cu
+++ b/modules/gpu/src/cuda/stereocsbp.cu
@ -44,9 +44,9 @@
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/limits.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace stereocsbp 
+    namespace stereocsbp
    {
        ///////////////////////////////////////////////////////////////
        /////////////////////// load constants ////////////////////////
@ -62,7 +62,7 @@ namespace cv { namespace gpu { namespace device
        __constant__ int cth;

        __constant__ size_t cimg_step;
-        __constant__ size_t cmsg_step;        
+        __constant__ size_t cmsg_step;
        __constant__ size_t cdisp_step1;
        __constant__ size_t cdisp_step2;

@ -392,7 +392,7 @@ namespace cv { namespace gpu { namespace device
                get_first_k_initial_local<<<grid, threads, 0, stream>>> (data_cost_selected, disp_selected_pyr, h, w, nr_plane);
            else
                get_first_k_initial_global<<<grid, threads, 0, stream>>>(data_cost_selected, disp_selected_pyr, h, w, nr_plane);
-            
+
            cudaSafeCall( cudaGetLastError() );

            if (stream == 0)
@ -575,7 +575,7 @@ namespace cv { namespace gpu { namespace device
            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );
            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );
            cudaSafeCall( cudaMemcpyToSymbol(cmsg_step,  &msg_step,  sizeof(size_t)) );
-            
+
            callers[level](disp_selected_pyr, data_cost, rows, cols, h, w, level, nr_plane, channels, stream);
            cudaSafeCall( cudaGetLastError() );

@ -588,13 +588,13 @@ namespace cv { namespace gpu { namespace device

        template void compute_data_cost(const float* disp_selected_pyr, float* data_cost, size_t msg_step,
                               int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
-             
+

        ///////////////////////////////////////////////////////////////
        //////////////////////// init message /////////////////////////
        ///////////////////////////////////////////////////////////////

-         
+
         template <typename T>
        __device__ void get_first_k_element_increase(T* u_new, T* d_new, T* l_new, T* r_new,
                                                     const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
@ -691,7 +691,7 @@ namespace cv { namespace gpu { namespace device
            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );
            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );
            cudaSafeCall( cudaMemcpyToSymbol(cmsg_step,   &msg_step, sizeof(size_t)) );
-            
+
            dim3 threads(32, 8, 1);
            dim3 grid(1, 1, 1);

@ -720,7 +720,7 @@ namespace cv { namespace gpu { namespace device
                          const float* u_cur, const float* d_cur, const float* l_cur, const float* r_cur,
                          float* selected_disp_pyr_new, const float* selected_disp_pyr_cur,
                          float* data_cost_selected, const float* data_cost, size_t msg_step,
-                          int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);        
+                          int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);

        ///////////////////////////////////////////////////////////////
        ////////////////////  calc all iterations /////////////////////
@ -805,7 +805,7 @@ namespace cv { namespace gpu { namespace device
            for(int t = 0; t < iters; ++t)
            {
                compute_message<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, h, w, nr_plane, t & 1);
-                cudaSafeCall( cudaGetLastError() );                
+                cudaSafeCall( cudaGetLastError() );
            }
 			if (stream == 0)
                    cudaSafeCall( cudaDeviceSynchronize() );
@ -814,7 +814,7 @@ namespace cv { namespace gpu { namespace device
        template void calc_all_iterations(short* u, short* d, short* l, short* r, const short* data_cost_selected, const short* selected_disp_pyr_cur, size_t msg_step,
            int h, int w, int nr_plane, int iters, cudaStream_t stream);

-        template void calc_all_iterations(float* u, float* d, float* l, float* r, const float* data_cost_selected, const float* selected_disp_pyr_cur, size_t msg_step, 
+        template void calc_all_iterations(float* u, float* d, float* l, float* r, const float* data_cost_selected, const float* selected_disp_pyr_cur, size_t msg_step,
            int h, int w, int nr_plane, int iters, cudaStream_t stream);


@ -879,7 +879,7 @@ namespace cv { namespace gpu { namespace device
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        template void compute_disp(const short* u, const short* d, const short* l, const short* r, const short* data_cost_selected, const short* disp_selected, size_t msg_step, 
+        template void compute_disp(const short* u, const short* d, const short* l, const short* r, const short* data_cost_selected, const short* disp_selected, size_t msg_step,
            const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream);

        template void compute_disp(const float* u, const float* d, const float* l, const float* r, const float* data_cost_selected, const float* disp_selected, size_t msg_step,
--- a/modules/gpu/src/cuda/warp.cu
+++ b/modules/gpu/src/cuda/warp.cu
@ -98,7 +98,7 @@ namespace cv { namespace gpu { namespace device
        {
            dim3 block(32, 8);
            dim3 grid(divUp(xmap.cols, block.x), divUp(xmap.rows, block.y));
-            
+
            buildWarpMaps<Transform><<<grid, block, 0, stream>>>(xmap, ymap);
            cudaSafeCall( cudaGetLastError() );

@ -158,7 +158,7 @@ namespace cv { namespace gpu { namespace device
        {
            static void call(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2D_<T> dst, const float* borderValue, int)
            {
-                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; 
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;

                dim3 block(32, 8);
                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
@ -256,7 +256,7 @@ namespace cv { namespace gpu { namespace device
        #undef OPENCV_GPU_IMPLEMENT_WARP_TEX

        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcher
-        { 
+        {
            static void call(DevMem2D_<T> src, DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2D_<T> dst, const float* borderValue, cudaStream_t stream, int cc)
            {
                if (stream == 0)
@ -266,7 +266,7 @@ namespace cv { namespace gpu { namespace device
            }
        };

-        template <class Transform, typename T> 
+        template <class Transform, typename T>
        void warp_caller(DevMem2Db src, DevMem2Db srcWhole, int xoff, int yoff, DevMem2Db dst, int interpolation,
                         int borderMode, const float* borderValue, cudaStream_t stream, int cc)
        {