add PyrLK to ocl module

13 years ago · 78e89890b0
parent 0e9405e591
commit 78e89890b0
6 changed files with 1668 additions and 0 deletions
--- a/modules/ocl/include/opencv2/ocl/matrix_operations.hpp
+++ b/modules/ocl/include/opencv2/ocl/matrix_operations.hpp
@ -460,6 +460,18 @@ namespace cv
            a.swap(b);
        }

+		inline void ensureSizeIsEnough(int rows, int cols, int type, oclMat& m)
+		{
+			if (m.type() == type && m.rows >= rows && m.cols >= cols)
+				m = m(Rect(0, 0, cols, rows));
+			else
+				m.create(rows, cols, type);
+		}
+
+		inline void ensureSizeIsEnough(Size size, int type, oclMat& m)
+		{
+			ensureSizeIsEnough(size.height, size.width, type, m);
+		}
    } /* end of namespace ocl */

 } /* end of namespace cv */
--- a/modules/ocl/include/opencv2/ocl/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl/ocl.hpp
@ -1125,6 +1125,66 @@ namespace cv
 			explicit BruteForceMatcher_OCL(Hamming /*d*/) : BruteForceMatcher_OCL_base(HammingDist) {}
 		};

+		/////////////////////////////// PyrLKOpticalFlow /////////////////////////////////////
+		class CV_EXPORTS PyrLKOpticalFlow
+		{
+		public:
+			PyrLKOpticalFlow()
+			{
+				winSize = Size(21, 21);
+				maxLevel = 3;
+				iters = 30;
+				derivLambda = 0.5;
+				useInitialFlow = false;
+				minEigThreshold = 1e-4f;
+				getMinEigenVals = false;
+				isDeviceArch11_ = false;
+			}
+
+			void sparse(const oclMat& prevImg, const oclMat& nextImg, const oclMat& prevPts, oclMat& nextPts,
+				oclMat& status, oclMat* err = 0);
+
+			void dense(const oclMat& prevImg, const oclMat& nextImg, oclMat& u, oclMat& v, oclMat* err = 0);
+
+			Size winSize;
+			int maxLevel;
+			int iters;
+			double derivLambda;
+			bool useInitialFlow;
+			float minEigThreshold;
+			bool getMinEigenVals;
+
+			void releaseMemory()
+			{
+				dx_calcBuf_.release();
+				dy_calcBuf_.release();
+
+				prevPyr_.clear();
+				nextPyr_.clear();
+
+				dx_buf_.release();
+				dy_buf_.release();
+			}
+
+		private:
+			void calcSharrDeriv(const oclMat& src, oclMat& dx, oclMat& dy);
+
+			void buildImagePyramid(const oclMat& img0, vector<oclMat>& pyr, bool withBorder);
+
+			oclMat dx_calcBuf_;
+			oclMat dy_calcBuf_;
+
+			vector<oclMat> prevPyr_;
+			vector<oclMat> nextPyr_;
+
+			oclMat dx_buf_;
+			oclMat dy_buf_;
+
+			oclMat uPyr_[2];
+			oclMat vPyr_[2];
+
+			bool isDeviceArch11_;
+		};

    }
 }
--- a/modules/ocl/src/kernels/pyrlk.cl
+++ b/modules/ocl/src/kernels/pyrlk.cl
@ -0,0 +1,831 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Dachuan Zhao, dachuan@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+//#pragma OPENCL EXTENSION cl_amd_printf : enable
+
+
+__kernel void calcSharrDeriv_vertical_C1_D0(__global const uchar* src, int srcStep, int rows, int cols, int cn, __global short* dx_buf, int dx_bufStep, __global short* dy_buf, int dy_bufStep)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    if (y < rows && x < cols * cn)
+    {
+        const uchar src_val0 = (src + (y > 0 ? y-1 : rows > 1 ? 1 : 0) * srcStep)[x];
+        const uchar src_val1 = (src + y * srcStep)[x];
+        const uchar src_val2 = (src + (y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0) * srcStep)[x];
+        
+        ((__global short*)((__global char*)dx_buf + y * dx_bufStep / 2))[x] = (src_val0 + src_val2) * 3 + src_val1 * 10;
+        ((__global short*)((__global char*)dy_buf + y * dy_bufStep / 2))[x] = src_val2 - src_val0;
+    }
+}
+
+__kernel void calcSharrDeriv_vertical_C4_D0(__global const uchar* src, int srcStep, int rows, int cols, int cn, __global short* dx_buf, int dx_bufStep, __global short* dy_buf, int dy_bufStep)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    if (y < rows && x < cols * cn)
+    {
+        const uchar src_val0 = (src + (y > 0 ? y - 1 : 1) * srcStep)[x];
+        const uchar src_val1 = (src + y * srcStep)[x];
+        const uchar src_val2 = (src + (y < rows - 1 ? y + 1 : rows - 2) * srcStep)[x];
+        
+        ((__global short*)((__global char*)dx_buf + y * dx_bufStep / 2))[x] = (src_val0 + src_val2) * 3 + src_val1 * 10;
+        ((__global short*)((__global char*)dy_buf + y * dy_bufStep / 2))[x] = src_val2 - src_val0;
+    }
+}
+
+__kernel void calcSharrDeriv_horizontal_C1_D0(int rows, int cols, int cn, __global const short* dx_buf, int dx_bufStep, __global const short* dy_buf, int dy_bufStep, __global short* dIdx, int dIdxStep, __global short* dIdy, int dIdyStep)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    const int colsn = cols * cn;
+
+    if (y < rows && x < colsn)
+    {
+        __global const short* dx_buf_row = dx_buf + y * dx_bufStep;
+        __global const short* dy_buf_row = dy_buf + y * dy_bufStep;
+
+        const int xr = x + cn < colsn ? x + cn : (cols - 2) * cn + x + cn - colsn;
+        const int xl = x - cn >= 0 ? x - cn : cn + x;
+
+        ((__global short*)((__global char*)dIdx + y * dIdxStep / 2))[x] = dx_buf_row[xr] - dx_buf_row[xl];
+        ((__global short*)((__global char*)dIdy + y * dIdyStep / 2))[x] = (dy_buf_row[xr] + dy_buf_row[xl]) * 3 + dy_buf_row[x] * 10;
+    }
+}
+
+__kernel void calcSharrDeriv_horizontal_C4_D0(int rows, int cols, int cn, __global const short* dx_buf, int dx_bufStep, __global const short* dy_buf, int dy_bufStep, __global short* dIdx, int dIdxStep, __global short* dIdy, int dIdyStep)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    const int colsn = cols * cn;
+
+    if (y < rows && x < colsn)
+    {
+        __global const short* dx_buf_row = dx_buf + y * dx_bufStep;
+        __global const short* dy_buf_row = dy_buf + y * dy_bufStep;
+
+        const int xr = x + cn < colsn ? x + cn : (cols - 2) * cn + x + cn - colsn;
+        const int xl = x - cn >= 0 ? x - cn : cn + x;
+
+        ((__global short*)((__global char*)dIdx + y * dIdxStep / 2))[x] = dx_buf_row[xr] - dx_buf_row[xl];
+        ((__global short*)((__global char*)dIdy + y * dIdyStep / 2))[x] = (dy_buf_row[xr] + dy_buf_row[xl]) * 3 + dy_buf_row[x] * 10;
+    }
+}
+
+#define W_BITS 14
+#define W_BITS1 14
+
+#define  CV_DESCALE(x, n)     (((x) + (1 << ((n)-1))) >> (n))
+
+int linearFilter_uchar(__global const uchar* src, int srcStep, int cn, float2 pt, int x, int y)
+{
+    int2 ipt;
+    ipt.x = convert_int_sat_rtn(pt.x);
+    ipt.y = convert_int_sat_rtn(pt.y);
+
+    float a = pt.x - ipt.x;
+    float b = pt.y - ipt.y;
+
+    int iw00 = convert_int_sat_rte((1.0f - a) * (1.0f - b) * (1 << W_BITS));
+    int iw01 = convert_int_sat_rte(a * (1.0f - b) * (1 << W_BITS));
+    int iw10 = convert_int_sat_rte((1.0f - a) * b * (1 << W_BITS));
+    int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+
+    __global const uchar* src_row = src + (ipt.y + y) * srcStep + ipt.x * cn;
+    __global const uchar* src_row1 = src + (ipt.y + y + 1) * srcStep + ipt.x * cn;
+
+    return CV_DESCALE(src_row[x] * iw00 + src_row[x + cn] * iw01 + src_row1[x] * iw10 + src_row1[x + cn] * iw11, W_BITS1 - 5);
+}
+
+int linearFilter_short(__global const short* src, int srcStep, int cn, float2 pt, int x, int y)
+{
+    int2 ipt;
+    ipt.x = convert_int_sat_rtn(pt.x);
+    ipt.y = convert_int_sat_rtn(pt.y);
+
+    float a = pt.x - ipt.x;
+    float b = pt.y - ipt.y;
+
+    int iw00 = convert_int_sat_rte((1.0f - a) * (1.0f - b) * (1 << W_BITS));
+    int iw01 = convert_int_sat_rte(a * (1.0f - b) * (1 << W_BITS));
+    int iw10 = convert_int_sat_rte((1.0f - a) * b * (1 << W_BITS));
+    int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+
+    __global const short* src_row = src + (ipt.y + y) * srcStep + ipt.x * cn;
+    __global const short* src_row1 = src + (ipt.y + y + 1) * srcStep + ipt.x * cn;
+
+    return CV_DESCALE(src_row[x] * iw00 + src_row[x + cn] * iw01 + src_row1[x] * iw10 + src_row1[x + cn] * iw11, W_BITS1);
+}
+
+float linearFilter_float(__global const float* src, int srcStep, int cn, float2 pt, float x, float y)
+{
+    int2 ipt;
+    ipt.x = convert_int_sat_rtn(pt.x);
+    ipt.y = convert_int_sat_rtn(pt.y);
+
+    float a = pt.x - ipt.x;
+    float b = pt.y - ipt.y;
+
+    float iw00 = ((1.0f - a) * (1.0f - b) * (1 << W_BITS));
+    float iw01 = (a * (1.0f - b) * (1 << W_BITS));
+    float iw10 = ((1.0f - a) * b * (1 << W_BITS));
+    float iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
+
+    __global const float* src_row = src + (int)(ipt.y + y) * srcStep / 4 + ipt.x * cn;
+    __global const float* src_row1 = src + (int)(ipt.y + y + 1) * srcStep / 4 + ipt.x * cn;
+
+    return src_row[(int)x] * iw00 + src_row[(int)x + cn] * iw01 + src_row1[(int)x] * iw10 + src_row1[(int)x + cn] * iw11, W_BITS1 - 5;
+}
+
+void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid)
+{
+    smem1[tid] = val1;
+    smem2[tid] = val2;
+    smem3[tid] = val3;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 128) 
+    { 
+        smem1[tid] = val1 += smem1[tid + 128]; 
+        smem2[tid] = val2 += smem2[tid + 128]; 
+        smem3[tid] = val3 += smem3[tid + 128]; 
+    } 
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 64) 
+    { 
+        smem1[tid] = val1 += smem1[tid + 64]; 
+        smem2[tid] = val2 += smem2[tid + 64]; 
+        smem3[tid] = val3 += smem3[tid + 64];
+    } 
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 32)
+    {
+        volatile __local float* vmem1 = smem1;
+        volatile __local float* vmem2 = smem2;
+        volatile __local float* vmem3 = smem3;
+
+        vmem1[tid] = val1 += vmem1[tid + 32]; 
+        vmem2[tid] = val2 += vmem2[tid + 32]; 
+        vmem3[tid] = val3 += vmem3[tid + 32];
+
+        vmem1[tid] = val1 += vmem1[tid + 16]; 
+        vmem2[tid] = val2 += vmem2[tid + 16]; 
+        vmem3[tid] = val3 += vmem3[tid + 16];
+
+        vmem1[tid] = val1 += vmem1[tid + 8]; 
+        vmem2[tid] = val2 += vmem2[tid + 8]; 
+        vmem3[tid] = val3 += vmem3[tid + 8];
+
+        vmem1[tid] = val1 += vmem1[tid + 4]; 
+        vmem2[tid] = val2 += vmem2[tid + 4]; 
+        vmem3[tid] = val3 += vmem3[tid + 4];
+
+        vmem1[tid] = val1 += vmem1[tid + 2]; 
+        vmem2[tid] = val2 += vmem2[tid + 2]; 
+        vmem3[tid] = val3 += vmem3[tid + 2];
+
+        vmem1[tid] = val1 += vmem1[tid + 1]; 
+        vmem2[tid] = val2 += vmem2[tid + 1]; 
+        vmem3[tid] = val3 += vmem3[tid + 1];
+    }
+}
+
+void reduce2(float val1, float val2, __local float* smem1, __local float* smem2, int tid)
+{
+    smem1[tid] = val1;
+    smem2[tid] = val2;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 128) 
+    { 
+        smem1[tid] = val1 += smem1[tid + 128]; 
+        smem2[tid] = val2 += smem2[tid + 128];  
+    } 
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 64) 
+    { 
+        smem1[tid] = val1 += smem1[tid + 64]; 
+        smem2[tid] = val2 += smem2[tid + 64]; 
+    } 
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 32)
+    {
+        volatile __local float* vmem1 = smem1;
+        volatile __local float* vmem2 = smem2;
+
+        vmem1[tid] = val1 += vmem1[tid + 32]; 
+        vmem2[tid] = val2 += vmem2[tid + 32]; 
+
+        vmem1[tid] = val1 += vmem1[tid + 16]; 
+        vmem2[tid] = val2 += vmem2[tid + 16]; 
+
+        vmem1[tid] = val1 += vmem1[tid + 8]; 
+        vmem2[tid] = val2 += vmem2[tid + 8]; 
+
+        vmem1[tid] = val1 += vmem1[tid + 4]; 
+        vmem2[tid] = val2 += vmem2[tid + 4]; 
+
+        vmem1[tid] = val1 += vmem1[tid + 2]; 
+        vmem2[tid] = val2 += vmem2[tid + 2]; 
+
+        vmem1[tid] = val1 += vmem1[tid + 1]; 
+        vmem2[tid] = val2 += vmem2[tid + 1]; 
+    }
+}
+
+void reduce1(float val1, __local float* smem1, int tid)
+{
+    smem1[tid] = val1;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 128) 
+    { 
+        smem1[tid] = val1 += smem1[tid + 128]; 
+    } 
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 64) 
+    { 
+        smem1[tid] = val1 += smem1[tid + 64]; 
+    } 
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 32)
+    {
+        volatile __local float* vmem1 = smem1;
+
+        vmem1[tid] = val1 += vmem1[tid + 32]; 
+        vmem1[tid] = val1 += vmem1[tid + 16]; 
+        vmem1[tid] = val1 += vmem1[tid + 8]; 
+        vmem1[tid] = val1 += vmem1[tid + 4];
+        vmem1[tid] = val1 += vmem1[tid + 2]; 
+        vmem1[tid] = val1 += vmem1[tid + 1]; 
+    }
+}
+
+#define SCALE (1.0f / (1 << 20))
+
+// Image read mode
+__constant sampler_t sampler    = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
+
+__kernel void lkSparse_C1_D5(image2d_t I, image2d_t J,
+    __global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status/*, __global float* err*/, const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr, char GET_MIN_EIGENVALS)
+{
+    __local float smem1[256];
+    __local float smem2[256];
+    __local float smem3[256];
+
+	int c_halfWin_x = (c_winSize_x - 1) / 2;
+	int c_halfWin_y = (c_winSize_y - 1) / 2;
+
+    const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
+
+    float2 prevPt = prevPts[get_group_id(0)];
+    prevPt.x *= (1.0f / (1 << level));
+    prevPt.y *= (1.0f / (1 << level));
+
+    if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows)
+    {
+        if (level == 0 && tid == 0)
+        {
+            status[get_group_id(0)] = 0;
+
+            //if (calcErr) 
+            //    err[get_group_id(0)] = 0;
+        }
+
+        return;
+    }
+    
+    prevPt.x -= c_halfWin_x;
+    prevPt.y -= c_halfWin_y;
+    
+    // extract the patch from the first image, compute covariation matrix of derivatives
+    
+    float A11 = 0;
+    float A12 = 0;
+    float A22 = 0;
+
+    float I_patch[21][21];
+    float dIdx_patch[21][21];
+    float dIdy_patch[21][21];
+
+    for (int yBase = get_local_id(1), i = 0; yBase < c_winSize_y; yBase += get_local_size(1), ++i)
+    {                
+        for (int xBase = get_local_id(0), j = 0; xBase < c_winSize_x; xBase += get_local_size(0), ++j)
+        {
+            float x = (prevPt.x + xBase + 0.5f);
+            float y = (prevPt.y + yBase + 0.5f);
+
+            I_patch[i][j] = read_imagef(I, sampler, (float2)(x, y)).x;
+            
+            float dIdx = 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x + 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x -
+                             (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x - 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x);
+
+            float dIdy = 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x, y + 1)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x -
+                            (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x, y - 1)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x);
+
+            dIdx_patch[i][j] = dIdx;
+            dIdy_patch[i][j] = dIdy;
+            
+            A11 += dIdx * dIdx;
+            A12 += dIdx * dIdy;
+            A22 += dIdy * dIdy;
+        }
+    }
+
+    reduce3(A11, A12, A22, smem1, smem2, smem3, tid);
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    A11 = smem1[0];
+    A12 = smem2[0];
+    A22 = smem3[0];
+    
+    float D = A11 * A22 - A12 * A12;
+
+    //if (calcErr && GET_MIN_EIGENVALS && tid == 0) 
+    //    err[get_group_id(0)] = minEig;
+
+    if (D < 1.192092896e-07f)
+    {
+        if (level == 0 && tid == 0)
+            status[get_group_id(0)] = 0;
+
+        return;
+    }
+
+    D = 1.f / D;
+
+    A11 *= D;
+    A12 *= D;
+    A22 *= D;
+
+    float2 nextPt = nextPts[get_group_id(0)];
+    nextPt.x *= 2.0f;
+    nextPt.y *= 2.0f; 
+    
+    nextPt.x -= c_halfWin_x;
+    nextPt.y -= c_halfWin_y;
+
+    for (int k = 0; k < c_iters; ++k)
+    {
+        if (nextPt.x < -c_halfWin_x || nextPt.x >= cols || nextPt.y < -c_halfWin_y || nextPt.y >= rows)
+        {
+            if (tid == 0 && level == 0)
+                status[get_group_id(0)] = 0;
+            return;
+        }
+
+        float b1 = 0;
+        float b2 = 0;
+        
+        for (int y = get_local_id(1), i = 0; y < c_winSize_y; y += get_local_size(1), ++i)
+        {
+            for (int x = get_local_id(0), j = 0; x < c_winSize_x; x += get_local_size(0), ++j)
+            {
+				float a = (nextPt.x + x + 0.5f);
+				float b = (nextPt.y + y + 0.5f);
+				
+                float I_val = I_patch[i][j];
+                float J_val = read_imagef(J, sampler, (float2)(a, b)).x;
+
+                float diff = (J_val - I_val) * 32.0f;
+
+                b1 += diff * dIdx_patch[i][j];
+                b2 += diff * dIdy_patch[i][j];
+            }
+        }
+        
+        reduce2(b1, b2, smem1, smem2, tid);
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        b1 = smem1[0];
+        b2 = smem2[0];
+
+        float2 delta;
+        delta.x = A12 * b2 - A22 * b1;
+        delta.y = A12 * b1 - A11 * b2;
+            
+        nextPt.x += delta.x;
+        nextPt.y += delta.y;
+
+        if (fabs(delta.x) < 0.01f && fabs(delta.y) < 0.01f)
+            break;
+    }
+
+    float errval = 0.0f;
+    if (calcErr)
+    {
+        for (int y = get_local_id(1), i = 0; y < c_winSize_y; y += get_local_size(1), ++i)
+        {
+            for (int x = get_local_id(0), j = 0; x < c_winSize_x; x += get_local_size(0), ++j)
+            {
+				float a = (nextPt.x + x + 0.5f);
+				float b = (nextPt.y + y + 0.5f);
+				
+                float I_val = I_patch[i][j];
+                float J_val = read_imagef(J, sampler, (float2)(a, b)).x;
+
+                float diff = J_val - I_val;
+
+                errval += fabs((float)diff);
+            }
+        }
+
+        reduce1(errval, smem1, tid);
+    }
+
+    if (tid == 0)
+    {
+        nextPt.x += c_halfWin_x;
+        nextPt.y += c_halfWin_y;
+
+        nextPts[get_group_id(0)] = nextPt;
+
+        //if (calcErr && !GET_MIN_EIGENVALS)
+        //    err[get_group_id(0)] = errval;
+    }
+}
+__kernel void lkSparse_C4_D5(image2d_t I, image2d_t J,
+    __global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status/*, __global float* err*/, const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr, char GET_MIN_EIGENVALS)
+{
+    __local float smem1[256];
+    __local float smem2[256];
+    __local float smem3[256];
+
+	int c_halfWin_x = (c_winSize_x - 1) / 2;
+	int c_halfWin_y = (c_winSize_y - 1) / 2;
+
+    const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
+
+    float2 prevPt = prevPts[get_group_id(0)];
+    prevPt.x *= (1.0f / (1 << level));
+    prevPt.y *= (1.0f / (1 << level));
+
+    if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows)
+    {
+        if (level == 0 && tid == 0)
+        {
+            status[get_group_id(0)] = 0;
+
+            //if (calcErr) 
+            //    err[get_group_id(0)] = 0;
+        }
+
+        return;
+    }
+    
+    prevPt.x -= c_halfWin_x;
+    prevPt.y -= c_halfWin_y;
+    
+    // extract the patch from the first image, compute covariation matrix of derivatives
+    
+    float A11 = 0;
+    float A12 = 0;
+    float A22 = 0;
+
+    float4 I_patch[21][21];
+    float4 dIdx_patch[21][21];
+    float4 dIdy_patch[21][21];
+
+    for (int yBase = get_local_id(1), i = 0; yBase < c_winSize_y; yBase += get_local_size(1), ++i)
+    {                
+        for (int xBase = get_local_id(0), j = 0; xBase < c_winSize_x; xBase += get_local_size(0), ++j)
+        {
+            float x = (prevPt.x + xBase + 0.5f);
+            float y = (prevPt.y + yBase + 0.5f);
+
+            I_patch[i][j] = read_imagef(I, sampler, (float2)(x, y)).x;
+            
+            float4 dIdx = 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x + 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x -
+                             (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x - 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x);
+
+            float4 dIdy = 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x, y + 1)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x -
+                            (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x, y - 1)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x);
+
+            dIdx_patch[i][j] = dIdx;
+            dIdy_patch[i][j] = dIdy;
+            
+            A11 += (dIdx * dIdx).x + (dIdx * dIdx).y + (dIdx * dIdx).z;
+            A12 += (dIdx * dIdy).x + (dIdx * dIdy).y + (dIdx * dIdy).z;
+            A22 += (dIdy * dIdy).x + (dIdy * dIdy).y + (dIdy * dIdy).z;
+        }
+    }
+
+    reduce3(A11, A12, A22, smem1, smem2, smem3, tid);
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    A11 = smem1[0];
+    A12 = smem2[0];
+    A22 = smem3[0];
+    
+    float D = A11 * A22 - A12 * A12;
+
+    //if (calcErr && GET_MIN_EIGENVALS && tid == 0) 
+    //    err[get_group_id(0)] = minEig;
+
+    if (D < 1.192092896e-07f)
+    {
+        if (level == 0 && tid == 0)
+            status[get_group_id(0)] = 0;
+
+        return;
+    }
+
+    D = 1.f / D;
+
+    A11 *= D;
+    A12 *= D;
+    A22 *= D;
+
+    float2 nextPt = nextPts[get_group_id(0)];
+    nextPt.x *= 2.0f;
+    nextPt.y *= 2.0f; 
+    
+    nextPt.x -= c_halfWin_x;
+    nextPt.y -= c_halfWin_y;
+
+    for (int k = 0; k < c_iters; ++k)
+    {
+        if (nextPt.x < -c_halfWin_x || nextPt.x >= cols || nextPt.y < -c_halfWin_y || nextPt.y >= rows)
+        {
+            if (tid == 0 && level == 0)
+                status[get_group_id(0)] = 0;
+            return;
+        }
+
+        float b1 = 0;
+        float b2 = 0;
+        
+        for (int y = get_local_id(1), i = 0; y < c_winSize_y; y += get_local_size(1), ++i)
+        {
+            for (int x = get_local_id(0), j = 0; x < c_winSize_x; x += get_local_size(0), ++j)
+            {
+				float a = (nextPt.x + x + 0.5f);
+				float b = (nextPt.y + y + 0.5f);
+				
+                float4 I_val = I_patch[i][j];
+                float4 J_val = read_imagef(J, sampler, (float2)(a, b)).x;
+
+                float4 diff = (J_val - I_val) * 32.0f;
+
+                b1 += (diff * dIdx_patch[i][j]).x + (diff * dIdx_patch[i][j]).y + (diff * dIdx_patch[i][j]).z;
+                b2 += (diff * dIdy_patch[i][j]).x + (diff * dIdy_patch[i][j]).y + (diff * dIdy_patch[i][j]).z;
+            }
+        }
+        
+        reduce2(b1, b2, smem1, smem2, tid);
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        b1 = smem1[0];
+        b2 = smem2[0];
+
+        float2 delta;
+        delta.x = A12 * b2 - A22 * b1;
+        delta.y = A12 * b1 - A11 * b2;
+            
+        nextPt.x += delta.x;
+        nextPt.y += delta.y;
+
+        if (fabs(delta.x) < 0.01f && fabs(delta.y) < 0.01f)
+            break;
+    }
+
+    float errval = 0.0f;
+    if (calcErr)
+    {
+        for (int y = get_local_id(1), i = 0; y < c_winSize_y; y += get_local_size(1), ++i)
+        {
+            for (int x = get_local_id(0), j = 0; x < c_winSize_x; x += get_local_size(0), ++j)
+            {
+				float a = (nextPt.x + x + 0.5f);
+				float b = (nextPt.y + y + 0.5f);
+				
+                float4 I_val = I_patch[i][j];
+                float4 J_val = read_imagef(J, sampler, (float2)(a, b)).x;
+
+                float4 diff = J_val - I_val;
+
+                errval += fabs(diff.x) + fabs(diff.y) + fabs(diff.z);
+            }
+        }
+
+        reduce1(errval, smem1, tid);
+    }
+
+    if (tid == 0)
+    {
+        nextPt.x += c_halfWin_x;
+        nextPt.y += c_halfWin_y;
+
+        nextPts[get_group_id(0)] = nextPt;
+
+        //if (calcErr && !GET_MIN_EIGENVALS)
+        //    err[get_group_id(0)] = errval;
+    }
+}
+
+__kernel void lkDense_C1_D0(image2d_t I, image2d_t J, __global float* u, int uStep, __global float* v, int vStep, __global const float* prevU, int prevUStep, __global const float* prevV, int prevVStep, 
+    const int rows, const int cols, /*__global float* err, int errStep, int cn,*/ int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)
+{
+	int c_halfWin_x = (c_winSize_x - 1) / 2;
+	int c_halfWin_y = (c_winSize_y - 1) / 2;
+
+    const int patchWidth  = get_local_size(0) + 2 * c_halfWin_x;
+    const int patchHeight = get_local_size(1) + 2 * c_halfWin_y;
+
+    __local int smem[8192];
+
+    __local int* I_patch = smem;
+    __local int* dIdx_patch = I_patch + patchWidth * patchHeight;
+    __local int* dIdy_patch = dIdx_patch + patchWidth * patchHeight;
+
+    const int xBase = get_group_id(0) * get_local_size(0);
+    const int yBase = get_group_id(1) * get_local_size(1);
+
+	sampler_t sampleri    = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
+	
+    for (int i = get_local_id(1); i < patchHeight; i += get_local_size(1))
+    {
+        for (int j = get_local_id(0); j < patchWidth; j += get_local_size(0))
+        {
+            float x = xBase - c_halfWin_x + j + 0.5f;
+            float y = yBase - c_halfWin_y + i + 0.5f;
+
+            I_patch[i * patchWidth + j] = read_imagei(I, sampleri, (float2)(x, y)).x;
+
+            // Sharr Deriv
+
+            dIdx_patch[i * patchWidth + j] = 3 * read_imagei(I, sampleri, (float2)(x+1, y-1)).x + 10 * read_imagei(I, sampleri, (float2)(x+1, y)).x + 3 * read_imagei(I, sampleri, (float2)(x+1, y+1)).x -
+                                            (3 * read_imagei(I, sampleri, (float2)(x-1, y-1)).x + 10 * read_imagei(I, sampleri, (float2)(x-1, y)).x + 3 * read_imagei(I, sampleri, (float2)(x-1, y+1)).x);
+
+            dIdy_patch[i * patchWidth + j] = 3 * read_imagei(I, sampleri, (float2)(x-1, y+1)).x + 10 * read_imagei(I, sampleri, (float2)(x, y+1)).x + 3 * read_imagei(I, sampleri, (float2)(x+1, y+1)).x -
+                                            (3 * read_imagei(I, sampleri, (float2)(x-1, y-1)).x + 10 * read_imagei(I, sampleri, (float2)(x, y-1)).x + 3 * read_imagei(I, sampleri, (float2)(x+1, y-1)).x);
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // extract the patch from the first image, compute covariation matrix of derivatives
+    
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    if (x >= cols || y >= rows)
+        return;
+
+    int A11i = 0;
+    int A12i = 0;
+    int A22i = 0;
+
+    for (int i = 0; i < c_winSize_y; ++i)
+    {                
+        for (int j = 0; j < c_winSize_x; ++j)
+        {
+            int dIdx = dIdx_patch[(get_local_id(1) + i) * patchWidth + (get_local_id(0) + j)];
+            int dIdy = dIdy_patch[(get_local_id(1) + i) * patchWidth + (get_local_id(0) + j)];
+            
+            A11i += dIdx * dIdx;
+            A12i += dIdx * dIdy;
+            A22i += dIdy * dIdy;
+        }
+    }
+    
+    float A11 = A11i;
+    float A12 = A12i;
+    float A22 = A22i;
+
+    float D = A11 * A22 - A12 * A12;
+    
+    //if (calcErr && GET_MIN_EIGENVALS)
+    //    (err + y * errStep)[x] = minEig;
+
+    if (D < 1.192092896e-07f)
+    {
+        //if (calcErr)
+        //    err(y, x) = 3.402823466e+38f;
+
+        return;
+    }
+
+    D = 1.f / D;
+
+    A11 *= D;
+    A12 *= D;
+    A22 *= D;
+
+    float2 nextPt;
+    nextPt.x = x + prevU[y/2 * prevUStep / 4 + x/2] * 2.0f;
+    nextPt.y = y + prevV[y/2 * prevVStep / 4 + x/2] * 2.0f;
+
+    for (int k = 0; k < c_iters; ++k)
+    {
+        if (nextPt.x < 0 || nextPt.x >= cols || nextPt.y < 0 || nextPt.y >= rows)
+        {
+            //if (calcErr)
+            //    err(y, x) = 3.402823466e+38f;
+
+            return;
+        }
+
+        int b1 = 0;
+        int b2 = 0;
+
+        for (int i = 0; i < c_winSize_y; ++i)
+        {
+            for (int j = 0; j < c_winSize_x; ++j)
+            {
+                int iI = I_patch[(get_local_id(1) + i) * patchWidth + get_local_id(0) + j];
+                int iJ = read_imagei(J, sampler, (float2)(nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f)).x;
+
+                int diff = (iJ - iI) * 32;
+
+                int dIdx = dIdx_patch[(get_local_id(1) + i) * patchWidth + (get_local_id(0) + j)];
+                int dIdy = dIdy_patch[(get_local_id(1) + i) * patchWidth + (get_local_id(0) + j)];
+
+                b1 += diff * dIdx;
+                b2 += diff * dIdy;
+            }
+        }
+
+        float2 delta;
+        delta.x = A12 * b2 - A22 * b1;
+        delta.y = A12 * b1 - A11 * b2;
+            
+        nextPt.x += delta.x;
+        nextPt.y += delta.y;
+
+        if (fabs(delta.x) < 0.01f && fabs(delta.y) < 0.01f)
+            break;
+    }
+
+    u[y * uStep / 4 + x] = nextPt.x - x;
+    v[y * vStep / 4 + x] = nextPt.y - y;
+
+    if (calcErr)
+    {
+        int errval = 0;
+
+        for (int i = 0; i < c_winSize_y; ++i)
+        {
+            for (int j = 0; j < c_winSize_x; ++j)
+            {
+                int iI = I_patch[(get_local_id(1) + i) * patchWidth + get_local_id(0) + j];
+                int iJ = read_imagei(J, sampler, (float2)(nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f)).x;
+
+                errval += abs(iJ - iI);
+            }
+        }
+
+        //err[y * errStep / 4 + x] = static_cast<float>(errval) / (c_winSize_x * c_winSize_y);
+    }
+}
--- a/modules/ocl/src/matrix_operations.cpp
+++ b/modules/ocl/src/matrix_operations.cpp
@ -916,6 +916,7 @@ oclMat cv::ocl::oclMat::reshape(int new_cn, int new_rows) const
        CV_Error(CV_BadNumChannels, "The total width is not divisible by the new number of channels");

    hdr.cols = new_width;
+	hdr.wholecols = new_width;
    hdr.flags = (hdr.flags & ~CV_MAT_CN_MASK) | ((new_cn - 1) << CV_CN_SHIFT);

    return hdr;
--- a/modules/ocl/src/pyrlk.cpp
+++ b/modules/ocl/src/pyrlk.cpp
@ -0,0 +1,597 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace cv::ocl;
+
+#if !defined (HAVE_OPENCL)
+
+void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat&, const oclMat&, const oclMat&, oclMat&, oclMat&, oclMat*) {  }
+void cv::ocl::PyrLKOpticalFlow::dense(const oclMat&, const oclMat&, oclMat&, oclMat&, oclMat*) {  }
+
+#else /* !defined (HAVE_OPENCL) */
+
+namespace cv
+{
+    namespace ocl
+    {
+        ///////////////////////////OpenCL kernel strings///////////////////////////
+        extern const char *pyrlk;
+
+    }
+}
+
+struct dim3
+{
+    unsigned int x, y, z;
+};
+
+struct float2
+{
+    float x, y;
+};
+
+struct int2
+{
+    int x, y;
+};
+
+void calcSharrDeriv_run(const oclMat& src, oclMat& dx_buf, oclMat& dy_buf, oclMat& dIdx, oclMat& dIdy, int cn)
+{
+    Context  *clCxt = src.clCxt;
+
+    string kernelName = "calcSharrDeriv_vertical";
+
+    size_t localThreads[3]  = { 32, 8, 1 };
+    size_t globalThreads[3] = { src.cols, src.rows, 1};
+
+    vector<pair<size_t , const void *> > args;
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src.step ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&cn ));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&dx_buf.data ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dx_buf.step ));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&dy_buf.data ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&dy_buf.step ));
+
+    openCLExecuteKernel(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, src.channels(), src.depth());
+
+	kernelName = "calcSharrDeriv_horizontal";
+
+    vector<pair<size_t , const void *> > args2;
+    args2.push_back( make_pair( sizeof(cl_int), (void *)&src.rows ));
+    args2.push_back( make_pair( sizeof(cl_int), (void *)&src.cols ));
+    args2.push_back( make_pair( sizeof(cl_int), (void *)&cn ));
+    args2.push_back( make_pair( sizeof(cl_mem), (void *)&dx_buf.data ));
+    args2.push_back( make_pair( sizeof(cl_int), (void *)&dx_buf.step ));
+    args2.push_back( make_pair( sizeof(cl_mem), (void *)&dy_buf.data ));
+    args2.push_back( make_pair( sizeof(cl_int), (void *)&dy_buf.step ));
+    args2.push_back( make_pair( sizeof(cl_mem), (void *)&dIdx.data ));
+    args2.push_back( make_pair( sizeof(cl_int), (void *)&dIdx.step ));
+    args2.push_back( make_pair( sizeof(cl_mem), (void *)&dIdy.data ));
+    args2.push_back( make_pair( sizeof(cl_int), (void *)&dIdy.step ));
+
+    openCLExecuteKernel(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args2, src.channels(), src.depth());
+}
+
+
+void cv::ocl::PyrLKOpticalFlow::calcSharrDeriv(const oclMat& src, oclMat& dIdx, oclMat& dIdy)
+{
+    CV_Assert(src.rows > 1 && src.cols > 1);
+    CV_Assert(src.depth() == CV_8U);
+
+    const int cn = src.channels();
+
+    ensureSizeIsEnough(src.size(), CV_MAKETYPE(CV_16S, cn), dx_calcBuf_);
+    ensureSizeIsEnough(src.size(), CV_MAKETYPE(CV_16S, cn), dy_calcBuf_);
+
+	calcSharrDeriv_run(src, dx_calcBuf_, dy_calcBuf_, dIdx, dIdy, cn);
+}
+
+void cv::ocl::PyrLKOpticalFlow::buildImagePyramid(const oclMat& img0, vector<oclMat>& pyr, bool withBorder)
+{
+    pyr.resize(maxLevel + 1);
+
+    Size sz = img0.size();
+
+	Mat img0Temp;
+	img0.download(img0Temp);
+	
+	Mat pyrTemp;
+	oclMat o;
+
+    for (int level = 0; level <= maxLevel; ++level)
+    {
+        oclMat temp;
+
+        if (withBorder)
+        {
+            temp.create(sz.height + winSize.height * 2, sz.width + winSize.width * 2, img0.type());
+        }
+        else
+        {
+            ensureSizeIsEnough(sz, img0.type(), pyr[level]);
+        }
+
+        if (level == 0)
+			pyr[level] = img0Temp;
+        else
+            pyrDown(pyr[level - 1], pyr[level]);
+
+        if (withBorder)
+            copyMakeBorder(pyr[level], temp, winSize.height, winSize.height, winSize.width, winSize.width, BORDER_REFLECT_101);
+
+        sz = Size((sz.width + 1) / 2, (sz.height + 1) / 2);
+
+        if (sz.width <= winSize.width || sz.height <= winSize.height)
+        {
+            maxLevel = level;
+            break;
+        }
+    }
+}
+
+namespace
+{
+    void calcPatchSize(cv::Size winSize, int cn, dim3& block, dim3& patch, bool isDeviceArch11)
+    {
+        winSize.width *= cn;
+
+        if (winSize.width > 32 && winSize.width > 2 * winSize.height)
+        {
+            block.x = isDeviceArch11 ? 16 : 32;
+            block.y = 8;
+        }
+        else
+        {
+            block.x = 16;
+            block.y = isDeviceArch11 ? 8 : 16;
+        }
+
+        patch.x = (winSize.width  + block.x - 1) / block.x;
+        patch.y = (winSize.height + block.y - 1) / block.y;
+
+        block.z = patch.z = 1;
+    }
+}
+
+struct MultiplyScalar
+{
+    MultiplyScalar(double val_, double scale_) : val(val_), scale(scale_) {}
+    double operator ()(double a) const
+    {
+        return (scale * a * val);
+    }
+    const double val;
+    const double scale;
+};
+
+void callF(const oclMat& src, oclMat& dst, MultiplyScalar op, int mask)
+{
+	Mat srcTemp;
+	Mat dstTemp;
+	src.download(srcTemp);
+	dst.download(dstTemp);
+
+	int i;
+	int j;
+	int k;
+	for(i = 0; i < srcTemp.rows; i++)
+	{
+		for(j = 0; j < srcTemp.cols; j++)
+		{
+			for(k = 0; k < srcTemp.channels(); k++)
+			{
+				((float*)dstTemp.data)[srcTemp.channels() * (i * srcTemp.rows + j) + k] = (float)op(((float*)srcTemp.data)[srcTemp.channels() * (i * srcTemp.rows + j) + k]);
+			}
+		}
+	}
+
+	dst = dstTemp;
+}
+
+static inline bool isAligned(const unsigned char* ptr, size_t size)
+{
+    return reinterpret_cast<size_t>(ptr) % size == 0;
+}
+
+static inline bool isAligned(size_t step, size_t size)
+{
+    return step % size == 0;
+}
+
+void callT(const oclMat& src, oclMat& dst, MultiplyScalar op, int mask)
+{
+    if (!isAligned(src.data, 4 * sizeof(double)) || !isAligned(src.step, 4 * sizeof(double)) || 
+        !isAligned(dst.data, 4 * sizeof(double)) || !isAligned(dst.step, 4 * sizeof(double)))
+    {
+        callF(src, dst, op, mask);
+        return;
+    }
+
+	Mat srcTemp;
+	Mat dstTemp;
+	src.download(srcTemp);
+	dst.download(dstTemp);
+
+	int x_shifted;
+
+	int i;
+	int j;
+	for(i = 0; i < srcTemp.rows; i++)
+	{
+		const double* srcRow = (const double*)srcTemp.data + i * srcTemp.rows;
+        double* dstRow = (double*)dstTemp.data + i * dstTemp.rows;;
+
+		for(j = 0; j < srcTemp.cols; j++)
+		{
+			x_shifted = j * 4;
+
+			if(x_shifted + 4 - 1 < srcTemp.cols)
+			{
+				dstRow[x_shifted    ] = op(srcRow[x_shifted    ]);
+				dstRow[x_shifted + 1] = op(srcRow[x_shifted + 1]);
+				dstRow[x_shifted + 2] = op(srcRow[x_shifted + 2]);
+				dstRow[x_shifted + 3] = op(srcRow[x_shifted + 3]);
+			}
+			else
+			{
+				for (int real_x = x_shifted; real_x < srcTemp.cols; ++real_x)
+				{
+					((float*)dstTemp.data)[i * srcTemp.rows + real_x] = op(((float*)srcTemp.data)[i * srcTemp.rows + real_x]);
+				}
+			}
+		}
+	}
+}
+
+void multiply(const oclMat& src1, double val, oclMat& dst, double scale = 1.0f);
+void multiply(const oclMat& src1, double val, oclMat& dst, double scale)
+{
+    MultiplyScalar op(val, scale);
+	//if(src1.channels() == 1 && dst.channels() == 1)
+	//{
+	//    callT(src1, dst, op, 0);
+	//}
+	//else
+	//{
+	    callF(src1, dst, op, 0);
+	//}
+}
+
+cl_mem bindTexture(const oclMat& mat, int depth, int channels)
+{
+	cl_mem texture;
+    cl_image_format format;
+    int err;
+	if(depth == 0)
+	{
+	    format.image_channel_data_type = CL_UNSIGNED_INT8;
+	}
+	else if(depth == 5)
+	{
+	    format.image_channel_data_type = CL_FLOAT;
+	}
+	if(channels == 1)
+	{
+	    format.image_channel_order     = CL_R;
+	}
+	else if(channels == 3)
+	{
+	    format.image_channel_order     = CL_RGB;
+	}
+	else if(channels == 4)
+	{
+	    format.image_channel_order     = CL_RGBA;
+	}
+#if CL_VERSION_1_2
+    cl_image_desc desc;
+    desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
+    desc.image_width      = mat.cols;
+    desc.image_height     = mat.rows;
+    desc.image_depth      = NULL;
+    desc.image_array_size = 1;
+    desc.image_row_pitch  = 0;
+    desc.image_slice_pitch= 0;
+    desc.buffer           = NULL;
+    desc.num_mip_levels   = 0;
+    desc.num_samples      = 0;
+	texture = clCreateImage(mat.clCxt->impl->clContext, CL_MEM_READ_WRITE, &format, &desc, NULL, &err); 
+#else
+    texture = clCreateImage2D(
+        mat.clCxt->impl->clContext, 
+        CL_MEM_READ_WRITE, 
+        &format, 
+        mat.cols, 
+        mat.rows, 
+        0, 
+        NULL, 
+        &err);
+#endif
+    size_t origin[] = { 0, 0, 0 }; 
+    size_t region[] = { mat.cols, mat.rows, 1 }; 
+	clEnqueueCopyBufferToImage(mat.clCxt->impl->clCmdQueue, (cl_mem)mat.data, texture, 0, origin, region, 0, NULL, 0);
+    openCLSafeCall(err);
+
+	return texture;
+}
+
+void lkSparse_run(oclMat& I, oclMat& J,
+    const oclMat& prevPts, oclMat& nextPts, oclMat& status, oclMat* err, bool GET_MIN_EIGENVALS, int ptcount, 
+    int level, dim3 block, dim3 patch, Size winSize, int iters)
+{
+    Context  *clCxt = I.clCxt;
+
+    string kernelName = "lkSparse";
+
+	size_t localThreads[3]  = { 16, 16, 1 };
+    size_t globalThreads[3] = { 16 * ptcount, 16, 1};
+
+	int cn = I.channels();
+
+	bool calcErr;
+    if (err)
+    {
+		calcErr = true;
+    }
+    else
+    {
+		calcErr = false;
+    }
+	calcErr = true;
+
+	cl_mem ITex = bindTexture(I, I.depth(), cn);
+	cl_mem JTex = bindTexture(J, J.depth(), cn);
+
+    vector<pair<size_t , const void *> > args;
+
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&ITex ));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&JTex ));
+
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&prevPts.data ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&prevPts.step ));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&nextPts.data ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&nextPts.step ));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&status.data ));
+    //args.push_back( make_pair( sizeof(cl_mem), (void *)&(err->data) ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&level ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&I.rows ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&I.cols ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&patch.x ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&patch.y ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&cn ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&winSize.width ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&winSize.height ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&iters ));
+    args.push_back( make_pair( sizeof(cl_char), (void *)&calcErr ));
+    args.push_back( make_pair( sizeof(cl_char), (void *)&GET_MIN_EIGENVALS ));
+
+    openCLExecuteKernel(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.channels(), I.depth());
+}
+
+void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat& prevImg, const oclMat& nextImg, const oclMat& prevPts, oclMat& nextPts, oclMat& status, oclMat* err)
+{
+    if (prevPts.empty())
+    {
+        nextPts.release();
+        status.release();
+        if (err) err->release();
+        return;
+    }
+
+    derivLambda = std::min(std::max(derivLambda, 0.0), 1.0);
+
+    iters = std::min(std::max(iters, 0), 100);
+
+    const int cn = prevImg.channels();
+
+    dim3 block, patch;
+    calcPatchSize(winSize, cn, block, patch, isDeviceArch11_);  
+
+    CV_Assert(derivLambda >= 0);
+    CV_Assert(maxLevel >= 0 && winSize.width > 2 && winSize.height > 2);
+    CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type());
+    CV_Assert(patch.x > 0 && patch.x < 6 && patch.y > 0 && patch.y < 6);
+    CV_Assert(prevPts.rows == 1 && prevPts.type() == CV_32FC2);
+
+    if (useInitialFlow)
+        CV_Assert(nextPts.size() == prevPts.size() && nextPts.type() == CV_32FC2);
+    else
+        ensureSizeIsEnough(1, prevPts.cols, prevPts.type(), nextPts);
+
+    oclMat temp1 = (useInitialFlow ? nextPts : prevPts).reshape(1);
+    oclMat temp2 = nextPts.reshape(1);
+	//oclMat scalar(temp1.rows, temp1.cols, temp1.type(), Scalar(1.0f / (1 << maxLevel) / 2.0f));
+	//ocl::multiply(temp1, scalar, temp2);
+	::multiply(temp1, 1.0f / (1 << maxLevel) / 2.0f, temp2);
+
+    ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status);
+    status.setTo(Scalar::all(1));
+
+    if (err)
+        ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err);
+
+    // build the image pyramids.
+
+    prevPyr_.resize(maxLevel + 1);
+    nextPyr_.resize(maxLevel + 1);
+
+    if (cn == 1 || cn == 4)
+    {
+        prevImg.convertTo(prevPyr_[0], CV_32F);
+        nextImg.convertTo(nextPyr_[0], CV_32F);
+    }
+    else
+    {
+		oclMat buf_;
+        cvtColor(prevImg, buf_, COLOR_BGR2BGRA);
+        buf_.convertTo(prevPyr_[0], CV_32F);
+
+        cvtColor(nextImg, buf_, COLOR_BGR2BGRA);
+        buf_.convertTo(nextPyr_[0], CV_32F);
+    }
+
+    for (int level = 1; level <= maxLevel; ++level)
+    {
+        pyrDown(prevPyr_[level - 1], prevPyr_[level]);
+        pyrDown(nextPyr_[level - 1], nextPyr_[level]);
+    }
+
+    // dI/dx ~ Ix, dI/dy ~ Iy
+
+    for (int level = maxLevel; level >= 0; level--)
+    {
+		lkSparse_run(prevPyr_[level], nextPyr_[level], 
+			prevPts, nextPts, status, level == 0 && err ? err : 0, getMinEigenVals, prevPts.cols,
+			level, block, patch, winSize, iters);
+    }
+}
+
+void lkDense_run(oclMat& I, oclMat& J, oclMat& u, oclMat& v, 
+    oclMat& prevU, oclMat& prevV, oclMat* err, Size winSize, int iters)
+{
+    Context  *clCxt = I.clCxt;
+
+    string kernelName = "lkDense";
+
+    size_t localThreads[3]  = { 16, 16, 1 };
+    size_t globalThreads[3] = { I.cols, I.rows, 1};
+
+	int cn = I.channels();
+
+	bool calcErr;
+    if (err)
+    {
+		calcErr = true;
+    }
+    else
+    {
+		calcErr = false;
+    }
+
+	cl_mem ITex = bindTexture(I, I.depth(), cn);
+	cl_mem JTex = bindTexture(J, J.depth(), cn);
+
+	int2 halfWin = {(winSize.width - 1) / 2, (winSize.height - 1) / 2};
+    const int patchWidth  = 16 + 2 * halfWin.x;
+    const int patchHeight = 16 + 2 * halfWin.y;
+    size_t smem_size = 3 * patchWidth * patchHeight * sizeof(int);
+
+    vector<pair<size_t , const void *> > args;
+
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&ITex ));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&JTex ));
+
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&u.data ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&u.step ));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&v.data ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&v.step ));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&prevU.data ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&prevU.step ));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&prevV.data ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&prevV.step ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&I.rows ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&I.cols ));
+    //args.push_back( make_pair( sizeof(cl_mem), (void *)&(*err).data ));
+    //args.push_back( make_pair( sizeof(cl_int), (void *)&(*err).step ));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&winSize.width ));
+	args.push_back( make_pair( sizeof(cl_int), (void *)&winSize.height ));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&iters ));
+    args.push_back( make_pair( sizeof(cl_char), (void *)&calcErr ));
+
+    openCLExecuteKernel(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.channels(), I.depth());
+}
+
+void cv::ocl::PyrLKOpticalFlow::dense(const oclMat& prevImg, const oclMat& nextImg, oclMat& u, oclMat& v, oclMat* err)
+{
+    CV_Assert(prevImg.type() == CV_8UC1);
+    CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type());
+    CV_Assert(maxLevel >= 0);
+    CV_Assert(winSize.width > 2 && winSize.height > 2);
+
+    if (err)
+        err->create(prevImg.size(), CV_32FC1);
+
+    prevPyr_.resize(maxLevel + 1);
+    nextPyr_.resize(maxLevel + 1);
+
+    prevPyr_[0] = prevImg;
+    nextImg.convertTo(nextPyr_[0], CV_32F);
+
+    for (int level = 1; level <= maxLevel; ++level)
+    {
+        pyrDown(prevPyr_[level - 1], prevPyr_[level]);
+        pyrDown(nextPyr_[level - 1], nextPyr_[level]);
+    }
+
+    ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[0]);
+    ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[0]);
+    ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[1]);
+    ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[1]);
+    uPyr_[1].setTo(Scalar::all(0));
+    vPyr_[1].setTo(Scalar::all(0));
+
+	Size winSize2i(winSize.width, winSize.height);
+
+    int idx = 0;
+
+    for (int level = maxLevel; level >= 0; level--)
+    {
+        int idx2 = (idx + 1) & 1;
+
+        lkDense_run(prevPyr_[level], nextPyr_[level], uPyr_[idx], vPyr_[idx], uPyr_[idx2], vPyr_[idx2],
+            level == 0 ? err : 0, winSize2i, iters);
+
+        if (level > 0)
+            idx = idx2;
+    }
+
+    uPyr_[idx].copyTo(u);
+    vPyr_[idx].copyTo(v);
+}
+
+#endif /* !defined (HAVE_CUDA) */
--- a/modules/ocl/test/test_pyrlk.cpp
+++ b/modules/ocl/test/test_pyrlk.cpp
@ -0,0 +1,167 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#include <iomanip>
+
+#ifdef HAVE_OPENCL
+
+using namespace cv;
+using namespace cv::ocl;
+using namespace cvtest;
+using namespace testing;
+using namespace std;
+
+//#define DUMP
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// BroxOpticalFlow
+
+#define BROX_OPTICAL_FLOW_DUMP_FILE            "opticalflow/brox_optical_flow.bin"
+#define BROX_OPTICAL_FLOW_DUMP_FILE_CC20       "opticalflow/brox_optical_flow_cc20.bin"
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// PyrLKOpticalFlow
+
+//IMPLEMENT_PARAM_CLASS(UseGray, bool)
+
+PARAM_TEST_CASE(Sparse, bool, bool)
+{
+    bool useGray;
+    bool UseSmart;
+
+    virtual void SetUp()
+    {
+        UseSmart = GET_PARAM(0);
+		useGray = GET_PARAM(0);
+    }
+};
+
+TEST_P(Sparse, Mat)
+{
+    cv::Mat frame0 = readImage("../../../samples/gpu/rubberwhale1.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
+    ASSERT_FALSE(frame0.empty());
+
+    cv::Mat frame1 = readImage("../../../samples/gpu/rubberwhale2.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
+    ASSERT_FALSE(frame1.empty());
+
+    cv::Mat gray_frame;
+    if (useGray)
+        gray_frame = frame0;
+    else
+        cv::cvtColor(frame0, gray_frame, cv::COLOR_BGR2GRAY);
+
+    std::vector<cv::Point2f> pts;
+    cv::goodFeaturesToTrack(gray_frame, pts, 1000, 0.01, 0.0);
+
+    cv::ocl::oclMat d_pts;
+    cv::Mat pts_mat(1, (int)pts.size(), CV_32FC2, (void*)&pts[0]);
+    d_pts.upload(pts_mat);
+
+    cv::ocl::PyrLKOpticalFlow pyrLK;
+
+	cv::ocl::oclMat oclFrame0;
+	cv::ocl::oclMat oclFrame1;
+    cv::ocl::oclMat d_nextPts;
+    cv::ocl::oclMat d_status;
+    cv::ocl::oclMat d_err;
+
+	oclFrame0 = frame0;
+	oclFrame1 = frame1;
+
+    pyrLK.sparse(oclFrame0, oclFrame1, d_pts, d_nextPts, d_status, &d_err);
+
+    std::vector<cv::Point2f> nextPts(d_nextPts.cols);
+    cv::Mat nextPts_mat(1, d_nextPts.cols, CV_32FC2, (void*)&nextPts[0]);
+    d_nextPts.download(nextPts_mat);
+
+    std::vector<unsigned char> status(d_status.cols);
+    cv::Mat status_mat(1, d_status.cols, CV_8UC1, (void*)&status[0]);
+    d_status.download(status_mat);
+
+    std::vector<float> err(d_err.cols);
+    cv::Mat err_mat(1, d_err.cols, CV_32FC1, (void*)&err[0]);
+    d_err.download(err_mat);
+
+    std::vector<cv::Point2f> nextPts_gold;
+    std::vector<unsigned char> status_gold;
+    std::vector<float> err_gold;
+    cv::calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts_gold, status_gold, err_gold);
+
+    ASSERT_EQ(nextPts_gold.size(), nextPts.size());
+    ASSERT_EQ(status_gold.size(), status.size());
+
+    size_t mistmatch = 0;
+    for (size_t i = 0; i < nextPts.size(); ++i)
+    {
+        if (status[i] != status_gold[i])
+        {
+            ++mistmatch;
+            continue;
+        }
+
+        if (status[i])
+        {
+            cv::Point2i a = nextPts[i];
+            cv::Point2i b = nextPts_gold[i];
+
+            bool eq = std::abs(a.x - b.x) < 1 && std::abs(a.y - b.y) < 1;
+            //float errdiff = std::abs(err[i] - err_gold[i]);
+            float errdiff = 0.0f;
+
+            if (!eq || errdiff > 1e-1)
+                ++mistmatch;
+        }
+    }
+
+    double bad_ratio = static_cast<double>(mistmatch) / (nextPts.size() * 2);
+
+    ASSERT_LE(bad_ratio, 0.05f);
+	
+}
+
+INSTANTIATE_TEST_CASE_P(Video, Sparse, Combine(
+    Values(false, true),   
+	Values(false)));
+
+#endif // HAVE_OPENCL
+