add SURF and HOG to ocl module

12 years ago · 64e9cf5d75
parent a2df490914
commit 64e9cf5d75
6 changed files with 4597 additions and 0 deletions
--- a/modules/ocl/include/opencv2/ocl/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl/ocl.hpp
@ -12,6 +12,7 @@
 //
 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@ -924,6 +925,154 @@ namespace cv
 		const oclMat& src3, double beta, oclMat& dst, int flags = 0);
 #endif

+        //////////////// HOG (Histogram-of-Oriented-Gradients) Descriptor and Object Detector //////////////
+        struct CV_EXPORTS HOGDescriptor
+        {
+            enum { DEFAULT_WIN_SIGMA = -1 };
+            enum { DEFAULT_NLEVELS = 64 };
+            enum { DESCR_FORMAT_ROW_BY_ROW, DESCR_FORMAT_COL_BY_COL };
+
+            HOGDescriptor(Size win_size=Size(64, 128), Size block_size=Size(16, 16),
+                          Size block_stride=Size(8, 8), Size cell_size=Size(8, 8),
+                          int nbins=9, double win_sigma=DEFAULT_WIN_SIGMA,
+                          double threshold_L2hys=0.2, bool gamma_correction=true,
+                          int nlevels=DEFAULT_NLEVELS);
+
+            size_t getDescriptorSize() const;
+            size_t getBlockHistogramSize() const;
+
+            void setSVMDetector(const vector<float>& detector);
+
+            static vector<float> getDefaultPeopleDetector();
+            static vector<float> getPeopleDetector48x96();
+            static vector<float> getPeopleDetector64x128();
+
+            void detect(const oclMat& img, vector<Point>& found_locations,
+                        double hit_threshold=0, Size win_stride=Size(),
+                        Size padding=Size());
+
+            void detectMultiScale(const oclMat& img, vector<Rect>& found_locations,
+                                  double hit_threshold=0, Size win_stride=Size(),
+                                  Size padding=Size(), double scale0=1.05,
+                                  int group_threshold=2);
+
+            void getDescriptors(const oclMat& img, Size win_stride,
+                                oclMat& descriptors,
+                                int descr_format=DESCR_FORMAT_COL_BY_COL);
+
+            Size win_size;
+            Size block_size;
+            Size block_stride;
+            Size cell_size;
+            int nbins;
+            double win_sigma;
+            double threshold_L2hys;
+            bool gamma_correction;
+            int nlevels;
+
+        protected:
+            void computeBlockHistograms(const oclMat& img);
+            void computeGradient(const oclMat& img, oclMat& grad, oclMat& qangle);
+
+            double getWinSigma() const;
+            bool checkDetectorSize() const;
+
+            static int numPartsWithin(int size, int part_size, int stride);
+            static Size numPartsWithin(Size size, Size part_size, Size stride);
+
+            // Coefficients of the separating plane
+            float free_coef;
+            oclMat detector;
+
+            // Results of the last classification step
+            oclMat labels;
+            Mat labels_host;
+
+            // Results of the last histogram evaluation step
+            oclMat block_hists;
+
+            // Gradients conputation results
+            oclMat grad, qangle;
+
+            std::vector<oclMat> image_scales;
+        };
+
+        //! Speeded up robust features, port from GPU module.
+        ////////////////////////////////// SURF //////////////////////////////////////////
+        class CV_EXPORTS SURF_OCL
+        {
+        public:
+            enum KeypointLayout
+            {
+                X_ROW = 0,
+                Y_ROW,
+                LAPLACIAN_ROW,
+                OCTAVE_ROW,
+                SIZE_ROW,
+                ANGLE_ROW,
+                HESSIAN_ROW,
+                ROWS_COUNT
+            };
+
+            //! the default constructor
+            SURF_OCL();
+            //! the full constructor taking all the necessary parameters
+            explicit SURF_OCL(double _hessianThreshold, int _nOctaves=4,
+                int _nOctaveLayers=2, bool _extended=false, float _keypointsRatio=0.01f, bool _upright = false);
+
+            //! returns the descriptor size in float's (64 or 128)
+            int descriptorSize() const;
+            
+            //! upload host keypoints to device memory
+            void uploadKeypoints(const vector<cv::KeyPoint>& keypoints, oclMat& keypointsocl);
+            //! download keypoints from device to host memory
+            void downloadKeypoints(const oclMat& keypointsocl, vector<KeyPoint>& keypoints);
+
+            //! download descriptors from device to host memory
+            void downloadDescriptors(const oclMat& descriptorsocl, vector<float>& descriptors);
+
+            //! finds the keypoints using fast hessian detector used in SURF
+            //! supports CV_8UC1 images
+            //! keypoints will have nFeature cols and 6 rows
+            //! keypoints.ptr<float>(X_ROW)[i] will contain x coordinate of i'th feature
+            //! keypoints.ptr<float>(Y_ROW)[i] will contain y coordinate of i'th feature
+            //! keypoints.ptr<float>(LAPLACIAN_ROW)[i] will contain laplacian sign of i'th feature
+            //! keypoints.ptr<float>(OCTAVE_ROW)[i] will contain octave of i'th feature
+            //! keypoints.ptr<float>(SIZE_ROW)[i] will contain size of i'th feature
+            //! keypoints.ptr<float>(ANGLE_ROW)[i] will contain orientation of i'th feature
+            //! keypoints.ptr<float>(HESSIAN_ROW)[i] will contain response of i'th feature
+            void operator()(const oclMat& img, const oclMat& mask, oclMat& keypoints);
+            //! finds the keypoints and computes their descriptors.
+            //! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction
+            void operator()(const oclMat& img, const oclMat& mask, oclMat& keypoints, oclMat& descriptors,
+                bool useProvidedKeypoints = false);
+
+            void operator()(const oclMat& img, const oclMat& mask, std::vector<KeyPoint>& keypoints);
+            void operator()(const oclMat& img, const oclMat& mask, std::vector<KeyPoint>& keypoints, oclMat& descriptors,
+                bool useProvidedKeypoints = false);
+
+            void operator()(const oclMat& img, const oclMat& mask, std::vector<KeyPoint>& keypoints, std::vector<float>& descriptors,
+                bool useProvidedKeypoints = false);
+
+            void releaseMemory();
+
+            // SURF parameters
+            float hessianThreshold;
+            int nOctaves;
+            int nOctaveLayers;
+            bool extended;
+            bool upright;
+
+            //! max keypoints = min(keypointsRatio * img.size().area(), 65535)
+            float keypointsRatio;
+
+            oclMat sum, mask1, maskSum, intBuffer;
+
+            oclMat det, trace;
+
+            oclMat maxPosBuffer;
+
+        };
    }
 }
 #include "opencv2/ocl/matrix_operations.hpp"
--- a/modules/ocl/src/hog.cpp
+++ b/modules/ocl/src/hog.cpp
--- a/modules/ocl/src/kernels/nonfree_surf.cl
+++ b/modules/ocl/src/kernels/nonfree_surf.cl
--- a/modules/ocl/src/kernels/objdetect_hog.cl
+++ b/modules/ocl/src/kernels/objdetect_hog.cl
@ -0,0 +1,450 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Wenju He, wenju@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+
+#define CELL_WIDTH 8
+#define CELL_HEIGHT 8
+#define CELLS_PER_BLOCK_X 2
+#define CELLS_PER_BLOCK_Y 2
+#define NTHREADS 256
+#define CV_PI_F 3.1415926535897932384626433832795f
+
+//----------------------------------------------------------------------------
+// Histogram computation
+
+__kernel void compute_hists_kernel(const int width, const int cblock_stride_x, const int cblock_stride_y, 
+                                   const int cnbins, const int cblock_hist_size, const int img_block_width, 
+                                   const int grad_quadstep, const int qangle_step, 
+                                   __global const float* grad, __global const uchar* qangle, 
+                                   const float scale, __global float* block_hists, __local float* smem)
+{
+    const int lidX = get_local_id(0);
+    const int lidY = get_local_id(1);
+    const int gidX = get_group_id(0);
+    const int gidY = get_group_id(1);
+
+    const int cell_x = lidX / 16;
+    const int cell_y = lidY;
+    const int cell_thread_x = lidX & 0xF;
+
+    __local float* hists = smem;
+    __local float* final_hist = smem + cnbins * 48;
+
+    const int offset_x = gidX * cblock_stride_x + (cell_x << 2) + cell_thread_x;
+    const int offset_y = gidY * cblock_stride_y + (cell_y << 2);
+
+    __global const float* grad_ptr = grad + offset_y * grad_quadstep + (offset_x << 1);
+    __global const uchar* qangle_ptr = qangle + offset_y * qangle_step + (offset_x << 1);
+
+    // 12 means that 12 pixels affect on block's cell (in one row)
+    if (cell_thread_x < 12)
+    {
+        __local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) + cell_thread_x;
+        for (int bin_id = 0; bin_id < cnbins; ++bin_id)
+            hist[bin_id * 48] = 0.f;
+
+        const int dist_x = -4 + cell_thread_x - 4 * cell_x;
+
+        const int dist_y_begin = -4 - 4 * lidY;
+        for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
+        {
+            float2 vote = (float2) (grad_ptr[0], grad_ptr[1]);
+            uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]);
+
+            grad_ptr += grad_quadstep;
+            qangle_ptr += qangle_step;
+
+            int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
+            int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);
+
+            float gaussian = exp(-(dist_center_y * dist_center_y + dist_center_x * dist_center_x) * scale);
+            float interp_weight = (8.f - fabs(dist_y + 0.5f)) * (8.f - fabs(dist_x + 0.5f)) / 64.f;
+
+            hist[bin.x * 48] += gaussian * interp_weight * vote.x;
+            hist[bin.y * 48] += gaussian * interp_weight * vote.y;
+        }
+
+        volatile __local float* hist_ = hist;
+        for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48)
+        {
+            if (cell_thread_x < 6) hist_[0] += hist_[6];
+            if (cell_thread_x < 3) hist_[0] += hist_[3];
+            if (cell_thread_x == 0)
+                final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] = hist_[0] + hist_[1] + hist_[2];
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    __global float* block_hist = block_hists + (gidY * img_block_width + gidX) * cblock_hist_size;
+
+    int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 16 + cell_thread_x;
+    if (tid < cblock_hist_size)
+        block_hist[tid] = final_hist[tid];
+}
+
+//-------------------------------------------------------------
+//  Normalization of histograms via L2Hys_norm
+//
+float reduce_smem(volatile __local float* smem, int size)
+{
+    unsigned int tid = get_local_id(0);
+    float sum = smem[tid];
+
+    if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; barrier(CLK_LOCAL_MEM_FENCE); }
+    if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; barrier(CLK_LOCAL_MEM_FENCE); }
+    if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; barrier(CLK_LOCAL_MEM_FENCE); }
+
+    if (tid < 32)
+    {
+        if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];
+        if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
+        if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
+        if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];
+        if (size >= 4) smem[tid] = sum = sum + smem[tid + 2];
+        if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    sum = smem[0];
+
+    return sum;
+}
+
+__kernel void normalize_hists_kernel(const int nthreads, const int block_hist_size, const int img_block_width,
+                                     __global float* block_hists, const float threshold, __local float *squares)
+{
+    const int tid = get_local_id(0);
+    const int gidX = get_group_id(0);
+    const int gidY = get_group_id(1);
+
+    __global float* hist = block_hists + (gidY * img_block_width + gidX) * block_hist_size + tid;
+
+    float elem = 0.f;
+    if (tid < block_hist_size)
+        elem = hist[0];
+
+    squares[tid] = elem * elem;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    float sum = reduce_smem(squares, nthreads);
+
+    float scale = 1.0f / (sqrt(sum) + 0.1f * block_hist_size);
+    elem = min(elem * scale, threshold);
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    squares[tid] = elem * elem;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    sum = reduce_smem(squares, nthreads);
+    scale = 1.0f / (sqrt(sum) + 1e-3f);
+
+    if (tid < block_hist_size)
+        hist[0] = elem * scale;
+}
+
+//---------------------------------------------------------------------
+//  Linear SVM based classification
+//
+__kernel void classify_hists_kernel(const int cblock_hist_size, const int cdescr_size, const int cdescr_width,
+                                    const int img_win_width, const int img_block_width,
+                                    const int win_block_stride_x, const int win_block_stride_y,
+                                    __global const float * block_hists, __global const float* coefs,
+                                    float free_coef, float threshold, __global uchar* labels)
+{
+    const int tid = get_local_id(0);
+    const int gidX = get_group_id(0);
+    const int gidY = get_group_id(1);
+
+    __global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
+
+    float product = 0.f;
+    for (int i = tid; i < cdescr_size; i += NTHREADS)
+    {
+        int offset_y = i / cdescr_width;
+        int offset_x = i - offset_y * cdescr_width;
+        product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x];
+    }
+
+    __local float products[NTHREADS];
+
+    products[tid] = product;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+ 
+    if (tid < 128) products[tid] = product = product + products[tid + 128];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    if (tid < 64) products[tid] = product = product + products[tid + 64];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 32)
+    {
+        volatile __local float* smem = products;
+        smem[tid] = product = product + smem[tid + 32];
+        smem[tid] = product = product + smem[tid + 16];
+        smem[tid] = product = product + smem[tid + 8];
+        smem[tid] = product = product + smem[tid + 4];
+        smem[tid] = product = product + smem[tid + 2];
+        smem[tid] = product = product + smem[tid + 1];
+    }
+
+    if (tid == 0)
+        labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
+}
+
+//----------------------------------------------------------------------------
+// Extract descriptors
+
+__kernel void extract_descrs_by_rows_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size, const int cdescr_width,
+                                            const int img_block_width, const int win_block_stride_x, const int win_block_stride_y,
+									        __global const float* block_hists, __global float* descriptors)
+{
+    int tid = get_local_id(0);
+    int gidX = get_group_id(0);
+    int gidY = get_group_id(1);
+    
+    // Get left top corner of the window in src
+    __global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
+
+    // Get left top corner of the window in dst
+    __global float* descriptor = descriptors + (gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
+
+    // Copy elements from src to dst
+    for (int i = tid; i < cdescr_size; i += NTHREADS)
+    {
+        int offset_y = i / cdescr_width;
+        int offset_x = i - offset_y * cdescr_width;
+        descriptor[i] = hist[offset_y * img_block_width * cblock_hist_size + offset_x];
+    }
+}
+
+__kernel void extract_descrs_by_cols_kernel(const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size, 
+                                            const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width, const int win_block_stride_x,
+                                            const int win_block_stride_y, __global const float* block_hists, __global float* descriptors)
+{
+    int tid = get_local_id(0);
+    int gidX = get_group_id(0);
+    int gidY = get_group_id(1);
+
+    // Get left top corner of the window in src
+    __global const float* hist = block_hists + (gidY * win_block_stride_y * img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
+
+    // Get left top corner of the window in dst
+    __global float* descriptor = descriptors + (gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
+
+    // Copy elements from src to dst
+    for (int i = tid; i < cdescr_size; i += NTHREADS)
+    {
+        int block_idx = i / cblock_hist_size;
+        int idx_in_block = i - block_idx * cblock_hist_size;
+
+        int y = block_idx / cnblocks_win_x;
+        int x = block_idx - y * cnblocks_win_x;
+
+        descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] = hist[(y * img_block_width  + x) * cblock_hist_size + idx_in_block];
+    }
+}
+
+//----------------------------------------------------------------------------
+// Gradients computation
+
+__kernel void compute_gradients_8UC4_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step, 
+                                            const __global uchar4 * img, __global float * grad, __global uchar * qangle, 
+                                            const float angle_scale, const char correct_gamma, const int cnbins)
+{
+    const int x = get_global_id(0);
+    const int tid = get_local_id(0);
+    const int gSizeX = get_local_size(0);
+    const int gidX = get_group_id(0);
+    const int gidY = get_group_id(1);
+
+    __global const uchar4* row = img + gidY * img_step;
+
+    __local float sh_row[(NTHREADS + 2) * 3];
+
+    uchar4 val;
+    if (x < width)
+        val = row[x];
+    else
+        val = row[width - 2];
+
+    sh_row[tid + 1] = val.x;
+    sh_row[tid + 1 + (NTHREADS + 2)] = val.y;
+    sh_row[tid + 1 + 2 * (NTHREADS + 2)] = val.z;
+
+    if (tid == 0)
+    {
+        val = row[max(x - 1, 1)];
+        sh_row[0] = val.x;
+        sh_row[(NTHREADS + 2)] = val.y;
+        sh_row[2 * (NTHREADS + 2)] = val.z;
+    }
+
+    if (tid == gSizeX - 1)
+    {
+        val = row[min(x + 1, width - 2)];
+        sh_row[gSizeX + 1] = val.x;
+        sh_row[gSizeX + 1 + (NTHREADS + 2)] = val.y;
+        sh_row[gSizeX + 1 + 2 * (NTHREADS + 2)] = val.z;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (x < width)
+    {
+        float3 a = (float3) (sh_row[tid], sh_row[tid + (NTHREADS + 2)], sh_row[tid + 2 * (NTHREADS + 2)]);
+        float3 b = (float3) (sh_row[tid + 2], sh_row[tid + 2 + (NTHREADS + 2)], sh_row[tid + 2 + 2 * (NTHREADS + 2)]);
+
+        float3 dx;
+        if (correct_gamma == 1)
+            dx = sqrt(b) - sqrt(a);
+        else
+            dx = b - a;
+
+        float3 dy = (float3) 0.f;
+
+        if (gidY > 0 && gidY < height - 1)
+        {
+            a = convert_float3(img[(gidY - 1) * img_step + x].xyz);
+            b = convert_float3(img[(gidY + 1) * img_step + x].xyz);
+
+            if (correct_gamma == 1)
+                dy = sqrt(b) - sqrt(a);
+            else
+                dy = b - a;
+        }
+
+        float best_dx = dx.x;
+        float best_dy = dy.x;
+
+        float mag0 = dx.x * dx.x + dy.x * dy.x;
+        float mag1 = dx.y * dx.y + dy.y * dy.y;
+        if (mag0 < mag1)
+        {
+            best_dx = dx.y;
+            best_dy = dy.y;
+            mag0 = mag1;
+        }
+
+        mag1 = dx.z * dx.z + dy.z * dy.z;
+        if (mag0 < mag1)
+        {
+            best_dx = dx.z;
+            best_dy = dy.z;
+            mag0 = mag1;
+        }
+
+        mag0 = sqrt(mag0);
+
+        float ang = (atan2(best_dy, best_dx) + CV_PI_F) * angle_scale - 0.5f;
+        int hidx = (int)floor(ang);
+        ang -= hidx;
+        hidx = (hidx + cnbins) % cnbins;
+
+        qangle[(gidY * qangle_step + x) << 1] = hidx;
+        qangle[((gidY * qangle_step + x) << 1) + 1] = (hidx + 1) % cnbins;
+        grad[(gidY * grad_quadstep + x) << 1] = mag0 * (1.f - ang);
+        grad[((gidY * grad_quadstep + x) << 1) + 1] = mag0 * ang;
+    }
+}
+
+__kernel void compute_gradients_8UC1_kernel(const int height, const int width, const int img_step, const int grad_quadstep, const int qangle_step,
+                                            __global const uchar * img, __global float * grad, __global uchar * qangle, 
+                                            const float angle_scale, const char correct_gamma, const int cnbins)
+{
+    const int x = get_global_id(0);
+    const int tid = get_local_id(0);
+    const int gSizeX = get_local_size(0);
+    const int gidX = get_group_id(0);
+    const int gidY = get_group_id(1);
+
+    __global const uchar* row = img + gidY * img_step;
+
+    __local float sh_row[NTHREADS + 2];
+
+    if (x < width)
+        sh_row[tid + 1] = row[x];
+    else
+        sh_row[tid + 1] = row[width - 2];
+
+    if (tid == 0)
+        sh_row[0] = row[max(x - 1, 1)];
+
+    if (tid == gSizeX - 1)
+        sh_row[gSizeX + 1] = row[min(x + 1, width - 2)];
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (x < width)
+    {
+        float dx;
+
+        if (correct_gamma == 1)
+            dx = sqrt(sh_row[tid + 2]) - sqrt(sh_row[tid]);
+        else
+            dx = sh_row[tid + 2] - sh_row[tid];
+
+        float dy = 0.f;
+        if (gidY > 0 && gidY < height - 1)
+        {
+            float a = (float) img[ (gidY + 1) * img_step + x ];
+            float b = (float) img[ (gidY - 1) * img_step + x ];
+            if (correct_gamma == 1)
+                dy = sqrt(a) - sqrt(b);
+            else
+                dy = a - b;
+        }
+        float mag = sqrt(dx * dx + dy * dy);
+
+        float ang = (atan2(dy, dx) + CV_PI_F) * angle_scale - 0.5f;
+        int hidx = (int)floor(ang);
+        ang -= hidx;
+        hidx = (hidx + cnbins) % cnbins;
+
+        qangle[ (gidY * qangle_step + x) << 1 ]     = hidx;
+        qangle[ ((gidY * qangle_step + x) << 1) + 1 ] = (hidx + 1) % cnbins;
+        grad[ (gidY * grad_quadstep + x) << 1 ]       = mag * (1.f - ang);
+        grad[ ((gidY * grad_quadstep + x) << 1) + 1 ]   = mag * ang;
+    }
+}
--- a/modules/ocl/src/surf.cpp
+++ b/modules/ocl/src/surf.cpp
@ -0,0 +1,760 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include <iomanip>
+#include "precomp.hpp"
+
+
+using namespace cv;
+using namespace cv::ocl;
+using namespace std;
+
+#if !defined (HAVE_OPENCL)
+
+cv::ocl::SURF_OCL::SURF_OCL() { throw_nogpu(); }
+cv::ocl::SURF_OCL::SURF_OCL(double, int, int, bool, float, bool) { throw_nogpu(); }
+int cv::ocl::SURF_OCL::descriptorSize() const { throw_nogpu(); return 0;}
+void cv::ocl::SURF_OCL::uploadKeypoints(const vector<KeyPoint>&, oclMat&) { throw_nogpu(); }
+void cv::ocl::SURF_OCL::downloadKeypoints(const oclMat&, vector<KeyPoint>&) { throw_nogpu(); }
+void cv::ocl::SURF_OCL::downloadDescriptors(const oclMat&, vector<float>&) { throw_nogpu(); }
+void cv::ocl::SURF_OCL::operator()(const oclMat&, const oclMat&, oclMat&) { throw_nogpu(); }
+void cv::ocl::SURF_OCL::operator()(const oclMat&, const oclMat&, oclMat&, oclMat&, bool) { throw_nogpu(); }
+void cv::ocl::SURF_OCL::operator()(const oclMat&, const oclMat&, vector<KeyPoint>&) { throw_nogpu(); }
+void cv::ocl::SURF_OCL::operator()(const oclMat&, const oclMat&, vector<KeyPoint>&, oclMat&, bool) { throw_nogpu(); }
+void cv::ocl::SURF_OCL::operator()(const oclMat&, const oclMat&, vector<KeyPoint>&, vector<float>&, bool) { throw_nogpu(); }
+void cv::ocl::SURF_OCL::releaseMemory() { throw_nogpu(); }
+
+#else /* !defined (HAVE_OPENCL) */
+namespace cv { namespace ocl 
+{
+    ///////////////////////////OpenCL kernel strings///////////////////////////
+    extern const char * nonfree_surf;
+}}
+
+namespace 
+{
+    static inline int divUp(int total, int grain)
+    {
+        return (total + grain - 1) / grain;
+    }
+    static inline int calcSize(int octave, int layer)
+    {
+        /* Wavelet size at first layer of first octave. */
+        const int HAAR_SIZE0 = 9;
+
+        /* Wavelet size increment between layers. This should be an even number,
+        such that the wavelet sizes in an octave are either all even or all odd.
+        This ensures that when looking for the neighbours of a sample, the layers
+
+        above and below are aligned correctly. */
+        const int HAAR_SIZE_INC = 6;
+
+        return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
+    }
+
+    class SURF_OCL_Invoker
+    {
+    public:
+        // facilities
+        void bindImgTex(const oclMat& img);
+        void bindSumTex(const oclMat& sum);
+        void bindMaskSumTex(const oclMat& maskSum);
+
+        //void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
+        //void loadOctaveConstants(int octave, int layer_rows, int layer_cols);
+
+        // kernel callers declearations
+        void icvCalcLayerDetAndTrace_gpu(oclMat& det, oclMat& trace, int octave, int nOctaveLayers, int layer_rows);
+
+        void icvFindMaximaInLayer_gpu(const oclMat& det, const oclMat& trace, oclMat& maxPosBuffer, oclMat& maxCounter, int counterOffset,
+            int octave, bool use_mask, int nLayers, int layer_rows, int layer_cols);
+
+        void icvInterpolateKeypoint_gpu(const oclMat& det, const oclMat& maxPosBuffer, unsigned int maxCounter,
+            oclMat& keypoints, oclMat& counters, int octave, int layer_rows, int maxFeatures);
+
+        void icvCalcOrientation_gpu(const oclMat& keypoints, int nFeatures);
+
+        void compute_descriptors_gpu(const oclMat& descriptors, const oclMat& keypoints, int nFeatures);
+        // end of kernel callers declearations
+
+
+        SURF_OCL_Invoker(SURF_OCL& surf, const oclMat& img, const oclMat& mask) :
+        surf_(surf),
+            img_cols(img.cols), img_rows(img.rows),
+            use_mask(!mask.empty())
+        {
+            CV_Assert(!img.empty() && img.type() == CV_8UC1);
+            CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
+            CV_Assert(surf_.nOctaves > 0 && surf_.nOctaveLayers > 0);
+
+            const int min_size = calcSize(surf_.nOctaves - 1, 0);
+            CV_Assert(img_rows - min_size >= 0);
+            CV_Assert(img_cols - min_size >= 0);
+
+            const int layer_rows = img_rows >> (surf_.nOctaves - 1);
+            const int layer_cols = img_cols >> (surf_.nOctaves - 1);
+            const int min_margin = ((calcSize((surf_.nOctaves - 1), 2) >> 1) >> (surf_.nOctaves - 1)) + 1;
+            CV_Assert(layer_rows - 2 * min_margin > 0);
+            CV_Assert(layer_cols - 2 * min_margin > 0);
+
+            maxFeatures   = std::min(static_cast<int>(img.size().area() * surf.keypointsRatio), 65535);
+            maxCandidates = std::min(static_cast<int>(1.5 * maxFeatures), 65535);
+
+            CV_Assert(maxFeatures > 0);
+
+            counters.create(1, surf_.nOctaves + 1, CV_32SC1);
+            counters.setTo(Scalar::all(0));
+
+            //loadGlobalConstants(maxCandidates, maxFeatures, img_rows, img_cols, surf_.nOctaveLayers, static_cast<float>(surf_.hessianThreshold));
+
+            bindImgTex(img);
+            oclMat integral_sqsum;
+            integral(img, surf_.sum, integral_sqsum); // the two argumented integral version is incorrect
+
+            bindSumTex(surf_.sum);
+            maskSumTex = 0;
+
+            if (use_mask)
+            {
+                throw std::exception();
+                //!FIXME
+                // temp fix for missing min overload
+                oclMat temp(mask.size(), mask.type());
+                temp.setTo(Scalar::all(1.0));
+                //cv::ocl::min(mask, temp, surf_.mask1);           ///////// disable this 
+                integral(surf_.mask1, surf_.maskSum);
+                bindMaskSumTex(surf_.maskSum);
+            }
+        }
+
+        void detectKeypoints(oclMat& keypoints)
+        {
+            // create image pyramid buffers
+            // different layers have same sized buffers, but they are sampled from gaussin kernel.
+            surf_.det.create(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1);  
+            surf_.trace.create(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1);
+
+            surf_.maxPosBuffer.create(1, maxCandidates, CV_32SC4);
+            keypoints.create(SURF_OCL::ROWS_COUNT, maxFeatures, CV_32FC1);
+            keypoints.setTo(Scalar::all(0));
+
+            for (int octave = 0; octave < surf_.nOctaves; ++octave)
+            {
+                const int layer_rows = img_rows >> octave;
+                const int layer_cols = img_cols >> octave;
+
+                //loadOctaveConstants(octave, layer_rows, layer_cols);
+
+                icvCalcLayerDetAndTrace_gpu(surf_.det, surf_.trace, octave, surf_.nOctaveLayers, layer_rows);
+
+                icvFindMaximaInLayer_gpu(surf_.det, surf_.trace, surf_.maxPosBuffer, counters, 1 + octave,
+                    octave, use_mask, surf_.nOctaveLayers, layer_rows, layer_cols);
+
+                unsigned int maxCounter = Mat(counters).at<unsigned int>(1 + octave);
+                maxCounter = std::min(maxCounter, static_cast<unsigned int>(maxCandidates));
+
+                if (maxCounter > 0)
+                {
+                    icvInterpolateKeypoint_gpu(surf_.det, surf_.maxPosBuffer, maxCounter,
+                        keypoints, counters, octave, layer_rows, maxFeatures);
+                }
+            }
+            unsigned int featureCounter = Mat(counters).at<unsigned int>(0);
+            featureCounter = std::min(featureCounter, static_cast<unsigned int>(maxFeatures));
+
+            keypoints.cols = featureCounter;
+
+            if (surf_.upright)
+                keypoints.row(SURF_OCL::ANGLE_ROW).setTo(Scalar::all(90.0));
+            else
+                findOrientation(keypoints);
+        }
+
+        void findOrientation(oclMat& keypoints)
+        {
+            const int nFeatures = keypoints.cols;
+            if (nFeatures > 0)
+            {
+                icvCalcOrientation_gpu(keypoints, nFeatures);
+            }
+        }
+
+        void computeDescriptors(const oclMat& keypoints, oclMat& descriptors, int descriptorSize)
+        {
+            const int nFeatures = keypoints.cols;
+            if (nFeatures > 0)
+            {
+                descriptors.create(nFeatures, descriptorSize, CV_32F);
+                compute_descriptors_gpu(descriptors, keypoints, nFeatures);
+            }
+        }
+
+        ~SURF_OCL_Invoker()
+        {
+            if(imgTex)
+                openCLFree(imgTex);
+            if(sumTex)
+                openCLFree(sumTex);
+            if(maskSumTex)
+                openCLFree(maskSumTex);
+            additioalParamBuffer.release();
+        }
+
+    private:
+        SURF_OCL& surf_;
+
+        int img_cols, img_rows;
+
+        bool use_mask;
+
+        int maxCandidates;
+        int maxFeatures;
+
+        oclMat counters;
+
+        // texture buffers
+        cl_mem imgTex;
+        cl_mem sumTex;
+        cl_mem maskSumTex;
+
+        oclMat additioalParamBuffer;
+    };
+}
+
+cv::ocl::SURF_OCL::SURF_OCL()
+{
+    hessianThreshold = 100.0f;
+    extended = true;
+    nOctaves = 4;
+    nOctaveLayers = 2;
+    keypointsRatio = 0.01f;
+    upright = false;
+}
+
+cv::ocl::SURF_OCL::SURF_OCL(double _threshold, int _nOctaves, int _nOctaveLayers, bool _extended, float _keypointsRatio, bool _upright)
+{
+    hessianThreshold = _threshold;
+    extended = _extended;
+    nOctaves = _nOctaves;
+    nOctaveLayers = _nOctaveLayers;
+    keypointsRatio = _keypointsRatio;
+    upright = _upright;
+}
+
+int cv::ocl::SURF_OCL::descriptorSize() const
+{
+    return extended ? 128 : 64;
+}
+
+void cv::ocl::SURF_OCL::uploadKeypoints(const vector<KeyPoint>& keypoints, oclMat& keypointsGPU)
+{
+    if (keypoints.empty())
+        keypointsGPU.release();
+    else
+    {
+        Mat keypointsCPU(SURF_OCL::ROWS_COUNT, static_cast<int>(keypoints.size()), CV_32FC1);
+
+        float* kp_x = keypointsCPU.ptr<float>(SURF_OCL::X_ROW);
+        float* kp_y = keypointsCPU.ptr<float>(SURF_OCL::Y_ROW);
+        int* kp_laplacian = keypointsCPU.ptr<int>(SURF_OCL::LAPLACIAN_ROW);
+        int* kp_octave = keypointsCPU.ptr<int>(SURF_OCL::OCTAVE_ROW);
+        float* kp_size = keypointsCPU.ptr<float>(SURF_OCL::SIZE_ROW);
+        float* kp_dir = keypointsCPU.ptr<float>(SURF_OCL::ANGLE_ROW);
+        float* kp_hessian = keypointsCPU.ptr<float>(SURF_OCL::HESSIAN_ROW);
+
+        for (size_t i = 0, size = keypoints.size(); i < size; ++i)
+        {
+            const KeyPoint& kp = keypoints[i];
+            kp_x[i] = kp.pt.x;
+            kp_y[i] = kp.pt.y;
+            kp_octave[i] = kp.octave;
+            kp_size[i] = kp.size;
+            kp_dir[i] = kp.angle;
+            kp_hessian[i] = kp.response;
+            kp_laplacian[i] = 1;
+        }
+
+        keypointsGPU.upload(keypointsCPU);
+    }
+}
+
+void cv::ocl::SURF_OCL::downloadKeypoints(const oclMat& keypointsGPU, vector<KeyPoint>& keypoints)
+{
+    const int nFeatures = keypointsGPU.cols;
+
+    if (nFeatures == 0)
+        keypoints.clear();
+    else
+    {
+        CV_Assert(keypointsGPU.type() == CV_32FC1 && keypointsGPU.rows == ROWS_COUNT);
+
+        Mat keypointsCPU(keypointsGPU);
+
+        keypoints.resize(nFeatures);
+
+        float* kp_x = keypointsCPU.ptr<float>(SURF_OCL::X_ROW);
+        float* kp_y = keypointsCPU.ptr<float>(SURF_OCL::Y_ROW);
+        int* kp_laplacian = keypointsCPU.ptr<int>(SURF_OCL::LAPLACIAN_ROW);
+        int* kp_octave = keypointsCPU.ptr<int>(SURF_OCL::OCTAVE_ROW);
+        float* kp_size = keypointsCPU.ptr<float>(SURF_OCL::SIZE_ROW);
+        float* kp_dir = keypointsCPU.ptr<float>(SURF_OCL::ANGLE_ROW);
+        float* kp_hessian = keypointsCPU.ptr<float>(SURF_OCL::HESSIAN_ROW);
+
+        for (int i = 0; i < nFeatures; ++i)
+        {
+            KeyPoint& kp = keypoints[i];
+            kp.pt.x = kp_x[i];
+            kp.pt.y = kp_y[i];
+            kp.class_id = kp_laplacian[i];
+            kp.octave = kp_octave[i];
+            kp.size = kp_size[i];
+            kp.angle = kp_dir[i];
+            kp.response = kp_hessian[i];
+        }
+    }
+}
+
+void cv::ocl::SURF_OCL::downloadDescriptors(const oclMat& descriptorsGPU, vector<float>& descriptors)
+{
+    if (descriptorsGPU.empty())
+        descriptors.clear();
+    else
+    {
+        CV_Assert(descriptorsGPU.type() == CV_32F);
+
+        descriptors.resize(descriptorsGPU.rows * descriptorsGPU.cols);
+        Mat descriptorsCPU(descriptorsGPU.size(), CV_32F, &descriptors[0]);
+        descriptorsGPU.download(descriptorsCPU);
+    }
+}
+
+void cv::ocl::SURF_OCL::operator()(const oclMat& img, const oclMat& mask, oclMat& keypoints)
+{
+    if (!img.empty())
+    {
+        SURF_OCL_Invoker surf(*this, img, mask);
+
+        surf.detectKeypoints(keypoints);
+    }
+}
+
+void cv::ocl::SURF_OCL::operator()(const oclMat& img, const oclMat& mask, oclMat& keypoints, oclMat& descriptors,
+    bool useProvidedKeypoints)
+{
+    if (!img.empty())
+    {
+        SURF_OCL_Invoker surf(*this, img, mask);
+
+        if (!useProvidedKeypoints)
+            surf.detectKeypoints(keypoints);
+        else if (!upright)
+        {
+            surf.findOrientation(keypoints);
+        }
+
+        surf.computeDescriptors(keypoints, descriptors, descriptorSize());
+    }
+}
+
+void cv::ocl::SURF_OCL::operator()(const oclMat& img, const oclMat& mask, vector<KeyPoint>& keypoints)
+{
+    oclMat keypointsGPU;
+
+    (*this)(img, mask, keypointsGPU);
+
+    downloadKeypoints(keypointsGPU, keypoints);
+}
+
+void cv::ocl::SURF_OCL::operator()(const oclMat& img, const oclMat& mask, vector<KeyPoint>& keypoints,
+    oclMat& descriptors, bool useProvidedKeypoints)
+{
+    oclMat keypointsGPU;
+
+    if (useProvidedKeypoints)
+        uploadKeypoints(keypoints, keypointsGPU);
+
+    (*this)(img, mask, keypointsGPU, descriptors, useProvidedKeypoints);
+
+    downloadKeypoints(keypointsGPU, keypoints);
+}
+
+void cv::ocl::SURF_OCL::operator()(const oclMat& img, const oclMat& mask, vector<KeyPoint>& keypoints,
+    vector<float>& descriptors, bool useProvidedKeypoints)
+{
+    oclMat descriptorsGPU;
+
+    (*this)(img, mask, keypoints, descriptorsGPU, useProvidedKeypoints);
+
+    downloadDescriptors(descriptorsGPU, descriptors);
+}
+
+void cv::ocl::SURF_OCL::releaseMemory()
+{
+    sum.release();
+    mask1.release();
+    maskSum.release();
+    intBuffer.release();
+    det.release();
+    trace.release();
+    maxPosBuffer.release();
+}
+
+// Facilities
+
+//// load SURF constants into device memory
+//void SURF_OCL_Invoker::loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold)
+//{
+//	Mat tmp(1, 9, CV_32FC1);
+//	float * tmp_data = tmp.ptr<float>();
+//	*tmp_data        = maxCandidates;
+//	*(++tmp_data)    = maxFeatures;
+//	*(++tmp_data)    = img_rows;
+//	*(++tmp_data)    = img_cols;
+//	*(++tmp_data)    = nOctaveLayers;
+//	*(++tmp_data)    = hessianThreshold;
+//	additioalParamBuffer = tmp;
+//}
+//void SURF_OCL_Invoker::loadOctaveConstants(int octave, int layer_rows, int layer_cols)
+//{
+//	Mat tmp = additioalParamBuffer;
+//	float * tmp_data = tmp.ptr<float>();
+//	tmp_data += 6;
+//	*tmp_data        = octave;
+//	*(++tmp_data)    = layer_rows;
+//	*(++tmp_data)    = layer_cols;
+//	additioalParamBuffer = tmp;
+//}
+
+// create and bind source buffer to image oject.
+void SURF_OCL_Invoker::bindImgTex(const oclMat& img)
+{
+    Mat cpu_img(img); // time consuming
+    cl_image_format format;
+    int err;
+
+    format.image_channel_data_type = CL_UNSIGNED_INT8;
+    format.image_channel_order     = CL_R;
+
+#if CL_VERSION_1_2
+    cl_image_desc desc;
+    desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
+    desc.image_width      = cpu_img.cols;
+    desc.image_height     = cpu_img.rows;
+    desc.image_depth      = NULL;
+    desc.image_array_size = 1;
+    desc.image_row_pitch  = cpu_img.step;
+    desc.image_slice_pitch= 0;
+    desc.buffer           = NULL;
+    desc.num_mip_levels   = 0;
+    desc.num_samples      = 0;
+    imgTex = clCreateImage(img.clCxt->impl->clContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, &format, &desc, cpu_img.data, &err);
+#else
+    imgTex = clCreateImage2D(
+        img.clCxt->impl->clContext, 
+        CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 
+        &format, 
+        cpu_img.cols, 
+        cpu_img.rows, 
+        cpu_img.step, 
+        cpu_img.data, 
+        &err);
+#endif
+    openCLSafeCall(err);
+}
+
+void SURF_OCL_Invoker::bindSumTex(const oclMat& sum)
+{
+    Mat cpu_img(sum); // time consuming
+    cl_image_format format;
+    int err;
+    format.image_channel_data_type = CL_UNSIGNED_INT32;
+    format.image_channel_order     = CL_R;
+#if CL_VERSION_1_2
+    cl_image_desc desc;
+    desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
+    desc.image_width      = cpu_img.cols;
+    desc.image_height     = cpu_img.rows;
+    desc.image_depth      = NULL;
+    desc.image_array_size = 1;
+    desc.image_row_pitch  = cpu_img.step;
+    desc.image_slice_pitch= 0;
+    desc.buffer           = NULL;
+    desc.num_mip_levels   = 0;
+    desc.num_samples      = 0;
+    sumTex = clCreateImage(sum.clCxt->impl->clContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, &format, &desc, cpu_img.data, &err);
+#else
+    sumTex = clCreateImage2D(
+        sum.clCxt->impl->clContext, 
+        CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 
+        &format, 
+        cpu_img.cols, 
+        cpu_img.rows, 
+        cpu_img.step, 
+        cpu_img.data, 
+        &err);
+#endif
+    openCLSafeCall(err);
+}
+void SURF_OCL_Invoker::bindMaskSumTex(const oclMat& maskSum)
+{
+    Mat cpu_img(maskSum); // time consuming
+    cl_image_format format;
+    int err;
+    format.image_channel_data_type = CL_UNSIGNED_INT32;
+    format.image_channel_order     = CL_R;
+#if CL_VERSION_1_2
+    cl_image_desc desc;
+    desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
+    desc.image_width      = cpu_img.cols;
+    desc.image_height     = cpu_img.rows;
+    desc.image_depth      = NULL;
+    desc.image_array_size = 1;
+    desc.image_row_pitch  = cpu_img.step;
+    desc.image_slice_pitch= 0;
+    desc.buffer           = NULL;
+    desc.num_mip_levels   = 0;
+    desc.num_samples      = 0;
+    maskSumTex = clCreateImage(maskSum.clCxt->impl->clContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, &format, &desc, cpu_img.data, &err);
+#else
+    maskSumTex = clCreateImage2D(
+        maskSum.clCxt->impl->clContext, 
+        CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 
+        &format, 
+        cpu_img.cols, 
+        cpu_img.rows, 
+        cpu_img.step, 
+        cpu_img.data, 
+        &err);
+#endif
+    openCLSafeCall(err);
+}
+
+////////////////////////////
+// kernel caller definitions
+void SURF_OCL_Invoker::icvCalcLayerDetAndTrace_gpu(oclMat& det, oclMat& trace, int octave, int nOctaveLayers, int c_layer_rows)
+{
+    const int min_size = calcSize(octave, 0);
+    const int max_samples_i = 1 + ((img_rows - min_size) >> octave);
+    const int max_samples_j = 1 + ((img_cols - min_size) >> octave);
+
+    Context *clCxt = det.clCxt;
+    string kernelName = "icvCalcLayerDetAndTrace";
+    vector< pair<size_t, const void *> > args;
+
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&sumTex));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&det.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&trace.data));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&det.step));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&trace.step));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&img_rows));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&img_cols));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&nOctaveLayers));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&octave));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&c_layer_rows));
+
+    size_t localThreads[3]  = {16, 16, 1};
+    size_t globalThreads[3] = {
+        divUp(max_samples_j, localThreads[0]) * localThreads[0], 
+        divUp(max_samples_i, localThreads[1]) * localThreads[1] * (nOctaveLayers + 2), 
+        1};
+        openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
+}
+
+void SURF_OCL_Invoker::icvFindMaximaInLayer_gpu(const oclMat& det, const oclMat& trace, oclMat& maxPosBuffer, oclMat& maxCounter, int counterOffset,
+    int octave, bool use_mask, int nLayers, int layer_rows, int layer_cols)
+{
+    const int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1;
+
+    Context *clCxt = det.clCxt;
+    string kernelName = use_mask ? "icvFindMaximaInLayer_withmask" : "icvFindMaximaInLayer";
+    vector< pair<size_t, const void *> > args;
+
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&det.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&trace.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&maxPosBuffer.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&maxCounter.data));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&counterOffset));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&det.step));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&trace.step));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&img_rows));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&img_cols));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&nLayers));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&octave));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&layer_rows));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&layer_cols));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&maxCandidates));
+    args.push_back( make_pair( sizeof(cl_float), (void *)&surf_.hessianThreshold));
+
+    if(use_mask)
+    {
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&maskSumTex));
+    }
+
+    size_t localThreads[3]  = {16, 16, 1};
+    size_t globalThreads[3] = {divUp(layer_cols - 2 * min_margin, localThreads[0] - 2) * localThreads[0], 
+        divUp(layer_rows - 2 * min_margin, localThreads[1] - 2) * nLayers * localThreads[1], 
+        1};
+
+    openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
+}
+
+void SURF_OCL_Invoker::icvInterpolateKeypoint_gpu(const oclMat& det, const oclMat& maxPosBuffer, unsigned int maxCounter,
+    oclMat& keypoints, oclMat& counters, int octave, int layer_rows, int maxFeatures)
+{
+    Context *clCxt = det.clCxt;
+    string kernelName = "icvInterpolateKeypoint";
+    vector< pair<size_t, const void *> > args;
+
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&det.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&maxPosBuffer.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&counters.data));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&det.step));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&img_rows));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&img_cols));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&octave));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&layer_rows));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&maxFeatures));
+
+    size_t localThreads[3]  = {3, 3, 3};
+    size_t globalThreads[3] = {maxCounter * localThreads[0], 1, 1};
+
+    openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
+}
+
+void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat& keypoints, int nFeatures)
+{
+    Context * clCxt = counters.clCxt;
+    string kernelName = "icvCalcOrientation";
+
+    vector< pair<size_t, const void *> > args;
+
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&sumTex));
+    args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&img_rows));
+    args.push_back( make_pair( sizeof(cl_int), (void *)&img_cols));
+
+    size_t localThreads[3]  = {32, 4, 1};
+    size_t globalThreads[3] = {nFeatures * localThreads[0], localThreads[1], 1};
+
+    openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
+}
+
+void SURF_OCL_Invoker::compute_descriptors_gpu(const oclMat& descriptors, const oclMat& keypoints, int nFeatures)
+{
+    // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
+    Context *clCxt = descriptors.clCxt;
+    string kernelName = "";
+    vector< pair<size_t, const void *> > args;
+    size_t localThreads[3]  = {1, 1, 1};
+    size_t globalThreads[3] = {1, 1, 1};
+
+    if(descriptors.cols == 64)
+    {
+        kernelName = "compute_descriptors64";
+
+        localThreads[0] = 6;
+        localThreads[1] = 6;
+
+        globalThreads[0] = nFeatures * localThreads[0];
+        globalThreads[1] = 16 * localThreads[1];
+
+        args.clear();
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&imgTex));
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step));
+        openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
+
+        kernelName = "normalize_descriptors64";
+
+        localThreads[0] = 64;
+        localThreads[1] = 1;
+
+        globalThreads[0] = nFeatures * localThreads[0];
+        globalThreads[1] = localThreads[1];
+
+        args.clear();
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step));
+        openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
+    }
+    else
+    {
+        kernelName = "compute_descriptors128";
+
+        localThreads[0] = 6;
+        localThreads[1] = 6;
+
+        globalThreads[0] = nFeatures * localThreads[0];
+        globalThreads[1] = 16 * localThreads[1];
+
+        args.clear();
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&imgTex));
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&keypoints.data));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&keypoints.step));
+        openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
+
+        kernelName = "normalize_descriptors128";
+
+        localThreads[0] = 128;
+        localThreads[1] = 1;
+
+        globalThreads[0] = nFeatures * localThreads[0];
+        globalThreads[1] = localThreads[1];
+
+        args.clear();
+        args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
+        args.push_back( make_pair( sizeof(cl_int), (void *)&descriptors.step));
+        openCLExecuteKernel(clCxt, &nonfree_surf, kernelName, globalThreads, localThreads, args, -1, -1);
+    }
+}
+
+#endif // /* !defined (HAVE_OPENCL) */
+
--- a/modules/ocl/test/test_hog.cpp
+++ b/modules/ocl/test/test_hog.cpp
@ -0,0 +1,192 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                        Intel License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//		Wenju He, wenju@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of Intel Corporation may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#include "opencv2/core/core.hpp"
+using namespace std;
+#ifdef HAVE_OPENCL
+
+
+PARAM_TEST_CASE(HOG,cv::Size,int)
+{
+	cv::Size winSize;
+	int type;
+	vector<cv::ocl::Info> info;
+	virtual void SetUp()
+	{
+		winSize = GET_PARAM(0);
+		type = GET_PARAM(1);
+		cv::ocl::getDevice(info);
+	}
+};
+
+TEST_P(HOG, GetDescriptors)
+{
+    // Load image
+    cv::Mat img_rgb = readImage("../../../samples/gpu/road.png");
+    ASSERT_FALSE(img_rgb.empty());
+
+    // Convert image
+    cv::Mat img;
+    switch (type)
+    {
+    case CV_8UC1:
+        cv::cvtColor(img_rgb, img, CV_BGR2GRAY);
+        break;
+    case CV_8UC4:
+    default:
+        cv::cvtColor(img_rgb, img, CV_BGR2BGRA);
+        break;
+    }
+    cv::ocl::oclMat d_img(img);
+
+    // HOGs
+    cv::ocl::HOGDescriptor ocl_hog;
+    ocl_hog.gamma_correction = true;
+    cv::HOGDescriptor hog;
+    hog.gammaCorrection = true;
+
+    // Compute descriptor
+    cv::ocl::oclMat d_descriptors;
+    ocl_hog.getDescriptors(d_img, ocl_hog.win_size, d_descriptors, ocl_hog.DESCR_FORMAT_COL_BY_COL);
+    cv::Mat down_descriptors;
+    d_descriptors.download(down_descriptors);
+    down_descriptors = down_descriptors.reshape(0, down_descriptors.cols * down_descriptors.rows);
+
+    hog.setSVMDetector(hog.getDefaultPeopleDetector());
+    std::vector<float> descriptors;
+    switch (type)
+    {
+    case CV_8UC1:
+        hog.compute(img, descriptors, ocl_hog.win_size);
+        break;
+    case CV_8UC4:
+    default:
+        hog.compute(img_rgb, descriptors, ocl_hog.win_size);
+        break;
+    }
+    cv::Mat cpu_descriptors(descriptors);
+
+    EXPECT_MAT_SIMILAR(down_descriptors, cpu_descriptors, 1e-2);
+}
+
+
+TEST_P(HOG, Detect)
+{
+    // Load image
+    cv::Mat img_rgb = readImage("../../../samples/gpu/road.png");
+    ASSERT_FALSE(img_rgb.empty());
+
+    // Convert image
+    cv::Mat img;
+    switch (type)
+    {
+    case CV_8UC1:
+        cv::cvtColor(img_rgb, img, CV_BGR2GRAY);
+        break;
+    case CV_8UC4:
+    default:
+        cv::cvtColor(img_rgb, img, CV_BGR2BGRA);
+        break;
+    }
+    cv::ocl::oclMat d_img(img);
+
+    // HOGs
+    if ((winSize != cv::Size(48, 96)) && (winSize != cv::Size(64, 128)))
+        winSize = cv::Size(64, 128);
+    cv::ocl::HOGDescriptor ocl_hog(winSize);
+    ocl_hog.gamma_correction = true;
+
+    cv::HOGDescriptor hog;
+    hog.winSize = winSize;
+    hog.gammaCorrection = true;
+
+    if (winSize.width == 48 && winSize.height == 96)
+    {
+        // daimler's base
+        ocl_hog.setSVMDetector(ocl_hog.getPeopleDetector48x96());
+        hog.setSVMDetector(hog.getDaimlerPeopleDetector());
+    }
+    else if (winSize.width == 64 && winSize.height == 128)
+    {
+        ocl_hog.setSVMDetector(ocl_hog.getPeopleDetector64x128());
+        hog.setSVMDetector(hog.getDefaultPeopleDetector());
+    }
+    else
+    {
+        ocl_hog.setSVMDetector(ocl_hog.getDefaultPeopleDetector());
+        hog.setSVMDetector(hog.getDefaultPeopleDetector());
+    }
+
+    // OpenCL detection
+    std::vector<cv::Point> d_v_locations;
+    ocl_hog.detect(d_img, d_v_locations, 0);
+    cv::Mat d_locations(d_v_locations);
+    
+    // CPU detection
+    std::vector<cv::Point> v_locations;
+    switch (type)
+    {
+    case CV_8UC1:
+        hog.detect(img, v_locations, 0);
+        break;
+    case CV_8UC4:
+    default:
+        hog.detect(img_rgb, v_locations, 0);
+        break;
+    }
+    cv::Mat locations(v_locations);
+
+    char s[100]={0};
+    EXPECT_MAT_NEAR(d_locations, locations, 0, s);
+}
+
+
+INSTANTIATE_TEST_CASE_P(OCL_ObjDetect, HOG, testing::Combine(
+                        testing::Values(cv::Size(64, 128), cv::Size(48, 96)),
+                        testing::Values(MatType(CV_8UC1), MatType(CV_8UC4))));
+
+
+#endif //HAVE_OPENCL