diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index d67e43d078..92f544b221 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -1454,12 +1454,14 @@ private:
     int subsetSize;
     int nodeStep;
 
-    // located on gpu
+    // gpu representation of classifier
     GpuMat stage_mat;
     GpuMat trees_mat;
     GpuMat nodes_mat;
     GpuMat leaves_mat;
     GpuMat subsets_mat;
+
+    // current integral image
     GpuMat integral;
 };
 
diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp
index b423d58f64..bd0058b914 100644
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@@ -59,7 +59,6 @@ struct Stage
 struct DTreeNode
 {
     int   featureIdx;
-    //float threshold; // for ordered features only
     int   left;
     int   right;
     DTreeNode(int f = 0, int l = 0, int r = 0) : featureIdx(f), left(l), right(r) {}
@@ -271,7 +270,8 @@ namespace cv { namespace gpu { namespace device
 {
     namespace lbp
     {
-        void CascadeClassify(DevMem2Db image, DevMem2Db objects, double scaleFactor = 1.2, int minNeighbors = 4, cudaStream_t stream = 0);
+        void cascadeClassify(const DevMem2Db stages, const DevMem2Di trees, const DevMem2Db nodes, const DevMem2Df leaves, const DevMem2Di subsets,
+            const DevMem2Db integral, int workWidth, int workHeight, int step, int subsetSize, DevMem2D_<int4> objects, int minNeighbors = 4, cudaStream_t stream = 0);
     }
 }}}
 
@@ -308,17 +308,8 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
         int step = (factor <= 2.) + 1;
         int stripCount = 1, stripSize = processingRectSize.height;
 
-        int y1 = 0;
-        int y2 = processingRectSize.height;
-
-        for (int y = y1; y < y2; y += step)
-            for (int x = 0; x < processingRectSize.width; x+=step)
-            {
-                //ToDO: classify
-                int result = 0;
-
-            }
-
+        cv::gpu::device::lbp::cascadeClassify(stage_mat, trees_mat, nodes_mat, leaves_mat, subsets_mat,
+         integral, processingRectSize.width, processingRectSize.height, step, subsetSize, objects, minNeighbors);
     }
     // TODO: reject levels
 
diff --git a/modules/gpu/src/cuda/lbp.cu b/modules/gpu/src/cuda/lbp.cu
index 929077c23d..61f67d4d2b 100644
--- a/modules/gpu/src/cuda/lbp.cu
+++ b/modules/gpu/src/cuda/lbp.cu
@@ -40,15 +40,51 @@
 //
 //M*/
 
-#include <opencv2/gpu/device/detail/lbp.hpp>
+#include <opencv2/gpu/device/lbp.hpp>
 
 namespace cv { namespace gpu { namespace device
 {
     namespace lbp
     {
-        void CascadeClassify(DevMem2Db image, DevMem2Db objects, double scaleFactor=1.2, int minNeighbors=4, cudaStream_t stream)
+        __global__ void lbp_classify(const DevMem2D_< ::cv::gpu::device::Stage> stages, const DevMem2Di trees, const DevMem2Db nodes, const DevMem2Df leaves, const DevMem2Di subsets,
+            const DevMem2Db integral, float step, int subsetSize, DevMem2D_<int4> objects)
+        {
+            unsigned int x = threadIdx.x;
+            unsigned int y = blockIdx.x;
+            int nodeOfs = 0, leafOfs = 0;
+            ::cv::gpu::device::Feature feature;
+
+            for (int s = 0; s < stages.cols; s++ )
+            {
+                ::cv::gpu::device::Stage stage = stages(0, s);
+                int sum = 0;
+                for (int w = 0; w < stage.ntrees; w++)
+                {
+                    ::cv::gpu::device::ClNode node = nodes(0, nodeOfs);
+                    char c = feature();// TODO: inmplement it
+                    const int subsetIdx = (nodeOfs * subsetSize);
+                    int idx = subsetIdx + ((c >> 5) & ( 1 << (c & 31)) ? leafOfs : leafOfs + 1);
+                    sum += leaves(0, subsets(0, idx) );
+                    nodeOfs++;
+                    leafOfs += 2;
+                }
+
+                if (sum < stage.threshold)
+                    return; // nothing matched
+                return;//mathed
+            }
+
+        }
+
+        void cascadeClassify(const DevMem2Db bstages, const DevMem2Di trees, const DevMem2Db nodes, const DevMem2Df leaves, const DevMem2Di subsets,
+            const DevMem2Db integral, int workWidth, int workHeight, int step, int subsetSize, DevMem2D_<int4> objects, int minNeighbors, cudaStream_t stream)
         {
             printf("CascadeClassify");
+            int blocks = ceilf(workHeight / (float)step);
+            int threads = ceilf(workWidth / (float)step);
+            DevMem2D_< ::cv::gpu::device::Stage> stages = DevMem2D_< ::cv::gpu::device::Stage>(bstages);
+
+            lbp_classify<<<blocks, threads>>>(stages, trees, nodes, leaves, subsets, integral, step, subsetSize, objects);
         }
     }
 }}}
\ No newline at end of file
diff --git a/modules/gpu/src/opencv2/gpu/device/lbp.hpp b/modules/gpu/src/opencv2/gpu/device/lbp.hpp
index 3b104f6190..ede48bfe86 100644
--- a/modules/gpu/src/opencv2/gpu/device/lbp.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/lbp.hpp
@@ -43,6 +43,13 @@
 #ifndef __OPENCV_GPU_DEVICE_LBP_HPP_
 #define __OPENCV_GPU_DEVICE_LBP_HPP_
 
+#include "internal_shared.hpp"
+// #include "opencv2/gpu/device/border_interpolate.hpp"
+// #include "opencv2/gpu/device/vec_traits.hpp"
+// #include "opencv2/gpu/device/vec_math.hpp"
+// #include "opencv2/gpu/device/saturate_cast.hpp"
+// #include "opencv2/gpu/device/filters.hpp"
+
 // #define CALC_SUM_(p0, p1, p2, p3, offset) \
 //     ((p0)[offset] - (p1)[offset] - (p2)[offset] + (p3)[offset])
 
@@ -53,16 +60,34 @@
 
 namespace cv { namespace gpu { namespace device {
 
+    struct Stage
+    {
+        int    first;
+        int    ntrees;
+        float  threshold;
+        __device__ __forceinline__ Stage(int f = 0, int n = 0, float t = 0.f) : first(f), ntrees(n), threshold(t) {}
+        __device__ __forceinline__ Stage(const Stage& other) : first(other.first), ntrees(other.ntrees), threshold(other.threshold) {}
+    };
+
+    struct ClNode
+    {
+        int   featureIdx;
+        int   left;
+        int   right;
+        __device__ __forceinline__  ClNode(int f = 0, int l = 0, int r = 0) : featureIdx(f), left(l), right(r) {}
+        __device__ __forceinline__  ClNode(const ClNode& other) : featureIdx(other.featureIdx), left(other.left), right(other.right) {}
+    };
+
     struct Feature
     {
         __device__ __forceinline__ Feature(const Feature& other) {(void)other;}
         __device__ __forceinline__ Feature() {}
-        __device__ __forceinline__ char operator() (volatile int* ptr, int offset)
+        __device__ __forceinline__ char operator() ()//(volatile int* ptr, int offset)
         {
             return char(0);
         }
 
-    }
-}// namespaces
+    };
+} } }// namespaces
 
 #endif
\ No newline at end of file