diff --git a/modules/gpu/perf/perf_objdetect.cpp b/modules/gpu/perf/perf_objdetect.cpp
index 48a355d6a7..e272d65352 100644
--- a/modules/gpu/perf/perf_objdetect.cpp
+++ b/modules/gpu/perf/perf_objdetect.cpp
@@ -104,7 +104,7 @@ PERF_TEST_P(SoftCascade, detect, Values<pair_string>(make_pair("cv/cascadeandhog
         cv::gpu::SoftCascade cascade;
         ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GetParam().first)));
 
-        cv::gpu::GpuMat rois, objectBoxes(1, 1000, CV_8UC1);
+        cv::gpu::GpuMat rois, objectBoxes(1, 1000, CV_8UC4);
         cascade.detectMultiScale(colored, rois, objectBoxes);
 
         TEST_CYCLE()
@@ -117,7 +117,7 @@ PERF_TEST_P(SoftCascade, detect, Values<pair_string>(make_pair("cv/cascadeandhog
         ASSERT_FALSE(colored.empty());
 
         cv::SoftCascade cascade;
-        ASSERT_TRUE(cascade.load(GetParam().first));
+        ASSERT_TRUE(cascade.load(getDataPath(GetParam().first)));
 
         std::vector<cv::Rect> rois, objectBoxes;
         cascade.detectMultiScale(colored, rois, objectBoxes);
diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index c8dff34bdc..4bf410fc59 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -57,14 +57,6 @@
 namespace cv { namespace gpu { namespace device {
 namespace icf {
 
-//     enum {
-//         HOG_BINS = 6,
-//         HOG_LUV_BINS = 10,
-//         WIDTH = 640,
-//         HEIGHT = 480,
-//         GREY_OFFSET = HEIGHT * HOG_LUV_BINS
-//     };
-
     // ToDo: use textures or ancached load instruction.
     __global__ void magToHist(const uchar* __restrict__ mag,
                               const float* __restrict__ angle, const int angPitch,
@@ -94,13 +86,6 @@ namespace icf {
     }
 
     texture<int,  cudaTextureType2D, cudaReadModeElementType> thogluv;
-    // ToDo: do it in load time
-    // __device__ __forceinline__ float rescale(const Level& level, uchar4& scaledRect, const Node& node)
-    // {
-    //     scaledRect = node.rect;
-    //     return (float)(node.threshold & 0x0FFFFFFFU);
-    // }
-
     __device__ __forceinline__ float rescale(const Level& level, uchar4& scaledRect, const Node& node)
     {
         float relScale = level.relScale;
@@ -119,17 +104,12 @@ namespace icf {
 
         float sarea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
 
-        float approx = 1.f;
-        // if (fabs(farea - 0.f) > FLT_EPSILON && fabs(farea - 0.f) > FLT_EPSILON)
-        {
-            const float expected_new_area = farea * relScale * relScale;
-            approx =  sarea / expected_new_area;
-        }
+        const float expected_new_area = farea * relScale * relScale;
+        float approx =  sarea / expected_new_area;
 
         dprintf("new rect: %d box %d %d %d %d  rel areas %f %f\n", (node.threshold >> 28),
         scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea);
 
-
         float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx;
         rootThreshold *= level.scaling[(node.threshold >> 28) > 6];
 
@@ -139,7 +119,7 @@ namespace icf {
         return rootThreshold;
     }
 
-    __device__ __forceinline__ int get(const int x, int y, int channel, uchar4 area)
+    __device__ __forceinline__ int get(const int x, int y, uchar4 area)
     {
 
         dprintf("feature box %d %d %d %d ", area.x, area.y, area.z, area.w);
@@ -149,9 +129,6 @@ namespace icf {
             x + area.x, y + area.w);
         dprintf("at point %d %d with offset %d\n", x, y, 0);
 
-        int offset = channel * 121;
-        y += offset;
-
         int a = tex2D(thogluv, x + area.x, y + area.y);
         int b = tex2D(thogluv, x + area.z, y + area.y);
         int c = tex2D(thogluv, x + area.z, y + area.w);
@@ -163,7 +140,7 @@ namespace icf {
     }
 
     __global__ void test_kernel(const Level* levels, const Octave* octaves, const float* stages,
-        const Node* nodes, const float* leaves, PtrStepSz<uchar4> objects)
+        const Node* nodes, const float* leaves, PtrStepSz<uchar4> objects, uint* ctr)
     {
         const int y = blockIdx.y * blockDim.y + threadIdx.y;
         const int x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -179,7 +156,7 @@ namespace icf {
 
         float confidence = 0.f;
 
-// #pragma unroll 8
+// #pragma unroll 2
         for(; st < stEnd; ++st)
         {
             dprintf("\n\nstage: %d\n", st);
@@ -190,7 +167,7 @@ namespace icf {
                 node.threshold >> 28, node.threshold & 0x0FFFFFFFU);
 
             float threshold = rescale(level, node.rect, node);
-            int sum = get(x, y, (node.threshold >> 28), node.rect);
+            int sum = get(x, y + (node.threshold >> 28) * 121, node.rect);
 
             dprintf("Node: [%d %d %d %d] %f\n", node.rect.x, node.rect.y, node.rect.z,
                 node.rect.w, threshold);
@@ -200,29 +177,30 @@ namespace icf {
 
             node = nodes[nId + next];
             threshold = rescale(level, node.rect, node);
-            sum = get(x, y, (node.threshold >> 28), node.rect);
+            sum = get(x, y + (node.threshold >> 28) * 121, node.rect);
 
             const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
             float impact = leaves[st * 4 + lShift];
             confidence += impact;
 
-            if (confidence <= stages[st]) st = stEnd + 1;
+            if (confidence <= stages[st]) st = stEnd + 10;
             dprintf("decided: %d (%d >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact);
             dprintf("extracted stage: %f\n", stages[st]);
             dprintf("computed  score: %f\n\n", confidence);
         }
 
-        // if (st == stEnd)
-        //     printf("%d %d %d\n", x, y, st);
-
-        uchar4 val;
-        val.x = (int)confidence;
-        if (x == y) objects(0, threadIdx.x) = val;
-
+        if(st == stEnd)
+        {
+            int idx = atomicInc(ctr, objects.cols);
+            uchar4 val;
+            val.x = x * 4;
+            objects(0, idx) = val;
+        }
     }
 
     void detect(const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages,
-        const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects)
+                const PtrStepSzb& nodes,  const PtrStepSzf& leaves,  const PtrStepSzi& hogluv,
+                PtrStepSz<uchar4> objects, PtrStepSzi counter)
     {
         int fw = 160;
         int fh = 120;
@@ -235,11 +213,12 @@ namespace icf {
         const float* st = (const float*)stages.ptr();
         const Node* nd = (const Node*)nodes.ptr();
         const float* lf = (const float*)leaves.ptr();
+        uint* ctr = (uint*)counter.ptr();
 
         cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
         cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
 
-        test_kernel<<<grid, block>>>(l, oct, st, nd, lf, objects);
+        test_kernel<<<grid, block>>>(l, oct, st, nd, lf, objects, ctr);
 
         cudaSafeCall( cudaGetLastError());
         cudaSafeCall( cudaDeviceSynchronize());
diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp
index ffbf380c69..320fbb3437 100644
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@@ -60,7 +60,8 @@ namespace icf {
     void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
         const int fw, const int fh, const int bins);
     void detect(const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages,
-        const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects);
+        const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects,
+        PtrStepSzi counter);
 }
 }}}
 
@@ -75,6 +76,7 @@ struct cv::gpu::SoftCascade::Filds
         shrunk.create(FRAME_HEIGHT / 4 * HOG_LUV_BINS, FRAME_WIDTH / 4, CV_8UC1);
         integralBuffer.create(shrunk.rows + 1 * HOG_LUV_BINS, shrunk.cols + 1, CV_32SC1);
         hogluv.create((FRAME_HEIGHT / 4 + 1) * HOG_LUV_BINS, FRAME_WIDTH / 4 + 1, CV_32SC1);
+        detCounter.create(1,1, CV_32SC1);
     }
 
     // scales range
@@ -90,6 +92,8 @@ struct cv::gpu::SoftCascade::Filds
     GpuMat leaves;
     GpuMat levels;
 
+    GpuMat detCounter;
+
     // preallocated buffer 640x480x10 for hogluv + 640x480 got gray
     GpuMat plane;
 
@@ -127,7 +131,8 @@ struct cv::gpu::SoftCascade::Filds
     bool fill(const FileNode &root, const float mins, const float maxs);
     void detect(cv::gpu::GpuMat objects, cudaStream_t stream) const
     {
-        device::icf::detect(levels, octaves, stages, nodes, leaves, hogluv, objects);
+        cudaMemset(detCounter.data, 0, detCounter.step * detCounter.rows * sizeof(int));
+        device::icf::detect(levels, octaves, stages, nodes, leaves, hogluv, objects , detCounter);
     }
 
 private:
@@ -506,14 +511,13 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat&
         GpuMat sum(flds.hogluv, cv::Rect(0, (fh + 1) * i, fw + 1, fh + 1));
         cv::gpu::integralBuffered(channel, sum, flds.integralBuffer);
     }
-
 #endif
 
     cudaStream_t stream = StreamAccessor::getStream(s);
-    // detection
     flds.detect(objects, stream);
 
-//     // flds.storage.frame(colored, stream);
+        //     cv::Mat out(flds.detCounter);
+        // std::cout << out << std::endl;
 }
 
 #endif
\ No newline at end of file