diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu
index 89a74eeac4..5cde710708 100644
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@@ -41,25 +41,136 @@
 //M*/
 
 #include <icf.hpp>
+#include <opencv2/gpu/device/saturate_cast.hpp>
 
 namespace cv { namespace gpu {
 
 
  namespace device {
 
+enum {
+    HOG_BINS = 6,
+    HOG_LUV_BINS = 10,
+    WIDTH = 640,
+    HEIGHT = 480,
+    GREY_OFFSET = HEIGHT * HOG_LUV_BINS
+};
+
+/* Returns the nearest upper power of two, works only for
+the typical GPU thread count (pert block) values */
+int power_2up(unsigned int n)
+{
+    if (n < 1) return 1;
+    else if (n < 2) return 2;
+    else if (n < 4) return 4;
+    else if (n < 8) return 8;
+    else if (n < 16) return 16;
+    else if (n < 32) return 32;
+    else if (n < 64) return 64;
+    else if (n < 128) return 128;
+    else if (n < 256) return 256;
+    else if (n < 512) return 512;
+    else if (n < 1024) return 1024;
+    return -1; // Input is too big
+}
+
+
+__device__ __forceinline__ uchar grey(const uchar3 rgb)
+{
+    return saturate_cast<uchar>(rgb.x * 0.114f + rgb.y * 0.587f + rgb.z * 0.299f);
+}
+
+__device__ __forceinline__ void luv(const uchar3 rgb, uchar& l, uchar& u, uchar& v)
+{
+
+}
+
 __global__ void rgb2grayluv(const uchar3* __restrict__ rgb, uchar* __restrict__ hog,
                             const int rgbPitch, const int hogPitch)
 {
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+    const uchar3 color = rgb[rgbPitch * y + x];
+
+    uchar l, u, v;
+    luv(color, l, u, v);
+
+    hog[hogPitch *  y + x] = l;
+    hog[hogPitch * (y + HEIGHT) + x] = u;
+    hog[hogPitch * (y + 2 * HEIGHT) + x] = v;
+    hog[hogPitch * (y + 3 * HEIGHT) + x] = grey(color);
+}
+
+__device__ __forceinline__
+int qangle(const float &y, const float &x)
+{
+    int bin = 0;
+//     const float2 &bin_vector_zero = const_angle_bins_vectors[0];
+//     float max_dot_product = fabs(x*bin_vector_zero.x + y*bin_vector_zero.y);
+
+//     // let us hope this gets unrolled
+// #pragma unroll
+//     for(int i=1; i < num_angles_bin; i+=1)
+//     {
+//         const float2 &bin_vector_i = const_angle_bins_vectors[i];
+//         //const float2 bin_vector_i = const_angle_bins_vectors[i];
+//         //const float2 &bin_vector_i = angle_bins_vectors[i];
+//         const float dot_product = fabs(x*bin_vector_i.x + y*bin_vector_i.y);
+//         if(dot_product > max_dot_product)
+//         {
+//             max_dot_product = dot_product;
+//             index = i;
+//         }
+//     }
+
+    return bin;
 }
 
-__global__ void gray2hog(const uchar* __restrict__ gray, uchar* __restrict__ hog,
-                         const int pitch)
+// texture<uchar, 2, cudaReadModeElementType> tgray;
+__global__ void gray2hog(const uchar* __restrict__ gray, uchar* __restrict__ hog, const int pitch, const float norm)
 {
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // derivative
+    float dx = gray[y * pitch + x + 1];
+    dx -= gray[y * pitch + x - 1];
+
+    float dy = gray[(y + 1) * pitch + x];
+    dy -= gray[(y -1) * pitch + x - 1];
+
+    // mag and angle
+    const uchar mag =  saturate_cast<uchar>(sqrtf(dy * dy + dx * dx) * norm);
+    const int bin = qangle(dx, dy);
+
+}
+
+template <int FACTOR>
+__device__ __forceinline__ uchar shrink(const uchar* ptr, const int pitch, const int y, const int x)
+{
+    int out = 0;
+#pragma unroll
+    for(int dy = 0; dy < FACTOR; ++dy)
+#pragma unroll
+        for(int dx = 0; dx < FACTOR; ++dx)
+        {
+            out += ptr[dy * pitch + dx];
+        }
+
+    return saturate_cast<uchar>(out / FACTOR);
 }
 
+template<int FACTOR>
 __global__ void decimate(const uchar* __restrict__ hogluv, uchar* __restrict__ shrank,
                         const int inPitch, const int outPitch )
 {
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+    const uchar* ptr = hogluv + (FACTOR * y) * inPitch + (FACTOR * x);
+
+    shrank[ y * outPitch + x]= shrink<FACTOR>(ptr, inPitch, y, x);
 }
 
 __global__ void intRow(const uchar* __restrict__ hogluv, ushort* __restrict__ sum,
@@ -89,6 +200,11 @@ void __device icf::Cascade::detectAt() const
 void icf::Cascade::detect(const cv::gpu::PtrStepSzb& hogluv, cudaStream_t stream) const
 {
     // detection kernel
+    dim3 block(32, 8, 1);
+    dim3 grid(32 * ChannelStorage::FRAME_WIDTH / 32, ChannelStorage::FRAME_HEIGHT / 8, 64);
+    device::detect<<<grid, block, 0, stream>>>(*this, hogluv, hogluv.step / sizeof(ushort));
+    if (!stream)
+        cudaSafeCall( cudaDeviceSynchronize() );
 
 }
 
@@ -99,12 +215,13 @@ void icf::ChannelStorage::frame(const cv::gpu::PtrStepSz<uchar3>& rgb, cudaStrea
     dim3 grid(FRAME_WIDTH / 32, FRAME_HEIGHT / 8);
 
     uchar * channels = (uchar*)dmem.ptr(FRAME_HEIGHT * HOG_BINS);
-    device::rgb2grayluv<<<grid, block, 0, stream>>>((uchar3*)rgb.ptr(), channels, rgb.step, dmem.step);
+    device::rgb2grayluv<<<grid, block, 0, stream>>>((uchar3*)rgb.ptr(), channels,
+                                                    rgb.step / sizeof(uchar3), dmem.step);
     cudaSafeCall( cudaGetLastError());
 
     // hog calculation kernel
     channels = (uchar*)dmem.ptr(FRAME_HEIGHT * HOG_LUV_BINS);
-    device::gray2hog<<<grid, block, 0, stream>>>(channels, (uchar*)dmem.ptr(), dmem.step);
+    device::gray2hog<<<grid, block, 0, stream>>>(channels, (uchar*)dmem.ptr(), dmem.step, magnitudeScaling);
     cudaSafeCall( cudaGetLastError() );
 
     const int shrWidth  = FRAME_WIDTH / shrinkage;
@@ -112,19 +229,20 @@ void icf::ChannelStorage::frame(const cv::gpu::PtrStepSz<uchar3>& rgb, cudaStrea
 
     // decimate kernel
     grid = dim3(shrWidth / 32, shrHeight / 8);
-    device::decimate<<<grid, block, 0, stream>>>((uchar*)dmem.ptr(), (uchar*)shrunk.ptr(), dmem.step, shrunk.step);
+    device::decimate<4><<<grid, block, 0, stream>>>((uchar*)dmem.ptr(), (uchar*)shrunk.ptr(), dmem.step, shrunk.step);
     cudaSafeCall( cudaGetLastError() );
 
     // integrate rows
     block = dim3(shrWidth, 1);
     grid = dim3(shrHeight * HOG_LUV_BINS, 1);
-    device::intRow<<<grid, block, 0, stream>>>((uchar*)shrunk.ptr(), (ushort*)hogluv.ptr(), shrunk.step, hogluv.step);
+    device::intRow<<<grid, block, 0, stream>>>((uchar*)shrunk.ptr(), (ushort*)hogluv.ptr(),
+        shrunk.step, hogluv.step / sizeof(ushort));
     cudaSafeCall( cudaGetLastError() );
 
     // integrate cols
     block = dim3(128, 1);
     grid = dim3(shrWidth * HOG_LUV_BINS, 1);
-    device::intCol<<<grid, block, 0, stream>>>((ushort*)hogluv.ptr(), hogluv.step);
+    device::intCol<<<grid, block, 0, stream>>>((ushort*)hogluv.ptr(), hogluv.step / hogluv.step / sizeof(ushort));
     cudaSafeCall( cudaGetLastError() );
 }
 
diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp
index 8b075beba6..69d21fdd9b 100644
--- a/modules/gpu/src/icf.hpp
+++ b/modules/gpu/src/icf.hpp
@@ -105,6 +105,7 @@ struct ChannelStorage
     };
 
     int shrinkage;
+    static const float magnitudeScaling = 1.f ;// / sqrt(2);
 };
 
 struct __align__(16) Octave