empty cascade

13 years ago · 1917366528
parent 4881205bae
commit 1917366528
3 changed files with 702 additions and 706 deletions
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@ -40,221 +40,221 @@
 //
 //M*/

-#include <icf.hpp>
-#include <opencv2/gpu/device/saturate_cast.hpp>
-#include <stdio.h>
-#include <float.h>
-
-//#define LOG_CUDA_CASCADE
-
-#if defined LOG_CUDA_CASCADE
-# define dprintf(format, ...) \
-            do { printf(format, __VA_ARGS__); } while (0)
-#else
-# define dprintf(format, ...)
-#endif
-
-namespace cv { namespace gpu { namespace device {
-
-namespace icf {
-
-    enum {
-        HOG_BINS = 6,
-        HOG_LUV_BINS = 10,
-        WIDTH = 640,
-        HEIGHT = 480,
-        GREY_OFFSET = HEIGHT * HOG_LUV_BINS
-    };
-
-    __global__ void magToHist(const uchar* __restrict__ mag,
-                              const float* __restrict__ angle, const int angPitch,
-                                    uchar* __restrict__ hog,   const int hogPitch)
-    {
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-
-        const int bin = (int)(angle[y * angPitch + x]);
-        const uchar val = mag[y * angPitch + x];
-
-        hog[((HEIGHT * bin) + y) * hogPitch + x] = val;
-    }
+// #include <icf.hpp>
+// #include <opencv2/gpu/device/saturate_cast.hpp>
+// #include <stdio.h>
+// #include <float.h>
+
+// //#define LOG_CUDA_CASCADE
+
+// #if defined LOG_CUDA_CASCADE
+// # define dprintf(format, ...) \
+//             do { printf(format, __VA_ARGS__); } while (0)
+// #else
+// # define dprintf(format, ...)
+// #endif
+
+// namespace cv { namespace gpu { namespace device {
+
+// namespace icf {
+
+//     enum {
+//         HOG_BINS = 6,
+//         HOG_LUV_BINS = 10,
+//         WIDTH = 640,
+//         HEIGHT = 480,
+//         GREY_OFFSET = HEIGHT * HOG_LUV_BINS
+//     };
+
+//     __global__ void magToHist(const uchar* __restrict__ mag,
+//                               const float* __restrict__ angle, const int angPitch,
+//                                     uchar* __restrict__ hog,   const int hogPitch)
+//     {
+//         const int y = blockIdx.y * blockDim.y + threadIdx.y;
+//         const int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+//         const int bin = (int)(angle[y * angPitch + x]);
+//         const uchar val = mag[y * angPitch + x];
+
+//         hog[((HEIGHT * bin) + y) * hogPitch + x] = val;
+//     }

-    void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle)
-    {
-        const uchar* mag = (const uchar*)hogluv.ptr(HEIGHT * HOG_BINS);
-        uchar* hog = (uchar*)hogluv.ptr();
-        const float* angle = (const float*)nangle.ptr();
+//     void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle)
+//     {
+//         const uchar* mag = (const uchar*)hogluv.ptr(HEIGHT * HOG_BINS);
+//         uchar* hog = (uchar*)hogluv.ptr();
+//         const float* angle = (const float*)nangle.ptr();

-        dim3 block(32, 8);
-        dim3 grid(WIDTH / 32, HEIGHT / 8);
+//         dim3 block(32, 8);
+//         dim3 grid(WIDTH / 32, HEIGHT / 8);

-        magToHist<<<grid, block>>>(mag, angle, nangle.step / sizeof(float), hog, hogluv.step);
-        cudaSafeCall( cudaGetLastError() );
-        cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-__global__ void detect(const cv::gpu::icf::Cascade cascade, const int* __restrict__ hogluv, const int pitch,
-    PtrStepSz<uchar4> objects)
-{
-    cascade.detectAt(hogluv, pitch, objects);
-}
+//         magToHist<<<grid, block>>>(mag, angle, nangle.step / sizeof(float), hog, hogluv.step);
+//         cudaSafeCall( cudaGetLastError() );
+//         cudaSafeCall( cudaDeviceSynchronize() );
+//     }
+// }
+
+// __global__ void detect(const cv::gpu::icf::Cascade cascade, const int* __restrict__ hogluv, const int pitch,
+//     PtrStepSz<uchar4> objects)
+// {
+//     cascade.detectAt(hogluv, pitch, objects);
+// }

-}
+// }

-float __device icf::Cascade::rescale(const icf::Level& level, uchar4& scaledRect,
-                                     const int channel, const float threshold) const
-{
-    dprintf("feature %d box %d %d %d %d\n", channel, scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w);
-    dprintf("rescale: %f [%f %f]\n",level.relScale, level.scaling[0], level.scaling[1]);
+// float __device icf::Cascade::rescale(const icf::Level& level, uchar4& scaledRect,
+//                                      const int channel, const float threshold) const
+// {
+//     dprintf("feature %d box %d %d %d %d\n", channel, scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w);
+//     dprintf("rescale: %f [%f %f]\n",level.relScale, level.scaling[0], level.scaling[1]);

-    float relScale = level.relScale;
-    float farea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
+//     float relScale = level.relScale;
+//     float farea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);

-    // rescale
-    scaledRect.x = __float2int_rn(relScale * scaledRect.x);
-    scaledRect.y = __float2int_rn(relScale * scaledRect.y);
-    scaledRect.z = __float2int_rn(relScale * scaledRect.z);
-    scaledRect.w = __float2int_rn(relScale * scaledRect.w);
+//     // rescale
+//     scaledRect.x = __float2int_rn(relScale * scaledRect.x);
+//     scaledRect.y = __float2int_rn(relScale * scaledRect.y);
+//     scaledRect.z = __float2int_rn(relScale * scaledRect.z);
+//     scaledRect.w = __float2int_rn(relScale * scaledRect.w);

-    float sarea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
+//     float sarea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);


-    float approx = 1.f;
-    if (fabs(farea - 0.f) > FLT_EPSILON && fabs(farea - 0.f) > FLT_EPSILON)
-    {
-        const float expected_new_area = farea * relScale * relScale;
-        approx = expected_new_area / sarea;
-    }
+//     float approx = 1.f;
+//     if (fabs(farea - 0.f) > FLT_EPSILON && fabs(farea - 0.f) > FLT_EPSILON)
+//     {
+//         const float expected_new_area = farea * relScale * relScale;
+//         approx = expected_new_area / sarea;
+//     }

-    dprintf("new rect: %d box %d %d %d %d  rel areas %f %f\n", channel,
-        scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea);
+//     dprintf("new rect: %d box %d %d %d %d  rel areas %f %f\n", channel,
+//         scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea);

-    // compensation areas rounding
-    float rootThreshold = threshold / approx;
-    // printf("    approx %f\n", rootThreshold);
-    rootThreshold *= level.scaling[(int)(channel > 6)];
+//     // compensation areas rounding
+//     float rootThreshold = threshold / approx;
+//     // printf("    approx %f\n", rootThreshold);
+//     rootThreshold *= level.scaling[(int)(channel > 6)];

-    dprintf("approximation %f %f -> %f %f\n", approx, threshold, rootThreshold, level.scaling[(int)(channel > 6)]);
+//     dprintf("approximation %f %f -> %f %f\n", approx, threshold, rootThreshold, level.scaling[(int)(channel > 6)]);

-    return rootThreshold;
-}
+//     return rootThreshold;
+// }

-typedef unsigned char uchar;
-float __device get(const int* __restrict__ hogluv, const int pitch,
-                   const int x, const int y, int channel, uchar4 area)
-{
-    dprintf("feature box %d %d %d %d ", area.x, area.y, area.z, area.w);
-    dprintf("get for channel %d\n", channel);
-    dprintf("extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n",
-        x + area.x, y + area.y,  x + area.z, y + area.y,  x + area.z,y + area.w,
-        x + area.x, y + area.w);
-    dprintf("at point %d %d with offset %d\n", x, y, 0);
+// typedef unsigned char uchar;
+// float __device get(const int* __restrict__ hogluv, const int pitch,
+//                    const int x, const int y, int channel, uchar4 area)
+// {
+//     dprintf("feature box %d %d %d %d ", area.x, area.y, area.z, area.w);
+//     dprintf("get for channel %d\n", channel);
+//     dprintf("extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n",
+//         x + area.x, y + area.y,  x + area.z, y + area.y,  x + area.z,y + area.w,
+//         x + area.x, y + area.w);
+//     dprintf("at point %d %d with offset %d\n", x, y, 0);

-    const int* curr = hogluv + ((channel * 121) + y) * pitch;
+//     const int* curr = hogluv + ((channel * 121) + y) * pitch;

-    int a = curr[area.y * pitch + x + area.x];
-    int b = curr[area.y * pitch + x + area.z];
-    int c = curr[area.w * pitch + x + area.z];
-    int d = curr[area.w * pitch + x + area.x];
+//     int a = curr[area.y * pitch + x + area.x];
+//     int b = curr[area.y * pitch + x + area.z];
+//     int c = curr[area.w * pitch + x + area.z];
+//     int d = curr[area.w * pitch + x + area.x];

-    dprintf("    retruved integral values: %d %d %d %d\n", a, b, c, d);
+//     dprintf("    retruved integral values: %d %d %d %d\n", a, b, c, d);

-    return (a - b + c - d);
-}
+//     return (a - b + c - d);
+// }


-void __device icf::Cascade::detectAt(const int* __restrict__ hogluv, const int pitch,
-                                    PtrStepSz<uchar4>& objects) const
-{
-    const icf::Level* lls = (const icf::Level*)levels.ptr();
+// void __device icf::Cascade::detectAt(const int* __restrict__ hogluv, const int pitch,
+//                                     PtrStepSz<uchar4>& objects) const
+// {
+//     const icf::Level* lls = (const icf::Level*)levels.ptr();

-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    // if (x > 0 || y > 0) return;
+//     const int y = blockIdx.y * blockDim.y + threadIdx.y;
+//     const int x = blockIdx.x * blockDim.x + threadIdx.x;
+//     // if (x > 0 || y > 0) return;

-    Level level = lls[blockIdx.z];
-    if (x >= level.workRect.x || y >= level.workRect.y) return;
+//     Level level = lls[blockIdx.z];
+//     if (x >= level.workRect.x || y >= level.workRect.y) return;

-    dprintf("level: %d (%f %f) [%f %f] (%d %d) (%d %d)\n", level.octave, level.relScale, level.shrScale,
-        level.scaling[0], level.scaling[1], level.workRect.x, level.workRect.y, level.objSize.x, level.objSize.y);
+//     dprintf("level: %d (%f %f) [%f %f] (%d %d) (%d %d)\n", level.octave, level.relScale, level.shrScale,
+//         level.scaling[0], level.scaling[1], level.workRect.x, level.workRect.y, level.objSize.x, level.objSize.y);

-    const Octave octave = ((const Octave*)octaves.ptr())[level.octave];
-    // printf("Octave: %d %d %d (%d %d) %f\n", octave.index, octave.stages,
-    //     octave.shrinkage, octave.size.x, octave.size.y, octave.scale);
+//     const Octave octave = ((const Octave*)octaves.ptr())[level.octave];
+//     // printf("Octave: %d %d %d (%d %d) %f\n", octave.index, octave.stages,
+//     //     octave.shrinkage, octave.size.x, octave.size.y, octave.scale);

-    const int stBegin = octave.index * octave.stages, stEnd = stBegin + octave.stages;
+//     const int stBegin = octave.index * octave.stages, stEnd = stBegin + octave.stages;

-    float detectionScore = 0.f;
+//     float detectionScore = 0.f;

-    int st = stBegin;
-    for(; st < stEnd; ++st)
-    {
-        const float stage = stages(0, st);
-        dprintf("Stage: %f\n", stage);
-        {
-            const int nId = st * 3;
+//     int st = stBegin;
+//     for(; st < stEnd; ++st)
+//     {
+//         const float stage = stages(0, st);
+//         dprintf("Stage: %f\n", stage);
+//         {
+//             const int nId = st * 3;

-            // work with root node
-            const Node node = ((const Node*)nodes.ptr())[nId];
+//             // work with root node
+//             const Node node = ((const Node*)nodes.ptr())[nId];

-            dprintf("Node: %d %f\n", node.feature, node.threshold);
+//             dprintf("Node: %d %f\n", node.feature, node.threshold);

-            const Feature feature = ((const Feature*)features.ptr())[node.feature];
+//             const Feature feature = ((const Feature*)features.ptr())[node.feature];

-            uchar4 scaledRect = feature.rect;
-            float threshold = rescale(level, scaledRect, feature.channel, node.threshold);
+//             uchar4 scaledRect = feature.rect;
+//             float threshold = rescale(level, scaledRect, feature.channel, node.threshold);

-            float sum = get(hogluv,pitch, x, y, feature.channel, scaledRect);
+//             float sum = get(hogluv,pitch, x, y, feature.channel, scaledRect);

-            dprintf("root feature %d %f\n",feature.channel, sum);
+//             dprintf("root feature %d %f\n",feature.channel, sum);

-            int next = 1 + (int)(sum >= threshold);
+//             int next = 1 + (int)(sum >= threshold);

-            dprintf("go: %d (%f >= %f)\n\n" ,next, sum, threshold);
+//             dprintf("go: %d (%f >= %f)\n\n" ,next, sum, threshold);

-            // leaves
-            const Node leaf = ((const Node*)nodes.ptr())[nId + next];
-            const Feature fLeaf = ((const Feature*)features.ptr())[leaf.feature];
+//             // leaves
+//             const Node leaf = ((const Node*)nodes.ptr())[nId + next];
+//             const Feature fLeaf = ((const Feature*)features.ptr())[leaf.feature];

-            scaledRect = fLeaf.rect;
-            threshold = rescale(level, scaledRect, fLeaf.channel, leaf.threshold);
-            sum = get(hogluv, pitch, x, y, fLeaf.channel, scaledRect);
+//             scaledRect = fLeaf.rect;
+//             threshold = rescale(level, scaledRect, fLeaf.channel, leaf.threshold);
+//             sum = get(hogluv, pitch, x, y, fLeaf.channel, scaledRect);

-            const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
-            float impact = leaves(0, (st * 4) + lShift);
+//             const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
+//             float impact = leaves(0, (st * 4) + lShift);

-            detectionScore += impact;
+//             detectionScore += impact;

-            dprintf("decided: %d (%f >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact);
-            dprintf("extracted stage:\n");
-            dprintf("ct %f\n", stage);
-            dprintf("computed score %f\n\n", detectionScore);
-            dprintf("\n\n");
-        }
+//             dprintf("decided: %d (%f >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact);
+//             dprintf("extracted stage:\n");
+//             dprintf("ct %f\n", stage);
+//             dprintf("computed score %f\n\n", detectionScore);
+//             dprintf("\n\n");
+//         }

-        if (detectionScore <= stage || st - stBegin == 100) break;
-    }
+//         if (detectionScore <= stage || st - stBegin == 100) break;
+//     }

-    dprintf("x %d y %d: %d\n", x, y, st - stBegin);
+//     dprintf("x %d y %d: %d\n", x, y, st - stBegin);

-    if (st == stEnd)
-    {
-        uchar4 a;
-        a.x = level.workRect.x;
-        a.y = level.workRect.y;
-        objects(0, threadIdx.x) = a;
-    }
-}
+//     if (st == stEnd)
+//     {
+//         uchar4 a;
+//         a.x = level.workRect.x;
+//         a.y = level.workRect.y;
+//         objects(0, threadIdx.x) = a;
+//     }
+// }

-void icf::Cascade::detect(const cv::gpu::PtrStepSzi& hogluv, PtrStepSz<uchar4> objects, cudaStream_t stream) const
-{
-    dim3 block(32, 8, 1);
-    dim3 grid(ChannelStorage::FRAME_WIDTH / 32, ChannelStorage::FRAME_HEIGHT / 8, 47);
-    device::detect<<<grid, block, 0, stream>>>(*this, hogluv, hogluv.step / sizeof(int), objects);
-    cudaSafeCall( cudaGetLastError() );
-    if (!stream)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+// void icf::Cascade::detect(const cv::gpu::PtrStepSzi& hogluv, PtrStepSz<uchar4> objects, cudaStream_t stream) const
+// {
+//     dim3 block(32, 8, 1);
+//     dim3 grid(ChannelStorage::FRAME_WIDTH / 32, ChannelStorage::FRAME_HEIGHT / 8, 47);
+//     device::detect<<<grid, block, 0, stream>>>(*this, hogluv, hogluv.step / sizeof(int), objects);
+//     cudaSafeCall( cudaGetLastError() );
+//     if (!stream)
+//         cudaSafeCall( cudaDeviceSynchronize() );
+// }

-}}
+// }}
--- a/modules/gpu/src/icf.hpp
+++ b/modules/gpu/src/icf.hpp
@ -40,127 +40,127 @@
 //
 //M*/

-#include <opencv2/gpu/device/common.hpp>
-
-#ifndef __OPENCV_ICF_HPP__
-#define __OPENCV_ICF_HPP__
-
-#if defined __CUDACC__
-# define __device __device__ __forceinline__
-#else
-# define __device
-#endif
-
-
-namespace cv { namespace gpu { namespace icf {
-
-using cv::gpu::PtrStepSzb;
-using cv::gpu::PtrStepSzf;
-
-typedef unsigned char uchar;
-
-struct __align__(16) Octave
-{
-    ushort index;
-    ushort stages;
-    ushort shrinkage;
-    ushort2 size;
-    float scale;
-
-    Octave(const ushort i, const ushort s, const ushort sh, const ushort2 sz, const float sc)
-    : index(i), stages(s), shrinkage(sh), size(sz), scale(sc) {}
-};
-
-struct __align__(8) Level //is actually 24 bytes
-{
-    int octave;
-
-    // float origScale; //not actually used
-    float relScale;
-    float shrScale;   // used for marking detection
-    float scaling[2]; // calculated according to Dollal paper
-
-    // for 640x480 we can not get overflow
-    uchar2 workRect;
-    uchar2 objSize;
-
-    Level(int idx, const Octave& oct, const float scale, const int w, const int h)
-    :  octave(idx), relScale(scale / oct.scale), shrScale (relScale / (float)oct.shrinkage)
-    {
-        workRect.x = round(w / (float)oct.shrinkage);
-        workRect.y = round(h / (float)oct.shrinkage);
-
-        objSize.x  = round(oct.size.x * relScale);
-        objSize.y  = round(oct.size.y * relScale);
-    }
-};
-
-struct Cascade
-{
-    Cascade() {}
-    Cascade(const cv::gpu::PtrStepSzb& octs, const cv::gpu::PtrStepSzf& sts, const cv::gpu::PtrStepSzb& nds,
-        const cv::gpu::PtrStepSzf& lvs, const cv::gpu::PtrStepSzb& fts, const cv::gpu::PtrStepSzb& lls)
-    : octaves(octs), stages(sts), nodes(nds), leaves(lvs), features(fts), levels(lls) {}
-
-    void detect(const cv::gpu::PtrStepSzi& hogluv, cv::gpu::PtrStepSz<uchar4> objects, cudaStream_t stream) const;
-    void __device detectAt(const int* __restrict__ hogluv, const int pitch, PtrStepSz<uchar4>& objects) const;
-    float __device rescale(const icf::Level& level, uchar4& scaledRect,
-                           const int channel, const float threshold) const;
-
-    PtrStepSzb octaves;
-    PtrStepSzf stages;
-    PtrStepSzb nodes;
-    PtrStepSzf leaves;
-    PtrStepSzb features;
-
-    PtrStepSzb levels;
-
-};
-
-struct ChannelStorage
-{
-    ChannelStorage(){}
-    ChannelStorage(const cv::gpu::PtrStepSzb& buff, const cv::gpu::PtrStepSzb& shr,
-        const cv::gpu::PtrStepSzb& itg, const int s)
-    : dmem (buff), shrunk(shr), hogluv(itg), shrinkage(s) {}
-
-    void frame(const cv::gpu::PtrStepSz<uchar3>& rgb, cudaStream_t stream){}
-
-    PtrStepSzb dmem;
-    PtrStepSzb shrunk;
-    PtrStepSzb hogluv;
-
-    enum
-    {
-        FRAME_WIDTH        = 640,
-        FRAME_HEIGHT       = 480,
-        TOTAL_SCALES       = 55,
-        CLASSIFIERS        = 5,
-        ORIG_OBJECT_WIDTH  = 64,
-        ORIG_OBJECT_HEIGHT = 128,
-        HOG_BINS           = 6,
-        HOG_LUV_BINS       = 10
-    };
-
-    int shrinkage;
-    static const float magnitudeScaling = 1.f ;// / sqrt(2);
-};
-
-struct __align__(8) Node
-{
-    int feature;
-    float threshold;
-
-    Node(const int f, const float t) : feature(f), threshold(t) {}
-};
-
-struct __align__(8) Feature
-{
-    int channel;
-    uchar4 rect;
-
-    Feature(const int c, const uchar4 r) : channel(c), rect(r) {}
-};
-}}}
-
-#endif
+// #include <opencv2/gpu/device/common.hpp>
+
+// #ifndef __OPENCV_ICF_HPP__
+// #define __OPENCV_ICF_HPP__
+
+// #if defined __CUDACC__
+// # define __device __device__ __forceinline__
+// #else
+// # define __device
+// #endif
+
+
+// namespace cv { namespace gpu { namespace icf {
+
+// using cv::gpu::PtrStepSzb;
+// using cv::gpu::PtrStepSzf;
+
+// typedef unsigned char uchar;
+
+// struct __align__(16) Octave
+// {
+//     ushort index;
+//     ushort stages;
+//     ushort shrinkage;
+//     ushort2 size;
+//     float scale;
+
+//     Octave(const ushort i, const ushort s, const ushort sh, const ushort2 sz, const float sc)
+//     : index(i), stages(s), shrinkage(sh), size(sz), scale(sc) {}
+// };
+
+// struct __align__(8) Level //is actually 24 bytes
+// {
+//     int octave;
+
+//     // float origScale; //not actually used
+//     float relScale;
+//     float shrScale;   // used for marking detection
+//     float scaling[2]; // calculated according to Dollal paper
+
+//     // for 640x480 we can not get overflow
+//     uchar2 workRect;
+//     uchar2 objSize;
+
+//     Level(int idx, const Octave& oct, const float scale, const int w, const int h)
+//     :  octave(idx), relScale(scale / oct.scale), shrScale (relScale / (float)oct.shrinkage)
+//     {
+//         workRect.x = round(w / (float)oct.shrinkage);
+//         workRect.y = round(h / (float)oct.shrinkage);
+
+//         objSize.x  = round(oct.size.x * relScale);
+//         objSize.y  = round(oct.size.y * relScale);
+//     }
+// };
+
+// struct Cascade
+// {
+//     Cascade() {}
+//     Cascade(const cv::gpu::PtrStepSzb& octs, const cv::gpu::PtrStepSzf& sts, const cv::gpu::PtrStepSzb& nds,
+//         const cv::gpu::PtrStepSzf& lvs, const cv::gpu::PtrStepSzb& fts, const cv::gpu::PtrStepSzb& lls)
+//     : octaves(octs), stages(sts), nodes(nds), leaves(lvs), features(fts), levels(lls) {}
+
+//     void detect(const cv::gpu::PtrStepSzi& hogluv, cv::gpu::PtrStepSz<uchar4> objects, cudaStream_t stream) const;
+//     void __device detectAt(const int* __restrict__ hogluv, const int pitch, PtrStepSz<uchar4>& objects) const;
+//     float __device rescale(const icf::Level& level, uchar4& scaledRect,
+//                            const int channel, const float threshold) const;
+
+//     PtrStepSzb octaves;
+//     PtrStepSzf stages;
+//     PtrStepSzb nodes;
+//     PtrStepSzf leaves;
+//     PtrStepSzb features;
+
+//     PtrStepSzb levels;
+
+// };
+
+// struct ChannelStorage
+// {
+//     ChannelStorage(){}
+//     ChannelStorage(const cv::gpu::PtrStepSzb& buff, const cv::gpu::PtrStepSzb& shr,
+//         const cv::gpu::PtrStepSzb& itg, const int s)
+//     : dmem (buff), shrunk(shr), hogluv(itg), shrinkage(s) {}
+
+//     void frame(const cv::gpu::PtrStepSz<uchar3>& rgb, cudaStream_t stream){}
+
+//     PtrStepSzb dmem;
+//     PtrStepSzb shrunk;
+//     PtrStepSzb hogluv;
+
+//     enum
+//     {
+//         FRAME_WIDTH        = 640,
+//         FRAME_HEIGHT       = 480,
+//         TOTAL_SCALES       = 55,
+//         CLASSIFIERS        = 5,
+//         ORIG_OBJECT_WIDTH  = 64,
+//         ORIG_OBJECT_HEIGHT = 128,
+//         HOG_BINS           = 6,
+//         HOG_LUV_BINS       = 10
+//     };
+
+//     int shrinkage;
+//     static const float magnitudeScaling = 1.f ;// / sqrt(2);
+// };
+
+// struct __align__(8) Node
+// {
+//     int feature;
+//     float threshold;
+
+//     Node(const int f, const float t) : feature(f), threshold(t) {}
+// };
+
+// struct __align__(8) Feature
+// {
+//     int channel;
+//     uchar4 rect;
+
+//     Feature(const int c, const uchar4 r) : channel(c), rect(r) {}
+// };
+// }}}
+
+// #endif
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@ -41,361 +41,365 @@
 //M*/

 #include <precomp.hpp>
-#include "opencv2/highgui/highgui.hpp"
+#include <opencv2/highgui/highgui.hpp>

 #if !defined (HAVE_CUDA)

 cv::gpu::SoftCascade::SoftCascade() : filds(0) { throw_nogpu(); }
-
 cv::gpu::SoftCascade::SoftCascade( const string&, const float, const float) : filds(0) { throw_nogpu(); }
-
 cv::gpu::SoftCascade::~SoftCascade() { throw_nogpu(); }
-
-bool cv::gpu::SoftCascade::load( const string&, const float, const float) { throw_nogpu(); }
-
+bool cv::gpu::SoftCascade::load( const string&, const float, const float) { throw_nogpu(); return false; }
 void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat&, const int, Stream) { throw_nogpu(); }

 #else

-#include <icf.hpp>
+// #include <icf.hpp>

-namespace cv { namespace gpu { namespace device {
-namespace icf {
-    void fillBins(cv::gpu::PtrStepSzb hogluv,const cv::gpu::PtrStepSzf& nangle);
-}
-}}}
+// namespace cv { namespace gpu { namespace device {
+// namespace icf {
+//     void fillBins(cv::gpu::PtrStepSzb hogluv,const cv::gpu::PtrStepSzf& nangle);
+// }
+// }}}
+
+// namespace {
+//     char *itoa(long i, char* s, int /*dummy_radix*/)
+//     {
+//         sprintf(s, "%ld", i);
+//         return s;
+//     }
+// }

 struct cv::gpu::SoftCascade::Filds
 {
-    // scales range
-    float minScale;
-    float maxScale;
-
-    int origObjWidth;
-    int origObjHeight;
-
-    GpuMat octaves;
-    GpuMat stages;
-    GpuMat nodes;
-    GpuMat leaves;
-    GpuMat features;
-    GpuMat levels;
-
-    // preallocated buffer 640x480x10 + 640x480
-    GpuMat dmem;
-    // 160x120x10
-    GpuMat shrunk;
-    // 161x121x10
-    GpuMat hogluv;
-
-    // will be removed in final version
-    // temporial mat for cvtColor
-    GpuMat luv;
-
-    // temporial mat for integrall
-    GpuMat integralBuffer;
-
-    // temp matrix for sobel and cartToPolar
-    GpuMat dfdx, dfdy, angle, mag, nmag, nangle;
-
-    std::vector<float> scales;
-
-    icf::Cascade cascade;
-    icf::ChannelStorage storage;
-
-    enum { BOOST = 0 };
-    enum
-    {
-        FRAME_WIDTH        = 640,
-        FRAME_HEIGHT       = 480,
-        TOTAL_SCALES       = 55,
-        CLASSIFIERS        = 5,
-        ORIG_OBJECT_WIDTH  = 64,
-        ORIG_OBJECT_HEIGHT = 128,
-        HOG_BINS           = 6,
-        HOG_LUV_BINS       = 10
-    };
-
-    bool fill(const FileNode &root, const float mins, const float maxs);
-    void detect(cv::gpu::GpuMat objects, cudaStream_t stream) const
-    {
-        cascade.detect(hogluv, objects, stream);
-    }
-
-private:
-    void calcLevels(const std::vector<icf::Octave>& octs,
-                                                    int frameW, int frameH, int nscales);
-
-    typedef std::vector<icf::Octave>::const_iterator  octIt_t;
-    int fitOctave(const std::vector<icf::Octave>& octs, const float& logFactor) const
-    {
-        float minAbsLog = FLT_MAX;
-        int res =  0;
-        for (int oct = 0; oct < (int)octs.size(); ++oct)
-        {
-            const icf::Octave& octave =octs[oct];
-            float logOctave = ::log(octave.scale);
-            float logAbsScale = ::fabs(logFactor - logOctave);
-
-            if(logAbsScale < minAbsLog)
-            {
-                res = oct;
-                minAbsLog = logAbsScale;
-            }
-        }
-        return res;
-    }
+//     // scales range
+//     float minScale;
+//     float maxScale;
+
+//     int origObjWidth;
+//     int origObjHeight;
+
+//     GpuMat octaves;
+//     GpuMat stages;
+//     GpuMat nodes;
+//     GpuMat leaves;
+//     GpuMat features;
+//     GpuMat levels;
+
+//     // preallocated buffer 640x480x10 + 640x480
+//     GpuMat dmem;
+//     // 160x120x10
+//     GpuMat shrunk;
+//     // 161x121x10
+//     GpuMat hogluv;
+
+//     // will be removed in final version
+//     // temporial mat for cvtColor
+//     GpuMat luv;
+
+//     // temporial mat for integrall
+//     GpuMat integralBuffer;
+
+//     // temp matrix for sobel and cartToPolar
+//     GpuMat dfdx, dfdy, angle, mag, nmag, nangle;
+
+//     std::vector<float> scales;
+
+//     icf::Cascade cascade;
+//     icf::ChannelStorage storage;
+
+//     enum { BOOST = 0 };
+//     enum
+//     {
+//         FRAME_WIDTH        = 640,
+//         FRAME_HEIGHT       = 480,
+//         TOTAL_SCALES       = 55,
+//         CLASSIFIERS        = 5,
+//         ORIG_OBJECT_WIDTH  = 64,
+//         ORIG_OBJECT_HEIGHT = 128,
+//         HOG_BINS           = 6,
+//         HOG_LUV_BINS       = 10
+//     };
+
+//     bool fill(const FileNode &root, const float mins, const float maxs);
+//     void detect(cv::gpu::GpuMat objects, cudaStream_t stream) const
+//     {
+//         cascade.detect(hogluv, objects, stream);
+//     }
+
+// private:
+//     void calcLevels(const std::vector<icf::Octave>& octs,
+//                                                     int frameW, int frameH, int nscales);
+
+//     typedef std::vector<icf::Octave>::const_iterator  octIt_t;
+//     int fitOctave(const std::vector<icf::Octave>& octs, const float& logFactor) const
+//     {
+//         float minAbsLog = FLT_MAX;
+//         int res =  0;
+//         for (int oct = 0; oct < (int)octs.size(); ++oct)
+//         {
+//             const icf::Octave& octave =octs[oct];
+//             float logOctave = ::log(octave.scale);
+//             float logAbsScale = ::fabs(logFactor - logOctave);
+
+//             if(logAbsScale < minAbsLog)
+//             {
+//                 res = oct;
+//                 minAbsLog = logAbsScale;
+//             }
+//         }
+//         return res;
+//     }
 };

-inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float mins, const float maxs)
-{
-    minScale = mins;
-    maxScale = maxs;
-
-    // cascade properties
-    static const char *const SC_STAGE_TYPE          = "stageType";
-    static const char *const SC_BOOST               = "BOOST";
-
-    static const char *const SC_FEATURE_TYPE        = "featureType";
-    static const char *const SC_ICF                 = "ICF";
-
-    static const char *const SC_ORIG_W              = "width";
-    static const char *const SC_ORIG_H              = "height";
-
-    static const char *const SC_OCTAVES             = "octaves";
-    static const char *const SC_STAGES              = "stages";
-    static const char *const SC_FEATURES            = "features";
-
-    static const char *const SC_WEEK                = "weakClassifiers";
-    static const char *const SC_INTERNAL            = "internalNodes";
-    static const char *const SC_LEAF                = "leafValues";
-
-    static const char *const SC_OCT_SCALE           = "scale";
-    static const char *const SC_OCT_STAGES          = "stageNum";
-    static const char *const SC_OCT_SHRINKAGE       = "shrinkingFactor";
-
-    static const char *const SC_STAGE_THRESHOLD     = "stageThreshold";
-
-    static const char * const SC_F_CHANNEL          = "channel";
-    static const char * const SC_F_RECT             = "rect";
-
-    // only Ada Boost supported
-    std::string stageTypeStr = (string)root[SC_STAGE_TYPE];
-    CV_Assert(stageTypeStr == SC_BOOST);
-
-    // only HOG-like integral channel features cupported
-    string featureTypeStr = (string)root[SC_FEATURE_TYPE];
-    CV_Assert(featureTypeStr == SC_ICF);
-
-    origObjWidth = (int)root[SC_ORIG_W];
-    CV_Assert(origObjWidth  == ORIG_OBJECT_WIDTH);
-
-    origObjHeight = (int)root[SC_ORIG_H];
-    CV_Assert(origObjHeight == ORIG_OBJECT_HEIGHT);
-
-    FileNode fn = root[SC_OCTAVES];
-        if (fn.empty()) return false;
-
-    std::vector<icf::Octave>  voctaves;
-    std::vector<float>        vstages;
-    std::vector<icf::Node>    vnodes;
-    std::vector<float>        vleaves;
-    std::vector<icf::Feature> vfeatures;
-    scales.clear();
-
-    // std::vector<Level> levels;
-
-    FileNodeIterator it = fn.begin(), it_end = fn.end();
-    int feature_offset = 0;
-    ushort octIndex = 0;
-    ushort shrinkage = 1;
-
-    for (; it != it_end; ++it)
-    {
-        FileNode fns = *it;
-        float scale = (float)fns[SC_OCT_SCALE];
-        scales.push_back(scale);
-        ushort nstages = saturate_cast<ushort>((int)fns[SC_OCT_STAGES]);
-        ushort2 size;
-        size.x = cvRound(ORIG_OBJECT_WIDTH * scale);
-        size.y = cvRound(ORIG_OBJECT_HEIGHT * scale);
-        shrinkage = saturate_cast<ushort>((int)fns[SC_OCT_SHRINKAGE]);
-
-        icf::Octave octave(octIndex, nstages, shrinkage, size, scale);
-        CV_Assert(octave.stages > 0);
-        voctaves.push_back(octave);
-
-        FileNode ffs = fns[SC_FEATURES];
-        if (ffs.empty()) return false;
-
-        fns = fns[SC_STAGES];
-        if (fn.empty()) return false;
-
-        // for each stage (~ decision tree with H = 2)
-        FileNodeIterator st = fns.begin(), st_end = fns.end();
-        for (; st != st_end; ++st )
-        {
-            fns = *st;
-            vstages.push_back((float)fns[SC_STAGE_THRESHOLD]);
-
-            fns = fns[SC_WEEK];
-            FileNodeIterator ftr = fns.begin(), ft_end = fns.end();
-            for (; ftr != ft_end; ++ftr)
-            {
-                fns = (*ftr)[SC_INTERNAL];
-                FileNodeIterator inIt = fns.begin(), inIt_end = fns.end();
-                for (; inIt != inIt_end;)
-                {
-                    int feature = (int)(*(inIt +=2)++) + feature_offset;
-                    float th = (float)(*(inIt++));
-                    vnodes.push_back(icf::Node(feature, th));
-                }
-
-                fns = (*ftr)[SC_LEAF];
-                inIt = fns.begin(), inIt_end = fns.end();
-                for (; inIt != inIt_end; ++inIt)
-                    vleaves.push_back((float)(*inIt));
-            }
-        }
-
-        st = ffs.begin(), st_end = ffs.end();
-        for (; st != st_end; ++st )
-        {
-            cv::FileNode rn = (*st)[SC_F_RECT];
-            cv::FileNodeIterator r_it = rn.begin();
-            uchar4 rect;
-            rect.x = saturate_cast<uchar>((int)*(r_it++));
-            rect.y = saturate_cast<uchar>((int)*(r_it++));
-            rect.z = saturate_cast<uchar>((int)*(r_it++));
-            rect.w = saturate_cast<uchar>((int)*(r_it++));
-            vfeatures.push_back(icf::Feature((int)(*st)[SC_F_CHANNEL], rect));
-        }
-
-        feature_offset += octave.stages * 3;
-        ++octIndex;
-    }
-
-    // upload in gpu memory
-    octaves.upload(cv::Mat(1, voctaves.size() * sizeof(icf::Octave), CV_8UC1, (uchar*)&(voctaves[0]) ));
-    CV_Assert(!octaves.empty());
-
-    stages.upload(cv::Mat(vstages).reshape(1,1));
-    CV_Assert(!stages.empty());
-
-    nodes.upload(cv::Mat(1, vnodes.size() * sizeof(icf::Node), CV_8UC1, (uchar*)&(vnodes[0]) ));
-    CV_Assert(!nodes.empty());
-
-    leaves.upload(cv::Mat(vleaves).reshape(1,1));
-    CV_Assert(!leaves.empty());
-
-    features.upload(cv::Mat(1, vfeatures.size() * sizeof(icf::Feature), CV_8UC1, (uchar*)&(vfeatures[0]) ));
-    CV_Assert(!features.empty());
-
-    // compute levels
-    calcLevels(voctaves, FRAME_WIDTH, FRAME_HEIGHT, TOTAL_SCALES);
-    CV_Assert(!levels.empty());
-
-    //init Cascade
-    cascade = icf::Cascade(octaves, stages, nodes, leaves, features, levels);
-
-    // allocate buffers
-    dmem.create(FRAME_HEIGHT * (HOG_LUV_BINS + 1), FRAME_WIDTH, CV_8UC1);
-    shrunk.create(FRAME_HEIGHT / shrinkage * HOG_LUV_BINS, FRAME_WIDTH / shrinkage, CV_8UC1);
-    // hogluv.create( (FRAME_HEIGHT / shrinkage + 1) * HOG_LUV_BINS, (FRAME_WIDTH / shrinkage + 1), CV_16UC1);
-    hogluv.create( (FRAME_HEIGHT / shrinkage + 1) * HOG_LUV_BINS, (FRAME_WIDTH / shrinkage + 1), CV_32SC1);
-    luv.create(FRAME_HEIGHT, FRAME_WIDTH, CV_8UC3);
-    integralBuffer.create(shrunk.rows + 1 * HOG_LUV_BINS, shrunk.cols + 1, CV_32SC1);
-
-    dfdx.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
-    dfdy.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
-    angle.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
-    mag.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
-
-    nmag.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
-    nangle.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
-
-    storage = icf::ChannelStorage(dmem, shrunk, hogluv, shrinkage);
-    return true;
-}
-
-namespace {
-    struct CascadeIntrinsics
-    {
-        static const float lambda = 1.099f, a = 0.89f;
-
-        static float getFor(int channel, float scaling)
-        {
-            CV_Assert(channel < 10);
-
-            if (fabs(scaling - 1.f) < FLT_EPSILON)
-                return 1.f;
-
-            // according to R. Benenson, M. Mathias, R. Timofte and L. Van Gool's and Dallal's papers
-            static const float A[2][2] =
-            {   //channel <= 6, otherwise
-                {        0.89f, 1.f}, // down
-                {        1.00f, 1.f}  // up
-            };
-
-            static const float B[2][2] =
-            {   //channel <= 6,  otherwise
-                { 1.099f / log(2), 2.f}, // down
-                {             0.f, 2.f}  // up
-            };
-
-            float a = A[(int)(scaling >= 1)][(int)(channel > 6)];
-            float b = B[(int)(scaling >= 1)][(int)(channel > 6)];
-
-            // printf("!!! scaling: %f %f %f -> %f\n", scaling, a, b, a * pow(scaling, b));
-            return a * pow(scaling, b);
-        }
-    };
-}
-
-inline void cv::gpu::SoftCascade::Filds::calcLevels(const std::vector<icf::Octave>& octs,
-                                                    int frameW, int frameH, int nscales)
-{
-    CV_Assert(nscales > 1);
-
-    std::vector<icf::Level> vlevels;
-    float logFactor = (::log(maxScale) - ::log(minScale)) / (nscales -1);
-
-    float scale = minScale;
-    for (int sc = 0; sc < nscales; ++sc)
-    {
-        int width  = ::std::max(0.0f, frameW - (origObjWidth  * scale));
-        int height = ::std::max(0.0f, frameH - (origObjHeight * scale));
-
-        float logScale = ::log(scale);
-        int fit = fitOctave(octs, logScale);
-
-        icf::Level level(fit, octs[fit], scale, width, height);
-        level.scaling[0] = CascadeIntrinsics::getFor(0, level.relScale);
-        level.scaling[1] = CascadeIntrinsics::getFor(9, level.relScale);
-
-        if (!width || !height)
-            break;
-        else
-            vlevels.push_back(level);
-
-        if (::fabs(scale - maxScale) < FLT_EPSILON) break;
-        scale = ::std::min(maxScale, ::expf(::log(scale) + logFactor));
-
-        // printf("level: %d (%f %f) [%f %f] (%d %d) (%d %d)\n", level.octave, level.relScale, level.shrScale,
-        //     level.scaling[0], level.scaling[1], level.workRect.x, level.workRect.y, level.objSize.x, level.objSize.y);
-
-        // std::cout << "level " << sc
-        //           << " octeve "
-        //           << vlevels[sc].octave
-        //           << " relScale "
-        //           << vlevels[sc].relScale
-        //           << " " << vlevels[sc].shrScale
-        //           << " [" << (int)vlevels[sc].objSize.x
-        //           << " " <<  (int)vlevels[sc].objSize.y << "] ["
-        // <<  (int)vlevels[sc].workRect.x << " " <<  (int)vlevels[sc].workRect.y << "]" << std::endl;
-    }
-    levels.upload(cv::Mat(1, vlevels.size() * sizeof(icf::Level), CV_8UC1, (uchar*)&(vlevels[0]) ));
-}
+// inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float mins, const float maxs)
+// {
+//     minScale = mins;
+//     maxScale = maxs;
+
+//     // cascade properties
+//     static const char *const SC_STAGE_TYPE          = "stageType";
+//     static const char *const SC_BOOST               = "BOOST";
+
+//     static const char *const SC_FEATURE_TYPE        = "featureType";
+//     static const char *const SC_ICF                 = "ICF";
+
+//     static const char *const SC_ORIG_W              = "width";
+//     static const char *const SC_ORIG_H              = "height";
+
+//     static const char *const SC_OCTAVES             = "octaves";
+//     static const char *const SC_STAGES              = "stages";
+//     static const char *const SC_FEATURES            = "features";
+
+//     static const char *const SC_WEEK                = "weakClassifiers";
+//     static const char *const SC_INTERNAL            = "internalNodes";
+//     static const char *const SC_LEAF                = "leafValues";
+
+//     static const char *const SC_OCT_SCALE           = "scale";
+//     static const char *const SC_OCT_STAGES          = "stageNum";
+//     static const char *const SC_OCT_SHRINKAGE       = "shrinkingFactor";
+
+//     static const char *const SC_STAGE_THRESHOLD     = "stageThreshold";
+
+//     static const char * const SC_F_CHANNEL          = "channel";
+//     static const char * const SC_F_RECT             = "rect";
+
+//     // only Ada Boost supported
+//     std::string stageTypeStr = (string)root[SC_STAGE_TYPE];
+//     CV_Assert(stageTypeStr == SC_BOOST);
+
+//     // only HOG-like integral channel features cupported
+//     string featureTypeStr = (string)root[SC_FEATURE_TYPE];
+//     CV_Assert(featureTypeStr == SC_ICF);
+
+//     origObjWidth = (int)root[SC_ORIG_W];
+//     CV_Assert(origObjWidth  == ORIG_OBJECT_WIDTH);
+
+//     origObjHeight = (int)root[SC_ORIG_H];
+//     CV_Assert(origObjHeight == ORIG_OBJECT_HEIGHT);
+
+//     FileNode fn = root[SC_OCTAVES];
+//         if (fn.empty()) return false;
+
+//     std::vector<icf::Octave>  voctaves;
+//     std::vector<float>        vstages;
+//     std::vector<icf::Node>    vnodes;
+//     std::vector<float>        vleaves;
+//     std::vector<icf::Feature> vfeatures;
+//     scales.clear();
+
+//     // std::vector<Level> levels;
+
+//     FileNodeIterator it = fn.begin(), it_end = fn.end();
+//     int feature_offset = 0;
+//     ushort octIndex = 0;
+//     ushort shrinkage = 1;
+
+//     for (; it != it_end; ++it)
+//     {
+//         FileNode fns = *it;
+//         float scale = (float)fns[SC_OCT_SCALE];
+//         scales.push_back(scale);
+//         ushort nstages = saturate_cast<ushort>((int)fns[SC_OCT_STAGES]);
+//         ushort2 size;
+//         size.x = cvRound(ORIG_OBJECT_WIDTH * scale);
+//         size.y = cvRound(ORIG_OBJECT_HEIGHT * scale);
+//         shrinkage = saturate_cast<ushort>((int)fns[SC_OCT_SHRINKAGE]);
+
+//         icf::Octave octave(octIndex, nstages, shrinkage, size, scale);
+//         CV_Assert(octave.stages > 0);
+//         voctaves.push_back(octave);
+
+//         FileNode ffs = fns[SC_FEATURES];
+//         if (ffs.empty()) return false;
+
+//         fns = fns[SC_STAGES];
+//         if (fn.empty()) return false;
+
+//         // for each stage (~ decision tree with H = 2)
+//         FileNodeIterator st = fns.begin(), st_end = fns.end();
+//         for (; st != st_end; ++st )
+//         {
+//             fns = *st;
+//             vstages.push_back((float)fns[SC_STAGE_THRESHOLD]);
+
+//             fns = fns[SC_WEEK];
+//             FileNodeIterator ftr = fns.begin(), ft_end = fns.end();
+//             for (; ftr != ft_end; ++ftr)
+//             {
+//                 fns = (*ftr)[SC_INTERNAL];
+//                 FileNodeIterator inIt = fns.begin(), inIt_end = fns.end();
+//                 for (; inIt != inIt_end;)
+//                 {
+//                     int feature = (int)(*(inIt +=2)++) + feature_offset;
+//                     float th = (float)(*(inIt++));
+//                     vnodes.push_back(icf::Node(feature, th));
+//                 }
+
+//                 fns = (*ftr)[SC_LEAF];
+//                 inIt = fns.begin(), inIt_end = fns.end();
+//                 for (; inIt != inIt_end; ++inIt)
+//                     vleaves.push_back((float)(*inIt));
+//             }
+//         }
+
+//         st = ffs.begin(), st_end = ffs.end();
+//         for (; st != st_end; ++st )
+//         {
+//             cv::FileNode rn = (*st)[SC_F_RECT];
+//             cv::FileNodeIterator r_it = rn.begin();
+//             uchar4 rect;
+//             rect.x = saturate_cast<uchar>((int)*(r_it++));
+//             rect.y = saturate_cast<uchar>((int)*(r_it++));
+//             rect.z = saturate_cast<uchar>((int)*(r_it++));
+//             rect.w = saturate_cast<uchar>((int)*(r_it++));
+//             vfeatures.push_back(icf::Feature((int)(*st)[SC_F_CHANNEL], rect));
+//         }
+
+//         feature_offset += octave.stages * 3;
+//         ++octIndex;
+//     }
+
+//     // upload in gpu memory
+//     octaves.upload(cv::Mat(1, voctaves.size() * sizeof(icf::Octave), CV_8UC1, (uchar*)&(voctaves[0]) ));
+//     CV_Assert(!octaves.empty());
+
+//     stages.upload(cv::Mat(vstages).reshape(1,1));
+//     CV_Assert(!stages.empty());
+
+//     nodes.upload(cv::Mat(1, vnodes.size() * sizeof(icf::Node), CV_8UC1, (uchar*)&(vnodes[0]) ));
+//     CV_Assert(!nodes.empty());
+
+//     leaves.upload(cv::Mat(vleaves).reshape(1,1));
+//     CV_Assert(!leaves.empty());
+
+//     features.upload(cv::Mat(1, vfeatures.size() * sizeof(icf::Feature), CV_8UC1, (uchar*)&(vfeatures[0]) ));
+//     CV_Assert(!features.empty());
+
+//     // compute levels
+//     calcLevels(voctaves, FRAME_WIDTH, FRAME_HEIGHT, TOTAL_SCALES);
+//     CV_Assert(!levels.empty());
+
+//     //init Cascade
+//     cascade = icf::Cascade(octaves, stages, nodes, leaves, features, levels);
+
+//     // allocate buffers
+//     dmem.create(FRAME_HEIGHT * (HOG_LUV_BINS + 1), FRAME_WIDTH, CV_8UC1);
+//     shrunk.create(FRAME_HEIGHT / shrinkage * HOG_LUV_BINS, FRAME_WIDTH / shrinkage, CV_8UC1);
+//     // hogluv.create( (FRAME_HEIGHT / shrinkage + 1) * HOG_LUV_BINS, (FRAME_WIDTH / shrinkage + 1), CV_16UC1);
+//     hogluv.create( (FRAME_HEIGHT / shrinkage + 1) * HOG_LUV_BINS, (FRAME_WIDTH / shrinkage + 1), CV_32SC1);
+//     luv.create(FRAME_HEIGHT, FRAME_WIDTH, CV_8UC3);
+//     integralBuffer.create(shrunk.rows + 1 * HOG_LUV_BINS, shrunk.cols + 1, CV_32SC1);
+
+//     dfdx.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
+//     dfdy.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
+//     angle.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
+//     mag.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
+
+//     nmag.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
+//     nangle.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1);
+
+//     storage = icf::ChannelStorage(dmem, shrunk, hogluv, shrinkage);
+//     return true;
+// }
+
+// namespace {
+//     struct CascadeIntrinsics
+//     {
+//         static const float lambda = 1.099f, a = 0.89f;
+
+//         static float getFor(int channel, float scaling)
+//         {
+//             CV_Assert(channel < 10);
+
+//             if (fabs(scaling - 1.f) < FLT_EPSILON)
+//                 return 1.f;
+
+//             // according to R. Benenson, M. Mathias, R. Timofte and L. Van Gool's and Dallal's papers
+//             static const float A[2][2] =
+//             {   //channel <= 6, otherwise
+//                 {        0.89f, 1.f}, // down
+//                 {        1.00f, 1.f}  // up
+//             };
+
+//             static const float B[2][2] =
+//             {   //channel <= 6,  otherwise
+//                 { 1.099f / log(2), 2.f}, // down
+//                 {             0.f, 2.f}  // up
+//             };
+
+//             float a = A[(int)(scaling >= 1)][(int)(channel > 6)];
+//             float b = B[(int)(scaling >= 1)][(int)(channel > 6)];
+
+//             // printf("!!! scaling: %f %f %f -> %f\n", scaling, a, b, a * pow(scaling, b));
+//             return a * pow(scaling, b);
+//         }
+//     };
+// }
+
+// inline void cv::gpu::SoftCascade::Filds::calcLevels(const std::vector<icf::Octave>& octs,
+//                                                     int frameW, int frameH, int nscales)
+// {
+//     CV_Assert(nscales > 1);
+
+//     std::vector<icf::Level> vlevels;
+//     float logFactor = (::log(maxScale) - ::log(minScale)) / (nscales -1);
+
+//     float scale = minScale;
+//     for (int sc = 0; sc < nscales; ++sc)
+//     {
+//         int width  = ::std::max(0.0f, frameW - (origObjWidth  * scale));
+//         int height = ::std::max(0.0f, frameH - (origObjHeight * scale));
+
+//         float logScale = ::log(scale);
+//         int fit = fitOctave(octs, logScale);
+
+//         icf::Level level(fit, octs[fit], scale, width, height);
+//         level.scaling[0] = CascadeIntrinsics::getFor(0, level.relScale);
+//         level.scaling[1] = CascadeIntrinsics::getFor(9, level.relScale);
+
+//         if (!width || !height)
+//             break;
+//         else
+//             vlevels.push_back(level);
+
+//         if (::fabs(scale - maxScale) < FLT_EPSILON) break;
+//         scale = ::std::min(maxScale, ::expf(::log(scale) + logFactor));
+
+//         // printf("level: %d (%f %f) [%f %f] (%d %d) (%d %d)\n", level.octave, level.relScale, level.shrScale,
+//         //     level.scaling[0], level.scaling[1], level.workRect.x, level.workRect.y, level.objSize.x, level.objSize.y);
+
+//         // std::cout << "level " << sc
+//         //           << " octeve "
+//         //           << vlevels[sc].octave
+//         //           << " relScale "
+//         //           << vlevels[sc].relScale
+//         //           << " " << vlevels[sc].shrScale
+//         //           << " [" << (int)vlevels[sc].objSize.x
+//         //           << " " <<  (int)vlevels[sc].objSize.y << "] ["
+//         // <<  (int)vlevels[sc].workRect.x << " " <<  (int)vlevels[sc].workRect.y << "]" << std::endl;
+//     }
+//     levels.upload(cv::Mat(1, vlevels.size() * sizeof(icf::Level), CV_8UC1, (uchar*)&(vlevels[0]) ));
+// }

 cv::gpu::SoftCascade::SoftCascade() : filds(0) {}

@ -419,97 +423,89 @@ bool cv::gpu::SoftCascade::load( const string& filename, const float minScale, c
    if (!fs.isOpened()) return false;

    filds = new Filds;
-    Filds& flds = *filds;
-    if (!flds.fill(fs.getFirstTopLevelNode(), minScale, maxScale)) return false;
+//     Filds& flds = *filds;
+//     if (!flds.fill(fs.getFirstTopLevelNode(), minScale, maxScale)) return false;
    return true;
 }

-namespace {
-    char *itoa(long i, char* s, int /*dummy_radix*/)
-    {
-        sprintf(s, "%ld", i);
-        return s;
-    }
-}
-
-#define USE_REFERENCE_VALUES
+// #define USE_REFERENCE_VALUES
 void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& /*rois*/,
                                GpuMat& objects, const int /*rejectfactor*/, Stream s)
 {
-    // only color images are supperted
-    CV_Assert(colored.type() == CV_8UC3);
-
-    // // only this window size allowed
-    CV_Assert(colored.cols == 640 && colored.rows == 480);
-
-    Filds& flds = *filds;
-
-#if defined USE_REFERENCE_VALUES
-    cudaMemset(flds.hogluv.data, 0, flds.hogluv.step * flds.hogluv.rows);
-    cv::FileStorage imgs("/home/kellan/testInts.xml", cv::FileStorage::READ);
-    char buff[33];
-
-    for(int i = 0; i < Filds::HOG_LUV_BINS; ++i)
-    {
-        cv::Mat channel;
-        imgs[std::string("channel") + itoa(i, buff, 10)] >> channel;
-        GpuMat gchannel(flds.hogluv, cv::Rect(0, 121 * i, 161, 121));
-        gchannel.upload(channel);
-    }
-#else
-    GpuMat& dmem = flds.dmem;
-    cudaMemset(dmem.data, 0, dmem.step * dmem.rows);
-    GpuMat& shrunk = flds.shrunk;
-    int w = shrunk.cols;
-    int h = colored.rows / flds.storage.shrinkage;
+//     // only color images are supperted
+//     CV_Assert(colored.type() == CV_8UC3);

-    std::vector<GpuMat> splited;
-    for(int i = 0; i < 3; ++i)
-    {
-        splited.push_back(GpuMat(dmem, cv::Rect(0, colored.rows * (7 + i), colored.cols, colored.rows)));
-    }
+//     // // only this window size allowed
+//     CV_Assert(colored.cols == 640 && colored.rows == 480);

-    GpuMat gray(dmem, cv::Rect(0, colored.rows * 10, colored.cols, colored.rows) );
+//     Filds& flds = *filds;

-    cv::gpu::cvtColor(colored, gray, CV_RGB2GRAY);
+// #if defined USE_REFERENCE_VALUES
+//     cudaMemset(flds.hogluv.data, 0, flds.hogluv.step * flds.hogluv.rows);
+//     cv::FileStorage imgs("/home/kellan/testInts.xml", cv::FileStorage::READ);
+//     char buff[33];

-    //create hog
-    cv::gpu::Sobel(gray, flds.dfdx, CV_32F, 1, 0, 3, 0.25);
-    cv::gpu::Sobel(gray, flds.dfdy, CV_32F, 0, 1, 3, 0.25);
+//     for(int i = 0; i < Filds::HOG_LUV_BINS; ++i)
+//     {
+//         cv::Mat channel;
+//         imgs[std::string("channel") + itoa(i, buff, 10)] >> channel;
+//         GpuMat gchannel(flds.hogluv, cv::Rect(0, 121 * i, 161, 121));
+//         gchannel.upload(channel);
+//     }
+// #else
+//     GpuMat& dmem = flds.dmem;
+//     cudaMemset(dmem.data, 0, dmem.step * dmem.rows);
+//     GpuMat& shrunk = flds.shrunk;
+//     int w = shrunk.cols;
+//     int h = colored.rows / flds.storage.shrinkage;

-    cv::gpu::cartToPolar(flds.dfdx, flds.dfdy, flds.mag, flds.angle, true);
+//     std::vector<GpuMat> splited;
+//     for(int i = 0; i < 3; ++i)
+//     {
+//         splited.push_back(GpuMat(dmem, cv::Rect(0, colored.rows * (7 + i), colored.cols, colored.rows)));
+//     }

-    cv::gpu::multiply(flds.mag, cv::Scalar::all(1.0 / ::log(2)), flds.nmag);
-    cv::gpu::multiply(flds.angle, cv::Scalar::all(1.0 / 60.0), flds.nangle);
+//     GpuMat gray(dmem, cv::Rect(0, colored.rows * 10, colored.cols, colored.rows) );

-    GpuMat magCannel(dmem, cv::Rect(0, colored.rows * 6, colored.cols, colored.rows));
-    flds.nmag.convertTo(magCannel, CV_8UC1);
-    device::icf::fillBins(dmem, flds.nangle);
+//     cv::gpu::cvtColor(colored, gray, CV_RGB2GRAY);

-    // create luv
-    cv::gpu::cvtColor(colored, flds.luv, CV_BGR2Luv);
-    cv::gpu::split(flds.luv, splited);
+//     //create hog
+//     cv::gpu::Sobel(gray, flds.dfdx, CV_32F, 1, 0, 3, 0.25);
+//     cv::gpu::Sobel(gray, flds.dfdy, CV_32F, 0, 1, 3, 0.25);

-    GpuMat plane(dmem, cv::Rect(0, 0, colored.cols, colored.rows * Filds::HOG_LUV_BINS));
-    cv::gpu::resize(plane, flds.shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA);
+//     cv::gpu::cartToPolar(flds.dfdx, flds.dfdy, flds.mag, flds.angle, true);

-    // fer debug purpose
-    // cudaMemset(flds.hogluv.data, 0, flds.hogluv.step * flds.hogluv.rows);
+//     cv::gpu::multiply(flds.mag, cv::Scalar::all(1.0 / ::log(2)), flds.nmag);
+//     cv::gpu::multiply(flds.angle, cv::Scalar::all(1.0 / 60.0), flds.nangle);

-    for(int i = 0; i < Filds::HOG_LUV_BINS; ++i)
-    {
-        GpuMat channel(shrunk, cv::Rect(0, h  * i, w, h ));
-        GpuMat sum(flds.hogluv, cv::Rect(0, (h + 1) * i, w + 1, h + 1));
-        cv::gpu::integralBuffered(channel, sum, flds.integralBuffer);
-    }
+//     GpuMat magCannel(dmem, cv::Rect(0, colored.rows * 6, colored.cols, colored.rows));
+//     flds.nmag.convertTo(magCannel, CV_8UC1);
+//     device::icf::fillBins(dmem, flds.nangle);

-#endif
+//     // create luv
+//     cv::gpu::cvtColor(colored, flds.luv, CV_BGR2Luv);
+//     cv::gpu::split(flds.luv, splited);
+
+//     GpuMat plane(dmem, cv::Rect(0, 0, colored.cols, colored.rows * Filds::HOG_LUV_BINS));
+//     cv::gpu::resize(plane, flds.shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA);
+
+//     // fer debug purpose
+//     // cudaMemset(flds.hogluv.data, 0, flds.hogluv.step * flds.hogluv.rows);
+
+//     for(int i = 0; i < Filds::HOG_LUV_BINS; ++i)
+//     {
+//         GpuMat channel(shrunk, cv::Rect(0, h  * i, w, h ));
+//         GpuMat sum(flds.hogluv, cv::Rect(0, (h + 1) * i, w + 1, h + 1));
+//         cv::gpu::integralBuffered(channel, sum, flds.integralBuffer);
+    // }
+
+// #endif

-    cudaStream_t stream = StreamAccessor::getStream(s);
-    // detection
-    flds.detect(objects, stream);
+//     cudaStream_t stream = StreamAccessor::getStream(s);
+//     // detection
+//     flds.detect(objects, stream);

-    // flds.storage.frame(colored, stream);
+//     // flds.storage.frame(colored, stream);
 }

 #endif