Merge cuda-geek/soft-cascade-gpu into cuda-dev

12 years ago · 209f16455d
parent 1712d0930c 05cd88ae42
commit 209f16455d
16 changed files with 2199 additions and 17 deletions
--- a/modules/gpu/doc/object_detection.rst
+++ b/modules/gpu/doc/object_detection.rst
@ -199,6 +199,121 @@ Returns block descriptors computed for the whole image.
 The function is mainly used to learn the classifier.
 Soft Cascade Classifier
 ==========================
 Soft Cascade Classifier for Object Detection
 ----------------------------------------------------------
 Cascade detectors have been shown to operate extremely rapidly, with high accuracy, and have important applications in different spheres. The initial goal for this cascade implementation was the fast and accurate pedestrian detector but it also useful in general. Soft cascade is trained with AdaBoost. But instead of training sequence of stages, the soft cascade is trained as a one long stage of T weak classifiers. Soft cascade is formulated as follows:
 .. math::
    \texttt{H}(x) = \sum _{\texttt{t}=1..\texttt{T}} {\texttt{s}_t(x)}
 where :math:`\texttt{s}_t(x) = \alpha_t\texttt{h}_t(x)` are the set of thresholded weak classifiers selected during AdaBoost training scaled by the associated weights. Let
 .. math::
    \texttt{H}_t(x) = \sum _{\texttt{i}=1..\texttt{t}} {\texttt{s}_i(x)}
 be the partial sum of sample responses before :math:`t`-the weak classifier will be applied. The funtcion :math:`\texttt{H}_t(x)` of :math:`t` for sample :math:`x` named *sample trace*.
 After each weak classifier evaluation, the sample trace at the point :math:`t` is compared with the rejection threshold :math:`r_t`. The sequence of :math:`r_t` named *rejection trace*.
 The sample has been rejected if it fall rejection threshold. So stageless cascade allows to reject not-object sample as soon as possible. Another meaning of the sample trace is a confidence with that sample recognized as desired object. At each :math:`t` that confidence depend on all previous weak classifier. This feature of soft cascade is resulted in more accurate detection. The original formulation of soft cascade can be found in [BJ05]_.
 .. [BJ05] Lubomir Bourdev and Jonathan Brandt. tRobust Object Detection Via Soft Cascade. IEEE CVPR, 2005.
 .. [BMTG12] Rodrigo Benenson, Markus Mathias, Radu Timofte and Luc Van Gool. Pedestrian detection at 100 frames per second. IEEE CVPR, 2012.
 SCascade
 ----------------
 .. ocv:class:: SCascade : public Algorithm
 Implementation of soft (stageless) cascaded detector. ::
    class CV_EXPORTS SCascade : public Algorithm
    {
        struct CV_EXPORTS Detection
        {
              ushort x;
              ushort y;
              ushort w;
              ushort h;
              float confidence;
              int kind;
              enum {PEDESTRIAN = 0};
        };
        SCascade(const double minScale = 0.4, const double maxScale = 5., const int scales = 55, const int rejfactor = 1);
        virtual ~SCascade();
        virtual bool load(const FileNode& fn);
        virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const;
        virtual void genRoi(InputArray roi, OutputArray mask, Stream& stream = Stream::Null()) const;
    };
 SCascade::SCascade
 --------------------------
 An empty cascade will be created.
 .. ocv:function:: bool SCascade::SCascade(const float minScale = 0.4f, const float maxScale = 5.f, const int scales = 55, const int rejfactor = 1)
    :param minScale: a minimum scale relative to the original size of the image on which cascade will be applyed.
    :param maxScale: a maximum scale relative to the original size of the image on which cascade will be applyed.
    :param scales: a number of scales from minScale to maxScale.
    :param rejfactor: used for non maximum suppression.
 SCascade::~SCascade
 ---------------------------
 Destructor for SCascade.
 .. ocv:function:: SCascade::~SCascade()
 SCascade::load
 --------------------------
 Load cascade from FileNode.
 .. ocv:function:: bool SCascade::load(const FileNode& fn)
    :param fn: File node from which the soft cascade are read.
 SCascade::detect
 --------------------------
 Apply cascade to an input frame and return the vector of Decection objcts.
 .. ocv:function:: void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const
    :param image: a frame on which detector will be applied.
    :param rois: a regions of interests mask generated by genRoi. Only the objects that fall into one of the regions will be returned.
    :param objects: an output array of Detections represented as GpuMat of detections (SCascade::Detection). The first element of the matrix is  actually a count of detections.
    :param stream: a high-level CUDA stream abstraction used for asynchronous execution.
 SCascade::genRoi
 --------------------------
 Convert ROI matrix into the suitable for detect method.
 .. ocv:function:: void genRoi(InputArray roi, OutputArray mask, Stream& stream = Stream::Null()) const
    :param rois: an input matrix of the same size as the image. There non zero value mean that detector should be executed in this point.
    :param mask: an output mask
    :param stream: a high-level CUDA stream abstraction used for asynchronous execution.
 gpu::CascadeClassifier_GPU
 --------------------------
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@ -1532,6 +1532,76 @@ public:
    int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, Size maxObjectSize, Size minSize = Size(), double scaleFactor = 1.1, int minNeighbors = 4);
 };
 // ======================== GPU version for soft cascade ===================== //
 // Implementation of soft (stageless) cascaded detector.
 class CV_EXPORTS SCascade : public Algorithm
 {
 public:
    // Representation of detectors result.
    struct CV_EXPORTS Detection
    {
        ushort x;
        ushort y;
        ushort w;
        ushort h;
        float confidence;
        int kind;
        enum {PEDESTRIAN = 0};
    };
    enum { NO_REJECT = 1, DOLLAR = 2, /*PASCAL = 4,*/ DEFAULT = NO_REJECT};
    // An empty cascade will be created.
    // Param minScale is a minimum scale relative to the original size of the image on which cascade will be applyed.
    // Param minScale is a maximum scale relative to the original size of the image on which cascade will be applyed.
    // Param scales is a number of scales from minScale to maxScale.
    // Param rejfactor is used for NMS.
    SCascade(const double minScale = 0.4, const double maxScale = 5., const int scales = 55, const int rejCriteria = 1);
    virtual ~SCascade();
    cv::AlgorithmInfo* info() const;
    // Load cascade from FileNode.
    // Param fn is a root node for cascade. Should be <cascade>.
    virtual bool load(const FileNode& fn);
    // Load cascade config.
    virtual void read(const FileNode& fn);
    // Return the matrix of of detectioned objects.
    // Param image is a frame on which detector will be applied.
    // Param rois is a regions of interests mask generated by genRoi.
    //    Only the objects that fall into one of the regions will be returned.
    // Param objects is an output array of Detections represented as GpuMat of detections (SCascade::Detection)
    //    The first element of the matrix is  actually a count of detections.
    // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution
    virtual void detect(InputArray image, InputArray rois, OutputArray objects, Stream& stream = Stream::Null()) const;
    // Convert ROI matrix into the suitable for detect method.
    // Param roi is an input matrix of the same size as the image.
    //    There non zero value mean that detector should be executed in this point.
    // Param mask is an output mask
    // Param stream is stream is a high-level CUDA stream abstraction used for asynchronous execution
    virtual void genRoi(InputArray roi, OutputArray mask, Stream& stream = Stream::Null()) const;
 private:
    struct Fields;
    Fields* fields;
    double minScale;
    double maxScale;
    int scales;
    int rejCriteria;
 };
 CV_EXPORTS bool initModule_gpu(void);
 ////////////////////////////////// SURF //////////////////////////////////////////
 class CV_EXPORTS SURF_GPU
--- a/modules/gpu/perf/perf_objdetect.cpp
+++ b/modules/gpu/perf/perf_objdetect.cpp
@ -89,7 +89,6 @@ PERF_TEST_P(HOG, CalTech, Values<string>("gpu/caltech/image_00000009_0.png", "gp
    SANITY_CHECK(found_locations);
 }
 ///////////////////////////////////////////////////////////////
 // HaarClassifier
@ -181,4 +180,4 @@ PERF_TEST_P(ImageAndCascade, ObjDetect_LBPClassifier,
    }
 }
-} // namespace
+} // namespace
--- a/modules/gpu/perf/perf_softcascade.cpp
+++ b/modules/gpu/perf/perf_softcascade.cpp
@ -0,0 +1,292 @@
 #include "perf_precomp.hpp"
 #define GPU_PERF_TEST_P(fixture, name, params)  \
    class fixture##_##name : public fixture {\
     public:\
      fixture##_##name() {}\
     protected:\
        virtual void __cpu();\
        virtual void __gpu();\
      virtual void PerfTestBody();\
    };\
    TEST_P(fixture##_##name, name /*perf*/){ RunPerfTestBody(); if (PERF_RUN_GPU()) __gpu(); else __cpu();}\
    INSTANTIATE_TEST_CASE_P(/*none*/, fixture##_##name, params);\
    void fixture##_##name::PerfTestBody()
 #define RUN_CPU(fixture, name)\
    void fixture##_##name::__cpu()
 #define RUN_GPU(fixture, name)\
    void fixture##_##name::__gpu()
 #define NO_CPU(fixture, name)\
 void fixture##_##name::__cpu() { FAIL() << "No such CPU implementation analogy";}
 namespace {
    struct DetectionLess
    {
        bool operator()(const cv::gpu::SCascade::Detection& a,
            const cv::gpu::SCascade::Detection& b) const
        {
            if (a.x != b.x) return a.x < b.x;
            else if (a.y != b.y) return a.y < b.y;
            else if (a.w != b.w) return a.w < b.w;
            else return a.h < b.h;
        }
    };
    cv::Mat sortDetections(cv::gpu::GpuMat& objects)
    {
        cv::Mat detections(objects);
        typedef cv::gpu::SCascade::Detection Detection;
        Detection* begin = (Detection*)(detections.ptr<char>(0));
        Detection* end = (Detection*)(detections.ptr<char>(0) + detections.cols);
        std::sort(begin, end, DetectionLess());
        return detections;
    }
 }
 typedef std::tr1::tuple<std::string, std::string> fixture_t;
 typedef perf::TestBaseWithParam<fixture_t> SCascadeTest;
 GPU_PERF_TEST_P(SCascadeTest, detect,
    testing::Combine(
        testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
        testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png"))))
 { }
 RUN_GPU(SCascadeTest, detect)
 {
    cv::Mat cpu = readImage (GET_PARAM(1));
    ASSERT_FALSE(cpu.empty());
    cv::gpu::GpuMat colored(cpu);
    cv::gpu::SCascade cascade;
    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
    ASSERT_TRUE(fs.isOpened());
    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
    cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SCascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1), trois;
    rois.setTo(1);
    cascade.genRoi(rois, trois);
    cascade.detect(colored, trois, objectBoxes);
    TEST_CYCLE()
    {
        cascade.detect(colored, trois, objectBoxes);
    }
    SANITY_CHECK(sortDetections(objectBoxes));
 }
 NO_CPU(SCascadeTest, detect)
 static cv::Rect getFromTable(int idx)
 {
    static const cv::Rect rois[] =
    {
        cv::Rect( 65 * 4,  20 * 4,  35 * 4, 80 * 4),
        cv::Rect( 95 * 4,  35 * 4,  45 * 4, 40 * 4),
        cv::Rect( 45 * 4,  35 * 4,  45 * 4, 40 * 4),
        cv::Rect( 25 * 4,  27 * 4,  50 * 4, 45 * 4),
        cv::Rect(100 * 4,  50 * 4,  45 * 4, 40 * 4),
        cv::Rect( 60 * 4,  30 * 4,  45 * 4, 40 * 4),
        cv::Rect( 40 * 4,  55 * 4,  50 * 4, 40 * 4),
        cv::Rect( 48 * 4,  37 * 4,  72 * 4, 80 * 4),
        cv::Rect( 48 * 4,  32 * 4,  85 * 4, 58 * 4),
        cv::Rect( 48 * 4,   0 * 4,  32 * 4, 27 * 4)
    };
    return rois[idx];
 }
 typedef std::tr1::tuple<std::string, std::string, int> roi_fixture_t;
 typedef perf::TestBaseWithParam<roi_fixture_t> SCascadeTestRoi;
 GPU_PERF_TEST_P(SCascadeTestRoi, detectInRoi,
    testing::Combine(
        testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
        testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")),
        testing::Range(0, 5)))
 {}
 RUN_GPU(SCascadeTestRoi, detectInRoi)
 {
    cv::Mat cpu = readImage (GET_PARAM(1));
    ASSERT_FALSE(cpu.empty());
    cv::gpu::GpuMat colored(cpu);
    cv::gpu::SCascade cascade;
    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
    ASSERT_TRUE(fs.isOpened());
    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
    cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(colored.size(), CV_8UC1);
    rois.setTo(0);
    int nroi = GET_PARAM(2);
    cv::RNG rng;
    for (int i = 0; i < nroi; ++i)
    {
        cv::Rect r = getFromTable(rng(10));
        cv::gpu::GpuMat sub(rois, r);
        sub.setTo(1);
    }
    cv::gpu::GpuMat trois;
    cascade.genRoi(rois, trois);
    cascade.detect(colored, trois, objectBoxes);
    TEST_CYCLE()
    {
        cascade.detect(colored, trois, objectBoxes);
    }
    SANITY_CHECK(sortDetections(objectBoxes));
 }
 NO_CPU(SCascadeTestRoi, detectInRoi)
 GPU_PERF_TEST_P(SCascadeTestRoi, detectEachRoi,
    testing::Combine(
        testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
        testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png")),
        testing::Range(0, 10)))
 {}
 RUN_GPU(SCascadeTestRoi, detectEachRoi)
 {
    cv::Mat cpu = readImage (GET_PARAM(1));
    ASSERT_FALSE(cpu.empty());
    cv::gpu::GpuMat colored(cpu);
    cv::gpu::SCascade cascade;
    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
    ASSERT_TRUE(fs.isOpened());
    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
    cv::gpu::GpuMat objectBoxes(1, 16384 * 20, CV_8UC1), rois(colored.size(), CV_8UC1);
    rois.setTo(0);
    int idx = GET_PARAM(2);
    cv::Rect r = getFromTable(idx);
    cv::gpu::GpuMat sub(rois, r);
    sub.setTo(1);
    cv::gpu::GpuMat trois;
    cascade.genRoi(rois, trois);
    cascade.detect(colored, trois, objectBoxes);
    TEST_CYCLE()
    {
        cascade.detect(colored, trois, objectBoxes);
    }
    SANITY_CHECK(sortDetections(objectBoxes));
 }
 NO_CPU(SCascadeTestRoi, detectEachRoi)
 GPU_PERF_TEST_P(SCascadeTest, detectOnIntegral,
    testing::Combine(
        testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
        testing::Values(std::string("cv/cascadeandhog/integrals.xml"))))
 { }
    static std::string itoa(long i)
    {
        static char s[65];
        sprintf(s, "%ld", i);
        return std::string(s);
    }
 RUN_GPU(SCascadeTest, detectOnIntegral)
 {
    cv::FileStorage fsi(perf::TestBase::getDataPath(GET_PARAM(1)), cv::FileStorage::READ);
    ASSERT_TRUE(fsi.isOpened());
    cv::gpu::GpuMat hogluv(121 * 10, 161, CV_32SC1);
    for (int i = 0; i < 10; ++i)
    {
        cv::Mat channel;
        fsi[std::string("channel") + itoa(i)] >> channel;
        cv::gpu::GpuMat gchannel(hogluv, cv::Rect(0, 121 * i, 161, 121));
        gchannel.upload(channel);
    }
    cv::gpu::SCascade cascade;
    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
    ASSERT_TRUE(fs.isOpened());
    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
    cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SCascade::Detection), CV_8UC1), rois(cv::Size(640, 480), CV_8UC1), trois;
    rois.setTo(1);
    cascade.genRoi(rois, trois);
    cascade.detect(hogluv, trois, objectBoxes);
    TEST_CYCLE()
    {
        cascade.detect(hogluv, trois, objectBoxes);
    }
    SANITY_CHECK(sortDetections(objectBoxes));
 }
 NO_CPU(SCascadeTest, detectOnIntegral)
 GPU_PERF_TEST_P(SCascadeTest, detectStream,
    testing::Combine(
        testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
        testing::Values(std::string("cv/cascadeandhog/bahnhof/image_00000000_0.png"))))
 { }
 RUN_GPU(SCascadeTest, detectStream)
 {
    cv::Mat cpu = readImage (GET_PARAM(1));
    ASSERT_FALSE(cpu.empty());
    cv::gpu::GpuMat colored(cpu);
    cv::gpu::SCascade cascade;
    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(0)), cv::FileStorage::READ);
    ASSERT_TRUE(fs.isOpened());
    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
    cv::gpu::GpuMat objectBoxes(1, 10000 * sizeof(cv::gpu::SCascade::Detection), CV_8UC1), rois(colored.size(), CV_8UC1), trois;
    rois.setTo(1);
    cv::gpu::Stream s;
    cascade.genRoi(rois, trois, s);
    cascade.detect(colored, trois, objectBoxes, s);
    TEST_CYCLE()
    {
        cascade.detect(colored, trois, objectBoxes, s);
    }
    cudaDeviceSynchronize();
    SANITY_CHECK(sortDetections(objectBoxes));
 }
 NO_CPU(SCascadeTest, detectStream)
--- a/modules/gpu/src/cuda/bf_knnmatch.cu
+++ b/modules/gpu/src/cuda/bf_knnmatch.cu
@ -1034,7 +1034,7 @@ namespace cv { namespace gpu { namespace device
                cudaSafeCall( cudaDeviceSynchronize() );
        }
-        void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream)
+        void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int /*cc*/, cudaStream_t stream)
        {
            findKnnMatch<256>(k, static_cast<PtrStepSzi>(trainIdx), static_cast<PtrStepSzf>(distance), allDist, stream);
        }
--- a/modules/gpu/src/cuda/icf-sc.cu
+++ b/modules/gpu/src/cuda/icf-sc.cu
@ -0,0 +1,370 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2008-2012, Willow Garage Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors "as is" and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 #include <opencv2/gpu/device/common.hpp>
 #include <icf.hpp>
 #include <float.h>
 #include <stdio.h>
 namespace cv { namespace gpu { namespace device {
 namespace icf {
    // ToDo: use textures or uncached load instruction.
    __global__ void magToHist(const uchar* __restrict__ mag,
                              const float* __restrict__ angle, const int angPitch,
                                    uchar* __restrict__ hog,   const int hogPitch, const int fh)
    {
        const int y = blockIdx.y * blockDim.y + threadIdx.y;
        const int x = blockIdx.x * blockDim.x + threadIdx.x;
        const int bin = (int)(angle[y * angPitch + x]);
        const uchar val = mag[y * hogPitch + x];
        hog[((fh * bin) + y) * hogPitch + x] = val;
    }
    void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
                  const int fw,  const int fh, const int bins, cudaStream_t stream )
    {
        const uchar* mag = (const uchar*)hogluv.ptr(fh * bins);
        uchar* hog = (uchar*)hogluv.ptr();
        const float* angle = (const float*)nangle.ptr();
        dim3 block(32, 8);
        dim3 grid(fw / 32, fh / 8);
        magToHist<<<grid, block, 0, stream>>>(mag, angle, nangle.step / sizeof(float), hog, hogluv.step, fh);
        if (!stream)
        {
            cudaSafeCall( cudaGetLastError() );
            cudaSafeCall( cudaDeviceSynchronize() );
        }
    }
    __device__ __forceinline__ float overlapArea(const Detection &a, const Detection &b)
    {
        int w = ::min(a.x + a.w, b.x + b.w) - ::max(a.x, b.x);
        int h = ::min(a.y + a.h, b.y + b.h) - ::max(a.y, b.y);
        return (w < 0 || h < 0)? 0.f : (float)(w * h);
    }
    texture<uint4,  cudaTextureType2D, cudaReadModeElementType> tdetections;
    __global__ void overlap(const uint* n, uchar* overlaps)
    {
        const int idx = threadIdx.x;
        const int total = *n;
        for (int i = idx + 1; i < total; i += 192)
        {
            const uint4 _a = tex2D(tdetections, i, 0);
            const Detection& a = *((Detection*)(&_a));
            bool excluded = false;
            for (int j = i + 1; j < total; ++j)
            {
                const uint4 _b = tex2D(tdetections, j, 0);
                const Detection& b = *((Detection*)(&_b));
                float ovl = overlapArea(a, b) / ::min(a.w * a.h, b.w * b.h);
                if (ovl > 0.65f)
                {
                    int suppessed = (a.confidence > b.confidence)? j : i;
                    overlaps[suppessed] = 1;
                    excluded = excluded || (suppessed == i);
                }
                if (__all(excluded)) break;
            }
        }
    }
    __global__ void collect(const uint* n, uchar* overlaps, uint* ctr, uint4* suppressed)
    {
        const int idx = threadIdx.x;
        const int total = *n;
        for (int i = idx; i < total; i += 192)
        {
            if (!overlaps[i])
            {
                int oidx = atomicInc(ctr, 50);
                suppressed[oidx] = tex2D(tdetections, i + 1, 0);
            }
        }
    }
    void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections,
        PtrStepSzb suppressed, cudaStream_t stream)
    {
        int block = 192;
        int grid = 1;
        cudaChannelFormatDesc desc = cudaCreateChannelDesc<uint4>();
        size_t offset;
        cudaSafeCall( cudaBindTexture2D(&offset, tdetections, objects.data, desc, objects.cols / sizeof(uint4), objects.rows, objects.step));
        overlap<<<grid, block>>>((uint*)ndetections.ptr(0), (uchar*)overlaps.ptr(0));
        collect<<<grid, block>>>((uint*)ndetections.ptr(0), (uchar*)overlaps.ptr(0), (uint*)suppressed.ptr(0), ((uint4*)suppressed.ptr(0)) + 1);
        if (!stream)
        {
            cudaSafeCall( cudaGetLastError());
            cudaSafeCall( cudaDeviceSynchronize());
        }
    }
    template<typename Policy>
    struct PrefixSum
    {
    __device static void apply(float& impact)
        {
    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
    #pragma unroll
            // scan on shuffl functions
            for (int i = 1; i < Policy::WARP; i *= 2)
            {
                const float n = __shfl_up(impact, i, Policy::WARP);
                if (threadIdx.x >= i)
                    impact += n;
            }
    #else
            __shared__ volatile float ptr[Policy::STA_X * Policy::STA_Y];
            const int idx = threadIdx.y * Policy::STA_X + threadIdx.x;
            ptr[idx] = impact;
            if ( threadIdx.x >=  1) ptr [idx ] = (ptr [idx -  1] + ptr [idx]);
            if ( threadIdx.x >=  2) ptr [idx ] = (ptr [idx -  2] + ptr [idx]);
            if ( threadIdx.x >=  4) ptr [idx ] = (ptr [idx -  4] + ptr [idx]);
            if ( threadIdx.x >=  8) ptr [idx ] = (ptr [idx -  8] + ptr [idx]);
            if ( threadIdx.x >= 16) ptr [idx ] = (ptr [idx - 16] + ptr [idx]);
            impact = ptr[idx];
    #endif
        }
    };
    texture<int,  cudaTextureType2D, cudaReadModeElementType> thogluv;
    template<bool isUp>
    __device__ __forceinline__ float rescale(const Level& level, Node& node)
    {
        uchar4& scaledRect = node.rect;
        float relScale = level.relScale;
        float farea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
        // rescale
        scaledRect.x = __float2int_rn(relScale * scaledRect.x);
        scaledRect.y = __float2int_rn(relScale * scaledRect.y);
        scaledRect.z = __float2int_rn(relScale * scaledRect.z);
        scaledRect.w = __float2int_rn(relScale * scaledRect.w);
        float sarea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y);
        const float expected_new_area = farea * relScale * relScale;
        float approx = (sarea == 0)? 1: __fdividef(sarea, expected_new_area);
        float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx * level.scaling[(node.threshold >> 28) > 6];
        return rootThreshold;
    }
    template<>
    __device__ __forceinline__ float rescale<true>(const Level& level, Node& node)
    {
        uchar4& scaledRect = node.rect;
        float relScale = level.relScale;
        float farea = scaledRect.z * scaledRect.w;
        // rescale
        scaledRect.x = __float2int_rn(relScale * scaledRect.x);
        scaledRect.y = __float2int_rn(relScale * scaledRect.y);
        scaledRect.z = __float2int_rn(relScale * scaledRect.z);
        scaledRect.w = __float2int_rn(relScale * scaledRect.w);
        float sarea = scaledRect.z * scaledRect.w;
        const float expected_new_area = farea * relScale * relScale;
        float approx = __fdividef(sarea, expected_new_area);
        float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx * level.scaling[(node.threshold >> 28) > 6];
        return rootThreshold;
    }
    template<bool isUp>
    __device__ __forceinline__ int get(int x, int y, uchar4 area)
    {
        int a = tex2D(thogluv, x + area.x, y + area.y);
        int b = tex2D(thogluv, x + area.z, y + area.y);
        int c = tex2D(thogluv, x + area.z, y + area.w);
        int d = tex2D(thogluv, x + area.x, y + area.w);
        return (a - b + c - d);
    }
    template<>
    __device__ __forceinline__ int get<true>(int x, int y, uchar4 area)
    {
        x += area.x;
        y += area.y;
        int a = tex2D(thogluv, x, y);
        int b = tex2D(thogluv, x + area.z, y);
        int c = tex2D(thogluv, x + area.z, y + area.w);
        int d = tex2D(thogluv, x, y + area.w);
        return (a - b + c - d);
    }
    texture<float2,  cudaTextureType2D, cudaReadModeElementType> troi;
 template<typename Policy>
 template<bool isUp>
 __device void CascadeInvoker<Policy>::detect(Detection* objects, const uint ndetections, uint* ctr, const int downscales) const
 {
    const int y = blockIdx.y * blockDim.y + threadIdx.y;
    const int x = blockIdx.x;
    // load Lavel
    __shared__ Level level;
    // check POI
    __shared__ volatile char roiCache[Policy::STA_Y];
    if (!threadIdx.y && !threadIdx.x)
        ((float2*)roiCache)[threadIdx.x] = tex2D(troi, blockIdx.y, x);
    __syncthreads();
    if (!roiCache[threadIdx.y]) return;
    if (!threadIdx.x)
        level = levels[downscales + blockIdx.z];
    if(x >= level.workRect.x || y >= level.workRect.y) return;
    int st = level.octave * level.step;
    const int stEnd = st + level.step;
    const int hogluvStep = gridDim.y * Policy::STA_Y;
    float confidence = 0.f;
    for(; st < stEnd; st += Policy::WARP)
    {
        const int nId = (st + threadIdx.x) * 3;
        Node node = nodes[nId];
        float threshold = rescale<isUp>(level, node);
        int sum = get<isUp>(x, y + (node.threshold >> 28) * hogluvStep, node.rect);
        int next = 1 + (int)(sum >= threshold);
        node = nodes[nId + next];
        threshold = rescale<isUp>(level, node);
        sum = get<isUp>(x, y + (node.threshold >> 28) * hogluvStep, node.rect);
        const int lShift = (next - 1) * 2 + (int)(sum >= threshold);
        float impact = leaves[(st + threadIdx.x) * 4 + lShift];
        PrefixSum<Policy>::apply(impact);
        confidence += impact;
        if(__any((confidence <= stages[(st + threadIdx.x)]))) st += 2048;
    }
    if(!threadIdx.x && st == stEnd &&  ((confidence - FLT_EPSILON) >= 0))
    {
        int idx = atomicInc(ctr, ndetections);
        objects[idx] = Detection(__float2int_rn(x * Policy::SHRINKAGE),
            __float2int_rn(y * Policy::SHRINKAGE), level.objSize.x, level.objSize.y, confidence);
    }
 }
 template<typename Policy, bool isUp>
 __global__ void soft_cascade(const CascadeInvoker<Policy> invoker, Detection* objects, const uint n, uint* ctr, const int downs)
 {
    invoker.template detect<isUp>(objects, n, ctr, downs);
 }
 template<typename Policy>
 void CascadeInvoker<Policy>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv,
    PtrStepSz<uchar4> objects, const int downscales, const cudaStream_t& stream) const
 {
    int fw = roi.rows;
    int fh = roi.cols;
    dim3 grid(fw, fh / Policy::STA_Y, downscales);
    uint* ctr = (uint*)(objects.ptr(0));
    Detection* det = ((Detection*)objects.ptr(0)) + 1;
    uint max_det = objects.cols / sizeof(Detection);
    cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
    cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
    cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc<typename Policy::roi_type>();
    cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / Policy::STA_Y, roi.rows, roi.step));
    const CascadeInvoker<Policy> inv = *this;
    soft_cascade<Policy, false><<<grid, Policy::block(), 0, stream>>>(inv, det, max_det, ctr, 0);
    cudaSafeCall( cudaGetLastError());
    grid = dim3(fw, fh / Policy::STA_Y, scales - downscales);
    soft_cascade<Policy, true><<<grid, Policy::block(), 0, stream>>>(inv, det, max_det, ctr, downscales);
    if (!stream)
    {
        cudaSafeCall( cudaGetLastError());
        cudaSafeCall( cudaDeviceSynchronize());
    }
 }
 template void CascadeInvoker<GK107PolicyX4>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv,
    PtrStepSz<uchar4> objects, const int downscales, const cudaStream_t& stream) const;
 }
 }}}
--- a/modules/gpu/src/cuda/integral_image.cu
+++ b/modules/gpu/src/cuda/integral_image.cu
@ -383,6 +383,88 @@ namespace cv { namespace gpu { namespace device
            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }
        __global__ void shfl_integral_vertical(PtrStepSz<unsigned int> buffer, PtrStepSz<unsigned int> integral)
        {
        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
            __shared__ unsigned int sums[32][9];
            const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
            const int lane_id = tidx % 8;
            if (tidx >= integral.cols)
                return;
            sums[threadIdx.x][threadIdx.y] = 0;
            __syncthreads();
            unsigned int stepSum = 0;
            for (int y = threadIdx.y; y < integral.rows; y += blockDim.y)
            {
                unsigned int* p = buffer.ptr(y) + tidx;
                unsigned int* dst = integral.ptr(y + 1) + tidx + 1;
                unsigned int sum = *p;
                sums[threadIdx.x][threadIdx.y] = sum;
                __syncthreads();
                // place into SMEM
                // shfl scan reduce the SMEM, reformating so the column
                // sums are computed in a warp
                // then read out properly
                const int j = threadIdx.x % 8;
                const int k = threadIdx.x / 8 + threadIdx.y * 4;
                int partial_sum = sums[k][j];
                for (int i = 1; i <= 8; i *= 2)
                {
                    int n = __shfl_up(partial_sum, i, 32);
                    if (lane_id >= i)
                        partial_sum += n;
                }
                sums[k][j] = partial_sum;
                __syncthreads();
                if (threadIdx.y > 0)
                    sum += sums[threadIdx.x][threadIdx.y - 1];
                sum += stepSum;
                stepSum += sums[threadIdx.x][blockDim.y - 1];
                __syncthreads();
                *dst = sum;
            }
        #endif
        }
        // used for frame preprocessing before Soft Cascade evaluation: no synchronization needed
        void shfl_integral_gpu_buffered(PtrStepSzb img, PtrStepSz<uint4> buffer, PtrStepSz<unsigned int> integral,
            int blockStep, cudaStream_t stream)
        {
            {
                const int block = blockStep;
                const int grid = img.rows;
                cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
                shfl_integral_horizontal<<<grid, block, 0, stream>>>((PtrStepSz<uint4>) img, buffer);
                cudaSafeCall( cudaGetLastError() );
            }
            {
                const dim3 block(32, 8);
                const dim3 grid(divUp(integral.cols, block.x), 1);
                shfl_integral_vertical<<<grid, block, 0, stream>>>((PtrStepSz<uint>)buffer, integral);
                cudaSafeCall( cudaGetLastError() );
            }
        }
    }
 }}}
--- a/modules/gpu/src/cuda/texture_binder.hpp
+++ b/modules/gpu/src/cuda/texture_binder.hpp
@ -85,7 +85,7 @@ namespace cv
  namespace device
  {
-      using pcl::gpu::TextureBinder;
+      using cv::gpu::TextureBinder;
  }
 }
--- a/modules/gpu/src/gpu_init.cpp
+++ b/modules/gpu/src/gpu_init.cpp
@ -0,0 +1,60 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 // By downloading, copying, installing or using the software you agree to this license.
 // If you do not agree to this license, do not download, install,
 // copy or use the software.
 //
 //
 // License Agreement
 // For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2008-2012, Willow Garage Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 // * Redistribution's of source code must retain the above copyright notice,
 // this list of conditions and the following disclaimer.
 //
 // * Redistribution's in binary form must reproduce the above copyright notice,
 // this list of conditions and the following disclaimer in the documentation
 // and/or other materials provided with the distribution.
 //
 // * The name of the copyright holders may not be used to endorse or promote products
 // derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors "as is" and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 #include <precomp.hpp>
 namespace cv { namespace gpu
 {
 CV_INIT_ALGORITHM(SCascade, "CascadeDetector.SCascade",
                  obj.info()->addParam(obj, "minScale",    obj.minScale);
                  obj.info()->addParam(obj, "maxScale",    obj.maxScale);
                  obj.info()->addParam(obj, "scales",      obj.scales);
                  obj.info()->addParam(obj, "rejCriteria", obj.rejCriteria));
 bool initModule_gpu(void)
 {
    Ptr<Algorithm> sc = createSCascade();
    return sc->info() != 0;
 }
 } }
--- a/modules/gpu/src/icf.hpp
+++ b/modules/gpu/src/icf.hpp
@ -0,0 +1,153 @@
 //M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2008-2012, Willow Garage Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors "as is" and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M
 #ifndef __OPENCV_ICF_HPP__
 #define __OPENCV_ICF_HPP__
 #include <opencv2/gpu/device/common.hpp>
 #if defined __CUDACC__
 # define __device __device__ __forceinline__
 #else
 # define __device
 #endif
 namespace cv { namespace gpu { namespace device {
 namespace icf {
 struct __align__(16) Octave
 {
    ushort index;
    ushort stages;
    ushort shrinkage;
    ushort2 size;
    float scale;
    Octave(const ushort i, const ushort s, const ushort sh, const ushort2 sz, const float sc)
    : index(i), stages(s), shrinkage(sh), size(sz), scale(sc) {}
 };
 struct __align__(8) Level //is actually 24 bytes
 {
    int octave;
    int step;
    float relScale;
    float scaling[2]; // calculated according to Dollal paper
    // for 640x480 we can not get overflow
    uchar2 workRect;
    uchar2 objSize;
    Level(int idx, const Octave& oct, const float scale, const int w, const int h);
    __device Level(){}
 };
 struct __align__(8) Node
 {
    uchar4 rect;
    // ushort channel;
    uint threshold;
    enum { THRESHOLD_MASK = 0x0FFFFFFF };
    Node(const uchar4 r, const uint ch, const uint t) : rect(r), threshold(t + (ch << 28)) {}
 };
 struct __align__(16) Detection
 {
    ushort x;
    ushort y;
    ushort w;
    ushort h;
    float confidence;
    int kind;
    Detection(){}
    __device Detection(int _x, int _y, uchar _w, uchar _h, float c)
    : x(_x), y(_y), w(_w), h(_h), confidence(c), kind(0) {};
 };
 struct GK107PolicyX4
 {
    enum {WARP = 32, STA_X = WARP, STA_Y = 8, SHRINKAGE = 4};
    typedef float2 roi_type;
    static const dim3 block()
    {
        return dim3(STA_X, STA_Y);
    }
 };
 template<typename Policy>
 struct CascadeInvoker
 {
    CascadeInvoker(): levels(0), stages(0), nodes(0), leaves(0), scales(0) {}
    CascadeInvoker(const PtrStepSzb& _levels, const PtrStepSzf& _stages,
                   const PtrStepSzb& _nodes,  const PtrStepSzf& _leaves)
    : levels((const Level*)_levels.ptr()),
      stages((const float*)_stages.ptr()),
      nodes((const Node*)_nodes.ptr()), leaves((const float*)_leaves.ptr()),
      scales(_levels.cols / sizeof(Level))
    {}
    const Level*  levels;
    const float*  stages;
    const Node*   nodes;
    const float*  leaves;
    int scales;
    void operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects,
        const int downscales, const cudaStream_t& stream = 0) const;
    template<bool isUp>
    __device void detect(Detection* objects, const uint ndetections, uint* ctr, const int downscales) const;
 };
 }
 }}}
 #endif
--- a/modules/gpu/src/nvidia/core/NCV.hpp
+++ b/modules/gpu/src/nvidia/core/NCV.hpp
@ -288,7 +288,7 @@ NCV_EXPORTS void ncvSetDebugOutputHandler(NCVDebugOutputHandler* func);
    do \
    { \
        cudaError_t res = cudacall; \
-        ncvAssertPrintReturn(cudaSuccess==res, "cudaError_t=" << res, errCode); \
+        ncvAssertPrintReturn(cudaSuccess==res, "cudaError_t=" << (int)res, errCode); \
    } while (0)
--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@ -0,0 +1,603 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2008-2012, Willow Garage Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors "as is" and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 #include <precomp.hpp>
 #include <opencv2/highgui/highgui.hpp>
 #if !defined (HAVE_CUDA)
 cv::gpu::SCascade::SCascade(const double, const double, const int, const int) { throw_nogpu(); }
 cv::gpu::SCascade::~SCascade() { throw_nogpu(); }
 bool cv::gpu::SCascade::load(const FileNode&) { throw_nogpu(); return false;}
 void cv::gpu::SCascade::detect(InputArray, InputArray, OutputArray, Stream&) const { throw_nogpu(); }
 void cv::gpu::SCascade::genRoi(InputArray, OutputArray, Stream&) const { throw_nogpu(); }
 void cv::gpu::SCascade::read(const FileNode& fn) { Algorithm::read(fn); }
 #else
 #include <icf.hpp>
 cv::gpu::device::icf::Level::Level(int idx, const Octave& oct, const float scale, const int w, const int h)
 :  octave(idx), step(oct.stages), relScale(scale / oct.scale)
 {
    workRect.x = round(w / (float)oct.shrinkage);
    workRect.y = round(h / (float)oct.shrinkage);
    objSize.x  = cv::saturate_cast<uchar>(oct.size.x * relScale);
    objSize.y  = cv::saturate_cast<uchar>(oct.size.y * relScale);
    // according to R. Benenson, M. Mathias, R. Timofte and L. Van Gool's and Dallal's papers
    if (fabs(relScale - 1.f) < FLT_EPSILON)
        scaling[0] = scaling[1] = 1.f;
    else
    {
        scaling[0] = (relScale < 1.f) ? 0.89f * ::pow(relScale, 1.099f / ::log(2)) : 1.f;
        scaling[1] = relScale * relScale;
    }
 }
 namespace cv { namespace gpu { namespace device {
 namespace icf {
    void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
        const int fw, const int fh, const int bins, cudaStream_t stream);
    void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections,
        PtrStepSzb suppressed, cudaStream_t stream);
 }
 namespace imgproc {
    void shfl_integral_gpu_buffered(PtrStepSzb, PtrStepSz<uint4>, PtrStepSz<unsigned int>, int, cudaStream_t);
    template <typename T>
    void resize_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy,
                    PtrStepSzb dst, int interpolation, cudaStream_t stream);
 }
 }}}
 struct cv::gpu::SCascade::Fields
 {
    static Fields* parseCascade(const FileNode &root, const float mins, const float maxs, const int totals)
    {
        static const char *const SC_STAGE_TYPE          = "stageType";
        static const char *const SC_BOOST               = "BOOST";
        static const char *const SC_FEATURE_TYPE        = "featureType";
        static const char *const SC_ICF                 = "ICF";
        // only Ada Boost supported
        std::string stageTypeStr = (string)root[SC_STAGE_TYPE];
        CV_Assert(stageTypeStr == SC_BOOST);
        // only HOG-like integral channel features cupported
        string featureTypeStr = (string)root[SC_FEATURE_TYPE];
        CV_Assert(featureTypeStr == SC_ICF);
        static const char *const SC_ORIG_W              = "width";
        static const char *const SC_ORIG_H              = "height";
        int origWidth  = (int)root[SC_ORIG_W];
        int origHeight = (int)root[SC_ORIG_H];
        static const char *const SC_OCTAVES             = "octaves";
        static const char *const SC_STAGES              = "stages";
        static const char *const SC_FEATURES            = "features";
        static const char *const SC_WEEK                = "weakClassifiers";
        static const char *const SC_INTERNAL            = "internalNodes";
        static const char *const SC_LEAF                = "leafValues";
        static const char *const SC_OCT_SCALE           = "scale";
        static const char *const SC_OCT_STAGES          = "stageNum";
        static const char *const SC_OCT_SHRINKAGE       = "shrinkingFactor";
        static const char *const SC_STAGE_THRESHOLD     = "stageThreshold";
        static const char * const SC_F_CHANNEL          = "channel";
        static const char * const SC_F_RECT             = "rect";
        FileNode fn = root[SC_OCTAVES];
        if (fn.empty()) return false;
        using namespace device::icf;
        std::vector<Octave>  voctaves;
        std::vector<float>   vstages;
        std::vector<Node>    vnodes;
        std::vector<float>   vleaves;
        FileNodeIterator it = fn.begin(), it_end = fn.end();
        int feature_offset = 0;
        ushort octIndex = 0;
        ushort shrinkage = 1;
        for (; it != it_end; ++it)
        {
            FileNode fns = *it;
            float scale = (float)fns[SC_OCT_SCALE];
            bool isUPOctave = scale >= 1;
            ushort nstages = saturate_cast<ushort>((int)fns[SC_OCT_STAGES]);
            ushort2 size;
            size.x = cvRound(origWidth * scale);
            size.y = cvRound(origHeight * scale);
            shrinkage = saturate_cast<ushort>((int)fns[SC_OCT_SHRINKAGE]);
            Octave octave(octIndex, nstages, shrinkage, size, scale);
            CV_Assert(octave.stages > 0);
            voctaves.push_back(octave);
            FileNode ffs = fns[SC_FEATURES];
            if (ffs.empty()) return false;
            FileNodeIterator ftrs = ffs.begin();
            fns = fns[SC_STAGES];
            if (fn.empty()) return false;
            // for each stage (~ decision tree with H = 2)
            FileNodeIterator st = fns.begin(), st_end = fns.end();
            for (; st != st_end; ++st )
            {
                fns = *st;
                vstages.push_back((float)fns[SC_STAGE_THRESHOLD]);
                fns = fns[SC_WEEK];
                FileNodeIterator ftr = fns.begin(), ft_end = fns.end();
                for (; ftr != ft_end; ++ftr)
                {
                    fns = (*ftr)[SC_INTERNAL];
                    FileNodeIterator inIt = fns.begin(), inIt_end = fns.end();
                    for (; inIt != inIt_end;)
                    {
                        // int feature = (int)(*(inIt +=2)) + feature_offset;
                        inIt +=3;
                        // extract feature, Todo:check it
                        uint th = saturate_cast<uint>((float)(*(inIt++)));
                        cv::FileNode ftn = (*ftrs)[SC_F_RECT];
                        cv::FileNodeIterator r_it = ftn.begin();
                        uchar4 rect;
                        rect.x = saturate_cast<uchar>((int)*(r_it++));
                        rect.y = saturate_cast<uchar>((int)*(r_it++));
                        rect.z = saturate_cast<uchar>((int)*(r_it++));
                        rect.w = saturate_cast<uchar>((int)*(r_it++));
                        if (isUPOctave)
                        {
                            rect.z -= rect.x;
                            rect.w -= rect.y;
                        }
                        uint channel = saturate_cast<uint>((int)(*ftrs)[SC_F_CHANNEL]);
                        vnodes.push_back(Node(rect, channel, th));
                        ++ftrs;
                    }
                    fns = (*ftr)[SC_LEAF];
                    inIt = fns.begin(), inIt_end = fns.end();
                    for (; inIt != inIt_end; ++inIt)
                        vleaves.push_back((float)(*inIt));
                }
            }
            feature_offset += octave.stages * 3;
            ++octIndex;
        }
        cv::Mat hoctaves(1, voctaves.size() * sizeof(Octave), CV_8UC1, (uchar*)&(voctaves[0]));
        CV_Assert(!hoctaves.empty());
        cv::Mat hstages(cv::Mat(vstages).reshape(1,1));
        CV_Assert(!hstages.empty());
        cv::Mat hnodes(1, vnodes.size() * sizeof(Node), CV_8UC1, (uchar*)&(vnodes[0]) );
        CV_Assert(!hnodes.empty());
        cv::Mat hleaves(cv::Mat(vleaves).reshape(1,1));
        CV_Assert(!hleaves.empty());
        Fields* fields = new Fields(mins, maxs, totals, origWidth, origHeight, shrinkage, 0,
            hoctaves, hstages, hnodes, hleaves);
        fields->voctaves = voctaves;
        fields->createLevels(FRAME_HEIGHT, FRAME_WIDTH);
        return fields;
    }
    bool check(float mins,float  maxs, int scales)
    {
        bool updated = (minScale == mins) || (maxScale == maxs) || (totals = scales);
        minScale = mins;
        maxScale = maxScale;
        totals   = scales;
        return updated;
    }
    int createLevels(const int fh, const int fw)
    {
        using namespace device::icf;
        std::vector<Level> vlevels;
        float logFactor = (::log(maxScale) - ::log(minScale)) / (totals -1);
        float scale = minScale;
        int dcs = 0;
        for (int sc = 0; sc < totals; ++sc)
        {
            int width  = ::std::max(0.0f, fw - (origObjWidth  * scale));
            int height = ::std::max(0.0f, fh - (origObjHeight * scale));
            float logScale = ::log(scale);
            int fit = fitOctave(voctaves, logScale);
            Level level(fit, voctaves[fit], scale, width, height);
            if (!width || !height)
                break;
            else
            {
                vlevels.push_back(level);
                if (voctaves[fit].scale < 1) ++dcs;
            }
            if (::fabs(scale - maxScale) < FLT_EPSILON) break;
            scale = ::std::min(maxScale, ::expf(::log(scale) + logFactor));
        }
        cv::Mat hlevels = cv::Mat(1, vlevels.size() * sizeof(Level), CV_8UC1, (uchar*)&(vlevels[0]) );
        CV_Assert(!hlevels.empty());
        levels.upload(hlevels);
        downscales = dcs;
        return dcs;
    }
    bool update(int fh, int fw, int shr)
    {
        if ((fh == luv.rows) && (fw == luv.cols)) return false;
        plane.create(fh * (HOG_LUV_BINS + 1), fw, CV_8UC1);
        fplane.create(fh * HOG_BINS, fw, CV_32FC1);
        luv.create(fh, fw, CV_8UC3);
        shrunk.create(fh / shr * HOG_LUV_BINS, fw / shr, CV_8UC1);
        integralBuffer.create(shrunk.rows, shrunk.cols, CV_32SC1);
        hogluv.create((fh / shr) * HOG_LUV_BINS + 1, fw / shr + 1, CV_32SC1);
        hogluv.setTo(cv::Scalar::all(0));
        overlaps.create(1, 5000, CV_8UC1);
        suppressed.create(1, sizeof(Detection) * 51, CV_8UC1);
        return true;
    }
    Fields( const float mins, const float maxs, const int tts, const int ow, const int oh, const int shr, const int ds,
        cv::Mat hoctaves, cv::Mat hstages, cv::Mat hnodes, cv::Mat hleaves)
    : minScale(mins), maxScale(maxs), totals(tts), origObjWidth(ow), origObjHeight(oh), shrinkage(shr), downscales(ds)
    {
        update(FRAME_HEIGHT, FRAME_WIDTH, shr);
        octaves.upload(hoctaves);
        stages.upload(hstages);
        nodes.upload(hnodes);
        leaves.upload(hleaves);
    }
    void detect(const cv::gpu::GpuMat& roi, cv::gpu::GpuMat& objects, Stream& s) const
    {
        if (s)
            s.enqueueMemSet(objects, 0);
        else
            cudaMemset(objects.data, 0, sizeof(Detection));
        cudaSafeCall( cudaGetLastError());
        device::icf::CascadeInvoker<device::icf::GK107PolicyX4> invoker
        = device::icf::CascadeInvoker<device::icf::GK107PolicyX4>(levels, stages, nodes, leaves);
        cudaStream_t stream = StreamAccessor::getStream(s);
        invoker(roi, hogluv, objects, downscales, stream);
    }
    void preprocess(const cv::gpu::GpuMat& colored, Stream& s)
    {
        if (s)
            s.enqueueMemSet(plane, 0);
        else
            cudaMemset(plane.data, 0, plane.step * plane.rows);
        const int fw = colored.cols;
        const int fh = colored.rows;
        GpuMat gray(plane, cv::Rect(0, fh * Fields::HOG_LUV_BINS, fw, fh));
        cv::gpu::cvtColor(colored, gray, CV_BGR2GRAY, s);
        createHogBins(gray ,s);
        createLuvBins(colored, s);
        integrate(fh, fw, s);
    }
    void suppress(GpuMat& objects, Stream& s)
    {
        GpuMat ndetections = GpuMat(objects, cv::Rect(0, 0, sizeof(Detection), 1));
        ensureSizeIsEnough(objects.rows, objects.cols, CV_8UC1, overlaps);
        if (s)
        {
            s.enqueueMemSet(overlaps, 0);
            s.enqueueMemSet(suppressed, 0);
        }
        else
        {
            overlaps.setTo(0);
            suppressed.setTo(0);
        }
        cudaStream_t stream = StreamAccessor::getStream(s);
        device::icf::suppress(objects, overlaps, ndetections, suppressed, stream);
    }
 private:
    typedef std::vector<device::icf::Octave>::const_iterator  octIt_t;
    static int fitOctave(const std::vector<device::icf::Octave>& octs, const float& logFactor)
    {
        float minAbsLog = FLT_MAX;
        int res =  0;
        for (int oct = 0; oct < (int)octs.size(); ++oct)
        {
            const device::icf::Octave& octave =octs[oct];
            float logOctave = ::log(octave.scale);
            float logAbsScale = ::fabs(logFactor - logOctave);
            if(logAbsScale < minAbsLog)
            {
                res = oct;
                minAbsLog = logAbsScale;
            }
        }
        return res;
    }
    void createHogBins(const cv::gpu::GpuMat& gray, Stream& s)
    {
        static const int fw = gray.cols;
        static const int fh = gray.rows;
        GpuMat dfdx(fplane, cv::Rect(0,  0, fw, fh));
        GpuMat dfdy(fplane, cv::Rect(0, fh, fw, fh));
        cv::gpu::Sobel(gray, dfdx, CV_32F, 1, 0, sobelBuf, 3, 1, BORDER_DEFAULT, -1, s);
        cv::gpu::Sobel(gray, dfdy, CV_32F, 0, 1, sobelBuf, 3, 1, BORDER_DEFAULT, -1, s);
        GpuMat mag(fplane, cv::Rect(0, 2 * fh, fw, fh));
        GpuMat ang(fplane, cv::Rect(0, 3 * fh, fw, fh));
        cv::gpu::cartToPolar(dfdx, dfdy, mag, ang, true, s);
        // normolize magnitude to uchar interval and angles to 6 bins
        GpuMat nmag(fplane, cv::Rect(0, 4 * fh, fw, fh));
        GpuMat nang(fplane, cv::Rect(0, 5 * fh, fw, fh));
        cv::gpu::multiply(mag, cv::Scalar::all(1.f / (8 *::log(2))), nmag, 1, -1, s);
        cv::gpu::multiply(ang, cv::Scalar::all(1.f / 60.f),     nang, 1, -1, s);
        //create uchar magnitude
        GpuMat cmag(plane, cv::Rect(0, fh * Fields::HOG_BINS, fw, fh));
        if (s)
            s.enqueueConvert(nmag, cmag, CV_8UC1);
        else
            nmag.convertTo(cmag, CV_8UC1);
        cudaStream_t stream = StreamAccessor::getStream(s);
        device::icf::fillBins(plane, nang, fw, fh, Fields::HOG_BINS, stream);
    }
    void createLuvBins(const cv::gpu::GpuMat& colored, Stream& s)
    {
        static const int fw = colored.cols;
        static const int fh = colored.rows;
        cv::gpu::cvtColor(colored, luv, CV_BGR2Luv, s);
        std::vector<GpuMat> splited;
        for(int i = 0; i < Fields::LUV_BINS; ++i)
        {
            splited.push_back(GpuMat(plane, cv::Rect(0, fh * (7 + i), fw, fh)));
        }
        cv::gpu::split(luv, splited, s);
    }
    void integrate(const int fh, const int fw, Stream& s)
    {
        GpuMat channels(plane, cv::Rect(0, 0, fw, fh * Fields::HOG_LUV_BINS));
        cv::gpu::resize(channels, shrunk, cv::Size(), 1.f / shrinkage, 1.f / shrinkage, CV_INTER_AREA, s);
        if (info.majorVersion() < 3)
            cv::gpu::integralBuffered(shrunk, hogluv, integralBuffer, s);
        else
        {
            cudaStream_t stream = StreamAccessor::getStream(s);
            device::imgproc::shfl_integral_gpu_buffered(shrunk, integralBuffer, hogluv, 12, stream);
        }
    }
 public:
    // scales range
    float minScale;
    float maxScale;
    int totals;
    int origObjWidth;
    int origObjHeight;
    const int shrinkage;
    int downscales;
    // preallocated buffer 640x480x10 for hogluv + 640x480 got gray
    GpuMat plane;
    // preallocated buffer for floating point operations
    GpuMat fplane;
    // temporial mat for cvtColor
    GpuMat luv;
    // 160x120x10
    GpuMat shrunk;
    // temporial mat for integrall
    GpuMat integralBuffer;
    // 161x121x10
    GpuMat hogluv;
    // used for area overlap computing during
    GpuMat overlaps;
    // used for suppression
    GpuMat suppressed;
    // Cascade from xml
    GpuMat octaves;
    GpuMat stages;
    GpuMat nodes;
    GpuMat leaves;
    GpuMat levels;
    GpuMat sobelBuf;
    GpuMat collected;
    std::vector<device::icf::Octave> voctaves;
    DeviceInfo info;
    enum { BOOST = 0 };
    enum
    {
        FRAME_WIDTH        = 640,
        FRAME_HEIGHT       = 480,
        HOG_BINS           = 6,
        LUV_BINS           = 3,
        HOG_LUV_BINS       = 10
    };
 };
 cv::gpu::SCascade::SCascade(const double mins, const double maxs, const int sc, const int rjf)
 : fields(0),  minScale(mins), maxScale(maxs), scales(sc), rejCriteria(rjf) {}
 cv::gpu::SCascade::~SCascade() { delete fields; }
 bool cv::gpu::SCascade::load(const FileNode& fn)
 {
    if (fields) delete fields;
    fields = Fields::parseCascade(fn, minScale, maxScale, scales);
    return fields != 0;
 }
 void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _objects, Stream& s) const
 {
    CV_Assert(fields);
    const GpuMat colored = image.getGpuMat();
    // only color images are supperted
    CV_Assert(colored.type() == CV_8UC3 || colored.type() == CV_32SC1);
    GpuMat rois = _rois.getGpuMat(), objects = _objects.getGpuMat();
    Fields& flds = *fields;
    if (colored.type() == CV_8UC3)
    {
        if (!flds.update(colored.rows, colored.cols, flds.shrinkage) || flds.check(minScale, maxScale, scales))
            flds.createLevels(colored.rows, colored.cols);
        flds.preprocess(colored, s);
    }
    else
    {
        if (s)
            s.enqueueCopy(colored, flds.hogluv);
        else
            colored.copyTo(flds.hogluv);
    }
    flds.detect(rois, objects, s);
    if (rejCriteria != NO_REJECT)
    {
        GpuMat spr(objects, cv::Rect(0, 0, flds.suppressed.cols, flds.suppressed.rows));
        flds.suppress(objects, s);
        flds.suppressed.copyTo(spr);
    }
 }
 void cv::gpu::SCascade::genRoi(InputArray _roi, OutputArray _mask, Stream& stream) const
 {
    CV_Assert(fields);
    int shr = (*fields).shrinkage;
    const GpuMat roi = _roi.getGpuMat();
    _mask.create( roi.cols / shr, roi.rows / shr, roi.type() );
    GpuMat mask = _mask.getGpuMat();
    cv::gpu::GpuMat tmp;
    cv::gpu::resize(roi, tmp, cv::Size(), 1.f / shr, 1.f / shr, CV_INTER_AREA, stream);
    cv::gpu::transpose(tmp, mask, stream);
 }
 void cv::gpu::SCascade::read(const FileNode& fn)
 {
    Algorithm::read(fn);
 }
 #endif
--- a/modules/gpu/test/test_softcascade.cpp
+++ b/modules/gpu/test/test_softcascade.cpp
@ -0,0 +1,332 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2008-2012, Willow Garage Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors "as is" and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 #include <test_precomp.hpp>
 #include <time.h>
 #ifdef HAVE_CUDA
 using cv::gpu::GpuMat;
 // show detection results on input image with cv::imshow
 // #define SHOW_DETECTIONS
 #if defined SHOW_DETECTIONS
 # define SHOW(res)           \
    cv::imshow(#res, result);\
    cv::waitKey(0);
 #else
 # define SHOW(res)
 #endif
 #define GPU_TEST_P(fixture, name, params)                         \
    class fixture##_##name : public fixture {                     \
     public:                                                      \
      fixture##_##name() {}                                       \
     protected:                                                   \
      virtual void body();                                        \
    };                                                            \
    TEST_P(fixture##_##name, name /*none*/){ body();}             \
    INSTANTIATE_TEST_CASE_P(/*none*/, fixture##_##name, params);  \
    void fixture##_##name::body()
 namespace {
    typedef cv::gpu::SCascade::Detection Detection;
    static cv::Rect getFromTable(int idx)
    {
        static const cv::Rect rois[] =
        {
            cv::Rect( 65 * 4,  20 * 4,  35 * 4, 80 * 4),
            cv::Rect( 95 * 4,  35 * 4,  45 * 4, 40 * 4),
            cv::Rect( 45 * 4,  35 * 4,  45 * 4, 40 * 4),
            cv::Rect( 25 * 4,  27 * 4,  50 * 4, 45 * 4),
            cv::Rect(100 * 4,  50 * 4,  45 * 4, 40 * 4),
            cv::Rect( 60 * 4,  30 * 4,  45 * 4, 40 * 4),
            cv::Rect( 40 * 4,  55 * 4,  50 * 4, 40 * 4),
            cv::Rect( 48 * 4,  37 * 4,  72 * 4, 80 * 4),
            cv::Rect( 48 * 4,  32 * 4,  85 * 4, 58 * 4),
            cv::Rect( 48 * 4,   0 * 4,  32 * 4, 27 * 4)
        };
        return rois[idx];
    }
    static std::string itoa(long i)
    {
        static char s[65];
        sprintf(s, "%ld", i);
        return std::string(s);
    }
    static void print(std::ostream &out, const Detection& d)
    {
 #if defined SHOW_DETECTIONS
        out << "\x1b[32m[ detection]\x1b[0m ("
            << std::setw(4)  << d.x
            << " "
            << std::setw(4)  << d.y
            << ") ("
            << std::setw(4)  << d.w
            << " "
            << std::setw(4)  << d.h
            << ") "
            << std::setw(12) << d.confidence
            <<  std::endl;
 #else
        (void)out; (void)d;
 #endif
    }
    static void printTotal(std::ostream &out, int detbytes)
    {
 #if defined SHOW_DETECTIONS
        out << "\x1b[32m[          ]\x1b[0m Total detections " << (detbytes / sizeof(Detection)) << std::endl;
 #else
        (void)out; (void)detbytes;
 #endif
    }
 #if defined SHOW_DETECTIONS
    static std::string getImageName(int level)
    {
        time_t rawtime;
        struct tm * timeinfo;
        char buffer [80];
        time ( &rawtime );
        timeinfo = localtime ( &rawtime );
        strftime (buffer,80,"%Y-%m-%d--%H-%M-%S",timeinfo);
        return "gpu_rec_level_" + itoa(level)+ "_" + std::string(buffer) + ".png";
    }
    static void writeResult(const cv::Mat& result, const int level)
    {
        std::string path = cv::tempfile(getImageName(level).c_str());
        cv::imwrite(path, result);
        std::cout << "\x1b[32m" << "[          ]" << std::endl << "[ stored in]"<< "\x1b[0m" << path << std::endl;
    }
 #endif
 }
 typedef ::testing::TestWithParam<std::tr1::tuple<cv::gpu::DeviceInfo, std::string, std::string, int> > SCascadeTestRoi;
 GPU_TEST_P(SCascadeTestRoi, detect,
    testing::Combine(
        ALL_DEVICES,
        testing::Values(std::string("cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml")),
        testing::Values(std::string("../cv/cascadeandhog/bahnhof/image_00000000_0.png")),
        testing::Range(0, 5)))
 {
    cv::gpu::setDevice(GET_PARAM(0).deviceID());
    cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path() + GET_PARAM(2));
    ASSERT_FALSE(coloredCpu.empty());
    cv::gpu::SCascade cascade;
    cv::FileStorage fs(perf::TestBase::getDataPath(GET_PARAM(1)), cv::FileStorage::READ);
    ASSERT_TRUE(fs.isOpened());
    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
    GpuMat colored(coloredCpu), objectBoxes(1, 16384, CV_8UC1), rois(colored.size(), CV_8UC1), trois;
    rois.setTo(0);
    int nroi = GET_PARAM(3);
    cv::Mat result(coloredCpu);
    cv::RNG rng;
    for (int i = 0; i < nroi; ++i)
    {
        cv::Rect r = getFromTable(rng(10));
        GpuMat sub(rois, r);
        sub.setTo(1);
        cv::rectangle(result, r, cv::Scalar(0, 0, 255, 255), 1);
    }
    objectBoxes.setTo(0);
    cascade.genRoi(rois, trois);
    cascade.detect(colored, trois, objectBoxes);
    cv::Mat dt(objectBoxes);
    typedef cv::gpu::SCascade::Detection Detection;
    Detection* dts = ((Detection*)dt.data) + 1;
    int* count = dt.ptr<int>(0);
    printTotal(std::cout, *count);
    for (int i = 0; i  < *count; ++i)
    {
        Detection d = dts[i];
        print(std::cout, d);
        cv::rectangle(result, cv::Rect(d.x, d.y, d.w, d.h), cv::Scalar(255, 0, 0, 255), 1);
    }
    SHOW(result);
 }
 TEST(SCascadeTest, readCascade)
 {
    std::string xml = cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/icf-template.xml";
    cv::gpu::SCascade cascade;
    cv::FileStorage fs(xml, cv::FileStorage::READ);
    ASSERT_TRUE(fs.isOpened());
    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
 }
 typedef ::testing::TestWithParam<cv::gpu::DeviceInfo > SCascadeTestAll;
 GPU_TEST_P(SCascadeTestAll, detect,
        ALL_DEVICES
        )
 {
    cv::gpu::setDevice(GetParam().deviceID());
    std::string xml =  cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml";
    cv::gpu::SCascade cascade;
    cv::FileStorage fs(xml, cv::FileStorage::READ);
    ASSERT_TRUE(fs.isOpened());
    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
    cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path()
        + "../cv/cascadeandhog/bahnhof/image_00000000_0.png");
    ASSERT_FALSE(coloredCpu.empty());
    GpuMat colored(coloredCpu), objectBoxes(1, 100000, CV_8UC1), rois(colored.size(), CV_8UC1);
    rois.setTo(0);
    GpuMat sub(rois, cv::Rect(rois.cols / 4, rois.rows / 4,rois.cols / 2, rois.rows / 2));
    sub.setTo(cv::Scalar::all(1));
    cv::gpu::GpuMat trois;
    cascade.genRoi(rois, trois);
    objectBoxes.setTo(0);
    cascade.detect(colored, trois, objectBoxes);
    typedef cv::gpu::SCascade::Detection Detection;
    cv::Mat detections(objectBoxes);
    int a = *(detections.ptr<int>(0));
    ASSERT_EQ(a ,2460);
 }
 GPU_TEST_P(SCascadeTestAll, detectOnIntegral,
        ALL_DEVICES
        )
 {
    cv::gpu::setDevice(GetParam().deviceID());
    std::string xml =  cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml";
    cv::gpu::SCascade cascade;
    cv::FileStorage fs(xml, cv::FileStorage::READ);
    ASSERT_TRUE(fs.isOpened());
    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
    std::string intPath = cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/integrals.xml";
    cv::FileStorage fsi(intPath, cv::FileStorage::READ);
    ASSERT_TRUE(fsi.isOpened());
    GpuMat hogluv(121 * 10, 161, CV_32SC1);
    for (int i = 0; i < 10; ++i)
    {
        cv::Mat channel;
        fsi[std::string("channel") + itoa(i)] >> channel;
        GpuMat gchannel(hogluv, cv::Rect(0, 121 * i, 161, 121));
        gchannel.upload(channel);
    }
    GpuMat objectBoxes(1, 100000, CV_8UC1), rois(cv::Size(640, 480), CV_8UC1);
    rois.setTo(1);
    cv::gpu::GpuMat trois;
    cascade.genRoi(rois, trois);
    objectBoxes.setTo(0);
    cascade.detect(hogluv, trois, objectBoxes);
    typedef cv::gpu::SCascade::Detection Detection;
    cv::Mat detections(objectBoxes);
    int a = *(detections.ptr<int>(0));
    ASSERT_EQ( a ,1024);
 }
 GPU_TEST_P(SCascadeTestAll, detectStream,
        ALL_DEVICES
        )
 {
    cv::gpu::setDevice(GetParam().deviceID());
    std::string xml =  cvtest::TS::ptr()->get_data_path() + "../cv/cascadeandhog/sc_cvpr_2012_to_opencv.xml";
    cv::gpu::SCascade cascade;
    cv::FileStorage fs(xml, cv::FileStorage::READ);
    ASSERT_TRUE(fs.isOpened());
    ASSERT_TRUE(cascade.load(fs.getFirstTopLevelNode()));
    cv::Mat coloredCpu = cv::imread(cvtest::TS::ptr()->get_data_path()
        + "../cv/cascadeandhog/bahnhof/image_00000000_0.png");
    ASSERT_FALSE(coloredCpu.empty());
    GpuMat colored(coloredCpu), objectBoxes(1, 100000, CV_8UC1), rois(colored.size(), CV_8UC1);
    rois.setTo(0);
    GpuMat sub(rois, cv::Rect(rois.cols / 4, rois.rows / 4,rois.cols / 2, rois.rows / 2));
    sub.setTo(cv::Scalar::all(1));
    cv::gpu::Stream s;
    cv::gpu::GpuMat trois;
    cascade.genRoi(rois, trois, s);
    objectBoxes.setTo(0);
    cascade.detect(colored, trois, objectBoxes, s);
    cudaDeviceSynchronize();
    typedef cv::gpu::SCascade::Detection Detection;
    cv::Mat detections(objectBoxes);
    int a = *(detections.ptr<int>(0));
    ASSERT_EQ(a ,2460);
 }
 #endif
--- a/samples/gpu/cascadeclassifier_nvidia_api.cpp
+++ b/samples/gpu/cascadeclassifier_nvidia_api.cpp
@ -30,7 +30,7 @@ const Size2i preferredVideoFrameSize(640, 480);
 const string wndTitle = "NVIDIA Computer Vision :: Haar Classifiers Cascade";
-void matPrint(Mat &img, int lineOffsY, Scalar fontColor, const string &ss)
+static void matPrint(Mat &img, int lineOffsY, Scalar fontColor, const string &ss)
 {
    int fontFace = FONT_HERSHEY_DUPLEX;
    double fontScale = 0.8;
@ -45,7 +45,7 @@ void matPrint(Mat &img, int lineOffsY, Scalar fontColor, const string &ss)
 }
-void displayState(Mat &canvas, bool bHelp, bool bGpu, bool bLargestFace, bool bFilter, double fps)
+static void displayState(Mat &canvas, bool bHelp, bool bGpu, bool bLargestFace, bool bFilter, double fps)
 {
    Scalar fontColorRed = CV_RGB(255,0,0);
    Scalar fontColorNV  = CV_RGB(118,185,0);
@ -74,7 +74,7 @@ void displayState(Mat &canvas, bool bHelp, bool bGpu, bool bLargestFace, bool bF
 }
-NCVStatus process(Mat *srcdst,
+static NCVStatus process(Mat *srcdst,
                  Ncv32u width, Ncv32u height,
                  NcvBool bFilterRects, NcvBool bLargestFace,
                  HaarClassifierCascadeDescriptor &haar,
@ -281,7 +281,7 @@ int main(int argc, const char** argv)
    //==============================================================================
    namedWindow(wndTitle, 1);
-    Mat gray, frameDisp;
+    Mat frameDisp;
    do
    {
--- a/samples/gpu/opticalflow_nvidia_api.cpp
+++ b/samples/gpu/opticalflow_nvidia_api.cpp
@ -59,7 +59,7 @@ public:
 class RgbToR
 {
 public:
-    float operator ()(unsigned char b, unsigned char g, unsigned char r)
+    float operator ()(unsigned char /*b*/, unsigned char /*g*/, unsigned char r)
    {
        return static_cast<float>(r)/255.0f;
    }
@ -69,7 +69,7 @@ public:
 class RgbToG
 {
 public:
-    float operator ()(unsigned char b, unsigned char g, unsigned char r)
+    float operator ()(unsigned char /*b*/, unsigned char g, unsigned char /*r*/)
    {
        return static_cast<float>(g)/255.0f;
    }
@ -78,7 +78,7 @@ public:
 class RgbToB
 {
 public:
-    float operator ()(unsigned char b, unsigned char g, unsigned char r)
+    float operator ()(unsigned char b, unsigned char /*g*/, unsigned char /*r*/)
    {
        return static_cast<float>(b)/255.0f;
    }
@ -135,7 +135,7 @@ NCVStatus CopyData(const IplImage *image, const NCVMatrixAlloc<Ncv32f> &dst)
    return NCV_SUCCESS;
 }
-NCVStatus LoadImages (const char *frame0Name,
+static NCVStatus LoadImages (const char *frame0Name,
                      const char *frame1Name,
                      int &width,
                      int &height,
@ -186,7 +186,7 @@ inline T MapValue (T x, T a, T b, T c, T d)
    return c + (d - c) * (x - a) / (b - a);
 }
-NCVStatus ShowFlow (NCVMatrixAlloc<Ncv32f> &u, NCVMatrixAlloc<Ncv32f> &v, const char *name)
+static NCVStatus ShowFlow (NCVMatrixAlloc<Ncv32f> &u, NCVMatrixAlloc<Ncv32f> &v, const char *name)
 {
    IplImage *flowField;
@ -246,7 +246,7 @@ NCVStatus ShowFlow (NCVMatrixAlloc<Ncv32f> &u, NCVMatrixAlloc<Ncv32f> &v, const
    return NCV_SUCCESS;
 }
-IplImage *CreateImage (NCVMatrixAlloc<Ncv32f> &h_r, NCVMatrixAlloc<Ncv32f> &h_g, NCVMatrixAlloc<Ncv32f> &h_b)
+static IplImage *CreateImage (NCVMatrixAlloc<Ncv32f> &h_r, NCVMatrixAlloc<Ncv32f> &h_g, NCVMatrixAlloc<Ncv32f> &h_b)
 {
    CvSize imageSize = cvSize (h_r.width (), h_r.height ());
    IplImage *image  = cvCreateImage (imageSize, IPL_DEPTH_8U, 4);
@ -270,7 +270,7 @@ IplImage *CreateImage (NCVMatrixAlloc<Ncv32f> &h_r, NCVMatrixAlloc<Ncv32f> &h_g,
    return image;
 }
-void PrintHelp ()
+static void PrintHelp ()
 {
    std::cout << "Usage help:\n";
    std::cout << std::setiosflags(std::ios::left);
@ -286,7 +286,7 @@ void PrintHelp ()
    std::cout << "\t" << std::setw(15) << PARAM_HELP << " - display this help message\n";
 }
-int ProcessCommandLine(int argc, char **argv,
+static int ProcessCommandLine(int argc, char **argv,
                       Ncv32f &timeStep,
                       char *&frame0Name,
                       char *&frame1Name,
--- a/samples/gpu/softcascade.cpp
+++ b/samples/gpu/softcascade.cpp
@ -0,0 +1,106 @@
 #include <opencv2/gpu/gpu.hpp>
 #include <opencv2/highgui/highgui.hpp>
 #include <iostream>
 int main(int argc, char** argv)
 {
    const std::string keys =
        "{help h usage ?    |     | print this message }"
        "{cascade c         |     | path to configuration xml }"
        "{frames f          |     | path to configuration xml }"
        "{min_scale         |0.4f | path to configuration xml }"
        "{max_scale         |5.0f | path to configuration xml }"
        "{total_scales      |55   | path to configuration xml }"
        "{device d          |0    | path to configuration xml }"
    ;
    cv::CommandLineParser parser(argc, argv, keys);
    parser.about("Soft cascade training application.");
    if (parser.has("help"))
    {
        parser.printMessage();
        return 0;
    }
    if (!parser.check())
    {
        parser.printErrors();
        return 1;
    }
    cv::gpu::setDevice(parser.get<int>("device"));
    std::string cascadePath = parser.get<std::string>("cascade");
    cv::FileStorage fs(cascadePath, cv::FileStorage::READ);
    if(!fs.isOpened())
    {
        std::cout << "Soft Cascade file " << cascadePath << " can't be opened." << std::endl << std::flush;
        return 1;
    }
    std::cout << "Read cascade from file " << cascadePath << std::endl;
    float minScale =  parser.get<float>("min_scale");
    float maxScale =  parser.get<float>("max_scale");
    int scales     =  parser.get<int>("total_scales");
    using cv::gpu::SCascade;
    SCascade cascade(minScale, maxScale, scales);
    if (!cascade.load(fs.getFirstTopLevelNode()))
    {
        std::cout << "Soft Cascade can't be parsed." << std::endl << std::flush;
        return 1;
    }
    std::string frames = parser.get<std::string>("frames");
    cv::VideoCapture capture(frames);
    if(!capture.isOpened())
    {
        std::cout << "Frame source " << frames << " can't be opened." << std::endl << std::flush;
        return 1;
    }
    cv::gpu::GpuMat objects(1, sizeof(SCascade::Detection) * 10000, CV_8UC1);
    cv::gpu::printShortCudaDeviceInfo(parser.get<int>("device"));
    for (;;)
    {
        cv::Mat frame;
        if (!capture.read(frame))
        {
            std::cout << "Nothing to read. " << std::endl << std::flush;
            return 0;
        }
        cv::gpu::GpuMat dframe(frame), roi(frame.rows, frame.cols, CV_8UC1), trois;
        roi.setTo(cv::Scalar::all(1));
        cascade.genRoi(roi, trois);
        cascade.detect(dframe, trois, objects);
        cv::Mat dt(objects);
        typedef cv::gpu::SCascade::Detection Detection;
        Detection* dts = ((Detection*)dt.data) + 1;
        int* count = dt.ptr<int>(0);
        std::cout << *count << std::endl;
        cv::Mat result;
        frame.copyTo(result);
        for (int i = 0; i < *count; ++i)
        {
            Detection d = dts[i];
            cv::rectangle(result, cv::Rect(d.x, d.y, d.w, d.h), cv::Scalar(255, 0, 0, 255), 1);
        }
        std::cout << "working..." << std::endl;
        cv::imshow("Soft Cascade demo", result);
        cv::waitKey(10);
    }
    return 0;
 }