refactor CUDA HOG algorithm:

use abstract interface with hidden implementation
10 years ago · 8257dc3c1e
parent 0af7597d36
commit 8257dc3c1e
5 changed files with 1661 additions and 1684 deletions
--- a/modules/cudaobjdetect/include/opencv2/cudaobjdetect.hpp
+++ b/modules/cudaobjdetect/include/opencv2/cudaobjdetect.hpp
@ -65,19 +65,8 @@ namespace cv { namespace cuda {
 // HOG (Histogram-of-Oriented-Gradients) Descriptor and Object Detector
 //

-struct CV_EXPORTS HOGConfidence
-{
-   double scale;
-   std::vector<Point> locations;
-   std::vector<double> confidences;
-   std::vector<double> part_scores[4];
-};
-
 /** @brief The class implements Histogram of Oriented Gradients (@cite Dalal2005) object detector.

-Interfaces of all methods are kept similar to the CPU HOG descriptor and detector analogues as much
-as possible.
-
@note
    -   An example applying the HOG descriptor for people detection can be found at
        opencv_source_code/samples/cpp/peopledetect.cpp
@ -86,11 +75,14 @@ as possible.
    -   (Python) An example applying the HOG descriptor for people detection can be found at
        opencv_source_code/samples/python2/peopledetect.py
 */
-struct CV_EXPORTS HOGDescriptor
+class CV_EXPORTS HOG : public cv::Algorithm
 {
-    enum { DEFAULT_WIN_SIGMA = -1 };
-    enum { DEFAULT_NLEVELS = 64 };
-    enum { DESCR_FORMAT_ROW_BY_ROW, DESCR_FORMAT_COL_BY_COL };
+public:
+    enum
+    {
+        DESCR_FORMAT_ROW_BY_ROW,
+        DESCR_FORMAT_COL_BY_COL
+    };

    /** @brief Creates the HOG descriptor and detector.

@ -99,132 +91,105 @@ struct CV_EXPORTS HOGDescriptor
    @param block_stride Block stride. It must be a multiple of cell size.
    @param cell_size Cell size. Only (8, 8) is supported for now.
    @param nbins Number of bins. Only 9 bins per cell are supported for now.
-    @param win_sigma Gaussian smoothing window parameter.
-    @param threshold_L2hys L2-Hys normalization method shrinkage.
-    @param gamma_correction Flag to specify whether the gamma correction preprocessing is required or
-    not.
-    @param nlevels Maximum number of detection window increases.
     */
-    HOGDescriptor(Size win_size=Size(64, 128), Size block_size=Size(16, 16),
-                  Size block_stride=Size(8, 8), Size cell_size=Size(8, 8),
-                  int nbins=9, double win_sigma=DEFAULT_WIN_SIGMA,
-                  double threshold_L2hys=0.2, bool gamma_correction=true,
-                  int nlevels=DEFAULT_NLEVELS);
+    static Ptr<HOG> create(Size win_size = Size(64, 128),
+                           Size block_size = Size(16, 16),
+                           Size block_stride = Size(8, 8),
+                           Size cell_size = Size(8, 8),
+                           int nbins = 9);
+
+    //! Gaussian smoothing window parameter.
+    virtual void setWinSigma(double win_sigma) = 0;
+    virtual double getWinSigma() const = 0;
+
+    //! L2-Hys normalization method shrinkage.
+    virtual void setL2HysThreshold(double threshold_L2hys) = 0;
+    virtual double getL2HysThreshold() const = 0;
+
+    //! Flag to specify whether the gamma correction preprocessing is required or not.
+    virtual void setGammaCorrection(bool gamma_correction) = 0;
+    virtual bool getGammaCorrection() const = 0;
+
+    //! Maximum number of detection window increases.
+    virtual void setNumLevels(int nlevels) = 0;
+    virtual int getNumLevels() const = 0;
+
+    //! Threshold for the distance between features and SVM classifying plane.
+    //! Usually it is 0 and should be specfied in the detector coefficients (as the last free
+    //! coefficient). But if the free coefficient is omitted (which is allowed), you can specify it
+    //! manually here.
+    virtual void setHitThreshold(double hit_threshold) = 0;
+    virtual double getHitThreshold() const = 0;
+
+    //! Window stride. It must be a multiple of block stride.
+    virtual void setWinStride(Size win_stride) = 0;
+    virtual Size getWinStride() const = 0;
+
+    //! Coefficient of the detection window increase.
+    virtual void setScaleFactor(double scale0) = 0;
+    virtual double getScaleFactor() const = 0;
+
+    //! Coefficient to regulate the similarity threshold. When detected, some
+    //! objects can be covered by many rectangles. 0 means not to perform grouping.
+    //! See groupRectangles.
+    virtual void setGroupThreshold(int group_threshold) = 0;
+    virtual int getGroupThreshold() const = 0;
+
+    //! Descriptor storage format:
+    //!   - **DESCR_FORMAT_ROW_BY_ROW** - Row-major order.
+    //!   - **DESCR_FORMAT_COL_BY_COL** - Column-major order.
+    virtual void setDescriptorFormat(int descr_format) = 0;
+    virtual int getDescriptorFormat() const = 0;

    /** @brief Returns the number of coefficients required for the classification.
     */
-    size_t getDescriptorSize() const;
+    virtual size_t getDescriptorSize() const = 0;
+
    /** @brief Returns the block histogram size.
     */
-    size_t getBlockHistogramSize() const;
+    virtual size_t getBlockHistogramSize() const = 0;

    /** @brief Sets coefficients for the linear SVM classifier.
     */
-    void setSVMDetector(const std::vector<float>& detector);
+    virtual void setSVMDetector(InputArray detector) = 0;

-    /** @brief Returns coefficients of the classifier trained for people detection (for default window size).
-    */
-    static std::vector<float> getDefaultPeopleDetector();
-    /** @brief Returns coefficients of the classifier trained for people detection (for 48x96 windows).
-    */
-    static std::vector<float> getPeopleDetector48x96();
-    /** @brief Returns coefficients of the classifier trained for people detection (for 64x128 windows).
+    /** @brief Returns coefficients of the classifier trained for people detection.
     */
-    static std::vector<float> getPeopleDetector64x128();
+    virtual Mat getDefaultPeopleDetector() const = 0;

    /** @brief Performs object detection without a multi-scale window.

    @param img Source image. CV_8UC1 and CV_8UC4 types are supported for now.
    @param found_locations Left-top corner points of detected objects boundaries.
-    @param hit_threshold Threshold for the distance between features and SVM classifying plane.
-    Usually it is 0 and should be specfied in the detector coefficients (as the last free
-    coefficient). But if the free coefficient is omitted (which is allowed), you can specify it
-    manually here.
-    @param win_stride Window stride. It must be a multiple of block stride.
-    @param padding Mock parameter to keep the CPU interface compatibility. It must be (0,0).
+    @param confidences Optional output array for confidences.
     */
-    void detect(const GpuMat& img, std::vector<Point>& found_locations,
-                double hit_threshold=0, Size win_stride=Size(),
-                Size padding=Size());
+    virtual void detect(InputArray img,
+                        std::vector<Point>& found_locations,
+                        std::vector<double>* confidences = NULL) = 0;

    /** @brief Performs object detection with a multi-scale window.

    @param img Source image. See cuda::HOGDescriptor::detect for type limitations.
    @param found_locations Detected objects boundaries.
+    @param confidences Optional output array for confidences.
    @param hit_threshold Threshold for the distance between features and SVM classifying plane. See
    cuda::HOGDescriptor::detect for details.
    @param win_stride Window stride. It must be a multiple of block stride.
    @param padding Mock parameter to keep the CPU interface compatibility. It must be (0,0).
-    @param scale0 Coefficient of the detection window increase.
-    @param group_threshold Coefficient to regulate the similarity threshold. When detected, some
-    objects can be covered by many rectangles. 0 means not to perform grouping. See groupRectangles .
     */
-    void detectMultiScale(const GpuMat& img, std::vector<Rect>& found_locations,
-                          double hit_threshold=0, Size win_stride=Size(),
-                          Size padding=Size(), double scale0=1.05,
-                          int group_threshold=2);
-
-    void computeConfidence(const GpuMat& img, std::vector<Point>& hits, double hit_threshold,
-                                                Size win_stride, Size padding, std::vector<Point>& locations, std::vector<double>& confidences);
-
-    void computeConfidenceMultiScale(const GpuMat& img, std::vector<Rect>& found_locations,
-                                                                    double hit_threshold, Size win_stride, Size padding,
-                                                                    std::vector<HOGConfidence> &conf_out, int group_threshold);
+    virtual void detectMultiScale(InputArray img,
+                                  std::vector<Rect>& found_locations,
+                                  std::vector<double>* confidences = NULL) = 0;

    /** @brief Returns block descriptors computed for the whole image.

    @param img Source image. See cuda::HOGDescriptor::detect for type limitations.
-    @param win_stride Window stride. It must be a multiple of block stride.
    @param descriptors 2D array of descriptors.
-    @param descr_format Descriptor storage format:
-    -   **DESCR_FORMAT_ROW_BY_ROW** - Row-major order.
-    -   **DESCR_FORMAT_COL_BY_COL** - Column-major order.
-
-    The function is mainly used to learn the classifier.
+    @param stream CUDA stream.
     */
-    void getDescriptors(const GpuMat& img, Size win_stride,
-                        GpuMat& descriptors,
-                        int descr_format=DESCR_FORMAT_COL_BY_COL);
-
-    Size win_size;
-    Size block_size;
-    Size block_stride;
-    Size cell_size;
-    int nbins;
-    double win_sigma;
-    double threshold_L2hys;
-    bool gamma_correction;
-    int nlevels;
-
-protected:
-    void computeBlockHistograms(const GpuMat& img);
-    void computeGradient(const GpuMat& img, GpuMat& grad, GpuMat& qangle);
-
-    double getWinSigma() const;
-    bool checkDetectorSize() const;
-
-    static int numPartsWithin(int size, int part_size, int stride);
-    static Size numPartsWithin(Size size, Size part_size, Size stride);
-
-    // Coefficients of the separating plane
-    float free_coef;
-    GpuMat detector;
-
-    // Results of the last classification step
-    GpuMat labels, labels_buf;
-    Mat labels_host;
-
-    // Results of the last histogram evaluation step
-    GpuMat block_hists, block_hists_buf;
-
-    // Gradients conputation results
-    GpuMat grad, qangle, grad_buf, qangle_buf;
-
-    // returns subbuffer with required size, reallocates buffer if nessesary.
-    static GpuMat getBuffer(const Size& sz, int type, GpuMat& buf);
-    static GpuMat getBuffer(int rows, int cols, int type, GpuMat& buf);
-
-    std::vector<GpuMat> image_scales;
+    virtual void compute(InputArray img,
+                         OutputArray descriptors,
+                         Stream& stream = Stream::Null()) = 0;
 };

 //
--- a/modules/cudaobjdetect/perf/perf_objdetect.cpp
+++ b/modules/cudaobjdetect/perf/perf_objdetect.cpp
@ -71,10 +71,10 @@ PERF_TEST_P(Image, ObjDetect_HOG,
        const cv::cuda::GpuMat d_img(img);
        std::vector<cv::Rect> gpu_found_locations;

-        cv::cuda::HOGDescriptor d_hog;
-        d_hog.setSVMDetector(cv::cuda::HOGDescriptor::getDefaultPeopleDetector());
+        cv::Ptr<cv::cuda::HOG> d_hog = cv::cuda::HOG::create();
+        d_hog->setSVMDetector(d_hog->getDefaultPeopleDetector());

-        TEST_CYCLE() d_hog.detectMultiScale(d_img, gpu_found_locations);
+        TEST_CYCLE() d_hog->detectMultiScale(d_img, gpu_found_locations);

        SANITY_CHECK(gpu_found_locations);
    }
@ -82,8 +82,10 @@ PERF_TEST_P(Image, ObjDetect_HOG,
    {
        std::vector<cv::Rect> cpu_found_locations;

+        cv::Ptr<cv::cuda::HOG> d_hog = cv::cuda::HOG::create();
+
        cv::HOGDescriptor hog;
-        hog.setSVMDetector(cv::cuda::HOGDescriptor::getDefaultPeopleDetector());
+        hog.setSVMDetector(d_hog->getDefaultPeopleDetector());

        TEST_CYCLE() hog.detectMultiScale(img, cpu_found_locations);

--- a/modules/cudaobjdetect/src/hog.cpp
+++ b/modules/cudaobjdetect/src/hog.cpp
@ -42,23 +42,12 @@

 #include "precomp.hpp"

+using namespace cv;
+using namespace cv::cuda;
+
 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)

-cv::cuda::HOGDescriptor::HOGDescriptor(Size, Size, Size, Size, int, double, double, bool, int) { throw_no_cuda(); }
-size_t cv::cuda::HOGDescriptor::getDescriptorSize() const { throw_no_cuda(); return 0; }
-size_t cv::cuda::HOGDescriptor::getBlockHistogramSize() const { throw_no_cuda(); return 0; }
-double cv::cuda::HOGDescriptor::getWinSigma() const { throw_no_cuda(); return 0; }
-bool cv::cuda::HOGDescriptor::checkDetectorSize() const { throw_no_cuda(); return false; }
-void cv::cuda::HOGDescriptor::setSVMDetector(const std::vector<float>&) { throw_no_cuda(); }
-void cv::cuda::HOGDescriptor::detect(const GpuMat&, std::vector<Point>&, double, Size, Size) { throw_no_cuda(); }
-void cv::cuda::HOGDescriptor::detectMultiScale(const GpuMat&, std::vector<Rect>&, double, Size, Size, double, int) { throw_no_cuda(); }
-void cv::cuda::HOGDescriptor::computeBlockHistograms(const GpuMat&) { throw_no_cuda(); }
-void cv::cuda::HOGDescriptor::getDescriptors(const GpuMat&, Size, GpuMat&, int) { throw_no_cuda(); }
-std::vector<float> cv::cuda::HOGDescriptor::getDefaultPeopleDetector() { throw_no_cuda(); return std::vector<float>(); }
-std::vector<float> cv::cuda::HOGDescriptor::getPeopleDetector48x96() { throw_no_cuda(); return std::vector<float>(); }
-std::vector<float> cv::cuda::HOGDescriptor::getPeopleDetector64x128() { throw_no_cuda(); return std::vector<float>(); }
-void cv::cuda::HOGDescriptor::computeConfidence(const GpuMat&, std::vector<Point>&, double, Size, Size, std::vector<Point>&, std::vector<double>&) { throw_no_cuda(); }
-void cv::cuda::HOGDescriptor::computeConfidenceMultiScale(const GpuMat&, std::vector<Rect>&, double, Size, Size, std::vector<HOGConfidence>&, int) { throw_no_cuda(); }
+Ptr<cuda::HOG> cv::cuda::HOG::create(Size, Size, Size, Size, int) { throw_no_cuda(); return Ptr<cuda::HOG>(); }

 #else

@ -102,244 +91,323 @@ namespace cv { namespace cuda { namespace device
    }
 }}}

-using namespace ::cv::cuda::device;
-
-cv::cuda::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size block_stride_, Size cell_size_,
-                                      int nbins_, double win_sigma_, double threshold_L2hys_, bool gamma_correction_, int nlevels_)
-        : win_size(win_size_),
-          block_size(block_size_),
-          block_stride(block_stride_),
-          cell_size(cell_size_),
-          nbins(nbins_),
-          win_sigma(win_sigma_),
-          threshold_L2hys(threshold_L2hys_),
-          gamma_correction(gamma_correction_),
-          nlevels(nlevels_)
+using namespace cv::cuda::device;
+
+namespace
+{
+    class HOG_Impl : public cv::cuda::HOG
+    {
+    public:
+        HOG_Impl(Size win_size,
+                 Size block_size,
+                 Size block_stride,
+                 Size cell_size,
+                 int nbins);
+
+        virtual void setWinSigma(double win_sigma) { win_sigma_ = win_sigma; }
+        virtual double getWinSigma() const;
+
+        virtual void setL2HysThreshold(double threshold_L2hys) { threshold_L2hys_ = threshold_L2hys; }
+        virtual double getL2HysThreshold() const { return threshold_L2hys_; }
+
+        virtual void setGammaCorrection(bool gamma_correction) { gamma_correction_ = gamma_correction; }
+        virtual bool getGammaCorrection() const { return gamma_correction_; }
+
+        virtual void setNumLevels(int nlevels) { nlevels_ = nlevels; }
+        virtual int getNumLevels() const { return nlevels_; }
+
+        virtual void setHitThreshold(double hit_threshold) { hit_threshold_ = hit_threshold; }
+        virtual double getHitThreshold() const { return hit_threshold_; }
+
+        virtual void setWinStride(Size win_stride) { win_stride_ = win_stride; }
+        virtual Size getWinStride() const { return win_stride_; }
+
+        virtual void setScaleFactor(double scale0) { scale0_ = scale0; }
+        virtual double getScaleFactor() const { return scale0_; }
+
+        virtual void setGroupThreshold(int group_threshold) { group_threshold_ = group_threshold; }
+        virtual int getGroupThreshold() const { return group_threshold_; }
+
+        virtual void setDescriptorFormat(int descr_format) { descr_format_ = descr_format; }
+        virtual int getDescriptorFormat() const { return descr_format_; }
+
+        virtual size_t getDescriptorSize() const;
+
+        virtual size_t getBlockHistogramSize() const;
+
+        virtual void setSVMDetector(InputArray detector);
+
+        virtual Mat getDefaultPeopleDetector() const;
+
+        virtual void detect(InputArray img,
+                            std::vector<Point>& found_locations,
+                            std::vector<double>* confidences);
+
+        virtual void detectMultiScale(InputArray img,
+                                      std::vector<Rect>& found_locations,
+                                      std::vector<double>* confidences);
+
+        virtual void compute(InputArray img,
+                             OutputArray descriptors,
+                             Stream& stream);
+
+    private:
+        Size win_size_;
+        Size block_size_;
+        Size block_stride_;
+        Size cell_size_;
+        int nbins_;
+
+        double win_sigma_;
+        double threshold_L2hys_;
+        bool gamma_correction_;
+        int nlevels_;
+        double hit_threshold_;
+        Size win_stride_;
+        double scale0_;
+        int group_threshold_;
+        int descr_format_;
+
+    private:
+        int getTotalHistSize(Size img_size) const;
+        void computeBlockHistograms(const GpuMat& img, GpuMat& block_hists);
+        void computeGradient(const GpuMat& img, GpuMat& grad, GpuMat& qangle);
+
+        // Coefficients of the separating plane
+        float free_coef_;
+        GpuMat detector_;
+    };
+
+    HOG_Impl::HOG_Impl(Size win_size,
+                       Size block_size,
+                       Size block_stride,
+                       Size cell_size,
+                       int nbins) :
+        win_size_(win_size),
+        block_size_(block_size),
+        block_stride_(block_stride),
+        cell_size_(cell_size),
+        nbins_(nbins),
+
+        win_sigma_(-1.0),
+        threshold_L2hys_(0.2),
+        gamma_correction_(true),
+        nlevels_(64),
+        hit_threshold_(0.0),
+        win_stride_(block_stride),
+        scale0_(1.05),
+        group_threshold_(2),
+        descr_format_(DESCR_FORMAT_COL_BY_COL)
    {
        CV_Assert((win_size.width  - block_size.width ) % block_stride.width  == 0 &&
                  (win_size.height - block_size.height) % block_stride.height == 0);

-    CV_Assert(block_size.width % cell_size.width == 0 && block_size.height % cell_size.height == 0);
+        CV_Assert(block_size.width % cell_size.width == 0 &&
+                  block_size.height % cell_size.height == 0);

        CV_Assert(block_stride == cell_size);

        CV_Assert(cell_size == Size(8, 8));

-    Size cells_per_block = Size(block_size.width / cell_size.width, block_size.height / cell_size.height);
+        Size cells_per_block(block_size.width / cell_size.width, block_size.height / cell_size.height);
        CV_Assert(cells_per_block == Size(2, 2));
    }

-size_t cv::cuda::HOGDescriptor::getDescriptorSize() const
+    static int numPartsWithin(int size, int part_size, int stride)
    {
-    return numPartsWithin(win_size, block_size, block_stride).area() * getBlockHistogramSize();
+        return (size - part_size + stride) / stride;
    }

-size_t cv::cuda::HOGDescriptor::getBlockHistogramSize() const
+    static Size numPartsWithin(Size size, Size part_size, Size stride)
    {
-    Size cells_per_block = Size(block_size.width / cell_size.width, block_size.height / cell_size.height);
-    return (size_t)(nbins * cells_per_block.area());
+        return Size(numPartsWithin(size.width, part_size.width, stride.width),
+                    numPartsWithin(size.height, part_size.height, stride.height));
    }

-double cv::cuda::HOGDescriptor::getWinSigma() const
+    size_t HOG_Impl::getDescriptorSize() const
    {
-    return win_sigma >= 0 ? win_sigma : (block_size.width + block_size.height) / 8.0;
+        return numPartsWithin(win_size_, block_size_, block_stride_).area() * getBlockHistogramSize();
    }

-bool cv::cuda::HOGDescriptor::checkDetectorSize() const
+    size_t HOG_Impl::getBlockHistogramSize() const
    {
-    size_t detector_size = detector.rows * detector.cols;
-    size_t descriptor_size = getDescriptorSize();
-    return detector_size == 0 || detector_size == descriptor_size || detector_size == descriptor_size + 1;
+        Size cells_per_block(block_size_.width / cell_size_.width, block_size_.height / cell_size_.height);
+        return nbins_ * cells_per_block.area();
    }

-void cv::cuda::HOGDescriptor::setSVMDetector(const std::vector<float>& _detector)
+    double HOG_Impl::getWinSigma() const
    {
-    std::vector<float> detector_reordered(_detector.size());
-
-    size_t block_hist_size = getBlockHistogramSize();
-    cv::Size blocks_per_img = numPartsWithin(win_size, block_size, block_stride);
+        return win_sigma_ >= 0 ? win_sigma_ : (block_size_.width + block_size_.height) / 8.0;
+    }

-    for (int i = 0; i < blocks_per_img.height; ++i)
-        for (int j = 0; j < blocks_per_img.width; ++j)
+    void HOG_Impl::setSVMDetector(InputArray _detector)
    {
-            const float* src = &_detector[0] + (j * blocks_per_img.height + i) * block_hist_size;
-            float* dst = &detector_reordered[0] + (i * blocks_per_img.width + j) * block_hist_size;
-            for (size_t k = 0; k < block_hist_size; ++k)
-                dst[k] = src[k];
-        }
+        const int descriptor_size = static_cast<int>(getDescriptorSize());

-    this->detector.upload(Mat(detector_reordered).reshape(1, 1));
+        const Mat detector = _detector.getMat();

-    size_t descriptor_size = getDescriptorSize();
-    free_coef = _detector.size() > descriptor_size ? _detector[descriptor_size] : 0;
+        CV_Assert( detector.type() == CV_32FC1 );
+        CV_Assert( detector.rows == 1 );
+        CV_Assert( detector.cols == descriptor_size || detector.cols == descriptor_size + 1 );

-    CV_Assert(checkDetectorSize());
-}
+        std::vector<float> detector_reordered(detector.ptr<float>(), detector.ptr<float>() + detector.cols);

-cv::cuda::GpuMat cv::cuda::HOGDescriptor::getBuffer(const Size& sz, int type, GpuMat& buf)
-{
-    if (buf.empty() || buf.type() != type)
-        buf.create(sz, type);
-    else
-        if (buf.cols < sz.width || buf.rows < sz.height)
-            buf.create(std::max(buf.rows, sz.height), std::max(buf.cols, sz.width), type);
+        size_t block_hist_size = getBlockHistogramSize();
+        Size blocks_per_win = numPartsWithin(win_size_, block_size_, block_stride_);

-    return buf(Rect(Point(0,0), sz));
+        for (int i = 0; i < blocks_per_win.height; ++i)
+        {
+            for (int j = 0; j < blocks_per_win.width; ++j)
+            {
+                const float* src = detector.ptr<float>() + (j * blocks_per_win.height + i) * block_hist_size;
+                float* dst = &detector_reordered[0] + (i * blocks_per_win.width + j) * block_hist_size;
+                for (size_t k = 0; k < block_hist_size; ++k)
+                    dst[k] = src[k];
+            }
        }

-cv::cuda::GpuMat cv::cuda::HOGDescriptor::getBuffer(int rows, int cols, int type, GpuMat& buf)
-{
-    return getBuffer(Size(cols, rows), type, buf);
+        detector_.upload(Mat(detector_reordered).reshape(1, 1));
+        free_coef_ = detector.cols > descriptor_size ? detector.at<float>(0, descriptor_size) : 0;
    }

+    static Mat getPeopleDetector64x128();
+    static Mat getPeopleDetector48x96();

-void cv::cuda::HOGDescriptor::computeGradient(const GpuMat& img, GpuMat& _grad, GpuMat& _qangle)
+    Mat HOG_Impl::getDefaultPeopleDetector() const
    {
-    CV_Assert(img.type() == CV_8UC1 || img.type() == CV_8UC4);
-
-    //   grad.create(img.size(), CV_32FC2);
-    _grad = getBuffer(img.size(), CV_32FC2, grad_buf);
-
-    //   qangle.create(img.size(), CV_8UC2);
-    _qangle = getBuffer(img.size(), CV_8UC2, qangle_buf);
+        CV_Assert( win_size_ == Size(64, 128) || win_size_ == Size(48, 96) );

-    float angleScale = (float)(nbins / CV_PI);
-    switch (img.type())
-    {
-        case CV_8UC1:
-            hog::compute_gradients_8UC1(nbins, img.rows, img.cols, img, angleScale, _grad, _qangle, gamma_correction);
-            break;
-        case CV_8UC4:
-            hog::compute_gradients_8UC4(nbins, img.rows, img.cols, img, angleScale, _grad, _qangle, gamma_correction);
-            break;
-    }
+        if (win_size_ == Size(64, 128))
+            return getPeopleDetector64x128();
+        else
+            return getPeopleDetector48x96();
    }

-
-void cv::cuda::HOGDescriptor::computeBlockHistograms(const GpuMat& img)
+    void HOG_Impl::detect(InputArray _img, std::vector<Point>& hits, std::vector<double>* confidences)
    {
-    cv::Size blocks_per_win = numPartsWithin(win_size, block_size, block_stride);
-    hog::set_up_constants(nbins, block_stride.width, block_stride.height, blocks_per_win.width, blocks_per_win.height);
+        const GpuMat img = _img.getGpuMat();

-    computeGradient(img, grad, qangle);
-
-    size_t block_hist_size = getBlockHistogramSize();
-    Size blocks_per_img = numPartsWithin(img.size(), block_size, block_stride);
+        CV_Assert( img.type() == CV_8UC1 || img.type() == CV_8UC4 );
+        CV_Assert( win_stride_.width % block_stride_.width == 0 && win_stride_.height % block_stride_.height == 0 );

-    //   block_hists.create(1, block_hist_size * blocks_per_img.area(), CV_32F);
-    block_hists = getBuffer(1, static_cast<int>(block_hist_size * blocks_per_img.area()), CV_32F, block_hists_buf);
+        hits.clear();
+        if (detector_.empty())
+            return;

-    hog::compute_hists(nbins, block_stride.width, block_stride.height, img.rows, img.cols,
-                        grad, qangle, (float)getWinSigma(), block_hists.ptr<float>());
+        BufferPool pool(Stream::Null());

-    hog::normalize_hists(nbins, block_stride.width, block_stride.height, img.rows, img.cols,
-                         block_hists.ptr<float>(), (float)threshold_L2hys);
-}
+        GpuMat block_hists = pool.getBuffer(1, getTotalHistSize(img.size()), CV_32FC1);
+        computeBlockHistograms(img, block_hists);

+        Size wins_per_img = numPartsWithin(img.size(), win_size_, win_stride_);

-void cv::cuda::HOGDescriptor::getDescriptors(const GpuMat& img, Size win_stride, GpuMat& descriptors, int descr_format)
+        if (confidences == NULL)
        {
-    CV_Assert(win_stride.width % block_stride.width == 0 && win_stride.height % block_stride.height == 0);
-
-    computeBlockHistograms(img);
-
-    const size_t block_hist_size = getBlockHistogramSize();
-    Size blocks_per_win = numPartsWithin(win_size, block_size, block_stride);
-    Size wins_per_img   = numPartsWithin(img.size(), win_size, win_stride);
-
-    descriptors.create(wins_per_img.area(), static_cast<int>(blocks_per_win.area() * block_hist_size), CV_32F);
+            GpuMat labels = pool.getBuffer(1, wins_per_img.area(), CV_8UC1);
+
+            hog::classify_hists(win_size_.height, win_size_.width,
+                                block_stride_.height, block_stride_.width,
+                                win_stride_.height, win_stride_.width,
+                                img.rows, img.cols,
+                                block_hists.ptr<float>(),
+                                detector_.ptr<float>(),
+                                (float)free_coef_,
+                                (float)hit_threshold_,
+                                labels.ptr());
+
+            Mat labels_host;
+            labels.download(labels_host);
+            unsigned char* vec = labels_host.ptr();

-    switch (descr_format)
+            for (int i = 0; i < wins_per_img.area(); i++)
            {
-    case DESCR_FORMAT_ROW_BY_ROW:
-        hog::extract_descrs_by_rows(win_size.height, win_size.width, block_stride.height, block_stride.width,
-                                    win_stride.height, win_stride.width, img.rows, img.cols, block_hists.ptr<float>(), descriptors);
-        break;
-    case DESCR_FORMAT_COL_BY_COL:
-        hog::extract_descrs_by_cols(win_size.height, win_size.width, block_stride.height, block_stride.width,
-                                    win_stride.height, win_stride.width, img.rows, img.cols, block_hists.ptr<float>(), descriptors);
-        break;
-    default:
-        CV_Error(cv::Error::StsBadArg, "Unknown descriptor format");
+                int y = i / wins_per_img.width;
+                int x = i - wins_per_img.width * y;
+                if (vec[i])
+                    hits.push_back(Point(x * win_stride_.width, y * win_stride_.height));
            }
        }
-
-void cv::cuda::HOGDescriptor::computeConfidence(const GpuMat& img, std::vector<Point>& hits, double hit_threshold,
-                          Size win_stride, Size padding, std::vector<Point>& locations, std::vector<double>& confidences)
-{
-  CV_Assert(padding == Size(0, 0));
-
-  hits.clear();
-  if (detector.empty())
-    return;
-
-  computeBlockHistograms(img);
-
-  if (win_stride == Size())
-    win_stride = block_stride;
        else
-    CV_Assert(win_stride.width % block_stride.width == 0 &&
-         win_stride.height % block_stride.height == 0);
-
-  Size wins_per_img = numPartsWithin(img.size(), win_size, win_stride);
-  labels.create(1, wins_per_img.area(), CV_32F);
-
-  hog::compute_confidence_hists(win_size.height, win_size.width, block_stride.height, block_stride.width,
-               win_stride.height, win_stride.width, img.rows, img.cols, block_hists.ptr<float>(),
-               detector.ptr<float>(), (float)free_coef, (float)hit_threshold, labels.ptr<float>());
-
+        {
+            GpuMat labels = pool.getBuffer(1, wins_per_img.area(), CV_32FC1);
+
+            hog::compute_confidence_hists(win_size_.height, win_size_.width,
+                                          block_stride_.height, block_stride_.width,
+                                          win_stride_.height, win_stride_.width,
+                                          img.rows, img.cols,
+                                          block_hists.ptr<float>(),
+                                          detector_.ptr<float>(),
+                                          (float)free_coef_,
+                                          (float)hit_threshold_,
+                                          labels.ptr<float>());
+
+            Mat labels_host;
            labels.download(labels_host);
            float* vec = labels_host.ptr<float>();

-  // does not support roi for now..
-  locations.clear();
-  confidences.clear();
+            confidences->clear();
            for (int i = 0; i < wins_per_img.area(); i++)
            {
                int y = i / wins_per_img.width;
                int x = i - wins_per_img.width * y;
-      if (vec[i] >= hit_threshold)
-   hits.push_back(Point(x * win_stride.width, y * win_stride.height));

-      Point pt(win_stride.width * x, win_stride.height * y);
-      locations.push_back(pt);
-      confidences.push_back((double)vec[i]);
+                if (vec[i] >= hit_threshold_)
+                {
+                    hits.push_back(Point(x * win_stride_.width, y * win_stride_.height));
+                    confidences->push_back((double)vec[i]);
+                }
+            }
        }
    }

-void cv::cuda::HOGDescriptor::computeConfidenceMultiScale(const GpuMat& img, std::vector<Rect>& found_locations,
-                            double hit_threshold, Size win_stride, Size padding,
-                            std::vector<HOGConfidence> &conf_out, int group_threshold)
+    void HOG_Impl::detectMultiScale(InputArray _img,
+                                    std::vector<Rect>& found_locations,
+                                    std::vector<double>* confidences)
    {
+        const GpuMat img = _img.getGpuMat();
+
+        CV_Assert( img.type() == CV_8UC1 || img.type() == CV_8UC4 );
+        CV_Assert( confidences == NULL || group_threshold_ == 0 );
+
        std::vector<double> level_scale;
-    double scale = 1.;
+        double scale = 1.0;
        int levels = 0;
-
-    for (levels = 0; levels < (int)conf_out.size(); levels++)
+        for (levels = 0; levels < nlevels_; levels++)
        {
-        scale = conf_out[levels].scale;
            level_scale.push_back(scale);
-        if (cvRound(img.cols/scale) < win_size.width || cvRound(img.rows/scale) < win_size.height)
+
+            if (cvRound(img.cols / scale) < win_size_.width ||
+                cvRound(img.rows / scale) < win_size_.height ||
+                scale0_ <= 1)
+            {
                break;
            }

+            scale *= scale0_;
+        }
        levels = std::max(levels, 1);
        level_scale.resize(levels);

-    std::vector<Rect> all_candidates;
-    std::vector<Point> locations;
+        std::vector<Point> level_hits;
+        std::vector<double> level_confidences;
+
+        BufferPool pool(Stream::Null());

+        found_locations.clear();
        for (size_t i = 0; i < level_scale.size(); i++)
        {
            scale = level_scale[i];
+
            Size sz(cvRound(img.cols / scale), cvRound(img.rows / scale));
-        GpuMat smaller_img;

+            GpuMat smaller_img;
            if (sz == img.size())
+            {
                smaller_img = img;
+            }
            else
            {
-            smaller_img.create(sz, img.type());
+                smaller_img = pool.getBuffer(sz, img.type());
                switch (img.type())
                {
                    case CV_8UC1: hog::resize_8UC1(img, smaller_img); break;
@ -347,127 +415,137 @@ void cv::cuda::HOGDescriptor::computeConfidenceMultiScale(const GpuMat& img, std
                }
            }

-        computeConfidence(smaller_img, locations, hit_threshold, win_stride, padding, conf_out[i].locations, conf_out[i].confidences);
+            detect(smaller_img, level_hits,
+                   confidences ? &level_confidences : NULL);

-        Size scaled_win_size(cvRound(win_size.width * scale), cvRound(win_size.height * scale));
-        for (size_t j = 0; j < locations.size(); j++)
-            all_candidates.push_back(Rect(Point2d(locations[j]) * scale, scaled_win_size));
-    }
+            Size scaled_win_size(cvRound(win_size_.width * scale),
+                                 cvRound(win_size_.height * scale));

-    found_locations.assign(all_candidates.begin(), all_candidates.end());
-    groupRectangles(found_locations, group_threshold, 0.2/*magic number copied from CPU version*/);
+            for (size_t j = 0; j < level_hits.size(); j++)
+            {
+                found_locations.push_back(Rect(Point2d(level_hits[j]) * scale, scaled_win_size));
+                if (confidences)
+                    confidences->push_back(level_confidences[j]);
+            }
        }

+        if (group_threshold_ > 0)
+        {
+            groupRectangles(found_locations, group_threshold_, 0.2/*magic number copied from CPU version*/);
+        }
+    }

-void cv::cuda::HOGDescriptor::detect(const GpuMat& img, std::vector<Point>& hits, double hit_threshold, Size win_stride, Size padding)
+    void HOG_Impl::compute(InputArray _img,
+                           OutputArray _descriptors,
+                           Stream& stream)
    {
-    CV_Assert(img.type() == CV_8UC1 || img.type() == CV_8UC4);
-    CV_Assert(padding == Size(0, 0));
+        const GpuMat img = _img.getGpuMat();

-    hits.clear();
-    if (detector.empty())
-        return;
+        CV_Assert( img.type() == CV_8UC1 || img.type() == CV_8UC4 );
+        CV_Assert( win_stride_.width % block_stride_.width == 0 && win_stride_.height % block_stride_.height == 0 );
+        CV_Assert( !stream );

-    computeBlockHistograms(img);
+        BufferPool pool(stream);

-    if (win_stride == Size())
-        win_stride = block_stride;
-    else
-        CV_Assert(win_stride.width % block_stride.width == 0 && win_stride.height % block_stride.height == 0);
+        GpuMat block_hists = pool.getBuffer(1, getTotalHistSize(img.size()), CV_32FC1);
+        computeBlockHistograms(img, block_hists);

-    Size wins_per_img = numPartsWithin(img.size(), win_size, win_stride);
-    //   labels.create(1, wins_per_img.area(), CV_8U);
-    labels = getBuffer(1, wins_per_img.area(), CV_8U, labels_buf);
+        const size_t block_hist_size = getBlockHistogramSize();
+        Size blocks_per_win = numPartsWithin(win_size_, block_size_, block_stride_);
+        Size wins_per_img   = numPartsWithin(img.size(), win_size_, win_stride_);

-    hog::classify_hists(win_size.height, win_size.width, block_stride.height, block_stride.width,
-                        win_stride.height, win_stride.width, img.rows, img.cols, block_hists.ptr<float>(),
-                        detector.ptr<float>(), (float)free_coef, (float)hit_threshold, labels.ptr());
+        _descriptors.create(wins_per_img.area(), static_cast<int>(blocks_per_win.area() * block_hist_size), CV_32FC1);
+        GpuMat descriptors = _descriptors.getGpuMat();

-    labels.download(labels_host);
-    unsigned char* vec = labels_host.ptr();
-    for (int i = 0; i < wins_per_img.area(); i++)
+        switch (descr_format_)
        {
-        int y = i / wins_per_img.width;
-        int x = i - wins_per_img.width * y;
-        if (vec[i])
-            hits.push_back(Point(x * win_stride.width, y * win_stride.height));
+        case DESCR_FORMAT_ROW_BY_ROW:
+            hog::extract_descrs_by_rows(win_size_.height, win_size_.width,
+                                        block_stride_.height, block_stride_.width,
+                                        win_stride_.height, win_stride_.width,
+                                        img.rows, img.cols,
+                                        block_hists.ptr<float>(),
+                                        descriptors);
+            break;
+        case DESCR_FORMAT_COL_BY_COL:
+            hog::extract_descrs_by_cols(win_size_.height, win_size_.width,
+                                        block_stride_.height, block_stride_.width,
+                                        win_stride_.height, win_stride_.width,
+                                        img.rows, img.cols,
+                                        block_hists.ptr<float>(),
+                                        descriptors);
+            break;
+        default:
+            CV_Error(cv::Error::StsBadArg, "Unknown descriptor format");
        }
    }

+    int HOG_Impl::getTotalHistSize(Size img_size) const
+    {
+        size_t block_hist_size = getBlockHistogramSize();
+        Size blocks_per_img = numPartsWithin(img_size, block_size_, block_stride_);
+        return static_cast<int>(block_hist_size * blocks_per_img.area());
+    }

-
-void cv::cuda::HOGDescriptor::detectMultiScale(const GpuMat& img, std::vector<Rect>& found_locations, double hit_threshold,
-                                              Size win_stride, Size padding, double scale0, int group_threshold)
+    void HOG_Impl::computeBlockHistograms(const GpuMat& img, GpuMat& block_hists)
    {
+        cv::Size blocks_per_win = numPartsWithin(win_size_, block_size_, block_stride_);
+        hog::set_up_constants(nbins_, block_stride_.width, block_stride_.height, blocks_per_win.width, blocks_per_win.height);

-    CV_Assert(img.type() == CV_8UC1 || img.type() == CV_8UC4);
+        BufferPool pool(Stream::Null());

-    std::vector<double> level_scale;
-    double scale = 1.;
-    int levels = 0;
+        GpuMat grad = pool.getBuffer(img.size(), CV_32FC2);
+        GpuMat qangle = pool.getBuffer(img.size(), CV_8UC2);
+        computeGradient(img, grad, qangle);

-    for (levels = 0; levels < nlevels; levels++)
-    {
-        level_scale.push_back(scale);
-        if (cvRound(img.cols/scale) < win_size.width ||
-            cvRound(img.rows/scale) < win_size.height || scale0 <= 1)
-            break;
-        scale *= scale0;
-    }
-    levels = std::max(levels, 1);
-    level_scale.resize(levels);
-    image_scales.resize(levels);
+        block_hists.create(1, getTotalHistSize(img.size()), CV_32FC1);

-    std::vector<Rect> all_candidates;
-    std::vector<Point> locations;
+        hog::compute_hists(nbins_,
+                           block_stride_.width, block_stride_.height,
+                           img.rows, img.cols,
+                           grad, qangle,
+                           (float)getWinSigma(),
+                           block_hists.ptr<float>());

-    for (size_t i = 0; i < level_scale.size(); i++)
-    {
-        scale = level_scale[i];
-        Size sz(cvRound(img.cols / scale), cvRound(img.rows / scale));
-        GpuMat smaller_img;
+        hog::normalize_hists(nbins_,
+                             block_stride_.width, block_stride_.height,
+                             img.rows, img.cols,
+                             block_hists.ptr<float>(),
+                             (float)threshold_L2hys_);
+    }

-        if (sz == img.size())
-            smaller_img = img;
-        else
+    void HOG_Impl::computeGradient(const GpuMat& img, GpuMat& grad, GpuMat& qangle)
    {
-            image_scales[i].create(sz, img.type());
+        grad.create(img.size(), CV_32FC2);
+        qangle.create(img.size(), CV_8UC2);
+
+        float angleScale = (float)(nbins_ / CV_PI);
        switch (img.type())
        {
-                case CV_8UC1: hog::resize_8UC1(img, image_scales[i]); break;
-                case CV_8UC4: hog::resize_8UC4(img, image_scales[i]); break;
-            }
-            smaller_img = image_scales[i];
+            case CV_8UC1:
+                hog::compute_gradients_8UC1(nbins_, img.rows, img.cols, img, angleScale, grad, qangle, gamma_correction_);
+                break;
+            case CV_8UC4:
+                hog::compute_gradients_8UC4(nbins_, img.rows, img.cols, img, angleScale, grad, qangle, gamma_correction_);
+                break;
        }
-
-        detect(smaller_img, locations, hit_threshold, win_stride, padding);
-        Size scaled_win_size(cvRound(win_size.width * scale), cvRound(win_size.height * scale));
-        for (size_t j = 0; j < locations.size(); j++)
-            all_candidates.push_back(Rect(Point2d(locations[j]) * scale, scaled_win_size));
    }
-
-    found_locations.assign(all_candidates.begin(), all_candidates.end());
-    groupRectangles(found_locations, group_threshold, 0.2/*magic number copied from CPU version*/);
 }

-int cv::cuda::HOGDescriptor::numPartsWithin(int size, int part_size, int stride)
+Ptr<cuda::HOG> cv::cuda::HOG::create(Size win_size,
+                                     Size block_size,
+                                     Size block_stride,
+                                     Size cell_size,
+                                     int nbins)
 {
-    return (size - part_size + stride) / stride;
+    return makePtr<HOG_Impl>(win_size, block_size, block_stride, cell_size, nbins);
 }

-cv::Size cv::cuda::HOGDescriptor::numPartsWithin(cv::Size size, cv::Size part_size, cv::Size stride)
+namespace
 {
-    return Size(numPartsWithin(size.width, part_size.width, stride.width), numPartsWithin(size.height, part_size.height, stride.height));
-}
-
-std::vector<float> cv::cuda::HOGDescriptor::getDefaultPeopleDetector()
+    static Mat getPeopleDetector48x96()
    {
-    return getPeopleDetector64x128();
-}
-
-std::vector<float> cv::cuda::HOGDescriptor::getPeopleDetector48x96()
-{
-    static const float detector[] = {
+        static float detector[] = {
            0.294350f, -0.098796f, -0.129522f, 0.078753f, 0.387527f, 0.261529f,
            0.145939f, 0.061520f, 0.328699f, 0.227148f, -0.066467f, -0.086723f,
            0.047559f, 0.106714f, 0.037897f, 0.111461f, -0.024406f, 0.304769f,
@ -799,15 +877,13 @@ std::vector<float> cv::cuda::HOGDescriptor::getPeopleDetector48x96()
            -0.119002f, 0.026722f, 0.034853f, -0.060934f, -0.025054f, -0.093026f,
            -0.035372f, -0.233209f, -0.049869f, -0.039151f, -0.022279f, -0.065380f,
            -9.063785f };
-    return std::vector<float>(detector, detector + sizeof(detector)/sizeof(detector[0]));
-}
-
-

+        return Mat(1, static_cast<int>(sizeof(detector)/sizeof(detector[0])), CV_32FC1, detector);
+    }

-std::vector<float> cv::cuda::HOGDescriptor::getPeopleDetector64x128()
+    Mat getPeopleDetector64x128()
    {
-    static const float detector[] = {
+        static float detector[] = {
           0.05359386f, -0.14721455f, -0.05532170f, 0.05077307f,
           0.11547081f, -0.04268804f, 0.04635834f, -0.05468199f, 0.08232084f,
           0.10424068f, -0.02294518f, 0.01108519f, 0.01378693f, 0.11193510f,
@ -1613,7 +1689,9 @@ std::vector<float> cv::cuda::HOGDescriptor::getPeopleDetector64x128()
           -0.01612278f, -1.46097376e-003f, 0.14013411f, -8.96181818e-003f,
           -0.03250246f, 3.38630192e-003f, 2.64779478e-003f, 0.03359732f,
           -0.02411991f, -0.04229729f, 0.10666174f, -6.66579151f };
-    return std::vector<float>(detector, detector + sizeof(detector)/sizeof(detector[0]));
+
+        return Mat(1, static_cast<int>(sizeof(detector)/sizeof(detector[0])), CV_32FC1, detector);
+    }
 }

 #endif
--- a/modules/cudaobjdetect/test/test_objdetect.cpp
+++ b/modules/cudaobjdetect/test/test_objdetect.cpp
@ -48,9 +48,10 @@ using namespace cvtest;

 //#define DUMP

-struct HOG : testing::TestWithParam<cv::cuda::DeviceInfo>, cv::cuda::HOGDescriptor
+struct HOG : testing::TestWithParam<cv::cuda::DeviceInfo>
 {
    cv::cuda::DeviceInfo devInfo;
+    cv::Ptr<cv::cuda::HOG> hog;

 #ifdef DUMP
    std::ofstream f;
@ -69,23 +70,13 @@ struct HOG : testing::TestWithParam<cv::cuda::DeviceInfo>, cv::cuda::HOGDescript
        devInfo = GetParam();

        cv::cuda::setDevice(devInfo.deviceID());
+
+        hog = cv::cuda::HOG::create();
    }

 #ifdef DUMP
-    void dump(const cv::Mat& blockHists, const std::vector<cv::Point>& locations)
-    {
-        f.write((char*)&blockHists.rows, sizeof(blockHists.rows));
-        f.write((char*)&blockHists.cols, sizeof(blockHists.cols));
-
-        for (int i = 0; i < blockHists.rows; ++i)
+    void dump(const std::vector<cv::Point>& locations)
    {
-            for (int j = 0; j < blockHists.cols; ++j)
-            {
-                float val = blockHists.at<float>(i, j);
-                f.write((char*)&val, sizeof(val));
-            }
-        }
-
        int nlocations = locations.size();
        f.write((char*)&nlocations, sizeof(nlocations));

@ -93,21 +84,18 @@ struct HOG : testing::TestWithParam<cv::cuda::DeviceInfo>, cv::cuda::HOGDescript
            f.write((char*)&locations[i], sizeof(locations[i]));
    }
 #else
-    void compare(const cv::Mat& blockHists, const std::vector<cv::Point>& locations)
+    void compare(const std::vector<cv::Point>& locations)
    {
+        // skip block_hists check
        int rows, cols;
        f.read((char*)&rows, sizeof(rows));
        f.read((char*)&cols, sizeof(cols));
-        ASSERT_EQ(rows, blockHists.rows);
-        ASSERT_EQ(cols, blockHists.cols);
-
-        for (int i = 0; i < blockHists.rows; ++i)
+        for (int i = 0; i < rows; ++i)
        {
-            for (int j = 0; j < blockHists.cols; ++j)
+            for (int j = 0; j < cols; ++j)
            {
                float val;
                f.read((char*)&val, sizeof(val));
-                ASSERT_NEAR(val, blockHists.at<float>(i, j), 1e-3);
            }
        }

@ -126,54 +114,41 @@ struct HOG : testing::TestWithParam<cv::cuda::DeviceInfo>, cv::cuda::HOGDescript

    void testDetect(const cv::Mat& img)
    {
-        gamma_correction = false;
-        setSVMDetector(cv::cuda::HOGDescriptor::getDefaultPeopleDetector());
+        hog->setGammaCorrection(false);
+        hog->setSVMDetector(hog->getDefaultPeopleDetector());

        std::vector<cv::Point> locations;

        // Test detect
-        detect(loadMat(img), locations, 0);
+        hog->detect(loadMat(img), locations);

 #ifdef DUMP
-        dump(cv::Mat(block_hists), locations);
+        dump(locations);
 #else
-        compare(cv::Mat(block_hists), locations);
+        compare(locations);
 #endif

        // Test detect on smaller image
        cv::Mat img2;
        cv::resize(img, img2, cv::Size(img.cols / 2, img.rows / 2));
-        detect(loadMat(img2), locations, 0);
+        hog->detect(loadMat(img2), locations);

 #ifdef DUMP
-        dump(cv::Mat(block_hists), locations);
+        dump(locations);
 #else
-        compare(cv::Mat(block_hists), locations);
+        compare(locations);
 #endif

        // Test detect on greater image
        cv::resize(img, img2, cv::Size(img.cols * 2, img.rows * 2));
-        detect(loadMat(img2), locations, 0);
+        hog->detect(loadMat(img2), locations);

 #ifdef DUMP
-        dump(cv::Mat(block_hists), locations);
+        dump(locations);
 #else
-        compare(cv::Mat(block_hists), locations);
+        compare(locations);
 #endif
    }
-
-    // Does not compare border value, as interpolation leads to delta
-    void compare_inner_parts(cv::Mat d1, cv::Mat d2)
-    {
-        for (int i = 1; i < blocks_per_win_y - 1; ++i)
-            for (int j = 1; j < blocks_per_win_x - 1; ++j)
-                for (int k = 0; k < block_hist_size; ++k)
-                {
-                    float a = d1.at<float>(0, (i * blocks_per_win_x + j) * block_hist_size);
-                    float b = d2.at<float>(0, (i * blocks_per_win_x + j) * block_hist_size);
-                    ASSERT_FLOAT_EQ(a, b);
-                }
-    }
 };

 // desabled while resize does not fixed
@ -182,13 +157,8 @@ CUDA_TEST_P(HOG, DISABLED_Detect)
    cv::Mat img_rgb = readImage("hog/road.png");
    ASSERT_FALSE(img_rgb.empty());

-#ifdef DUMP
    f.open((std::string(cvtest::TS::ptr()->get_data_path()) + "hog/expected_output.bin").c_str(), std::ios_base::binary);
    ASSERT_TRUE(f.is_open());
-#else
-    f.open((std::string(cvtest::TS::ptr()->get_data_path()) + "hog/expected_output.bin").c_str(), std::ios_base::binary);
-    ASSERT_TRUE(f.is_open());
-#endif

    // Test on color image
    cv::Mat img;
@ -198,8 +168,6 @@ CUDA_TEST_P(HOG, DISABLED_Detect)
    // Test on gray image
    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2GRAY);
    testDetect(img);
-
-    f.close();
 }

 CUDA_TEST_P(HOG, GetDescriptors)
@ -216,8 +184,14 @@ CUDA_TEST_P(HOG, GetDescriptors)

    // Convert train images into feature vectors (train table)
    cv::cuda::GpuMat descriptors, descriptors_by_cols;
-    getDescriptors(d_img, win_size, descriptors, DESCR_FORMAT_ROW_BY_ROW);
-    getDescriptors(d_img, win_size, descriptors_by_cols, DESCR_FORMAT_COL_BY_COL);
+
+    hog->setWinStride(Size(64, 128));
+
+    hog->setDescriptorFormat(cv::cuda::HOG::DESCR_FORMAT_ROW_BY_ROW);
+    hog->compute(d_img, descriptors);
+
+    hog->setDescriptorFormat(cv::cuda::HOG::DESCR_FORMAT_COL_BY_COL);
+    hog->compute(d_img, descriptors_by_cols);

    // Check size of the result train table
    wins_per_img_x = 3;
@ -242,48 +216,6 @@ CUDA_TEST_P(HOG, GetDescriptors)
                    ASSERT_EQ(l[(y * blocks_per_win_x + x) * block_hist_size + k],
                              r[(x * blocks_per_win_y + y) * block_hist_size + k]);
    }
-
-    /* Now we want to extract the same feature vectors, but from single images. NOTE: results will
-    be defferent, due to border values interpolation. Using of many small images is slower, however we
-    wont't call getDescriptors and will use computeBlockHistograms instead of. computeBlockHistograms
-    works good, it can be checked in the gpu_hog sample */
-
-    img_rgb = readImage("hog/positive1.png");
-    ASSERT_TRUE(!img_rgb.empty());
-    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
-    computeBlockHistograms(cv::cuda::GpuMat(img));
-    // Everything is fine with interpolation for left top subimage
-    ASSERT_EQ(0.0, cv::norm((cv::Mat)block_hists, (cv::Mat)descriptors.rowRange(0, 1)));
-
-    img_rgb = readImage("hog/positive2.png");
-    ASSERT_TRUE(!img_rgb.empty());
-    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
-    computeBlockHistograms(cv::cuda::GpuMat(img));
-    compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(1, 2)));
-
-    img_rgb = readImage("hog/negative1.png");
-    ASSERT_TRUE(!img_rgb.empty());
-    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
-    computeBlockHistograms(cv::cuda::GpuMat(img));
-    compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(2, 3)));
-
-    img_rgb = readImage("hog/negative2.png");
-    ASSERT_TRUE(!img_rgb.empty());
-    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
-    computeBlockHistograms(cv::cuda::GpuMat(img));
-    compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(3, 4)));
-
-    img_rgb = readImage("hog/positive3.png");
-    ASSERT_TRUE(!img_rgb.empty());
-    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
-    computeBlockHistograms(cv::cuda::GpuMat(img));
-    compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(4, 5)));
-
-    img_rgb = readImage("hog/negative3.png");
-    ASSERT_TRUE(!img_rgb.empty());
-    cv::cvtColor(img_rgb, img, cv::COLOR_BGR2BGRA);
-    computeBlockHistograms(cv::cuda::GpuMat(img));
-    compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(5, 6)));
 }

 INSTANTIATE_TEST_CASE_P(CUDA_ObjDetect, HOG, ALL_DEVICES);
@ -310,12 +242,12 @@ CUDA_TEST_P(CalTech, HOG)
    cv::cuda::GpuMat d_img(img);
    cv::Mat markedImage(img.clone());

-    cv::cuda::HOGDescriptor d_hog;
-    d_hog.setSVMDetector(cv::cuda::HOGDescriptor::getDefaultPeopleDetector());
-    d_hog.nlevels = d_hog.nlevels + 32;
+    cv::Ptr<cv::cuda::HOG> d_hog = cv::cuda::HOG::create();
+    d_hog->setSVMDetector(d_hog->getDefaultPeopleDetector());
+    d_hog->setNumLevels(d_hog->getNumLevels() + 32);

    std::vector<cv::Rect> found_locations;
-    d_hog.detectMultiScale(d_img, found_locations);
+    d_hog->detectMultiScale(d_img, found_locations);

 #if defined (LOG_CASCADE_STATISTIC)
    for (int i = 0; i < (int)found_locations.size(); i++)
@ -326,7 +258,8 @@ CUDA_TEST_P(CalTech, HOG)
        cv::rectangle(markedImage, r , CV_RGB(255, 0, 0));
    }

-    cv::imshow("Res", markedImage); cv::waitKey();
+    cv::imshow("Res", markedImage);
+    cv::waitKey();
 #endif
 }

--- a/samples/gpu/hog.cpp
+++ b/samples/gpu/hog.cpp
@ -244,19 +244,13 @@ void App::run()
    Size win_size(args.win_width, args.win_width * 2); //(64, 128) or (48, 96)
    Size win_stride(args.win_stride_width, args.win_stride_height);

+    cv::Ptr<cv::cuda::HOG> gpu_hog = cv::cuda::HOG::create(win_size);
+    cv::HOGDescriptor cpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9);
+
    // Create HOG descriptors and detectors here
-    vector<float> detector;
-    if (win_size == Size(64, 128))
-        detector = cv::cuda::HOGDescriptor::getPeopleDetector64x128();
-    else
-        detector = cv::cuda::HOGDescriptor::getPeopleDetector48x96();
-
-    cv::cuda::HOGDescriptor gpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9,
-                                   cv::cuda::HOGDescriptor::DEFAULT_WIN_SIGMA, 0.2, gamma_corr,
-                                   cv::cuda::HOGDescriptor::DEFAULT_NLEVELS);
-    cv::HOGDescriptor cpu_hog(win_size, Size(16, 16), Size(8, 8), Size(8, 8), 9, 1, -1,
-                              HOGDescriptor::L2Hys, 0.2, gamma_corr, cv::HOGDescriptor::DEFAULT_NLEVELS);
-    gpu_hog.setSVMDetector(detector);
+    Mat detector = gpu_hog->getDefaultPeopleDetector();
+
+    gpu_hog->setSVMDetector(detector);
    cpu_hog.setSVMDetector(detector);

    while (running)
@ -307,9 +301,6 @@ void App::run()
            else img = img_aux;
            img_to_show = img;

-            gpu_hog.nlevels = nlevels;
-            cpu_hog.nlevels = nlevels;
-
            vector<Rect> found;

            // Perform HOG classification
@ -317,11 +308,19 @@ void App::run()
            if (use_gpu)
            {
                gpu_img.upload(img);
-                gpu_hog.detectMultiScale(gpu_img, found, hit_threshold, win_stride,
-                                         Size(0, 0), scale, gr_threshold);
+                gpu_hog->setNumLevels(nlevels);
+                gpu_hog->setHitThreshold(hit_threshold);
+                gpu_hog->setWinStride(win_stride);
+                gpu_hog->setScaleFactor(scale);
+                gpu_hog->setGroupThreshold(gr_threshold);
+                gpu_hog->detectMultiScale(gpu_img, found);
            }
-            else cpu_hog.detectMultiScale(img, found, hit_threshold, win_stride,
+            else
+            {
+                cpu_hog.nlevels = nlevels;
+                cpu_hog.detectMultiScale(img, found, hit_threshold, win_stride,
                                          Size(0, 0), scale, gr_threshold);
+            }
            hogWorkEnd();

            // Draw positive classified windows