kernel policy

12 years ago · 0898c3c651
parent 916ba4c0ea
commit 0898c3c651
3 changed files with 52 additions and 81 deletions
--- a/modules/gpu/src/cuda/isf-sc.cu
+++ b/modules/gpu/src/cuda/isf-sc.cu
@ -303,21 +303,16 @@ namespace icf {
    }
 #endif

-    void detect(const PtrStepSzb& roi, const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages,
-                const PtrStepSzb& nodes,  const PtrStepSzf& leaves,  const PtrStepSzi& hogluv,
-                PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales)
+    template<>
+    void CascadeInvoker<CascadePolicy>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv,
+        PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales, const int scale) const
    {
        int fw = 160;
        int fh = 120;

        dim3 block(32, 8);
-        dim3 grid(fw, fh / 8, downscales);
+        dim3 grid(fw, fh / 8, (scale == -1) ? downscales : 1);

-        const Level* l = (const Level*)levels.ptr();
-        const Octave* oct = ((const Octave*)octaves.ptr());
-        const float* st = (const float*)stages.ptr();
-        const Node* nd = (const Node*)nodes.ptr();
-        const float* lf = (const float*)leaves.ptr();
        uint* ctr = (uint*)counter.ptr();
        Detection* det = (Detection*)objects.ptr();
        uint max_det = objects.cols / sizeof(Detection);
@ -328,44 +323,21 @@ namespace icf {
        cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc<float2>();
        cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / 8, roi.rows, roi.step));

-        test_kernel_warp<false><<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, 0);
-        cudaSafeCall( cudaGetLastError());
-
-        grid = dim3(fw, fh / 8, 47 - downscales);
-        test_kernel_warp<true><<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, downscales);
-        cudaSafeCall( cudaGetLastError());
-        cudaSafeCall( cudaDeviceSynchronize());
-    }
-
-    void detectAtScale(const int scale, const PtrStepSzb& roi, const PtrStepSzb& levels, const PtrStepSzb& octaves,
-        const PtrStepSzf& stages, const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv,
-        PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales)
-    {
-        int fw = 160;
-        int fh = 120;
-
-        dim3 block(32, 8);
-        dim3 grid(fw, fh / 8, 1);
-
-        const Level* l = (const Level*)levels.ptr();
-        const Octave* oct = ((const Octave*)octaves.ptr());
-        const float* st = (const float*)stages.ptr();
-        const Node* nd = (const Node*)nodes.ptr();
-        const float* lf = (const float*)leaves.ptr();
-        uint* ctr = (uint*)counter.ptr();
-        Detection* det = (Detection*)objects.ptr();
-        uint max_det = objects.cols / sizeof(Detection);
-
-        cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
-        cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
-
-        cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc<float2>();
-        cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / 8, roi.rows, roi.step));
+        if (scale == -1)
+        {
+            test_kernel_warp<false><<<grid, block>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, 0);
+            cudaSafeCall( cudaGetLastError());

-        if (scale >= downscales)
-            test_kernel_warp<true><<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, scale);
+            grid = dim3(fw, fh / 8, 47 - downscales);
+            test_kernel_warp<true><<<grid, block>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, downscales);
+        }
        else
-            test_kernel_warp<false><<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, scale);
+        {
+            if (scale >= downscales)
+                test_kernel_warp<true><<<grid, block>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, scale);
+            else
+                test_kernel_warp<false><<<grid, block>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, scale);
+        }

        cudaSafeCall( cudaGetLastError());
        cudaSafeCall( cudaDeviceSynchronize());
--- a/modules/gpu/src/icf.hpp
+++ b/modules/gpu/src/icf.hpp
@ -124,6 +124,33 @@ struct __align__(16) Detection
    : x(_x), y(_y), w(_w), h(_h), confidence(c), kind(0) {};
 };

+struct CascadePolicy
+{
+    enum {STA_X = 32, STA_Y = 8};
+};
+
+template<typename Policy>
+struct CascadeInvoker
+{
+    CascadeInvoker(): levels(0), octaves(0), stages(0), nodes(0), leaves(0) {}
+    CascadeInvoker(const PtrStepSzb& _levels, const PtrStepSzb& _octaves, const PtrStepSzf& _stages,
+                   const PtrStepSzb& _nodes,  const PtrStepSzf& _leaves)
+    : levels((const Level*)_levels.ptr()), octaves((const Octave*)_octaves.ptr()), stages((const float*)_stages.ptr()),
+       nodes((const Node*)_nodes.ptr()), leaves((const float*)_leaves.ptr())
+    {}
+
+    const Level*  levels;
+    const Octave* octaves;
+
+    const float*  stages;
+
+    const Node*   nodes;
+    const float*  leaves;
+
+    void operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects,
+        PtrStepSzi counter, const int downscales, const int csale = -1) const;
+};
+
 }
 }}}

--- a/modules/gpu/src/softcascade.cpp
+++ b/modules/gpu/src/softcascade.cpp
@ -69,29 +69,6 @@ namespace cv { namespace gpu { namespace device {
 namespace icf {
    void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
        const int fw, const int fh, const int bins);
-
-    void detect(const PtrStepSzb& rois,
-                const PtrStepSzb& levels,
-                const PtrStepSzb& octaves,
-                const PtrStepSzf& stages,
-                const PtrStepSzb& nodes,
-                const PtrStepSzf& leaves,
-                const PtrStepSzi& hogluv,
-                PtrStepSz<uchar4> objects,
-                PtrStepSzi counter,
-                const int downscales);
-
-    void detectAtScale(const int scale,
-                       const PtrStepSzb& rois,
-                       const PtrStepSzb& levels,
-                       const PtrStepSzb& octaves,
-                       const PtrStepSzf& stages,
-                       const PtrStepSzb& nodes,
-                       const PtrStepSzf& leaves,
-                       const PtrStepSzi& hogluv,
-                       PtrStepSz<uchar4> objects,
-                       PtrStepSzi counter,
-                       const int downscales);
 }
 namespace imgproc
 {
@ -150,6 +127,8 @@ struct cv::gpu::SoftCascade::Filds

    std::vector<float> scales;

+    device::icf::CascadeInvoker<device::icf::CascadePolicy> invoker;
+
    static const int shrinkage = 4;

    enum { BOOST = 0 };
@ -166,17 +145,11 @@ struct cv::gpu::SoftCascade::Filds
    };

    bool fill(const FileNode &root, const float mins, const float maxs);
-    void detect(const cv::gpu::GpuMat& roi, cv::gpu::GpuMat& objects, cudaStream_t stream) const
-    {
-        cudaMemset(detCounter.data, 0, detCounter.step * detCounter.rows * sizeof(int));
-        device::icf::detect(roi, levels, octaves, stages, nodes, leaves, hogluv, objects , detCounter, downscales);
-    }
-
-    void detectAtScale(int scale, const cv::gpu::GpuMat& roi, cv::gpu::GpuMat& objects, cudaStream_t stream) const
+    void detect(int scale, const cv::gpu::GpuMat& roi, cv::gpu::GpuMat& objects, cudaStream_t stream) const
    {
        cudaMemset(detCounter.data, 0, detCounter.step * detCounter.rows * sizeof(int));
-        device::icf::detectAtScale(scale, roi, levels, octaves, stages, nodes, leaves, hogluv, objects,
-            detCounter, downscales);
+        // device::icf::CascadeInvoker<device::icf::CascadePolicy> invoker(levels, octaves, stages, nodes, leaves);
+        invoker(roi, hogluv, objects, detCounter, downscales, scale);
    }

    void preprocess(const cv::gpu::GpuMat& colored)
@ -439,6 +412,8 @@ bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float mins, c
    calcLevels(voctaves, FRAME_WIDTH, FRAME_HEIGHT, TOTAL_SCALES);
    CV_Assert(!levels.empty());

+    invoker = device::icf::CascadeInvoker<device::icf::CascadePolicy>(levels, octaves, stages, nodes, leaves);
+
    return true;
 }

@ -569,10 +544,7 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat&

    flds.preprocess(colored);

-    if (specificScale == -1)
-        flds.detect(rois,objects, 0);
-    else
-        flds.detectAtScale(specificScale, rois, objects, 0);
+    flds.detect(specificScale, rois, objects, 0);

    cv::Mat out(flds.detCounter);
    int ndetections = *(out.data);