Optimized mog and mog2, which have much better performance.

12 years ago · 1e8194fd3c
parent 1bcd1fd3a2
commit 1e8194fd3c
3 changed files with 107 additions and 103 deletions
--- a/modules/ocl/src/bgfg_mog.cpp
+++ b/modules/ocl/src/bgfg_mog.cpp
@ -254,7 +254,7 @@ static void mog_withoutLearning(const oclMat& frame, int cn, oclMat& fgmask, ocl
 }
-static void mog_withLearning(const oclMat& frame, int cn, oclMat& fgmask, oclMat& weight, oclMat& sortKey, oclMat& mean, oclMat& var,
+static void mog_withLearning(const oclMat& frame, int cn, oclMat& fgmask_raw, oclMat& weight, oclMat& sortKey, oclMat& mean, oclMat& var,
    int nmixtures, float varThreshold, float backgroundRatio, float learningRate, float minVar)
 {
    Context* clCxt = Context::getContext();
@ -262,6 +262,8 @@ static void mog_withLearning(const oclMat& frame, int cn, oclMat& fgmask, oclMat
    size_t local_thread[] = {32, 8, 1};
    size_t global_thread[] = {frame.cols, frame.rows, 1};
    oclMat fgmask(fgmask_raw.size(), CV_32SC1);
    int frame_step = (int)(frame.step/frame.elemSize());
    int fgmask_step = (int)(fgmask.step/fgmask.elemSize());
    int weight_step = (int)(weight.step/weight.elemSize());
@ -318,6 +320,8 @@ static void mog_withLearning(const oclMat& frame, int cn, oclMat& fgmask, oclMat
    args.push_back(make_pair(sizeof(cl_int), (void*)&frame_offset_y));
    openCLExecuteKernel(clCxt, &bgfg_mog, kernel_name, global_thread, local_thread, args, -1, -1, build_option);
    fgmask.convertTo(fgmask, CV_8U);
    fgmask.copyTo(fgmask_raw);
 }
 void cv::ocl::device::mog::mog_ocl(const oclMat& frame, int cn, oclMat& fgmask, oclMat& weight, oclMat& sortKey, oclMat& mean, oclMat& var,
@ -392,9 +396,11 @@ void cv::ocl::device::mog::loadConstants(float Tb, float TB, float Tg, float var
        (void *)constants, sizeof(_contant_struct));
 }
-void cv::ocl::device::mog::mog2_ocl(const oclMat& frame, int cn, oclMat& fgmask, oclMat& modesUsed, oclMat& weight, oclMat& variance, 
+void cv::ocl::device::mog::mog2_ocl(const oclMat& frame, int cn, oclMat& fgmaskRaw, oclMat& modesUsed, oclMat& weight, oclMat& variance, 
                                oclMat& mean, float alphaT, float prune, bool detectShadows, int nmixtures)
 {
    oclMat fgmask(fgmaskRaw.size(), CV_32SC1);
    Context* clCxt = Context::getContext();
    const float alpha1 = 1.0f - alphaT;
@ -464,6 +470,9 @@ void cv::ocl::device::mog::mog2_ocl(const oclMat& frame, int cn, oclMat& fgmask,
    args.push_back(make_pair(sizeof(cl_mem), (void*)&cl_constants));
    openCLExecuteKernel(clCxt, &bgfg_mog, kernel_name, global_thread, local_thread, args, -1, -1, build_option);
    fgmask.convertTo(fgmask, CV_8U);
    fgmask.copyTo(fgmaskRaw);
 }
 void cv::ocl::device::mog::getBackgroundImage2_ocl(int cn, const oclMat& modesUsed, const oclMat& weight, const oclMat& mean, oclMat& dst, int nmixtures)
@ -580,7 +589,7 @@ void cv::ocl::MOG2::initialize(cv::Size frameSize, int frameType)
    mean_.setTo(Scalar::all(0));
    //make the array for keeping track of the used modes per pixel - all zeros at start
-    bgmodelUsedModes_.create(frameSize_, CV_8UC1);
+    bgmodelUsedModes_.create(frameSize_, CV_32FC1);
    bgmodelUsedModes_.setTo(cv::Scalar::all(0));
    loadConstants(varThreshold, backgroundRatio, varThresholdGen, fVarInit, fVarMin, fVarMax, fTau, nShadowDetection);
--- a/modules/ocl/src/opencl/bgfg_mog.cl
+++ b/modules/ocl/src/opencl/bgfg_mog.cl
@ -188,7 +188,7 @@ __kernel void mog_withoutLearning_kernel(__global T_FRAME* frame, __global uchar
    }
 }
-__kernel void mog_withLearning_kernel(__global T_FRAME* frame, __global uchar* fgmask,
+__kernel void mog_withLearning_kernel(__global T_FRAME* frame, __global int* fgmask,
    __global float* weight, __global float* sortKey, __global T_MEAN_VAR* mean, 
    __global T_MEAN_VAR* var, int frame_row, int frame_col, int frame_step, int fgmask_step,
    int weight_step, int sortKey_step, int mean_step, int var_step,
@ -202,130 +202,125 @@ __kernel void mog_withLearning_kernel(__global T_FRAME* frame, __global uchar* f
    int x = get_global_id(0);
    int y = get_global_id(1);
-    if(x < frame_col && y < frame_row)
+    if(x >= frame_col || y >= frame_row) return;
-    {
+    float wsum = 0.0f;
-
+    int kHit = -1;
-        float wsum = 0.0f;
+    int kForeground = -1;
-        int kHit = -1;
+    int k = 0;
        int kForeground = -1;
        int k = 0;
-        T_MEAN_VAR pix = cvt(frame[(y + frame_offset_y) * frame_step + (x + frame_offset_x)]);
+    T_MEAN_VAR pix = cvt(frame[(y + frame_offset_y) * frame_step + (x + frame_offset_x)]);
        for (; k < (NMIXTURES); ++k)
        {
            float w = weight[(k * frame_row + y) * weight_step + x];
            wsum += w;
-            if (w < 1.192092896e-07f)
+    for (; k < (NMIXTURES); ++k)
-                break;
+    {
        float w = weight[(k * frame_row + y) * weight_step + x];
        wsum += w;
-            T_MEAN_VAR mu = mean[(k * frame_row + y) * mean_step + x];
+        if (w < 1.192092896e-07f)
-            T_MEAN_VAR _var = var[(k * frame_row + y) * var_step + x];
+            break;
-            T_MEAN_VAR diff = pix - mu;
+        T_MEAN_VAR mu = mean[(k * frame_row + y) * mean_step + x];
        T_MEAN_VAR _var = var[(k * frame_row + y) * var_step + x];
-            if (sqr(diff) < varThreshold * sum(_var))
+        float sortKey_prev, weight_prev;
-            {
+        T_MEAN_VAR mean_prev, var_prev;
-                wsum -= w;
+        if (sqr(pix - mu) < varThreshold * sum(_var))
-                float dw = learningRate * (1.0f - w);
+        {
-
+            wsum -= w;
-                _var = clamp1(_var, learningRate, diff, minVar);
+            float dw = learningRate * (1.0f - w);
-                float sortKey_prev = w / sqr(sum(_var));
+            _var = clamp1(_var, learningRate, pix - mu, minVar);
                sortKey[(k * frame_row + y) * sortKey_step + x] = sortKey_prev;
-                float weight_prev = w + dw;
+            sortKey_prev = w / sqr(sum(_var));
-                weight[(k * frame_row + y) * weight_step + x] = weight_prev;
+            sortKey[(k * frame_row + y) * sortKey_step + x] = sortKey_prev;
-                T_MEAN_VAR mean_prev = mu + learningRate * diff;
+            weight_prev = w + dw;
-                mean[(k * frame_row + y) * mean_step + x] = mean_prev;
+            weight[(k * frame_row + y) * weight_step + x] = weight_prev;
-                T_MEAN_VAR var_prev = _var;
+            mean_prev = mu + learningRate * (pix - mu);
-                var[(k * frame_row + y) * var_step + x] = var_prev;
+            mean[(k * frame_row + y) * mean_step + x] = mean_prev;
-                int k1 = k - 1;
+            var_prev = _var;
            var[(k * frame_row + y) * var_step + x] = var_prev;
        }
-                if (k1 >= 0)
+        int k1 = k - 1;
                {
                    float sortKey_next = sortKey[(k1 * frame_row + y) * sortKey_step + x];
                    float weight_next = weight[(k1 * frame_row + y) * weight_step + x];
                    T_MEAN_VAR mean_next = mean[(k1 * frame_row + y) * mean_step + x];
                    T_MEAN_VAR var_next = var[(k1 * frame_row + y) * var_step + x];
-                    for (; sortKey_next < sortKey_prev && k1 >= 0; --k1)
+        if (k1 >= 0 && sqr(pix - mu) < varThreshold * sum(_var))
-                    {
+        {
-                        sortKey[(k1 * frame_row + y) * sortKey_step + x] = sortKey_prev;
+            float sortKey_next = sortKey[(k1 * frame_row + y) * sortKey_step + x];
-                        sortKey[((k1 + 1) * frame_row + y) * sortKey_step + x] = sortKey_next;
+            float weight_next = weight[(k1 * frame_row + y) * weight_step + x];
            T_MEAN_VAR mean_next = mean[(k1 * frame_row + y) * mean_step + x];
            T_MEAN_VAR var_next = var[(k1 * frame_row + y) * var_step + x];
-                        weight[(k1 * frame_row + y) * weight_step + x] = weight_prev;
+            for (; sortKey_next < sortKey_prev && k1 >= 0; --k1)
-                        weight[((k1 + 1) * frame_row + y) * weight_step + x] = weight_next;
+            {
                sortKey[(k1 * frame_row + y) * sortKey_step + x] = sortKey_prev;
                sortKey[((k1 + 1) * frame_row + y) * sortKey_step + x] = sortKey_next;
-                        mean[(k1 * frame_row + y) * mean_step + x] = mean_prev;
+                weight[(k1 * frame_row + y) * weight_step + x] = weight_prev;
-                        mean[((k1 + 1) * frame_row + y) * mean_step + x] = mean_next;
+                weight[((k1 + 1) * frame_row + y) * weight_step + x] = weight_next;
-                        var[(k1 * frame_row + y) * var_step + x] = var_prev;
+                mean[(k1 * frame_row + y) * mean_step + x] = mean_prev;
-                        var[((k1 + 1) * frame_row + y) * var_step + x] = var_next;
+                mean[((k1 + 1) * frame_row + y) * mean_step + x] = mean_next;
-                        sortKey_prev = sortKey_next;
+                var[(k1 * frame_row + y) * var_step + x] = var_prev;
-                        sortKey_next = k1 > 0 ? sortKey[((k1 - 1) * frame_row + y) * sortKey_step + x] : 0.0f;
+                var[((k1 + 1) * frame_row + y) * var_step + x] = var_next;
-                        weight_prev = weight_next;
+                sortKey_prev = sortKey_next;
-                        weight_next = k1 > 0 ? weight[((k1 - 1) * frame_row + y) * weight_step + x] : 0.0f;
+                sortKey_next = k1 > 0 ? sortKey[((k1 - 1) * frame_row + y) * sortKey_step + x] : 0.0f;
-                        mean_prev = mean_next;
+                weight_prev = weight_next;
-                        mean_next = k1 > 0 ? mean[((k1 - 1) * frame_row + y) * mean_step + x] : (T_MEAN_VAR)F_ZERO;
+                weight_next = k1 > 0 ? weight[((k1 - 1) * frame_row + y) * weight_step + x] : 0.0f;
-                        var_prev = var_next;
+                mean_prev = mean_next;
-                        var_next = k1 > 0 ? var[((k1 - 1) * frame_row + y) * var_step + x] : (T_MEAN_VAR)F_ZERO;
+                mean_next = k1 > 0 ? mean[((k1 - 1) * frame_row + y) * mean_step + x] : (T_MEAN_VAR)F_ZERO;
                    }
                }
-                kHit = k1 + 1;
+                var_prev = var_next;
-                break;
+                var_next = k1 > 0 ? var[((k1 - 1) * frame_row + y) * var_step + x] : (T_MEAN_VAR)F_ZERO;
            }
        }
-        if (kHit < 0)
+        kHit = k1 + 1;
-        {
+        break;
-            kHit = k = k < ((NMIXTURES) - 1) ? k : ((NMIXTURES) - 1);
+    }
            wsum += w0 - weight[(k * frame_row + y) * weight_step + x];
            weight[(k * frame_row + y) * weight_step + x] = w0;
            mean[(k * frame_row + y) * mean_step + x] = pix;
            #if defined (CN1)
            var[(k * frame_row + y) * var_step + x] = (T_MEAN_VAR)(var0);
            #else
            var[(k * frame_row + y) * var_step + x] = (T_MEAN_VAR)(var0, var0, var0, var0);
            #endif
            sortKey[(k * frame_row + y) * sortKey_step + x] = sk0;
        }
        else
        {
            for( ; k < (NMIXTURES); k++)
                wsum += weight[(k * frame_row + y) * weight_step + x];
        }
-        float wscale = 1.0f / wsum;
+    if (kHit < 0)
-        wsum = 0;
+    {
-        for (k = 0; k < (NMIXTURES); ++k)
+        kHit = k = k < ((NMIXTURES) - 1) ? k : ((NMIXTURES) - 1);
-        {
+        wsum += w0 - weight[(k * frame_row + y) * weight_step + x];
-            float w = weight[(k * frame_row + y) * weight_step + x];
+
-            wsum += w *= wscale;
+        weight[(k * frame_row + y) * weight_step + x] = w0;
        mean[(k * frame_row + y) * mean_step + x] = pix;
 #if defined (CN1)
        var[(k * frame_row + y) * var_step + x] = (T_MEAN_VAR)(var0);
 #else
        var[(k * frame_row + y) * var_step + x] = (T_MEAN_VAR)(var0, var0, var0, var0);
 #endif
        sortKey[(k * frame_row + y) * sortKey_step + x] = sk0;
    }
    else
    {
        for( ; k < (NMIXTURES); k++)
            wsum += weight[(k * frame_row + y) * weight_step + x];
    }
-            weight[(k * frame_row + y) * weight_step + x] = w;
+    float wscale = 1.0f / wsum;
-            sortKey[(k * frame_row + y) * sortKey_step + x] *= wscale;
+    wsum = 0;
    for (k = 0; k < (NMIXTURES); ++k)
    {
        float w = weight[(k * frame_row + y) * weight_step + x];
        w *= wscale;
        wsum += w;
-            if (wsum > backgroundRatio && kForeground < 0)
+        weight[(k * frame_row + y) * weight_step + x] = w;
-                kForeground = k + 1;
+        sortKey[(k * frame_row + y) * sortKey_step + x] *= wscale;
-        }
+
-        if(kHit >= kForeground)
+        kForeground = select(kForeground, k + 1, wsum > backgroundRatio && kForeground < 0);
            fgmask[(y + fgmask_offset_y) * fgmask_step + (x + fgmask_offset_x)] = (uchar)(-1);
        else
            fgmask[(y + fgmask_offset_y) * fgmask_step + (x + fgmask_offset_x)] = (uchar)(0);
    }
    fgmask[(y + fgmask_offset_y) * fgmask_step + (x + fgmask_offset_x)] = (uchar)(-(kHit >= kForeground));
 }
 __kernel void getBackgroundImage_kernel(__global float* weight, __global T_MEAN_VAR* mean, __global T_FRAME* dst,
    int dst_row, int dst_col, int weight_step, int mean_step, int dst_step, 
    float backgroundRatio)
@ -355,8 +350,8 @@ __kernel void getBackgroundImage_kernel(__global float* weight, __global T_MEAN_
    }
 }
-__kernel void mog2_kernel(__global T_FRAME * frame, __global uchar* fgmask, __global float* weight, __global T_MEAN_VAR * mean,
+__kernel void mog2_kernel(__global T_FRAME * frame, __global int* fgmask, __global float* weight, __global T_MEAN_VAR * mean,
-        __global uchar* modesUsed, __global float* variance, int frame_row, int frame_col, int frame_step, 
+        __global int* modesUsed, __global float* variance, int frame_row, int frame_col, int frame_step, 
        int fgmask_step, int weight_step, int mean_step, int modesUsed_step, int var_step, float alphaT, float alpha1, float prune, 
        int detectShadows_flag, int fgmask_offset_x, int fgmask_offset_y, int frame_offset_x, int frame_offset_y, __constant con_srtuct_t* constants)
 {
@ -509,7 +504,7 @@ __kernel void mog2_kernel(__global T_FRAME * frame, __global uchar* fgmask, __gl
    }
 }
-__kernel void getBackgroundImage2_kernel(__global uchar* modesUsed, __global float* weight, __global T_MEAN_VAR* mean,
+__kernel void getBackgroundImage2_kernel(__global int* modesUsed, __global float* weight, __global T_MEAN_VAR* mean,
    __global T_FRAME* dst, float c_TB, int modesUsed_row, int modesUsed_col, int modesUsed_step, int weight_step, 
    int mean_step, int dst_step, int dst_x, int dst_y)
 {
--- a/modules/ocl/test/test_bgfg.cpp
+++ b/modules/ocl/test/test_bgfg.cpp
@ -191,7 +191,7 @@ TEST_P(mog2, getBackgroundImage)
    if (useGray)
        return;
-    std::string inputFile = string(cvtest::TS::ptr()->get_data_path()) + "gpu/768x576.avi";
+    std::string inputFile = string(cvtest::TS::ptr()->get_data_path()) + "video/768x576.avi";
    cv::VideoCapture cap(inputFile);
    ASSERT_TRUE(cap.isOpened());