Merge pull request #16090 from jeffeDurand:cuda_mog2_issue_5296

* cuda_mog2_issue_5296
5 years ago · 5bf7345743
parent 4342657762
commit 5bf7345743
3 changed files with 493 additions and 479 deletions
--- a/modules/cudabgsegm/src/cuda/mog2.cu
+++ b/modules/cudabgsegm/src/cuda/mog2.cu
@ -47,393 +47,372 @@
 #include "opencv2/core/cuda/vec_math.hpp"
 #include "opencv2/core/cuda/limits.hpp"
-namespace cv { namespace cuda { namespace device
+#include "mog2.hpp"
 {
    namespace mog2
    {
        ///////////////////////////////////////////////////////////////
        // Utility
        __device__ __forceinline__ float cvt(uchar val)
        {
            return val;
        }
        __device__ __forceinline__ float3 cvt(const uchar3& val)
        {
            return make_float3(val.x, val.y, val.z);
        }
        __device__ __forceinline__ float4 cvt(const uchar4& val)
        {
            return make_float4(val.x, val.y, val.z, val.w);
        }
        __device__ __forceinline__ float sqr(float val)
        {
            return val * val;
        }
        __device__ __forceinline__ float sqr(const float3& val)
        {
            return val.x * val.x + val.y * val.y + val.z * val.z;
        }
        __device__ __forceinline__ float sqr(const float4& val)
        {
            return val.x * val.x + val.y * val.y + val.z * val.z;
        }
-        __device__ __forceinline__ float sum(float val)
+namespace cv
-        {
+{
-            return val;
+namespace cuda
-        }
+{
-        __device__ __forceinline__ float sum(const float3& val)
+namespace device
-        {
+{
-            return val.x + val.y + val.z;
+namespace mog2
-        }
+{
-        __device__ __forceinline__ float sum(const float4& val)
+///////////////////////////////////////////////////////////////
-        {
+// Utility
            return val.x + val.y + val.z;
        }
        template <class Ptr2D>
        __device__ __forceinline__ void swap(Ptr2D& ptr, int x, int y, int k, int rows)
        {
            typename Ptr2D::elem_type val = ptr(k * rows + y, x);
            ptr(k * rows + y, x) = ptr((k + 1) * rows + y, x);
            ptr((k + 1) * rows + y, x) = val;
        }
        ///////////////////////////////////////////////////////////////
        // MOG2
-        __constant__ int           c_nmixtures;
+__device__ __forceinline__ float cvt(uchar val)
-        __constant__ float         c_Tb;
+{
-        __constant__ float         c_TB;
+    return val;
-        __constant__ float         c_Tg;
+}
-        __constant__ float         c_varInit;
+__device__ __forceinline__ float3 cvt(const uchar3 &val)
-        __constant__ float         c_varMin;
+{
-        __constant__ float         c_varMax;
+    return make_float3(val.x, val.y, val.z);
-        __constant__ float         c_tau;
+}
-        __constant__ unsigned char c_shadowVal;
+__device__ __forceinline__ float4 cvt(const uchar4 &val)
 {
    return make_float4(val.x, val.y, val.z, val.w);
 }
-        void loadConstants(int nmixtures, float Tb, float TB, float Tg, float varInit, float varMin, float varMax, float tau, unsigned char shadowVal)
+__device__ __forceinline__ float sqr(float val)
-        {
+{
-            varMin = ::fminf(varMin, varMax);
+    return val * val;
-            varMax = ::fmaxf(varMin, varMax);
+}
-
+__device__ __forceinline__ float sqr(const float3 &val)
-            cudaSafeCall( cudaMemcpyToSymbol(c_nmixtures, &nmixtures, sizeof(int)) );
+{
-            cudaSafeCall( cudaMemcpyToSymbol(c_Tb, &Tb, sizeof(float)) );
+    return val.x * val.x + val.y * val.y + val.z * val.z;
-            cudaSafeCall( cudaMemcpyToSymbol(c_TB, &TB, sizeof(float)) );
+}
-            cudaSafeCall( cudaMemcpyToSymbol(c_Tg, &Tg, sizeof(float)) );
+__device__ __forceinline__ float sqr(const float4 &val)
-            cudaSafeCall( cudaMemcpyToSymbol(c_varInit, &varInit, sizeof(float)) );
+{
-            cudaSafeCall( cudaMemcpyToSymbol(c_varMin, &varMin, sizeof(float)) );
+    return val.x * val.x + val.y * val.y + val.z * val.z;
-            cudaSafeCall( cudaMemcpyToSymbol(c_varMax, &varMax, sizeof(float)) );
+}
            cudaSafeCall( cudaMemcpyToSymbol(c_tau, &tau, sizeof(float)) );
            cudaSafeCall( cudaMemcpyToSymbol(c_shadowVal, &shadowVal, sizeof(unsigned char)) );
        }
-        template <bool detectShadows, typename SrcT, typename WorkT>
+__device__ __forceinline__ float sum(float val)
-        __global__ void mog2(const PtrStepSz<SrcT> frame, PtrStepb fgmask, PtrStepb modesUsed,
+{
-                             PtrStepf gmm_weight, PtrStepf gmm_variance, PtrStep<WorkT> gmm_mean,
+    return val;
-                             const float alphaT, const float alpha1, const float prune)
+}
-        {
+__device__ __forceinline__ float sum(const float3 &val)
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+{
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+    return val.x + val.y + val.z;
 }
 __device__ __forceinline__ float sum(const float4 &val)
 {
    return val.x + val.y + val.z;
 }
-            if (x >= frame.cols || y >= frame.rows)
+template <class Ptr2D>
-                return;
+__device__ __forceinline__ void swap(Ptr2D &ptr, int x, int y, int k, int rows)
 {
    typename Ptr2D::elem_type val = ptr(k * rows + y, x);
    ptr(k * rows + y, x) = ptr((k + 1) * rows + y, x);
    ptr((k + 1) * rows + y, x) = val;
 }
 ///////////////////////////////////////////////////////////////
 // MOG2
 template <bool detectShadows, typename SrcT, typename WorkT>
 __global__ void mog2(const PtrStepSz<SrcT> frame, PtrStepb fgmask, PtrStepb modesUsed,
                     PtrStepf gmm_weight, PtrStepf gmm_variance, PtrStep<WorkT> gmm_mean,
                     const float alphaT, const float alpha1, const float prune, const Constants *const constants)
 {
    const int x = blockIdx.x * blockDim.x + threadIdx.x;
    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-            WorkT pix = cvt(frame(y, x));
+    if (x < frame.cols && y < frame.rows)
    {
        WorkT pix = cvt(frame(y, x));
-            //calculate distances to the modes (+ sort)
+        //calculate distances to the modes (+ sort)
-            //here we need to go in descending order!!!
+        //here we need to go in descending order!!!
-            bool background = false; // true - the pixel classified as background
+        bool background = false; // true - the pixel classified as background
-            //internal:
+        //internal:
-            bool fitsPDF = false; //if it remains zero a new GMM mode will be added
+        bool fitsPDF = false; //if it remains zero a new GMM mode will be added
-            int nmodes = modesUsed(y, x);
+        int nmodes = modesUsed(y, x);
-            int nNewModes = nmodes; //current number of modes in GMM
+        const int nNewModes = nmodes; //current number of modes in GMM
-            float totalWeight = 0.0f;
+        float totalWeight = 0.0f;
-            //go through all modes
+        //go through all modes
-            for (int mode = 0; mode < nmodes; ++mode)
+        for (int mode = 0; mode < nmodes; ++mode)
        {
            //need only weight if fit is found
            float weight = alpha1 * gmm_weight(mode * frame.rows + y, x) + prune;
            int swap_count = 0;
            //fit not found yet
            if (!fitsPDF)
            {
-                //need only weight if fit is found
+                //check if it belongs to some of the remaining modes
-                float weight = alpha1 * gmm_weight(mode * frame.rows + y, x) + prune;
+                const float var = gmm_variance(mode * frame.rows + y, x);
                int swap_count = 0;
                //fit not found yet
                if (!fitsPDF)
                {
                    //check if it belongs to some of the remaining modes
                    float var = gmm_variance(mode * frame.rows + y, x);
                    WorkT mean = gmm_mean(mode * frame.rows + y, x);
-                    //calculate difference and distance
+                const WorkT mean = gmm_mean(mode * frame.rows + y, x);
                    WorkT diff = mean - pix;
                    float dist2 = sqr(diff);
-                    //background? - Tb - usually larger than Tg
+                //calculate difference and distance
-                    if (totalWeight < c_TB && dist2 < c_Tb * var)
+                const WorkT diff = mean - pix;
-                        background = true;
+                const float dist2 = sqr(diff);
-                    //check fit
+                //background? - Tb - usually larger than Tg
-                    if (dist2 < c_Tg * var)
+                if (totalWeight < constants->TB_ && dist2 < constants->Tb_ * var)
-                    {
+                    background = true;
                        //belongs to the mode
                        fitsPDF = true;
-                        //update distribution
+                //check fit
                if (dist2 < constants->Tg_ * var)
                {
                    //belongs to the mode
                    fitsPDF = true;
-                        //update weight
+                    //update distribution
                        weight += alphaT;
                        float k = alphaT / weight;
-                        //update mean
+                    //update weight
-                        gmm_mean(mode * frame.rows + y, x) = mean - k * diff;
+                    weight += alphaT;
                    float k = alphaT / weight;
-                        //update variance
+                    //update mean
-                        float varnew = var + k * (dist2 - var);
+                    gmm_mean(mode * frame.rows + y, x) = mean - k * diff;
-                        //limit the variance
+                    //update variance
-                        varnew = ::fmaxf(varnew, c_varMin);
+                    float varnew = var + k * (dist2 - var);
                        varnew = ::fminf(varnew, c_varMax);
-                        gmm_variance(mode * frame.rows + y, x) = varnew;
+                    //limit the variance
                    varnew = ::fmaxf(varnew, constants->varMin_);
                    varnew = ::fminf(varnew, constants->varMax_);
-                        //sort
+                    gmm_variance(mode * frame.rows + y, x) = varnew;
                        //all other weights are at the same place and
                        //only the matched (iModes) is higher -> just find the new place for it
-                        for (int i = mode; i > 0; --i)
+                    //sort
-                        {
+                    //all other weights are at the same place and
-                            //check one up
+                    //only the matched (iModes) is higher -> just find the new place for it
                            if (weight < gmm_weight((i - 1) * frame.rows + y, x))
                                break;
-                            swap_count++;
+                    for (int i = mode; i > 0; --i)
-                            //swap one up
+                    {
-                            swap(gmm_weight, x, y, i - 1, frame.rows);
+                        //check one up
-                            swap(gmm_variance, x, y, i - 1, frame.rows);
+                        if (weight < gmm_weight((i - 1) * frame.rows + y, x))
-                            swap(gmm_mean, x, y, i - 1, frame.rows);
+                            break;
                        }
-                        //belongs to the mode - bFitsPDF becomes 1
+                        swap_count++;
                        //swap one up
                        swap(gmm_weight, x, y, i - 1, frame.rows);
                        swap(gmm_variance, x, y, i - 1, frame.rows);
                        swap(gmm_mean, x, y, i - 1, frame.rows);
                    }
                } // !fitsPDF
-                //check prune
+                    //belongs to the mode - bFitsPDF becomes 1
                if (weight < -prune)
                {
                    weight = 0.0f;
                    nmodes--;
                }
            } // !fitsPDF
-                gmm_weight((mode - swap_count) * frame.rows + y, x) = weight; //update weight by the calculated value
+            //check prune
-                totalWeight += weight;
+            if (weight < -prune)
            {
                weight = 0.0f;
                nmodes--;
            }
-            //renormalize weights
+            gmm_weight((mode - swap_count) * frame.rows + y, x) = weight; //update weight by the calculated value
            totalWeight += weight;
        }
-            totalWeight = 1.f / totalWeight;
+        //renormalize weights
            for (int mode = 0; mode < nmodes; ++mode)
                gmm_weight(mode * frame.rows + y, x) *= totalWeight;
-            nmodes = nNewModes;
+        totalWeight = 1.f / totalWeight;
        for (int mode = 0; mode < nmodes; ++mode)
            gmm_weight(mode * frame.rows + y, x) *= totalWeight;
-            //make new mode if needed and exit
+        nmodes = nNewModes;
-            if (!fitsPDF)
+        //make new mode if needed and exit
            {
                // replace the weakest or add a new one
                int mode = nmodes == c_nmixtures ? c_nmixtures - 1 : nmodes++;
-                if (nmodes == 1)
+        if (!fitsPDF)
-                    gmm_weight(mode * frame.rows + y, x) = 1.f;
+        {
-                else
+            // replace the weakest or add a new one
-                {
+            const int mode = nmodes == constants->nmixtures_ ? constants->nmixtures_ - 1 : nmodes++;
                    gmm_weight(mode * frame.rows + y, x) = alphaT;
-                    // renormalize all other weights
+            if (nmodes == 1)
                gmm_weight(mode * frame.rows + y, x) = 1.f;
            else
            {
                gmm_weight(mode * frame.rows + y, x) = alphaT;
-                    for (int i = 0; i < nmodes - 1; ++i)
+                // renormalize all other weights
                        gmm_weight(i * frame.rows + y, x) *= alpha1;
                }
-                // init
+                for (int i = 0; i < nmodes - 1; ++i)
                    gmm_weight(i * frame.rows + y, x) *= alpha1;
            }
-                gmm_mean(mode * frame.rows + y, x) = pix;
+            // init
                gmm_variance(mode * frame.rows + y, x) = c_varInit;
-                //sort
+            gmm_mean(mode * frame.rows + y, x) = pix;
-                //find the new place for it
+            gmm_variance(mode * frame.rows + y, x) = constants->varInit_;
-                for (int i = nmodes - 1; i > 0; --i)
+            //sort
-                {
+            //find the new place for it
                    // check one up
                    if (alphaT < gmm_weight((i - 1) * frame.rows + y, x))
                        break;
-                    //swap one up
+            for (int i = nmodes - 1; i > 0; --i)
-                    swap(gmm_weight, x, y, i - 1, frame.rows);
+            {
-                    swap(gmm_variance, x, y, i - 1, frame.rows);
+                // check one up
-                    swap(gmm_mean, x, y, i - 1, frame.rows);
+                if (alphaT < gmm_weight((i - 1) * frame.rows + y, x))
-                }
+                    break;
                //swap one up
                swap(gmm_weight, x, y, i - 1, frame.rows);
                swap(gmm_variance, x, y, i - 1, frame.rows);
                swap(gmm_mean, x, y, i - 1, frame.rows);
            }
        }
-            //set the number of modes
+        //set the number of modes
-            modesUsed(y, x) = nmodes;
+        modesUsed(y, x) = nmodes;
-            bool isShadow = false;
+        bool isShadow = false;
-            if (detectShadows && !background)
+        if (detectShadows && !background)
-            {
+        {
-                float tWeight = 0.0f;
+            float tWeight = 0.0f;
-                // check all the components  marked as background:
+            // check all the components  marked as background:
-                for (int mode = 0; mode < nmodes; ++mode)
+            for (int mode = 0; mode < nmodes; ++mode)
-                {
+            {
-                    WorkT mean = gmm_mean(mode * frame.rows + y, x);
+                const WorkT mean = gmm_mean(mode * frame.rows + y, x);
-                    WorkT pix_mean = pix * mean;
+                const WorkT pix_mean = pix * mean;
-                    float numerator = sum(pix_mean);
+                const float numerator = sum(pix_mean);
-                    float denominator = sqr(mean);
+                const float denominator = sqr(mean);
-                    // no division by zero allowed
+                // no division by zero allowed
-                    if (denominator == 0)
+                if (denominator == 0)
-                        break;
+                    break;
                    // if tau < a < 1 then also check the color distortion
                    if (numerator <= denominator && numerator >= c_tau * denominator)
                    {
                        float a = numerator / denominator;
-                        WorkT dD = a * mean - pix;
+                // if tau < a < 1 then also check the color distortion
                else if (numerator <= denominator && numerator >= constants->tau_ * denominator)
                {
                    const float a = numerator / denominator;
-                        if (sqr(dD) < c_Tb * gmm_variance(mode * frame.rows + y, x) * a * a)
+                    WorkT dD = a * mean - pix;
                        {
                            isShadow = true;
                            break;
                        }
                    };
-                    tWeight += gmm_weight(mode * frame.rows + y, x);
+                    if (sqr(dD) < constants->Tb_ * gmm_variance(mode * frame.rows + y, x) * a * a)
-                    if (tWeight > c_TB)
+                    {
                        isShadow = true;
                        break;
-                }
+                    }
-            }
+                };
-            fgmask(y, x) = background ? 0 : isShadow ? c_shadowVal : 255;
+                tWeight += gmm_weight(mode * frame.rows + y, x);
                if (tWeight > constants->TB_)
                    break;
            }
        }
-        template <typename SrcT, typename WorkT>
+        fgmask(y, x) = background ? 0 : isShadow ? constants->shadowVal_ : 255;
-        void mog2_caller(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzf variance, PtrStepSzb mean,
+    }
-                         float alphaT, float prune, bool detectShadows, cudaStream_t stream)
+}
        {
            dim3 block(32, 8);
            dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));
            const float alpha1 = 1.0f - alphaT;
-            if (detectShadows)
+template <typename SrcT, typename WorkT>
-            {
+void mog2_caller(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzf variance, PtrStepSzb mean,
-                cudaSafeCall( cudaFuncSetCacheConfig(mog2<true, SrcT, WorkT>, cudaFuncCachePreferL1) );
+                 float alphaT, float prune, bool detectShadows, const Constants *const constants, cudaStream_t stream)
 {
    dim3 block(32, 8);
    dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));
-                mog2<true, SrcT, WorkT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask, modesUsed,
+    const float alpha1 = 1.0f - alphaT;
                                                                    weight, variance, (PtrStepSz<WorkT>) mean,
                                                                    alphaT, alpha1, prune);
            }
            else
            {
                cudaSafeCall( cudaFuncSetCacheConfig(mog2<false, SrcT, WorkT>, cudaFuncCachePreferL1) );
-                mog2<false, SrcT, WorkT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask, modesUsed,
+    if (detectShadows)
-                                                                    weight, variance, (PtrStepSz<WorkT>) mean,
+    {
-                                                                    alphaT, alpha1, prune);
+        cudaSafeCall(cudaFuncSetCacheConfig(mog2<true, SrcT, WorkT>, cudaFuncCachePreferL1));
            }
-            cudaSafeCall( cudaGetLastError() );
+        mog2<true, SrcT, WorkT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>)frame, fgmask, modesUsed,
                                                            weight, variance, (PtrStepSz<WorkT>)mean,
                                                            alphaT, alpha1, prune, constants);
    }
    else
    {
        cudaSafeCall(cudaFuncSetCacheConfig(mog2<false, SrcT, WorkT>, cudaFuncCachePreferL1));
-            if (stream == 0)
+        mog2<false, SrcT, WorkT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>)frame, fgmask, modesUsed,
-                cudaSafeCall( cudaDeviceSynchronize() );
+                                                             weight, variance, (PtrStepSz<WorkT>)mean,
-        }
+                                                             alphaT, alpha1, prune, constants);
    }
-        void mog2_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzf variance, PtrStepSzb mean,
+    cudaSafeCall(cudaGetLastError());
                      float alphaT, float prune, bool detectShadows, cudaStream_t stream)
        {
            typedef void (*func_t)(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzf variance, PtrStepSzb mean, float alphaT, float prune, bool detectShadows, cudaStream_t stream);
-            static const func_t funcs[] =
+    if (stream == 0)
-            {
+        cudaSafeCall(cudaDeviceSynchronize());
-                0, mog2_caller<uchar, float>, 0, mog2_caller<uchar3, float3>, mog2_caller<uchar4, float4>
+}
            };
-            funcs[cn](frame, fgmask, modesUsed, weight, variance, mean, alphaT, prune, detectShadows, stream);
+void mog2_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzf variance, PtrStepSzb mean,
-        }
+              float alphaT, float prune, bool detectShadows, const Constants *const constants, cudaStream_t stream)
 {
    typedef void (*func_t)(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzf variance, PtrStepSzb mean, float alphaT, float prune, bool detectShadows, const Constants *const constants, cudaStream_t stream);
-        template <typename WorkT, typename OutT>
+    static const func_t funcs[] =
        __global__ void getBackgroundImage2(const PtrStepSzb modesUsed, const PtrStepf gmm_weight, const PtrStep<WorkT> gmm_mean, PtrStep<OutT> dst)
        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            0, mog2_caller<uchar, float>, 0, mog2_caller<uchar3, float3>, mog2_caller<uchar4, float4>};
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-            if (x >= modesUsed.cols || y >= modesUsed.rows)
+    funcs[cn](frame, fgmask, modesUsed, weight, variance, mean, alphaT, prune, detectShadows, constants, stream);
-                return;
+}
-            int nmodes = modesUsed(y, x);
+template <typename WorkT, typename OutT>
 __global__ void getBackgroundImage2(const PtrStepSzb modesUsed, const PtrStepf gmm_weight, const PtrStep<WorkT> gmm_mean, PtrStep<OutT> dst, const Constants *const constants)
 {
    const int x = blockIdx.x * blockDim.x + threadIdx.x;
    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-            WorkT meanVal = VecTraits<WorkT>::all(0.0f);
+    if (x >= modesUsed.cols || y >= modesUsed.rows)
-            float totalWeight = 0.0f;
+        return;
-            for (int mode = 0; mode < nmodes; ++mode)
+    int nmodes = modesUsed(y, x);
            {
                float weight = gmm_weight(mode * modesUsed.rows + y, x);
-                WorkT mean = gmm_mean(mode * modesUsed.rows + y, x);
+    WorkT meanVal = VecTraits<WorkT>::all(0.0f);
-                meanVal = meanVal + weight * mean;
+    float totalWeight = 0.0f;
-                totalWeight += weight;
+    for (int mode = 0; mode < nmodes; ++mode)
    {
        float weight = gmm_weight(mode * modesUsed.rows + y, x);
-                if(totalWeight > c_TB)
+        WorkT mean = gmm_mean(mode * modesUsed.rows + y, x);
-                    break;
+        meanVal = meanVal + weight * mean;
            }
-            meanVal = meanVal * (1.f / totalWeight);
+        totalWeight += weight;
-            dst(y, x) = saturate_cast<OutT>(meanVal);
+        if (totalWeight > constants->TB_)
-        }
+            break;
    }
-        template <typename WorkT, typename OutT>
+    meanVal = meanVal * (1.f / totalWeight);
        void getBackgroundImage2_caller(PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, cudaStream_t stream)
        {
            dim3 block(32, 8);
            dim3 grid(divUp(modesUsed.cols, block.x), divUp(modesUsed.rows, block.y));
-            cudaSafeCall( cudaFuncSetCacheConfig(getBackgroundImage2<WorkT, OutT>, cudaFuncCachePreferL1) );
+    dst(y, x) = saturate_cast<OutT>(meanVal);
 }
-            getBackgroundImage2<WorkT, OutT><<<grid, block, 0, stream>>>(modesUsed, weight, (PtrStepSz<WorkT>) mean, (PtrStepSz<OutT>) dst);
+template <typename WorkT, typename OutT>
-            cudaSafeCall( cudaGetLastError() );
+void getBackgroundImage2_caller(PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, const Constants *const constants, cudaStream_t stream)
 {
    dim3 block(32, 8);
    dim3 grid(divUp(modesUsed.cols, block.x), divUp(modesUsed.rows, block.y));
-            if (stream == 0)
+    cudaSafeCall(cudaFuncSetCacheConfig(getBackgroundImage2<WorkT, OutT>, cudaFuncCachePreferL1));
                cudaSafeCall( cudaDeviceSynchronize() );
        }
-        void getBackgroundImage2_gpu(int cn, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, cudaStream_t stream)
+    getBackgroundImage2<WorkT, OutT><<<grid, block, 0, stream>>>(modesUsed, weight, (PtrStepSz<WorkT>)mean, (PtrStepSz<OutT>)dst, constants);
-        {
+    cudaSafeCall(cudaGetLastError());
            typedef void (*func_t)(PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, cudaStream_t stream);
-            static const func_t funcs[] =
+    if (stream == 0)
-            {
+        cudaSafeCall(cudaDeviceSynchronize());
-                0, getBackgroundImage2_caller<float, uchar>, 0, getBackgroundImage2_caller<float3, uchar3>, getBackgroundImage2_caller<float4, uchar4>
+}
            };
-            funcs[cn](modesUsed, weight, mean, dst, stream);
+void getBackgroundImage2_gpu(int cn, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, const Constants *const constants, cudaStream_t stream)
-        }
+{
-    }
+    typedef void (*func_t)(PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, const Constants *const constants, cudaStream_t stream);
 }}}
    static const func_t funcs[] =
        {
            0, getBackgroundImage2_caller<float, uchar>, 0, getBackgroundImage2_caller<float3, uchar3>, getBackgroundImage2_caller<float4, uchar4>};
    funcs[cn](modesUsed, weight, mean, dst, constants, stream);
 }
 } // namespace mog2
 } // namespace device
 } // namespace cuda
 } // namespace cv
 #endif /* CUDA_DISABLER */
--- a/modules/cudabgsegm/src/cuda/mog2.hpp
+++ b/modules/cudabgsegm/src/cuda/mog2.hpp
@ -0,0 +1,37 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 #ifndef OPENCV_CUDA_MOG2_H
 #define OPENCV_CUDA_MOG2_H
 #include "opencv2/core/cuda.hpp"
 struct CUstream_st;
 typedef struct CUstream_st *cudaStream_t;
 namespace cv { namespace cuda {
 class Stream;
 namespace device { namespace mog2 {
 typedef struct
 {
    float Tb_;
    float TB_;
    float Tg_;
    float varInit_;
    float varMin_;
    float varMax_;
    float tau_;
    int nmixtures_;
    unsigned char shadowVal_;
 } Constants;
 void mog2_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzf variance, PtrStepSzb mean, float alphaT, float prune, bool detectShadows, const Constants *const constants, cudaStream_t stream);
 void getBackgroundImage2_gpu(int cn, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, const Constants *const constants, cudaStream_t stream);
 } } } }
 #endif /* OPENCV_CUDA_MOG2_H */
--- a/modules/cudabgsegm/src/mog2.cpp
+++ b/modules/cudabgsegm/src/mog2.cpp
@ -41,209 +41,207 @@
 //M*/
 #include "precomp.hpp"
 #include "cuda/mog2.hpp"
 using namespace cv;
 using namespace cv::cuda;
 using namespace cv::cuda::device::mog2;
 #if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-Ptr<cuda::BackgroundSubtractorMOG2> cv::cuda::createBackgroundSubtractorMOG2(int, double, bool) { throw_no_cuda(); return Ptr<cuda::BackgroundSubtractorMOG2>(); }
+Ptr<cuda::BackgroundSubtractorMOG2> cv::cuda::createBackgroundSubtractorMOG2(int, double, bool)
 {
    throw_no_cuda();
    return Ptr<cuda::BackgroundSubtractorMOG2>();
 }
 #else
-namespace cv { namespace cuda { namespace device
+namespace
 {
-    namespace mog2
+// default parameters of gaussian background detection algorithm
-    {
+const int defaultHistory = 500; // Learning rate; alpha = 1/defaultHistory2
-        void loadConstants(int nmixtures, float Tb, float TB, float Tg, float varInit, float varMin, float varMax, float tau, unsigned char shadowVal);
+const float defaultVarThreshold = 4.0f * 4.0f;
-        void mog2_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzf variance, PtrStepSzb mean, float alphaT, float prune, bool detectShadows, cudaStream_t stream);
+const int defaultNMixtures = 5;            // maximal number of Gaussians in mixture
-        void getBackgroundImage2_gpu(int cn, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, cudaStream_t stream);
+const float defaultBackgroundRatio = 0.9f; // threshold sum of weights for background test
-    }
+const float defaultVarThresholdGen = 3.0f * 3.0f;
-}}}
+const float defaultVarInit = 15.0f; // initial variance for new components
 const float defaultVarMax = 5.0f * defaultVarInit;
 const float defaultVarMin = 4.0f;
 // additional parameters
 const float defaultCT = 0.05f;                // complexity reduction prior constant 0 - no reduction of number of components
 const unsigned char defaultShadowValue = 127; // value to use in the segmentation mask for shadows, set 0 not to do shadow detection
 const float defaultShadowThreshold = 0.5f;    // Tau - shadow threshold, see the paper for explanation
 class MOG2Impl CV_FINAL : public cuda::BackgroundSubtractorMOG2
 {
 public:
    MOG2Impl(int history, double varThreshold, bool detectShadows);
    ~MOG2Impl();
-namespace
+    void apply(InputArray image, OutputArray fgmask, double learningRate = -1) CV_OVERRIDE;
    void apply(InputArray image, OutputArray fgmask, double learningRate, Stream &stream) CV_OVERRIDE;
    void getBackgroundImage(OutputArray backgroundImage) const CV_OVERRIDE;
    void getBackgroundImage(OutputArray backgroundImage, Stream &stream) const CV_OVERRIDE;
    int getHistory() const CV_OVERRIDE { return history_; }
    void setHistory(int history) CV_OVERRIDE { history_ = history; }
    int getNMixtures() const CV_OVERRIDE { return constantsHost_.nmixtures_; }
    void setNMixtures(int nmixtures) CV_OVERRIDE { constantsHost_.nmixtures_ = nmixtures; }
    double getBackgroundRatio() const CV_OVERRIDE { return constantsHost_.TB_; }
    void setBackgroundRatio(double ratio) CV_OVERRIDE { constantsHost_.TB_ = (float)ratio; }
    double getVarThreshold() const CV_OVERRIDE { return constantsHost_.Tb_; }
    void setVarThreshold(double varThreshold) CV_OVERRIDE { constantsHost_.Tb_ = (float)varThreshold; }
    double getVarThresholdGen() const CV_OVERRIDE { return constantsHost_.Tg_; }
    void setVarThresholdGen(double varThresholdGen) CV_OVERRIDE { constantsHost_.Tg_ = (float)varThresholdGen; }
    double getVarInit() const CV_OVERRIDE { return constantsHost_.varInit_; }
    void setVarInit(double varInit) CV_OVERRIDE { constantsHost_.varInit_ = (float)varInit; }
    double getVarMin() const CV_OVERRIDE { return constantsHost_.varMin_; }
    void setVarMin(double varMin) CV_OVERRIDE { constantsHost_.varMin_ = ::fminf((float)varMin, constantsHost_.varMax_); }
    double getVarMax() const CV_OVERRIDE { return constantsHost_.varMax_; }
    void setVarMax(double varMax) CV_OVERRIDE { constantsHost_.varMax_ = ::fmaxf(constantsHost_.varMin_, (float)varMax); }
    double getComplexityReductionThreshold() const CV_OVERRIDE { return ct_; }
    void setComplexityReductionThreshold(double ct) CV_OVERRIDE { ct_ = (float)ct; }
    bool getDetectShadows() const CV_OVERRIDE { return detectShadows_; }
    void setDetectShadows(bool detectShadows) CV_OVERRIDE { detectShadows_ = detectShadows; }
    int getShadowValue() const CV_OVERRIDE { return constantsHost_.shadowVal_; }
    void setShadowValue(int value) CV_OVERRIDE { constantsHost_.shadowVal_ = (uchar)value; }
    double getShadowThreshold() const CV_OVERRIDE { return constantsHost_.tau_; }
    void setShadowThreshold(double threshold) CV_OVERRIDE { constantsHost_.tau_ = (float)threshold; }
 private:
    void initialize(Size frameSize, int frameType, Stream &stream);
    Constants constantsHost_;
    Constants *constantsDevice_;
    int history_;
    float ct_;
    bool detectShadows_;
    Size frameSize_;
    int frameType_;
    int nframes_;
    GpuMat weight_;
    GpuMat variance_;
    GpuMat mean_;
    //keep track of number of modes per pixel
    GpuMat bgmodelUsedModes_;
 };
 MOG2Impl::MOG2Impl(int history, double varThreshold, bool detectShadows) : frameSize_(0, 0), frameType_(0), nframes_(0)
 {
    history_ = history > 0 ? history : defaultHistory;
    detectShadows_ = detectShadows;
    ct_ = defaultCT;
    setNMixtures(defaultNMixtures);
    setBackgroundRatio(defaultBackgroundRatio);
    setVarInit(defaultVarInit);
    setVarMin(defaultVarMin);
    setVarMax(defaultVarMax);
    setVarThreshold(varThreshold > 0 ? (float)varThreshold : defaultVarThreshold);
    setVarThresholdGen(defaultVarThresholdGen);
    setShadowValue(defaultShadowValue);
    setShadowThreshold(defaultShadowThreshold);
    cudaSafeCall(cudaMalloc((void **)&constantsDevice_, sizeof(Constants)));
 }
 MOG2Impl::~MOG2Impl()
 {
    cudaFree(constantsDevice_);
 }
 void MOG2Impl::apply(InputArray image, OutputArray fgmask, double learningRate)
 {
    apply(image, fgmask, learningRate, Stream::Null());
 }
 void MOG2Impl::apply(InputArray _frame, OutputArray _fgmask, double learningRate, Stream &stream)
 {
-    // default parameters of gaussian background detection algorithm
+    using namespace cv::cuda::device::mog2;
-    const int defaultHistory = 500; // Learning rate; alpha = 1/defaultHistory2
+
-    const float defaultVarThreshold = 4.0f * 4.0f;
+    GpuMat frame = _frame.getGpuMat();
    const int defaultNMixtures = 5; // maximal number of Gaussians in mixture
    const float defaultBackgroundRatio = 0.9f; // threshold sum of weights for background test
    const float defaultVarThresholdGen = 3.0f * 3.0f;
    const float defaultVarInit = 15.0f; // initial variance for new components
    const float defaultVarMax = 5.0f * defaultVarInit;
    const float defaultVarMin = 4.0f;
    // additional parameters
    const float defaultCT = 0.05f; // complexity reduction prior constant 0 - no reduction of number of components
    const unsigned char defaultShadowValue = 127; // value to use in the segmentation mask for shadows, set 0 not to do shadow detection
    const float defaultShadowThreshold = 0.5f; // Tau - shadow threshold, see the paper for explanation
    class MOG2Impl CV_FINAL : public cuda::BackgroundSubtractorMOG2
    {
    public:
        MOG2Impl(int history, double varThreshold, bool detectShadows);
        void apply(InputArray image, OutputArray fgmask, double learningRate=-1) CV_OVERRIDE;
        void apply(InputArray image, OutputArray fgmask, double learningRate, Stream& stream) CV_OVERRIDE;
        void getBackgroundImage(OutputArray backgroundImage) const CV_OVERRIDE;
        void getBackgroundImage(OutputArray backgroundImage, Stream& stream) const CV_OVERRIDE;
        int getHistory() const CV_OVERRIDE { return history_; }
        void setHistory(int history) CV_OVERRIDE { history_ = history; }
        int getNMixtures() const CV_OVERRIDE { return nmixtures_; }
        void setNMixtures(int nmixtures) CV_OVERRIDE { nmixtures_ = nmixtures; }
        double getBackgroundRatio() const CV_OVERRIDE { return backgroundRatio_; }
        void setBackgroundRatio(double ratio) CV_OVERRIDE { backgroundRatio_ = (float) ratio; }
        double getVarThreshold() const CV_OVERRIDE { return varThreshold_; }
        void setVarThreshold(double varThreshold) CV_OVERRIDE { varThreshold_ = (float) varThreshold; }
        double getVarThresholdGen() const CV_OVERRIDE { return varThresholdGen_; }
        void setVarThresholdGen(double varThresholdGen) CV_OVERRIDE { varThresholdGen_ = (float) varThresholdGen; }
        double getVarInit() const CV_OVERRIDE { return varInit_; }
        void setVarInit(double varInit) CV_OVERRIDE { varInit_ = (float) varInit; }
        double getVarMin() const CV_OVERRIDE { return varMin_; }
        void setVarMin(double varMin) CV_OVERRIDE { varMin_ = (float) varMin; }
        double getVarMax() const CV_OVERRIDE { return varMax_; }
        void setVarMax(double varMax) CV_OVERRIDE { varMax_ = (float) varMax; }
        double getComplexityReductionThreshold() const CV_OVERRIDE { return ct_; }
        void setComplexityReductionThreshold(double ct) CV_OVERRIDE { ct_ = (float) ct; }
        bool getDetectShadows() const CV_OVERRIDE { return detectShadows_; }
        void setDetectShadows(bool detectShadows) CV_OVERRIDE { detectShadows_ = detectShadows; }
        int getShadowValue() const CV_OVERRIDE { return shadowValue_; }
        void setShadowValue(int value) CV_OVERRIDE { shadowValue_ = (uchar) value; }
-        double getShadowThreshold() const CV_OVERRIDE { return shadowThreshold_; }
+    int ch = frame.channels();
-        void setShadowThreshold(double threshold) CV_OVERRIDE { shadowThreshold_ = (float) threshold; }
+    int work_ch = ch;
    private:
        void initialize(Size frameSize, int frameType);
        int history_;
        int nmixtures_;
        float backgroundRatio_;
        float varThreshold_;
        float varThresholdGen_;
        float varInit_;
        float varMin_;
        float varMax_;
        float ct_;
        bool detectShadows_;
        uchar shadowValue_;
        float shadowThreshold_;
        Size frameSize_;
        int frameType_;
        int nframes_;
        GpuMat weight_;
        GpuMat variance_;
        GpuMat mean_;
        //keep track of number of modes per pixel
        GpuMat bgmodelUsedModes_;
    };
    MOG2Impl::MOG2Impl(int history, double varThreshold, bool detectShadows) :
        frameSize_(0, 0), frameType_(0), nframes_(0)
    {
        history_ = history > 0 ? history : defaultHistory;
        varThreshold_ = varThreshold > 0 ? (float) varThreshold : defaultVarThreshold;
        detectShadows_ = detectShadows;
        nmixtures_ = defaultNMixtures;
        backgroundRatio_ = defaultBackgroundRatio;
        varInit_ = defaultVarInit;
        varMax_ = defaultVarMax;
        varMin_ = defaultVarMin;
        varThresholdGen_ = defaultVarThresholdGen;
        ct_ = defaultCT;
        shadowValue_ =  defaultShadowValue;
        shadowThreshold_ = defaultShadowThreshold;
    }
-    void MOG2Impl::apply(InputArray image, OutputArray fgmask, double learningRate)
+    if (nframes_ == 0 || learningRate >= 1.0 || frame.size() != frameSize_ || work_ch != mean_.channels())
-    {
+        initialize(frame.size(), frame.type(), stream);
        apply(image, fgmask, learningRate, Stream::Null());
    }
-    void MOG2Impl::apply(InputArray _frame, OutputArray _fgmask, double learningRate, Stream& stream)
+    _fgmask.create(frameSize_, CV_8UC1);
-    {
+    GpuMat fgmask = _fgmask.getGpuMat();
        using namespace cv::cuda::device::mog2;
-        GpuMat frame = _frame.getGpuMat();
+    fgmask.setTo(Scalar::all(0), stream);
-        int ch = frame.channels();
+    ++nframes_;
-        int work_ch = ch;
+    learningRate = learningRate >= 0 && nframes_ > 1 ? learningRate : 1.0 / std::min(2 * nframes_, history_);
    CV_Assert(learningRate >= 0);
-        if (nframes_ == 0 || learningRate >= 1.0 || frame.size() != frameSize_ || work_ch != mean_.channels())
+    mog2_gpu(frame, frame.channels(), fgmask, bgmodelUsedModes_, weight_, variance_, mean_,
-            initialize(frame.size(), frame.type());
+             (float)learningRate, static_cast<float>(-learningRate * ct_), detectShadows_, constantsDevice_, StreamAccessor::getStream(stream));
 }
 void MOG2Impl::getBackgroundImage(OutputArray backgroundImage) const
 {
    getBackgroundImage(backgroundImage, Stream::Null());
 }
 void MOG2Impl::getBackgroundImage(OutputArray _backgroundImage, Stream &stream) const
 {
    using namespace cv::cuda::device::mog2;
    _backgroundImage.create(frameSize_, frameType_);
    GpuMat backgroundImage = _backgroundImage.getGpuMat();
    getBackgroundImage2_gpu(backgroundImage.channels(), bgmodelUsedModes_, weight_, mean_, backgroundImage, constantsDevice_, StreamAccessor::getStream(stream));
 }
 void MOG2Impl::initialize(cv::Size frameSize, int frameType, Stream &stream)
 {
    using namespace cv::cuda::device::mog2;
-        _fgmask.create(frameSize_, CV_8UC1);
+    CV_Assert(frameType == CV_8UC1 || frameType == CV_8UC3 || frameType == CV_8UC4);
        GpuMat fgmask = _fgmask.getGpuMat();
-        fgmask.setTo(Scalar::all(0), stream);
+    frameSize_ = frameSize;
    frameType_ = frameType;
    nframes_ = 0;
-        ++nframes_;
+    const int ch = CV_MAT_CN(frameType);
-        learningRate = learningRate >= 0 && nframes_ > 1 ? learningRate : 1.0 / std::min(2 * nframes_, history_);
+    const int work_ch = ch;
        CV_Assert( learningRate >= 0 );
-        mog2_gpu(frame, frame.channels(), fgmask, bgmodelUsedModes_, weight_, variance_, mean_,
+    // for each gaussian mixture of each pixel bg model we store ...
-                 (float) learningRate, static_cast<float>(-learningRate * ct_), detectShadows_, StreamAccessor::getStream(stream));
+    // the mixture weight (w),
-    }
+    // the mean (nchannels values) and
    // the covariance
    weight_.create(frameSize.height * getNMixtures(), frameSize_.width, CV_32FC1);
    variance_.create(frameSize.height * getNMixtures(), frameSize_.width, CV_32FC1);
    mean_.create(frameSize.height * getNMixtures(), frameSize_.width, CV_32FC(work_ch));
-    void MOG2Impl::getBackgroundImage(OutputArray backgroundImage) const
+    //make the array for keeping track of the used modes per pixel - all zeros at start
-    {
+    bgmodelUsedModes_.create(frameSize_, CV_8UC1);
-        getBackgroundImage(backgroundImage, Stream::Null());
+    bgmodelUsedModes_.setTo(Scalar::all(0));
    }
    void MOG2Impl::getBackgroundImage(OutputArray _backgroundImage, Stream& stream) const
    {
        using namespace cv::cuda::device::mog2;
        _backgroundImage.create(frameSize_, frameType_);
        GpuMat backgroundImage = _backgroundImage.getGpuMat();
        getBackgroundImage2_gpu(backgroundImage.channels(), bgmodelUsedModes_, weight_, mean_, backgroundImage, StreamAccessor::getStream(stream));
    }
    void MOG2Impl::initialize(cv::Size frameSize, int frameType)
    {
        using namespace cv::cuda::device::mog2;
        CV_Assert( frameType == CV_8UC1 || frameType == CV_8UC3 || frameType == CV_8UC4 );
        frameSize_ = frameSize;
        frameType_ = frameType;
        nframes_ = 0;
        int ch = CV_MAT_CN(frameType);
        int work_ch = ch;
        // for each gaussian mixture of each pixel bg model we store ...
        // the mixture weight (w),
        // the mean (nchannels values) and
        // the covariance
        weight_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC1);
        variance_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC1);
        mean_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC(work_ch));
        //make the array for keeping track of the used modes per pixel - all zeros at start
        bgmodelUsedModes_.create(frameSize_, CV_8UC1);
        bgmodelUsedModes_.setTo(Scalar::all(0));
-        loadConstants(nmixtures_, varThreshold_, backgroundRatio_, varThresholdGen_, varInit_, varMin_, varMax_, shadowThreshold_, shadowValue_);
+    cudaSafeCall(cudaMemcpyAsync(constantsDevice_, &constantsHost_, sizeof(Constants), cudaMemcpyHostToDevice, StreamAccessor::getStream(stream)));
    }
 }
 } // namespace
 Ptr<cuda::BackgroundSubtractorMOG2> cv::cuda::createBackgroundSubtractorMOG2(int history, double varThreshold, bool detectShadows)
 {