diff --git a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
index e0c766a3b6..2c6f65727e 100644
--- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
+++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
@@ -59,6 +59,7 @@
 #include <cstdio>
 
 #include "NCV.hpp"
+#include "NCVAlg.hpp"
 #include "NPP_staging/NPP_staging.hpp"
 #include "NCVRuntimeTemplates.hpp"
 #include "NCVHaarObjectDetection.hpp"
@@ -84,11 +85,6 @@ inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)
     pos += K_WARP_SIZE;
     s_Data[pos] = idata;
 
-    //for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1)
-    //{
-    //    s_Data[pos] += s_Data[pos - offset];
-    //}
-
     s_Data[pos] += s_Data[pos - 1];
     s_Data[pos] += s_Data[pos - 2];
     s_Data[pos] += s_Data[pos - 4];
@@ -234,60 +230,6 @@ __device__ Ncv32u getElemIImg(Ncv32u x, Ncv32u *d_IImg)
 }
 
 
-__device__ Ncv32f reduceSpecialization(Ncv32f partialSum)
-{
-    __shared__ volatile Ncv32f reductor[NUM_THREADS_CLASSIFIERPARALLEL];
-    reductor[threadIdx.x] = partialSum;
-    __syncthreads();
-
-#if defined CPU_FP_COMPLIANCE
-    if (!threadIdx.x)
-    {
-        Ncv32f sum = 0.0f;
-        for (int i=0; i<NUM_THREADS_CLASSIFIERPARALLEL; i++)
-        {
-            sum += reductor[i];
-        }
-        reductor[0] = sum;
-    }
-#else
-
-#if NUM_THREADS_CLASSIFIERPARALLEL_LOG2 >= 8
-    if (threadIdx.x < 128)
-    {
-        reductor[threadIdx.x] += reductor[threadIdx.x + 128]; 
-    }
-    __syncthreads();
-#endif
-#if NUM_THREADS_CLASSIFIERPARALLEL_LOG2 >= 7
-    if (threadIdx.x < 64)
-    {
-        reductor[threadIdx.x] += reductor[threadIdx.x + 64]; 
-    }
-    __syncthreads();
-#endif
-
-    if (threadIdx.x < 32)
-    {
-#if NUM_THREADS_CLASSIFIERPARALLEL_LOG2 >= 6
-        reductor[threadIdx.x] += reductor[threadIdx.x + 32];
-#endif
-#if NUM_THREADS_CLASSIFIERPARALLEL_LOG2 >= 5
-        reductor[threadIdx.x] += reductor[threadIdx.x + 16];
-#endif
-        reductor[threadIdx.x] += reductor[threadIdx.x + 8];
-        reductor[threadIdx.x] += reductor[threadIdx.x + 4];
-        reductor[threadIdx.x] += reductor[threadIdx.x + 2];
-        reductor[threadIdx.x] += reductor[threadIdx.x + 1];
-    }
-#endif
-
-    __syncthreads();
-
-    return reductor[0];
-}
-
-
 __device__ Ncv32u d_outMaskPosition;
 
 
@@ -623,7 +565,14 @@ __global__ void applyHaarClassifierClassifierParallel(Ncv32u *d_IImg, Ncv32u IIm
             curRootNodeOffset += NUM_THREADS_CLASSIFIERPARALLEL;
         }
 
-        Ncv32f finalStageSum = reduceSpecialization(curStageSum);
+        struct functorAddValues
+        {
+            __device__ void reduce(Ncv32f &in1out, Ncv32f &in2)
+            {
+                in1out += in2;
+            }
+        };
+        Ncv32f finalStageSum = subReduce<Ncv32f, functorAddValues, NUM_THREADS_CLASSIFIERPARALLEL>(curStageSum);
 
         if (finalStageSum < stageThreshold)
         {
diff --git a/modules/gpu/src/nvidia/core/NCVAlg.hpp b/modules/gpu/src/nvidia/core/NCVAlg.hpp
new file mode 100644
index 0000000000..0dda7c1dcc
--- /dev/null
+++ b/modules/gpu/src/nvidia/core/NCVAlg.hpp
@@ -0,0 +1,97 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 
+// 
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _ncv_alg_hpp_
+#define _ncv_alg_hpp_
+
+#include "NCV.hpp"
+
+
+template<typename T>
+static T divUp(T a, T b)
+{
+    return (a + b - 1) / b;
+}
+
+
+template<typename Tdata, class Tfunc, Ncv32u nThreads>
+static __device__ Tdata subReduce(Tdata threadElem)
+{
+    Tfunc functor;
+
+    __shared__ Tdata reduceArr[nThreads];
+    reduceArr[threadIdx.x] = threadElem;
+    __syncthreads();
+
+    if (nThreads >= 256 && threadIdx.x < 128)
+    {
+        functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 128]);
+    }
+    __syncthreads();
+
+    if (nThreads >= 128 && threadIdx.x < 64)
+    {
+        functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 64]);
+    }
+    __syncthreads();
+
+    if (threadIdx.x < 32)
+    {
+        if (nThreads >= 64)
+        {
+            functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 32]);
+        }
+        if (nThreads >= 32)
+        {
+            functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 16]);
+        }
+        functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 8]);
+        functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 4]);
+        functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 2]);
+        functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 1]);
+    }
+
+    __syncthreads();
+    return reduceArr[0];
+}
+
+
+#endif //_ncv_alg_hpp_