diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp
index 6bef7fbe68..0af5fa252f 100644
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@@ -129,7 +129,7 @@ struct cv::gpu::CascadeClassifier_GPU::CascadeClassifierImpl
 private:
 
 
-    static void NCVDebugOutputHandler(const char* msg) { CV_Error(CV_GpuApiCallError, msg); }
+    static void NCVDebugOutputHandler(const std::string &msg) { CV_Error(CV_GpuApiCallError, msg.c_str()); }
 
 
     NCVStatus load(const string& classifierFile)
diff --git a/modules/gpu/src/nvidia/core/NCV.cu b/modules/gpu/src/nvidia/core/NCV.cu
index 6b55740aba..a5af00a8ea 100644
--- a/modules/gpu/src/nvidia/core/NCV.cu
+++ b/modules/gpu/src/nvidia/core/NCV.cu
@@ -40,10 +40,9 @@
 //M*/
 
 
-#include <ios>
-#include <stdarg.h>
+#include <iostream>
+#include <string>
 #include <vector>
-#include <cstdio>
 #include "NCV.hpp"
 
 using namespace std;
@@ -56,24 +55,18 @@ using namespace std;
 //==============================================================================
 
 
-static void stdioDebugOutput(const char *msg)
+static void stdDebugOutput(const string &msg)
 {
-    printf("%s", msg);
+    cout << msg;
 }
 
 
-static NCVDebugOutputHandler *debugOutputHandler = stdioDebugOutput;
+static NCVDebugOutputHandler *debugOutputHandler = stdDebugOutput;
 
 
-void ncvDebugOutput(const char *msg, ...)
+void ncvDebugOutput(const string &msg)
 {
-    const int K_DEBUG_STRING_MAXLEN = 1024;
-    char buffer[K_DEBUG_STRING_MAXLEN];
-    va_list args;
-    va_start(args, msg);
-    vsnprintf(buffer, K_DEBUG_STRING_MAXLEN, msg, args);
-    va_end (args);
-    debugOutputHandler(buffer);
+    debugOutputHandler(msg);
 }
 
 
@@ -288,7 +281,7 @@ NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity,
 
     allocBegin = NULL;
 
-    if (reusePtr == NULL)
+    if (reusePtr == NULL && capacity != 0)
     {
         bReusesMemory = false;
         switch (memT)
@@ -329,7 +322,7 @@ NCVMemStackAllocator::~NCVMemStackAllocator()
     {
         ncvAssertPrintCheck(currentSize == 0, "NCVMemStackAllocator dtor:: not all objects were deallocated properly, forcing destruction");
 
-        if (!bReusesMemory)
+        if (!bReusesMemory && (allocBegin != (Ncv8u *)(0x1)))
         {
             switch (_memType)
             {
@@ -355,7 +348,7 @@ NCVStatus NCVMemStackAllocator::alloc(NCVMemSegment &seg, size_t size)
     seg.clear();
     ncvAssertReturn(isInitialized(), NCV_ALLOCATOR_BAD_ALLOC);
 
-    size = alignUp(static_cast<Ncv32u>(size), this->_alignment);
+    size = alignUp(size, this->_alignment);
     this->currentSize += size;
     this->_maxSize = std::max(this->_maxSize, this->currentSize);
 
@@ -464,7 +457,7 @@ NCVStatus NCVMemNativeAllocator::alloc(NCVMemSegment &seg, size_t size)
         break;
     }
 
-    this->currentSize += alignUp(static_cast<Ncv32u>(size), this->_alignment);
+    this->currentSize += alignUp(size, this->_alignment);
     this->_maxSize = std::max(this->_maxSize, this->currentSize);
 
     seg.begin.memtype = this->_memType;
@@ -480,8 +473,8 @@ NCVStatus NCVMemNativeAllocator::dealloc(NCVMemSegment &seg)
     ncvAssertReturn(seg.begin.memtype == this->_memType, NCV_ALLOCATOR_BAD_DEALLOC);
     ncvAssertReturn(seg.begin.ptr != NULL, NCV_ALLOCATOR_BAD_DEALLOC);
 
-    ncvAssertReturn(currentSize >= alignUp(static_cast<Ncv32u>(seg.size), this->_alignment), NCV_ALLOCATOR_BAD_DEALLOC);
-    currentSize -= alignUp(static_cast<Ncv32u>(seg.size), this->_alignment);
+    ncvAssertReturn(currentSize >= alignUp(seg.size, this->_alignment), NCV_ALLOCATOR_BAD_DEALLOC);
+    currentSize -= alignUp(seg.size, this->_alignment);
 
     switch (this->_memType)
     {
diff --git a/modules/gpu/src/nvidia/core/NCV.hpp b/modules/gpu/src/nvidia/core/NCV.hpp
index f310e14c4b..530aa0b8e8 100644
--- a/modules/gpu/src/nvidia/core/NCV.hpp
+++ b/modules/gpu/src/nvidia/core/NCV.hpp
@@ -42,7 +42,7 @@
 #ifndef _ncv_hpp_
 #define _ncv_hpp_
 
-#if (defined WIN32 || defined _WIN32 || defined WINCE) && defined CVAPI_EXPORTS //&& !defined(__CUDACC__) 
+#if (defined WIN32 || defined _WIN32 || defined WINCE) && defined CVAPI_EXPORTS
     #define NCV_EXPORTS __declspec(dllexport)
 #else
     #define NCV_EXPORTS
@@ -53,6 +53,8 @@
 #endif
 
 #include <cuda_runtime.h>
+#include <sstream>
+#include <iostream>
 
 
 //==============================================================================
@@ -78,7 +80,7 @@ namespace NcvCTprep
 }
 
 
-#define NCV_CT_PREP_PASTE_AUX(a,b)      a##b                           ///< Concatenation indirection macro
+#define NCV_CT_PREP_PASTE_AUX(a,b)      a##b                         ///< Concatenation indirection macro
 #define NCV_CT_PREP_PASTE(a,b)          NCV_CT_PREP_PASTE_AUX(a, b)  ///< Concatenation macro
 
 
@@ -181,6 +183,25 @@ struct NcvSize32u
     Ncv32u height; ///< Rectangle height.
     __host__ __device__ NcvSize32u() : width(0), height(0) {};
     __host__ __device__ NcvSize32u(Ncv32u width, Ncv32u height) : width(width), height(height) {}
+    __host__ __device__ bool operator == (const NcvSize32u &another) const {return this->width == another.width && this->height == another.height;}
+};
+
+
+struct NcvPoint2D32s
+{
+    Ncv32s x; ///< Point X.
+    Ncv32s y; ///< Point Y.
+    __host__ __device__ NcvPoint2D32s() : x(0), y(0) {};
+    __host__ __device__ NcvPoint2D32s(Ncv32s x, Ncv32s y) : x(x), y(y) {}
+};
+
+
+struct NcvPoint2D32u
+{
+    Ncv32u x; ///< Point X.
+    Ncv32u y; ///< Point Y.
+    __host__ __device__ NcvPoint2D32u() : x(0), y(0) {};
+    __host__ __device__ NcvPoint2D32u(Ncv32u x, Ncv32u y) : x(x), y(y) {}
 };
 
 
@@ -199,6 +220,7 @@ NCV_CT_ASSERT(sizeof(NcvRect8u) == sizeof(Ncv32u));
 NCV_CT_ASSERT(sizeof(NcvRect32s) == 4 * sizeof(Ncv32s));
 NCV_CT_ASSERT(sizeof(NcvRect32u) == 4 * sizeof(Ncv32u));
 NCV_CT_ASSERT(sizeof(NcvSize32u) == 2 * sizeof(Ncv32u));
+NCV_CT_ASSERT(sizeof(NcvPoint2D32u) == 2 * sizeof(Ncv32u));
 
 
 //==============================================================================
@@ -219,49 +241,44 @@ const Ncv32u K_LOG2_WARP_SIZE = 5;
 //==============================================================================
 
 
-#define NCV_CT_PREP_STRINGIZE_AUX(x)    #x
-#define NCV_CT_PREP_STRINGIZE(x)        NCV_CT_PREP_STRINGIZE_AUX(x)
-
+NCV_EXPORTS void ncvDebugOutput(const std::string &msg);
 
-NCV_EXPORTS void ncvDebugOutput(const char *msg, ...);
 
-
-typedef void NCVDebugOutputHandler(const char* msg);
+typedef void NCVDebugOutputHandler(const std::string &msg);
 
 
 NCV_EXPORTS void ncvSetDebugOutputHandler(NCVDebugOutputHandler* func);
 
 
 #define ncvAssertPrintCheck(pred, msg) \
-    ((pred) ? true : (ncvDebugOutput("\n%s\n", \
-    "NCV Assertion Failed: " msg ", file=" __FILE__ ", line=" NCV_CT_PREP_STRINGIZE(__LINE__) \
-    ), false))
-
-
-#define ncvAssertPrintReturn(pred, msg, err) \
-    if (ncvAssertPrintCheck(pred, msg)) ; else return err
-
-
-#define ncvAssertReturn(pred, err) \
     do \
     { \
         if (!(pred)) \
         { \
-            ncvDebugOutput("\n%s%d%s\n", "NCV Assertion Failed: retcode=", (int)err, ", file=" __FILE__ ", line=" NCV_CT_PREP_STRINGIZE(__LINE__)); \
-            return err; \
+            std::ostringstream oss; \
+            oss << "NCV Assertion Failed: " << msg << ", file=" << __FILE__ << ", line=" << __LINE__ << std::endl; \
+            ncvDebugOutput(oss.str()); \
         } \
     } while (0)
 
 
+#define ncvAssertPrintReturn(pred, msg, err) \
+    do \
+    { \
+        ncvAssertPrintCheck(pred, msg); \
+        if (!(pred)) return err; \
+    } while (0)
+
+
+#define ncvAssertReturn(pred, err) \
+    ncvAssertPrintReturn(pred, "retcode=" << (int)err, err)
+
+
 #define ncvAssertReturnNcvStat(ncvOp) \
     do \
     { \
         NCVStatus _ncvStat = ncvOp; \
-        if (NCV_SUCCESS != _ncvStat) \
-        { \
-            ncvDebugOutput("\n%s%d%s\n", "NCV Assertion Failed: NcvStat=", (int)_ncvStat, ", file=" __FILE__ ", line=" NCV_CT_PREP_STRINGIZE(__LINE__)); \
-            return _ncvStat; \
-        } \
+        ncvAssertPrintReturn(NCV_SUCCESS==_ncvStat, "NcvStat=" << (int)_ncvStat, _ncvStat); \
     } while (0)
 
 
@@ -270,18 +287,14 @@ NCV_EXPORTS void ncvSetDebugOutputHandler(NCVDebugOutputHandler* func);
     { \
         cudaError_t resCall = cudacall; \
         cudaError_t resGLE = cudaGetLastError(); \
-        if (cudaSuccess != resCall || cudaSuccess != resGLE) \
-        { \
-            ncvDebugOutput("\n%s%d%s\n", "NCV CUDA Assertion Failed: cudaError_t=", (int)(resCall | resGLE), ", file=" __FILE__ ", line=" NCV_CT_PREP_STRINGIZE(__LINE__)); \
-            return errCode; \
-        } \
+        ncvAssertPrintReturn(cudaSuccess==resCall && cudaSuccess==resGLE, "cudaError_t=" << (int)(resCall | resGLE), errCode); \
     } while (0)
 
 
 /**
 * Return-codes for status notification, errors and warnings
 */
-enum NCVStatus
+enum
 {
     //NCV statuses
     NCV_SUCCESS,
@@ -338,9 +351,14 @@ enum NCVStatus
     NPPST_MEM_INSUFFICIENT_BUFFER,            ///< Insufficient user-allocated buffer
     NPPST_MEM_RESIDENCE_ERROR,                ///< Memory residence error detected (check if pointers should be device or pinned)
     NPPST_MEM_INTERNAL_ERROR,                 ///< Internal memory management error
+
+    NCV_LAST_STATUS                           ///< Marker to continue error numeration in other files
 };
 
 
+typedef Ncv32u NCVStatus;
+
+
 #define NCV_SET_SKIP_COND(x) \
     bool __ncv_skip_cond = x
 
@@ -774,9 +792,20 @@ public:
         return ncvStat;
     }
 
+    T &at(Ncv32u x, Ncv32u y) const
+    {
+        if (x >= this->_width || y >= this->_height)
+        {
+            printf("Error addressing matrix at [%d, %d]\n", x, y);
+            return *this->_ptr;
+        }
+        return ((T *)((Ncv8u *)this->_ptr + y * this->_pitch))[x];
+    }
+
     T *ptr() const {return this->_ptr;}
     Ncv32u width() const {return this->_width;}
     Ncv32u height() const {return this->_height;}
+    NcvSize32u size() const {return NcvSize32u(this->_width, this->_height);}
     Ncv32u pitch() const {return this->_pitch;}
     NCVMemoryType memType() const {return this->_memtype;}
 
@@ -923,7 +952,7 @@ public:
         this->_width = roi.width;
         this->_height = roi.height;
         this->_pitch = mat.pitch();
-        this->_ptr = mat.ptr() + roi.y * mat.stride() + roi.x;
+        this->_ptr = &mat.at(roi.x, roi.y);
         this->_memtype = mat.memType();
 
         this->bReused = true;
@@ -962,4 +991,24 @@ NCV_EXPORTS NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst, Ncv32u dstStride, Ncv
 NCV_EXPORTS NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst, Ncv32u dstStride, Ncv32u dstWidth, Ncv32u dstHeight,
                                               NcvRect32u *d_rects, Ncv32u numRects, Ncv32u color, cudaStream_t cuStream);
 
+
+#define CLAMP(x,a,b)        ( (x) > (b) ? (b) : ( (x) < (a) ? (a) : (x) ) )
+#define CLAMP_TOP(x, a)     (((x) > (a)) ? (a) : (x))
+#define CLAMP_BOTTOM(x, a)  (((x) < (a)) ? (a) : (x))
+#define CLAMP_0_255(x)      CLAMP(x,0,255)
+
+
+#define SUB_BEGIN(type, name)    struct { __inline type name
+#define SUB_END(name)            } name;
+#define SUB_CALL(name)           name.name
+
+#define SQR(x)              ((x)*(x))
+
+
+#define ncvSafeMatAlloc(name, type, alloc, width, height, err) \
+    NCVMatrixAlloc<type> name(alloc, width, height); \
+    ncvAssertReturn(name.isMemAllocated(), err);
+
+
+
 #endif // _ncv_hpp_
diff --git a/modules/gpu/src/nvidia/core/NCVColorConversion.hpp b/modules/gpu/src/nvidia/core/NCVColorConversion.hpp
new file mode 100644
index 0000000000..e7b4afbd1b
--- /dev/null
+++ b/modules/gpu/src/nvidia/core/NCVColorConversion.hpp
@@ -0,0 +1,96 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 
+// 
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _ncv_color_conversion_hpp_
+#define _ncv_color_conversion_hpp_
+
+#include "NCVPixelOperations.hpp"
+
+enum NCVColorSpace
+{
+    NCVColorSpaceGray,
+    NCVColorSpaceRGBA,
+};
+
+template<NCVColorSpace CSin, NCVColorSpace CSout, typename Tin, typename Tout> struct __pixColorConv {
+static void _pixColorConv(const Tin &pixIn, Tout &pixOut);
+};
+
+template<typename Tin, typename Tout> struct __pixColorConv<NCVColorSpaceRGBA, NCVColorSpaceGray, Tin, Tout> {
+static void _pixColorConv(const Tin &pixIn, Tout &pixOut)
+{
+    Ncv32f luma = 0.299f * pixIn.x + 0.587f * pixIn.y + 0.114f * pixIn.z;
+    _TDemoteClampNN(luma, pixOut.x);
+}};
+
+template<typename Tin, typename Tout> struct __pixColorConv<NCVColorSpaceGray, NCVColorSpaceRGBA, Tin, Tout> {
+static void _pixColorConv(const Tin &pixIn, Tout &pixOut)
+{
+    _TDemoteClampNN(pixIn.x, pixOut.x);
+    _TDemoteClampNN(pixIn.x, pixOut.y);
+    _TDemoteClampNN(pixIn.x, pixOut.z);
+    pixOut.w = 0;
+}};
+
+template<NCVColorSpace CSin, NCVColorSpace CSout, typename Tin, typename Tout>
+static
+NCVStatus _ncvColorConv_host(const NCVMatrix<Tin> &h_imgIn,
+                             const NCVMatrix<Tout> &h_imgOut)
+{
+    ncvAssertReturn(h_imgIn.size() == h_imgOut.size(), NCV_DIMENSIONS_INVALID);
+    ncvAssertReturn(h_imgIn.memType() == h_imgOut.memType() &&
+                    (h_imgIn.memType() == NCVMemoryTypeHostPinned || h_imgIn.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
+    NCV_SET_SKIP_COND(h_imgIn.memType() == NCVMemoryTypeNone);
+    NCV_SKIP_COND_BEGIN
+
+    for (Ncv32u i=0; i<h_imgIn.height(); i++)
+    {
+        for (Ncv32u j=0; j<h_imgIn.width(); j++)
+        {
+            __pixColorConv<CSin, CSout, Tin, Tout>::_pixColorConv(h_imgIn.at(j,i), h_imgOut.at(j,i));
+        }
+    }
+
+    NCV_SKIP_COND_END
+    return NCV_SUCCESS;
+}
+
+#endif //_ncv_color_conversion_hpp_
diff --git a/modules/gpu/src/nvidia/core/NCVPixelOperations.hpp b/modules/gpu/src/nvidia/core/NCVPixelOperations.hpp
new file mode 100644
index 0000000000..3951a2f435
--- /dev/null
+++ b/modules/gpu/src/nvidia/core/NCVPixelOperations.hpp
@@ -0,0 +1,350 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 
+// 
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _ncv_pixel_operations_hpp_
+#define _ncv_pixel_operations_hpp_
+
+#include <limits.h>
+#include <float.h>
+#include "NCV.hpp"
+
+template<typename TBase> inline TBase _pixMaxVal();
+template<> static inline Ncv8u _pixMaxVal<Ncv8u>() {return UCHAR_MAX;}
+template<> static inline Ncv16u _pixMaxVal<Ncv16u>() {return USHRT_MAX;}
+template<> static inline Ncv32u _pixMaxVal<Ncv32u>() {return UINT_MAX;}
+template<> static inline Ncv8s _pixMaxVal<Ncv8s>() {return CHAR_MAX;}
+template<> static inline Ncv16s _pixMaxVal<Ncv16s>() {return SHRT_MAX;}
+template<> static inline Ncv32s _pixMaxVal<Ncv32s>() {return INT_MAX;}
+template<> static inline Ncv32f _pixMaxVal<Ncv32f>() {return FLT_MAX;}
+template<> static inline Ncv64f _pixMaxVal<Ncv64f>() {return DBL_MAX;}
+
+template<typename TBase> inline TBase _pixMinVal();
+template<> static inline Ncv8u _pixMinVal<Ncv8u>() {return 0;}
+template<> static inline Ncv16u _pixMinVal<Ncv16u>() {return 0;}
+template<> static inline Ncv32u _pixMinVal<Ncv32u>() {return 0;}
+template<> static inline Ncv8s _pixMinVal<Ncv8s>() {return CHAR_MIN;}
+template<> static inline Ncv16s _pixMinVal<Ncv16s>() {return SHRT_MIN;}
+template<> static inline Ncv32s _pixMinVal<Ncv32s>() {return INT_MIN;}
+template<> static inline Ncv32f _pixMinVal<Ncv32f>() {return FLT_MIN;}
+template<> static inline Ncv64f _pixMinVal<Ncv64f>() {return DBL_MIN;}
+
+template<typename Tvec> struct TConvVec2Base;
+template<> struct TConvVec2Base<uchar1> {typedef Ncv8u TBase;};
+template<> struct TConvVec2Base<uchar3> {typedef Ncv8u TBase;};
+template<> struct TConvVec2Base<uchar4> {typedef Ncv8u TBase;};
+template<> struct TConvVec2Base<ushort1> {typedef Ncv16u TBase;};
+template<> struct TConvVec2Base<ushort3> {typedef Ncv16u TBase;};
+template<> struct TConvVec2Base<ushort4> {typedef Ncv16u TBase;};
+template<> struct TConvVec2Base<uint1> {typedef Ncv32u TBase;};
+template<> struct TConvVec2Base<uint3> {typedef Ncv32u TBase;};
+template<> struct TConvVec2Base<uint4> {typedef Ncv32u TBase;};
+template<> struct TConvVec2Base<float1> {typedef Ncv32f TBase;};
+template<> struct TConvVec2Base<float3> {typedef Ncv32f TBase;};
+template<> struct TConvVec2Base<float4> {typedef Ncv32f TBase;};
+template<> struct TConvVec2Base<double1> {typedef Ncv64f TBase;};
+template<> struct TConvVec2Base<double3> {typedef Ncv64f TBase;};
+template<> struct TConvVec2Base<double4> {typedef Ncv64f TBase;};
+
+#define NC(T)       (sizeof(T) / sizeof(TConvVec2Base<T>::TBase))
+
+template<typename TBase, Ncv32u NC> struct TConvBase2Vec;
+template<> struct TConvBase2Vec<Ncv8u, 1> {typedef uchar1 TVec;};
+template<> struct TConvBase2Vec<Ncv8u, 3> {typedef uchar3 TVec;};
+template<> struct TConvBase2Vec<Ncv8u, 4> {typedef uchar4 TVec;};
+template<> struct TConvBase2Vec<Ncv16u, 1> {typedef ushort1 TVec;};
+template<> struct TConvBase2Vec<Ncv16u, 3> {typedef ushort3 TVec;};
+template<> struct TConvBase2Vec<Ncv16u, 4> {typedef ushort4 TVec;};
+template<> struct TConvBase2Vec<Ncv32u, 1> {typedef uint1 TVec;};
+template<> struct TConvBase2Vec<Ncv32u, 3> {typedef uint3 TVec;};
+template<> struct TConvBase2Vec<Ncv32u, 4> {typedef uint4 TVec;};
+template<> struct TConvBase2Vec<Ncv32f, 1> {typedef float1 TVec;};
+template<> struct TConvBase2Vec<Ncv32f, 3> {typedef float3 TVec;};
+template<> struct TConvBase2Vec<Ncv32f, 4> {typedef float4 TVec;};
+template<> struct TConvBase2Vec<Ncv64f, 1> {typedef double1 TVec;};
+template<> struct TConvBase2Vec<Ncv64f, 3> {typedef double3 TVec;};
+template<> struct TConvBase2Vec<Ncv64f, 4> {typedef double4 TVec;};
+
+//TODO: consider using CUDA intrinsics to avoid branching
+template<typename Tin> static inline void _TDemoteClampZ(Tin &a, Ncv8u &out) {out = (Ncv8u)CLAMP_0_255(a);};
+template<typename Tin> static inline void _TDemoteClampZ(Tin &a, Ncv16u &out) {out = (Ncv16u)CLAMP(a, 0, USHRT_MAX);}
+template<typename Tin> static inline void _TDemoteClampZ(Tin &a, Ncv32u &out) {out = (Ncv32u)CLAMP(a, 0, UINT_MAX);}
+template<typename Tin> static inline void _TDemoteClampZ(Tin &a, Ncv32f &out) {out = (Ncv32f)a;}
+
+//TODO: consider using CUDA intrinsics to avoid branching
+template<typename Tin> static inline void _TDemoteClampNN(Tin &a, Ncv8u &out) {out = (Ncv8u)CLAMP_0_255(a+0.5f);}
+template<typename Tin> static inline void _TDemoteClampNN(Tin &a, Ncv16u &out) {out = (Ncv16u)CLAMP(a+0.5f, 0, USHRT_MAX);}
+template<typename Tin> static inline void _TDemoteClampNN(Tin &a, Ncv32u &out) {out = (Ncv32u)CLAMP(a+0.5f, 0, UINT_MAX);}
+template<typename Tin> static inline void _TDemoteClampNN(Tin &a, Ncv32f &out) {out = (Ncv32f)a;}
+
+template<typename Tout> inline Tout _pixMakeZero();
+template<> static inline uchar1 _pixMakeZero<uchar1>() {return make_uchar1(0);}
+template<> static inline uchar3 _pixMakeZero<uchar3>() {return make_uchar3(0,0,0);}
+template<> static inline uchar4 _pixMakeZero<uchar4>() {return make_uchar4(0,0,0,0);}
+template<> static inline ushort1 _pixMakeZero<ushort1>() {return make_ushort1(0);}
+template<> static inline ushort3 _pixMakeZero<ushort3>() {return make_ushort3(0,0,0);}
+template<> static inline ushort4 _pixMakeZero<ushort4>() {return make_ushort4(0,0,0,0);}
+template<> static inline uint1 _pixMakeZero<uint1>() {return make_uint1(0);}
+template<> static inline uint3 _pixMakeZero<uint3>() {return make_uint3(0,0,0);}
+template<> static inline uint4 _pixMakeZero<uint4>() {return make_uint4(0,0,0,0);}
+template<> static inline float1 _pixMakeZero<float1>() {return make_float1(0.f);}
+template<> static inline float3 _pixMakeZero<float3>() {return make_float3(0.f,0.f,0.f);}
+template<> static inline float4 _pixMakeZero<float4>() {return make_float4(0.f,0.f,0.f,0.f);}
+template<> static inline double1 _pixMakeZero<double1>() {return make_double1(0.);}
+template<> static inline double3 _pixMakeZero<double3>() {return make_double3(0.,0.,0.);}
+template<> static inline double4 _pixMakeZero<double4>() {return make_double4(0.,0.,0.,0.);}
+
+static inline uchar1 _pixMake(Ncv8u x) {return make_uchar1(x);}
+static inline uchar3 _pixMake(Ncv8u x, Ncv8u y, Ncv8u z) {return make_uchar3(x,y,z);}
+static inline uchar4 _pixMake(Ncv8u x, Ncv8u y, Ncv8u z, Ncv8u w) {return make_uchar4(x,y,z,w);}
+static inline ushort1 _pixMake(Ncv16u x) {return make_ushort1(x);}
+static inline ushort3 _pixMake(Ncv16u x, Ncv16u y, Ncv16u z) {return make_ushort3(x,y,z);}
+static inline ushort4 _pixMake(Ncv16u x, Ncv16u y, Ncv16u z, Ncv16u w) {return make_ushort4(x,y,z,w);}
+static inline uint1 _pixMake(Ncv32u x) {return make_uint1(x);}
+static inline uint3 _pixMake(Ncv32u x, Ncv32u y, Ncv32u z) {return make_uint3(x,y,z);}
+static inline uint4 _pixMake(Ncv32u x, Ncv32u y, Ncv32u z, Ncv32u w) {return make_uint4(x,y,z,w);}
+static inline float1 _pixMake(Ncv32f x) {return make_float1(x);}
+static inline float3 _pixMake(Ncv32f x, Ncv32f y, Ncv32f z) {return make_float3(x,y,z);}
+static inline float4 _pixMake(Ncv32f x, Ncv32f y, Ncv32f z, Ncv32f w) {return make_float4(x,y,z,w);}
+static inline double1 _pixMake(Ncv64f x) {return make_double1(x);}
+static inline double3 _pixMake(Ncv64f x, Ncv64f y, Ncv64f z) {return make_double3(x,y,z);}
+static inline double4 _pixMake(Ncv64f x, Ncv64f y, Ncv64f z, Ncv64f w) {return make_double4(x,y,z,w);}
+
+
+template<typename Tin, typename Tout, Ncv32u CN> struct __pixDemoteClampZ_CN {static Tout _pixDemoteClampZ_CN(Tin &pix);};
+
+template<typename Tin, typename Tout> struct __pixDemoteClampZ_CN<Tin, Tout, 1> {
+static Tout _pixDemoteClampZ_CN(Tin &pix)
+{
+    Tout out;
+    _TDemoteClampZ(pix.x, out.x);
+    return out;
+}};
+
+template<typename Tin, typename Tout> struct __pixDemoteClampZ_CN<Tin, Tout, 3> {
+static Tout _pixDemoteClampZ_CN(Tin &pix)
+{
+    Tout out;
+    _TDemoteClampZ(pix.x, out.x);
+    _TDemoteClampZ(pix.y, out.y);
+    _TDemoteClampZ(pix.z, out.z);
+    return out;
+}};
+
+template<typename Tin, typename Tout> struct __pixDemoteClampZ_CN<Tin, Tout, 4> {
+static Tout _pixDemoteClampZ_CN(Tin &pix)
+{
+    Tout out;
+    _TDemoteClampZ(pix.x, out.x);
+    _TDemoteClampZ(pix.y, out.y);
+    _TDemoteClampZ(pix.z, out.z);
+    _TDemoteClampZ(pix.w, out.w);
+    return out;
+}};
+
+template<typename Tin, typename Tout> static inline Tout _pixDemoteClampZ(Tin &pix)
+{
+    return __pixDemoteClampZ_CN<Tin, Tout, NC(Tin)>::_pixDemoteClampZ_CN(pix);
+}
+
+
+template<typename Tin, typename Tout, Ncv32u CN> struct __pixDemoteClampNN_CN {static Tout _pixDemoteClampNN_CN(Tin &pix);};
+
+template<typename Tin, typename Tout> struct __pixDemoteClampNN_CN<Tin, Tout, 1> {
+static Tout _pixDemoteClampNN_CN(Tin &pix)
+{
+    Tout out;
+    _TDemoteClampNN(pix.x, out.x);
+    return out;
+}};
+
+template<typename Tin, typename Tout> struct __pixDemoteClampNN_CN<Tin, Tout, 3> {
+static Tout _pixDemoteClampNN_CN(Tin &pix)
+{
+    Tout out;
+    _TDemoteClampNN(pix.x, out.x);
+    _TDemoteClampNN(pix.y, out.y);
+    _TDemoteClampNN(pix.z, out.z);
+    return out;
+}};
+
+template<typename Tin, typename Tout> struct __pixDemoteClampNN_CN<Tin, Tout, 4> {
+static Tout _pixDemoteClampNN_CN(Tin &pix)
+{
+    Tout out;
+    _TDemoteClampNN(pix.x, out.x);
+    _TDemoteClampNN(pix.y, out.y);
+    _TDemoteClampNN(pix.z, out.z);
+    _TDemoteClampNN(pix.w, out.w);
+    return out;
+}};
+
+template<typename Tin, typename Tout> static inline Tout _pixDemoteClampNN(Tin &pix)
+{
+    return __pixDemoteClampNN_CN<Tin, Tout, NC(Tin)>::_pixDemoteClampNN_CN(pix);
+}
+
+
+template<typename Tin, typename Tout, typename Tw, Ncv32u CN> struct __pixScale_CN {static Tout _pixScale_CN(Tin &pix, Tw w);};
+
+template<typename Tin, typename Tout, typename Tw> struct __pixScale_CN<Tin, Tout, Tw, 1> {
+static Tout _pixScale_CN(Tin &pix, Tw w)
+{
+    Tout out;
+    typedef typename TConvVec2Base<Tout>::TBase TBout;
+    out.x = (TBout)(pix.x * w);
+    return out;
+}};
+
+template<typename Tin, typename Tout, typename Tw> struct __pixScale_CN<Tin, Tout, Tw, 3> {
+static Tout _pixScale_CN(Tin &pix, Tw w)
+{
+    Tout out;
+    typedef typename TConvVec2Base<Tout>::TBase TBout;
+    out.x = (TBout)(pix.x * w);
+    out.y = (TBout)(pix.y * w);
+    out.z = (TBout)(pix.z * w);
+    return out;
+}};
+
+template<typename Tin, typename Tout, typename Tw> struct __pixScale_CN<Tin, Tout, Tw, 4> {
+static Tout _pixScale_CN(Tin &pix, Tw w)
+{
+    Tout out;
+    typedef typename TConvVec2Base<Tout>::TBase TBout;
+    out.x = (TBout)(pix.x * w);
+    out.y = (TBout)(pix.y * w);
+    out.z = (TBout)(pix.z * w);
+    out.w = (TBout)(pix.w * w);
+    return out;
+}};
+
+template<typename Tin, typename Tout, typename Tw> static Tout _pixScale(Tin &pix, Tw w)
+{
+    return __pixScale_CN<Tin, Tout, Tw, NC(Tin)>::_pixScale_CN(pix, w);
+}
+
+
+template<typename Tin, typename Tout, Ncv32u CN> struct __pixAdd_CN {static Tout _pixAdd_CN(Tout &pix1, Tin &pix2);};
+
+template<typename Tin, typename Tout> struct __pixAdd_CN<Tin, Tout, 1> {
+static Tout _pixAdd_CN(Tout &pix1, Tin &pix2)
+{
+    Tout out;
+    out.x = pix1.x + pix2.x;
+    return out;
+}};
+
+template<typename Tin, typename Tout> struct __pixAdd_CN<Tin, Tout, 3> {
+static Tout _pixAdd_CN(Tout &pix1, Tin &pix2)
+{
+    Tout out;
+    out.x = pix1.x + pix2.x;
+    out.y = pix1.y + pix2.y;
+    out.z = pix1.z + pix2.z;
+    return out;
+}};
+
+template<typename Tin, typename Tout> struct __pixAdd_CN<Tin, Tout, 4> {
+static Tout _pixAdd_CN(Tout &pix1, Tin &pix2)
+{
+    Tout out;
+    out.x = pix1.x + pix2.x;
+    out.y = pix1.y + pix2.y;
+    out.z = pix1.z + pix2.z;
+    out.w = pix1.w + pix2.w;
+    return out;
+}};
+
+template<typename Tin, typename Tout> static Tout _pixAdd(Tout &pix1, Tin &pix2)
+{
+    return __pixAdd_CN<Tin, Tout, NC(Tin)>::_pixAdd_CN(pix1, pix2);
+}
+
+
+template<typename Tin, typename Tout, Ncv32u CN> struct __pixDist_CN {static Tout _pixDist_CN(Tin &pix1, Tin &pix2);};
+
+template<typename Tin, typename Tout> struct __pixDist_CN<Tin, Tout, 1> {
+static Tout _pixDist_CN(Tin &pix1, Tin &pix2)
+{
+    return Tout(SQR(pix1.x - pix2.x));
+}};
+
+template<typename Tin, typename Tout> struct __pixDist_CN<Tin, Tout, 3> {
+static Tout _pixDist_CN(Tin &pix1, Tin &pix2)
+{
+    return Tout(SQR(pix1.x - pix2.x) + SQR(pix1.y - pix2.y) + SQR(pix1.z - pix2.z));
+}};
+
+template<typename Tin, typename Tout> struct __pixDist_CN<Tin, Tout, 4> {
+static Tout _pixDist_CN(Tin &pix1, Tin &pix2)
+{
+    return Tout(SQR(pix1.x - pix2.x) + SQR(pix1.y - pix2.y) + SQR(pix1.z - pix2.z) + SQR(pix1.w - pix2.w));
+}};
+
+template<typename Tin, typename Tout> static Tout _pixDist(Tin &pix1, Tin &pix2)
+{
+    return __pixDist_CN<Tin, Tout, NC(Tin)>::_pixDist_CN(pix1, pix2);
+}
+
+
+template <typename T> struct TAccPixWeighted;
+template<> struct TAccPixWeighted<uchar1> {typedef double1 type;};
+template<> struct TAccPixWeighted<uchar3> {typedef double3 type;};
+template<> struct TAccPixWeighted<uchar4> {typedef double4 type;};
+template<> struct TAccPixWeighted<ushort1> {typedef double1 type;};
+template<> struct TAccPixWeighted<ushort3> {typedef double3 type;};
+template<> struct TAccPixWeighted<ushort4> {typedef double4 type;};
+template<> struct TAccPixWeighted<float1> {typedef double1 type;};
+template<> struct TAccPixWeighted<float3> {typedef double3 type;};
+template<> struct TAccPixWeighted<float4> {typedef double4 type;};
+
+template<typename Tfrom> struct TAccPixDist {};
+template<> struct TAccPixDist<uchar1> {typedef Ncv32u type;};
+template<> struct TAccPixDist<uchar3> {typedef Ncv32u type;};
+template<> struct TAccPixDist<uchar4> {typedef Ncv32u type;};
+template<> struct TAccPixDist<ushort1> {typedef Ncv32u type;};
+template<> struct TAccPixDist<ushort3> {typedef Ncv32u type;};
+template<> struct TAccPixDist<ushort4> {typedef Ncv32u type;};
+template<> struct TAccPixDist<float1> {typedef Ncv32f type;};
+template<> struct TAccPixDist<float3> {typedef Ncv32f type;};
+template<> struct TAccPixDist<float4> {typedef Ncv32f type;};
+
+#endif //_ncv_pixel_operations_hpp_
diff --git a/modules/gpu/src/nvidia/core/NCVPyramid.cu b/modules/gpu/src/nvidia/core/NCVPyramid.cu
new file mode 100644
index 0000000000..20e52e580b
--- /dev/null
+++ b/modules/gpu/src/nvidia/core/NCVPyramid.cu
@@ -0,0 +1,397 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 
+// 
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include "NCV.hpp"
+#include "NCVPyramid.hpp"
+#include "NCVPixelOperations.hpp"
+
+
+template<typename T, Ncv32u CN> struct __average4_CN {static T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11);};
+
+template<typename T> struct __average4_CN<T, 1> {
+static T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11)
+{
+    T out;
+    out.x = ((Ncv32s)p00.x + p01.x + p10.x + p11.x + 2) / 4;
+    return out;
+}};
+
+template<> struct __average4_CN<float1, 1> {
+static float1 _average4_CN(const float1 &p00, const float1 &p01, const float1 &p10, const float1 &p11)
+{
+    float1 out;
+    out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
+    return out;
+}};
+
+template<> struct __average4_CN<double1, 1> {
+static double1 _average4_CN(const double1 &p00, const double1 &p01, const double1 &p10, const double1 &p11)
+{
+    double1 out;
+    out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
+    return out;
+}};
+
+template<typename T> struct __average4_CN<T, 3> {
+static T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11)
+{
+    T out;
+    out.x = ((Ncv32s)p00.x + p01.x + p10.x + p11.x + 2) / 4;
+    out.y = ((Ncv32s)p00.y + p01.y + p10.y + p11.y + 2) / 4;
+    out.z = ((Ncv32s)p00.z + p01.z + p10.z + p11.z + 2) / 4;
+    return out;
+}};
+
+template<> struct __average4_CN<float3, 3> {
+static float3 _average4_CN(const float3 &p00, const float3 &p01, const float3 &p10, const float3 &p11)
+{
+    float3 out;
+    out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
+    out.y = (p00.y + p01.y + p10.y + p11.y) / 4;
+    out.z = (p00.z + p01.z + p10.z + p11.z) / 4;
+    return out;
+}};
+
+template<> struct __average4_CN<double3, 3> {
+static double3 _average4_CN(const double3 &p00, const double3 &p01, const double3 &p10, const double3 &p11)
+{
+    double3 out;
+    out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
+    out.y = (p00.y + p01.y + p10.y + p11.y) / 4;
+    out.z = (p00.z + p01.z + p10.z + p11.z) / 4;
+    return out;
+}};
+
+template<typename T> struct __average4_CN<T, 4> {
+static T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11)
+{
+    T out;
+    out.x = ((Ncv32s)p00.x + p01.x + p10.x + p11.x + 2) / 4;
+    out.y = ((Ncv32s)p00.y + p01.y + p10.y + p11.y + 2) / 4;
+    out.z = ((Ncv32s)p00.z + p01.z + p10.z + p11.z + 2) / 4;
+    out.w = ((Ncv32s)p00.w + p01.w + p10.w + p11.w + 2) / 4;
+    return out;
+}};
+
+template<> struct __average4_CN<float4, 4> {
+static float4 _average4_CN(const float4 &p00, const float4 &p01, const float4 &p10, const float4 &p11)
+{
+    float4 out;
+    out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
+    out.y = (p00.y + p01.y + p10.y + p11.y) / 4;
+    out.z = (p00.z + p01.z + p10.z + p11.z) / 4;
+    out.w = (p00.w + p01.w + p10.w + p11.w) / 4;
+    return out;
+}};
+
+template<> struct __average4_CN<double4, 4> {
+static double4 _average4_CN(const double4 &p00, const double4 &p01, const double4 &p10, const double4 &p11)
+{
+    double4 out;
+    out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
+    out.y = (p00.y + p01.y + p10.y + p11.y) / 4;
+    out.z = (p00.z + p01.z + p10.z + p11.z) / 4;
+    out.w = (p00.w + p01.w + p10.w + p11.w) / 4;
+    return out;
+}};
+
+template<typename T> static T _average4(const T &p00, const T &p01, const T &p10, const T &p11)
+{
+    return __average4_CN<T, NC(T)>::_average4_CN(p00, p01, p10, p11);
+}
+
+
+template<typename Tin, typename Tout, Ncv32u CN> struct __lerp_CN {static Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d);};
+
+template<typename Tin, typename Tout> struct __lerp_CN<Tin, Tout, 1> {
+static Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d)
+{
+    typedef typename TConvVec2Base<Tout>::TBase TB;
+    return _pixMake(TB(b.x * d + a.x * (1 - d)));
+}};
+
+template<typename Tin, typename Tout> struct __lerp_CN<Tin, Tout, 3> {
+static Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d)
+{
+    typedef typename TConvVec2Base<Tout>::TBase TB;
+    return _pixMake(TB(b.x * d + a.x * (1 - d)),
+                    TB(b.y * d + a.y * (1 - d)),
+                    TB(b.z * d + a.z * (1 - d)));
+}};
+
+template<typename Tin, typename Tout> struct __lerp_CN<Tin, Tout, 4> {
+static Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d)
+{
+    typedef typename TConvVec2Base<Tout>::TBase TB;
+    return _pixMake(TB(b.x * d + a.x * (1 - d)),
+                    TB(b.y * d + a.y * (1 - d)),
+                    TB(b.z * d + a.z * (1 - d)),
+                    TB(b.w * d + a.w * (1 - d)));
+}};
+
+template<typename Tin, typename Tout> static Tout _lerp(const Tin &a, const Tin &b, Ncv32f d)
+{
+    return __lerp_CN<Tin, Tout, NC(Tin)>::_lerp_CN(a, b, d);
+}
+
+
+template<typename T>
+static T _interpLinear(const T &a, const T &b, Ncv32f d)
+{
+    typedef typename TConvBase2Vec<Ncv32f, NC(T)>::TVec TVFlt;
+    TVFlt tmp = _lerp<T, TVFlt>(a, b, d);
+    return _pixDemoteClampZ<TVFlt, T>(tmp);
+}
+
+
+template<typename T>
+static T _interpBilinear(const NCVMatrix<T> &refLayer, Ncv32f x, Ncv32f y)
+{
+    Ncv32u xl = (Ncv32u)x;
+    Ncv32u xh = xl+1;
+    Ncv32f dx = x - xl;
+    Ncv32u yl = (Ncv32u)y;
+    Ncv32u yh = yl+1;
+    Ncv32f dy = y - yl;
+    T p00, p01, p10, p11;
+    p00 = refLayer.at(xl, yl);
+    p01 = xh < refLayer.width() ? refLayer.at(xh, yl) : p00;
+    p10 = yh < refLayer.height() ? refLayer.at(xl, yh) : p00;
+    p11 = (xh < refLayer.width() && yh < refLayer.height()) ? refLayer.at(xh, yh) : p00;
+    typedef typename TConvBase2Vec<Ncv32f, NC(T)>::TVec TVFlt;
+    TVFlt m_00_01 = _lerp<T, TVFlt>(p00, p01, dx);
+    TVFlt m_10_11 = _lerp<T, TVFlt>(p10, p11, dx);
+    TVFlt mixture = _lerp<TVFlt, TVFlt>(m_00_01, m_10_11, dy);
+    return _pixDemoteClampZ<TVFlt, T>(mixture);
+}
+
+
+template <class T>
+NCVImagePyramid<T>::NCVImagePyramid(const NCVMatrix<T> &img,
+                                    Ncv8u numLayers,
+                                    INCVMemAllocator &alloc,
+                                    cudaStream_t cuStream)
+{
+    this->_isInitialized = false;
+    ncvAssertPrintReturn(img.memType() == alloc.memType(), "NCVImagePyramid_host::ctor error", );
+
+    this->layer0 = &img;
+    NcvSize32u szLastLayer(img.width(), img.height());
+    this->nLayers = 1;
+
+    NCV_SET_SKIP_COND(alloc.isCounting());
+    NcvBool bDeviceCode = alloc.memType() == NCVMemoryTypeDevice;
+
+    if (numLayers == 0)
+    {
+        numLayers = 255; //it will cut-off when any of the dimensions goes 1
+    }
+
+    for (Ncv32u i=0; i<(Ncv32u)numLayers-1; i++)
+    {
+        NcvSize32u szCurLayer(szLastLayer.width / 2, szLastLayer.height / 2);
+        if (szCurLayer.width == 0 || szCurLayer.height == 0)
+        {
+            break;
+        }
+
+        this->pyramid.push_back(new NCVMatrixAlloc<T>(alloc, szCurLayer.width, szCurLayer.height));
+        ncvAssertPrintReturn(((NCVMatrixAlloc<T> *)(this->pyramid[i]))->isMemAllocated(), "NCVImagePyramid_host::ctor error", );
+        this->nLayers++;
+
+        //fill in the layer
+        NCV_SKIP_COND_BEGIN
+
+        const NCVMatrix<T> *prevLayer = i == 0 ? this->layer0 : this->pyramid[i-1];
+        NCVMatrix<T> *curLayer = this->pyramid[i];
+
+        if (bDeviceCode)
+        {
+            //TODO: in cuStream
+        }
+        else
+        {
+            for (Ncv32u i=0; i<szCurLayer.height; i++)
+            {
+                for (Ncv32u j=0; j<szCurLayer.width; j++)
+                {
+                    T p00 = prevLayer->at(2*j+0, 2*i+0);
+                    T p01 = prevLayer->at(2*j+1, 2*i+0);
+                    T p10 = prevLayer->at(2*j+0, 2*i+1);
+                    T p11 = prevLayer->at(2*j+1, 2*i+1);
+                    curLayer->at(j, i) = _average4(p00, p01, p10, p11);
+                }
+            }
+        }
+
+        NCV_SKIP_COND_END
+
+        szLastLayer = szCurLayer;
+    }
+
+    this->_isInitialized = true;
+}
+
+
+template <class T>
+NCVImagePyramid<T>::~NCVImagePyramid()
+{
+}
+
+
+template <class T>
+NcvBool NCVImagePyramid<T>::isInitialized() const
+{
+    return this->_isInitialized;
+}
+
+
+template <class T>
+NCVStatus NCVImagePyramid<T>::getLayer(NCVMatrix<T> &outImg,
+                                       NcvSize32u outRoi,
+                                       NcvBool bTrilinear,
+                                       cudaStream_t cuStream) const
+{
+    ncvAssertReturn(this->isInitialized(), NCV_UNKNOWN_ERROR);
+    ncvAssertReturn(outImg.memType() == this->layer0->memType(), NCV_MEM_RESIDENCE_ERROR);
+    ncvAssertReturn(outRoi.width <= this->layer0->width() && outRoi.height <= this->layer0->height() &&
+                    outRoi.width > 0 && outRoi.height > 0, NCV_DIMENSIONS_INVALID);
+
+    if (outRoi.width == this->layer0->width() && outRoi.height == this->layer0->height())
+    {
+        ncvAssertReturnNcvStat(this->layer0->copy2D(outImg, NcvSize32u(this->layer0->width(), this->layer0->height()), cuStream));
+        return NCV_SUCCESS;
+    }
+
+    Ncv32f lastScale = 1.0f;
+    Ncv32f curScale;
+    const NCVMatrix<T> *lastLayer = this->layer0;
+    const NCVMatrix<T> *curLayer = NULL;
+    NcvBool bUse2Refs = false;
+
+    for (Ncv32u i=0; i<this->nLayers-1; i++)
+    {
+        curScale = lastScale * 0.5f;
+        curLayer = this->pyramid[i];
+
+        if (outRoi.width == curLayer->width() && outRoi.height == curLayer->height())
+        {
+            ncvAssertReturnNcvStat(this->pyramid[i]->copy2D(outImg, NcvSize32u(this->pyramid[i]->width(), this->pyramid[i]->height()), cuStream));
+            return NCV_SUCCESS;
+        }
+
+        if (outRoi.width >= curLayer->width() && outRoi.height >= curLayer->height())
+        {
+            if (outRoi.width < lastLayer->width() && outRoi.height < lastLayer->height())
+            {
+                bUse2Refs = true;
+            }
+            break;
+        }
+
+        lastScale = curScale;
+        lastLayer = curLayer;
+    }
+
+    bUse2Refs = bUse2Refs && bTrilinear;
+
+    NCV_SET_SKIP_COND(outImg.memType() == NCVMemoryTypeNone);
+    NcvBool bDeviceCode = this->layer0->memType() == NCVMemoryTypeDevice;
+
+    NCV_SKIP_COND_BEGIN
+
+    if (bDeviceCode)
+    {
+        //TODO: in cuStream
+    }
+    else
+    {
+        for (Ncv32u i=0; i<outRoi.height; i++)
+        {
+            for (Ncv32u j=0; j<outRoi.width; j++)
+            {
+                //top layer pixel (always exists)
+                NcvSize32u szTopLayer(lastLayer->width(), lastLayer->height());
+                Ncv32f ptTopX = 1.0f * (szTopLayer.width - 1) * j / (outRoi.width - 1);
+                Ncv32f ptTopY = 1.0f * (szTopLayer.height - 1) * i / (outRoi.height - 1);
+                T topPix = _interpBilinear(*lastLayer, ptTopX, ptTopY);
+                T trilinearPix = topPix;
+
+                if (bUse2Refs)
+                {
+                    //bottom layer pixel (exists only if the requested scale is greater than the smallest layer scale)
+                    NcvSize32u szBottomLayer(curLayer->width(), curLayer->height());
+                    Ncv32f ptBottomX = 1.0f * (szBottomLayer.width - 1) * j / (outRoi.width - 1);
+                    Ncv32f ptBottomY = 1.0f * (szBottomLayer.height - 1) * i / (outRoi.height - 1);
+                    T bottomPix = _interpBilinear(*curLayer, ptBottomX, ptBottomY);
+
+                    Ncv32f scale = (1.0f * outRoi.width / layer0->width() + 1.0f * outRoi.height / layer0->height()) / 2;
+                    Ncv32f dl = (scale - curScale) / (lastScale - curScale);
+                    dl = CLAMP(dl, 0.0f, 1.0f);
+                    trilinearPix = _interpLinear(bottomPix, topPix, dl);
+                }
+
+                outImg.at(j, i) = trilinearPix;
+            }
+        }
+    }
+
+    NCV_SKIP_COND_END
+
+    return NCV_SUCCESS;
+}
+
+
+template class NCVImagePyramid<uchar1>;
+template class NCVImagePyramid<uchar3>;
+template class NCVImagePyramid<uchar4>;
+template class NCVImagePyramid<ushort1>;
+template class NCVImagePyramid<ushort3>;
+template class NCVImagePyramid<ushort4>;
+template class NCVImagePyramid<uint1>;
+template class NCVImagePyramid<uint3>;
+template class NCVImagePyramid<uint4>;
+template class NCVImagePyramid<float1>;
+template class NCVImagePyramid<float3>;
+template class NCVImagePyramid<float4>;
diff --git a/modules/gpu/src/nvidia/core/NCVPyramid.hpp b/modules/gpu/src/nvidia/core/NCVPyramid.hpp
new file mode 100644
index 0000000000..92cf90fab9
--- /dev/null
+++ b/modules/gpu/src/nvidia/core/NCVPyramid.hpp
@@ -0,0 +1,97 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 
+// 
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+
+#ifndef _ncvpyramid_hpp_
+#define _ncvpyramid_hpp_
+
+#include <memory>
+#include <vector>
+#include "NCV.hpp"
+
+
+template <class T>
+class NCV_EXPORTS NCVMatrixStack
+{
+public:
+    NCVMatrixStack() {this->_arr.clear();}
+    ~NCVMatrixStack()
+    {
+        const Ncv32u nElem = this->_arr.size();
+        for (Ncv32u i=0; i<nElem; i++)
+        {
+            pop_back();
+        }
+    }
+    void push_back(NCVMatrix<T> *elem) {this->_arr.push_back(std::tr1::shared_ptr< NCVMatrix<T> >(elem));}
+    void pop_back() {this->_arr.pop_back();}
+    NCVMatrix<T> * operator [] (int i) const {return this->_arr[i].get();}
+private:
+    std::vector< std::tr1::shared_ptr< NCVMatrix<T> > > _arr;
+};
+
+
+template <class T>
+class NCV_EXPORTS NCVImagePyramid
+{
+public:
+
+    NCVImagePyramid(const NCVMatrix<T> &img,
+                    Ncv8u nLayers,
+                    INCVMemAllocator &alloc,
+                    cudaStream_t cuStream);
+    ~NCVImagePyramid();
+    NcvBool isInitialized() const;
+    NCVStatus getLayer(NCVMatrix<T> &outImg,
+                       NcvSize32u outRoi,
+                       NcvBool bTrilinear,
+                       cudaStream_t cuStream) const;
+
+private:
+
+    NcvBool _isInitialized;
+    const NCVMatrix<T> *layer0;
+    NCVMatrixStack<T> pyramid;
+    Ncv32u nLayers;
+};
+
+
+#endif //_ncvpyramid_hpp_
diff --git a/modules/gpu/src/optical_flow.cpp b/modules/gpu/src/optical_flow.cpp
index 19754c0578..9c6513a87c 100644
--- a/modules/gpu/src/optical_flow.cpp
+++ b/modules/gpu/src/optical_flow.cpp
@@ -68,10 +68,7 @@ namespace
 
 namespace
 {
-    void outputHandler(const char* msg) 
-    { 
-        CV_Error(CV_GpuApiCallError, msg); 
-    }
+    static void outputHandler(const std::string &msg) { CV_Error(CV_GpuApiCallError, msg.c_str()); }
 }
 
 void cv::gpu::BroxOpticalFlow::operator ()(const GpuMat& frame0, const GpuMat& frame1, GpuMat& u, GpuMat& v, Stream& s)