[+] Pixel types via templates

[+] Color conversions stub via pixel types [+] Pyramid calculation (required for mipmaps in CUDA 4.1) [~] Changed C strings to C++ throughout NCV [~] Fixed a couple of bugs in NCV
13 years ago · 5afb445283
parent 0b192cb4ea
commit 5afb445283
8 changed files with 1036 additions and 57 deletions
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@ -129,7 +129,7 @@ struct cv::gpu::CascadeClassifier_GPU::CascadeClassifierImpl
 private:


-    static void NCVDebugOutputHandler(const char* msg) { CV_Error(CV_GpuApiCallError, msg); }
+    static void NCVDebugOutputHandler(const std::string &msg) { CV_Error(CV_GpuApiCallError, msg.c_str()); }


    NCVStatus load(const string& classifierFile)
--- a/modules/gpu/src/nvidia/core/NCV.cu
+++ b/modules/gpu/src/nvidia/core/NCV.cu
@ -40,10 +40,9 @@
 //M*/


-#include <ios>
-#include <stdarg.h>
+#include <iostream>
+#include <string>
 #include <vector>
-#include <cstdio>
 #include "NCV.hpp"

 using namespace std;
@ -56,24 +55,18 @@ using namespace std;
 //==============================================================================


-static void stdioDebugOutput(const char *msg)
+static void stdDebugOutput(const string &msg)
 {
-    printf("%s", msg);
+    cout << msg;
 }


-static NCVDebugOutputHandler *debugOutputHandler = stdioDebugOutput;
+static NCVDebugOutputHandler *debugOutputHandler = stdDebugOutput;


-void ncvDebugOutput(const char *msg, ...)
+void ncvDebugOutput(const string &msg)
 {
-    const int K_DEBUG_STRING_MAXLEN = 1024;
-    char buffer[K_DEBUG_STRING_MAXLEN];
-    va_list args;
-    va_start(args, msg);
-    vsnprintf(buffer, K_DEBUG_STRING_MAXLEN, msg, args);
-    va_end (args);
-    debugOutputHandler(buffer);
+    debugOutputHandler(msg);
 }


@ -288,7 +281,7 @@ NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity,

    allocBegin = NULL;

-    if (reusePtr == NULL)
+    if (reusePtr == NULL && capacity != 0)
    {
        bReusesMemory = false;
        switch (memT)
@ -329,7 +322,7 @@ NCVMemStackAllocator::~NCVMemStackAllocator()
    {
        ncvAssertPrintCheck(currentSize == 0, "NCVMemStackAllocator dtor:: not all objects were deallocated properly, forcing destruction");

-        if (!bReusesMemory)
+        if (!bReusesMemory && (allocBegin != (Ncv8u *)(0x1)))
        {
            switch (_memType)
            {
@ -355,7 +348,7 @@ NCVStatus NCVMemStackAllocator::alloc(NCVMemSegment &seg, size_t size)
    seg.clear();
    ncvAssertReturn(isInitialized(), NCV_ALLOCATOR_BAD_ALLOC);

-    size = alignUp(static_cast<Ncv32u>(size), this->_alignment);
+    size = alignUp(size, this->_alignment);
    this->currentSize += size;
    this->_maxSize = std::max(this->_maxSize, this->currentSize);

@ -464,7 +457,7 @@ NCVStatus NCVMemNativeAllocator::alloc(NCVMemSegment &seg, size_t size)
        break;
    }

-    this->currentSize += alignUp(static_cast<Ncv32u>(size), this->_alignment);
+    this->currentSize += alignUp(size, this->_alignment);
    this->_maxSize = std::max(this->_maxSize, this->currentSize);

    seg.begin.memtype = this->_memType;
@ -480,8 +473,8 @@ NCVStatus NCVMemNativeAllocator::dealloc(NCVMemSegment &seg)
    ncvAssertReturn(seg.begin.memtype == this->_memType, NCV_ALLOCATOR_BAD_DEALLOC);
    ncvAssertReturn(seg.begin.ptr != NULL, NCV_ALLOCATOR_BAD_DEALLOC);

-    ncvAssertReturn(currentSize >= alignUp(static_cast<Ncv32u>(seg.size), this->_alignment), NCV_ALLOCATOR_BAD_DEALLOC);
-    currentSize -= alignUp(static_cast<Ncv32u>(seg.size), this->_alignment);
+    ncvAssertReturn(currentSize >= alignUp(seg.size, this->_alignment), NCV_ALLOCATOR_BAD_DEALLOC);
+    currentSize -= alignUp(seg.size, this->_alignment);

    switch (this->_memType)
    {
--- a/modules/gpu/src/nvidia/core/NCV.hpp
+++ b/modules/gpu/src/nvidia/core/NCV.hpp
@ -42,7 +42,7 @@
 #ifndef _ncv_hpp_
 #define _ncv_hpp_

-#if (defined WIN32 || defined _WIN32 || defined WINCE) && defined CVAPI_EXPORTS //&& !defined(__CUDACC__) 
+#if (defined WIN32 || defined _WIN32 || defined WINCE) && defined CVAPI_EXPORTS
    #define NCV_EXPORTS __declspec(dllexport)
 #else
    #define NCV_EXPORTS
@ -53,6 +53,8 @@
 #endif

 #include <cuda_runtime.h>
+#include <sstream>
+#include <iostream>


 //==============================================================================
@ -78,7 +80,7 @@ namespace NcvCTprep
 }


-#define NCV_CT_PREP_PASTE_AUX(a,b)      a##b                           ///< Concatenation indirection macro
+#define NCV_CT_PREP_PASTE_AUX(a,b)      a##b                         ///< Concatenation indirection macro
 #define NCV_CT_PREP_PASTE(a,b)          NCV_CT_PREP_PASTE_AUX(a, b)  ///< Concatenation macro


@ -181,6 +183,25 @@ struct NcvSize32u
    Ncv32u height; ///< Rectangle height.
    __host__ __device__ NcvSize32u() : width(0), height(0) {};
    __host__ __device__ NcvSize32u(Ncv32u width, Ncv32u height) : width(width), height(height) {}
+    __host__ __device__ bool operator == (const NcvSize32u &another) const {return this->width == another.width && this->height == another.height;}
+};
+
+
+struct NcvPoint2D32s
+{
+    Ncv32s x; ///< Point X.
+    Ncv32s y; ///< Point Y.
+    __host__ __device__ NcvPoint2D32s() : x(0), y(0) {};
+    __host__ __device__ NcvPoint2D32s(Ncv32s x, Ncv32s y) : x(x), y(y) {}
+};
+
+
+struct NcvPoint2D32u
+{
+    Ncv32u x; ///< Point X.
+    Ncv32u y; ///< Point Y.
+    __host__ __device__ NcvPoint2D32u() : x(0), y(0) {};
+    __host__ __device__ NcvPoint2D32u(Ncv32u x, Ncv32u y) : x(x), y(y) {}
 };


@ -199,6 +220,7 @@ NCV_CT_ASSERT(sizeof(NcvRect8u) == sizeof(Ncv32u));
 NCV_CT_ASSERT(sizeof(NcvRect32s) == 4 * sizeof(Ncv32s));
 NCV_CT_ASSERT(sizeof(NcvRect32u) == 4 * sizeof(Ncv32u));
 NCV_CT_ASSERT(sizeof(NcvSize32u) == 2 * sizeof(Ncv32u));
+NCV_CT_ASSERT(sizeof(NcvPoint2D32u) == 2 * sizeof(Ncv32u));


 //==============================================================================
@ -219,49 +241,44 @@ const Ncv32u K_LOG2_WARP_SIZE = 5;
 //==============================================================================


-#define NCV_CT_PREP_STRINGIZE_AUX(x)    #x
-#define NCV_CT_PREP_STRINGIZE(x)        NCV_CT_PREP_STRINGIZE_AUX(x)
-
+NCV_EXPORTS void ncvDebugOutput(const std::string &msg);

-NCV_EXPORTS void ncvDebugOutput(const char *msg, ...);

-
-typedef void NCVDebugOutputHandler(const char* msg);
+typedef void NCVDebugOutputHandler(const std::string &msg);


 NCV_EXPORTS void ncvSetDebugOutputHandler(NCVDebugOutputHandler* func);


 #define ncvAssertPrintCheck(pred, msg) \
-    ((pred) ? true : (ncvDebugOutput("\n%s\n", \
-    "NCV Assertion Failed: " msg ", file=" __FILE__ ", line=" NCV_CT_PREP_STRINGIZE(__LINE__) \
-    ), false))
-
-
-#define ncvAssertPrintReturn(pred, msg, err) \
-    if (ncvAssertPrintCheck(pred, msg)) ; else return err
-
-
-#define ncvAssertReturn(pred, err) \
    do \
    { \
        if (!(pred)) \
        { \
-            ncvDebugOutput("\n%s%d%s\n", "NCV Assertion Failed: retcode=", (int)err, ", file=" __FILE__ ", line=" NCV_CT_PREP_STRINGIZE(__LINE__)); \
-            return err; \
+            std::ostringstream oss; \
+            oss << "NCV Assertion Failed: " << msg << ", file=" << __FILE__ << ", line=" << __LINE__ << std::endl; \
+            ncvDebugOutput(oss.str()); \
        } \
    } while (0)


+#define ncvAssertPrintReturn(pred, msg, err) \
+    do \
+    { \
+        ncvAssertPrintCheck(pred, msg); \
+        if (!(pred)) return err; \
+    } while (0)
+
+
+#define ncvAssertReturn(pred, err) \
+    ncvAssertPrintReturn(pred, "retcode=" << (int)err, err)
+
+
 #define ncvAssertReturnNcvStat(ncvOp) \
    do \
    { \
        NCVStatus _ncvStat = ncvOp; \
-        if (NCV_SUCCESS != _ncvStat) \
-        { \
-            ncvDebugOutput("\n%s%d%s\n", "NCV Assertion Failed: NcvStat=", (int)_ncvStat, ", file=" __FILE__ ", line=" NCV_CT_PREP_STRINGIZE(__LINE__)); \
-            return _ncvStat; \
-        } \
+        ncvAssertPrintReturn(NCV_SUCCESS==_ncvStat, "NcvStat=" << (int)_ncvStat, _ncvStat); \
    } while (0)


@ -270,18 +287,14 @@ NCV_EXPORTS void ncvSetDebugOutputHandler(NCVDebugOutputHandler* func);
    { \
        cudaError_t resCall = cudacall; \
        cudaError_t resGLE = cudaGetLastError(); \
-        if (cudaSuccess != resCall || cudaSuccess != resGLE) \
-        { \
-            ncvDebugOutput("\n%s%d%s\n", "NCV CUDA Assertion Failed: cudaError_t=", (int)(resCall | resGLE), ", file=" __FILE__ ", line=" NCV_CT_PREP_STRINGIZE(__LINE__)); \
-            return errCode; \
-        } \
+        ncvAssertPrintReturn(cudaSuccess==resCall && cudaSuccess==resGLE, "cudaError_t=" << (int)(resCall | resGLE), errCode); \
    } while (0)


 /**
 * Return-codes for status notification, errors and warnings
 */
-enum NCVStatus
+enum
 {
    //NCV statuses
    NCV_SUCCESS,
@ -338,9 +351,14 @@ enum NCVStatus
    NPPST_MEM_INSUFFICIENT_BUFFER,            ///< Insufficient user-allocated buffer
    NPPST_MEM_RESIDENCE_ERROR,                ///< Memory residence error detected (check if pointers should be device or pinned)
    NPPST_MEM_INTERNAL_ERROR,                 ///< Internal memory management error
+
+    NCV_LAST_STATUS                           ///< Marker to continue error numeration in other files
 };


+typedef Ncv32u NCVStatus;
+
+
 #define NCV_SET_SKIP_COND(x) \
    bool __ncv_skip_cond = x

@ -774,9 +792,20 @@ public:
        return ncvStat;
    }

+    T &at(Ncv32u x, Ncv32u y) const
+    {
+        if (x >= this->_width || y >= this->_height)
+        {
+            printf("Error addressing matrix at [%d, %d]\n", x, y);
+            return *this->_ptr;
+        }
+        return ((T *)((Ncv8u *)this->_ptr + y * this->_pitch))[x];
+    }
+
    T *ptr() const {return this->_ptr;}
    Ncv32u width() const {return this->_width;}
    Ncv32u height() const {return this->_height;}
+    NcvSize32u size() const {return NcvSize32u(this->_width, this->_height);}
    Ncv32u pitch() const {return this->_pitch;}
    NCVMemoryType memType() const {return this->_memtype;}

@ -923,7 +952,7 @@ public:
        this->_width = roi.width;
        this->_height = roi.height;
        this->_pitch = mat.pitch();
-        this->_ptr = mat.ptr() + roi.y * mat.stride() + roi.x;
+        this->_ptr = &mat.at(roi.x, roi.y);
        this->_memtype = mat.memType();

        this->bReused = true;
@ -962,4 +991,24 @@ NCV_EXPORTS NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst, Ncv32u dstStride, Ncv
 NCV_EXPORTS NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst, Ncv32u dstStride, Ncv32u dstWidth, Ncv32u dstHeight,
                                              NcvRect32u *d_rects, Ncv32u numRects, Ncv32u color, cudaStream_t cuStream);

+
+#define CLAMP(x,a,b)        ( (x) > (b) ? (b) : ( (x) < (a) ? (a) : (x) ) )
+#define CLAMP_TOP(x, a)     (((x) > (a)) ? (a) : (x))
+#define CLAMP_BOTTOM(x, a)  (((x) < (a)) ? (a) : (x))
+#define CLAMP_0_255(x)      CLAMP(x,0,255)
+
+
+#define SUB_BEGIN(type, name)    struct { __inline type name
+#define SUB_END(name)            } name;
+#define SUB_CALL(name)           name.name
+
+#define SQR(x)              ((x)*(x))
+
+
+#define ncvSafeMatAlloc(name, type, alloc, width, height, err) \
+    NCVMatrixAlloc<type> name(alloc, width, height); \
+    ncvAssertReturn(name.isMemAllocated(), err);
+
+
+
 #endif // _ncv_hpp_
--- a/modules/gpu/src/nvidia/core/NCVColorConversion.hpp
+++ b/modules/gpu/src/nvidia/core/NCVColorConversion.hpp
@ -0,0 +1,96 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 
+// 
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _ncv_color_conversion_hpp_
+#define _ncv_color_conversion_hpp_
+
+#include "NCVPixelOperations.hpp"
+
+enum NCVColorSpace
+{
+    NCVColorSpaceGray,
+    NCVColorSpaceRGBA,
+};
+
+template<NCVColorSpace CSin, NCVColorSpace CSout, typename Tin, typename Tout> struct __pixColorConv {
+static void _pixColorConv(const Tin &pixIn, Tout &pixOut);
+};
+
+template<typename Tin, typename Tout> struct __pixColorConv<NCVColorSpaceRGBA, NCVColorSpaceGray, Tin, Tout> {
+static void _pixColorConv(const Tin &pixIn, Tout &pixOut)
+{
+    Ncv32f luma = 0.299f * pixIn.x + 0.587f * pixIn.y + 0.114f * pixIn.z;
+    _TDemoteClampNN(luma, pixOut.x);
+}};
+
+template<typename Tin, typename Tout> struct __pixColorConv<NCVColorSpaceGray, NCVColorSpaceRGBA, Tin, Tout> {
+static void _pixColorConv(const Tin &pixIn, Tout &pixOut)
+{
+    _TDemoteClampNN(pixIn.x, pixOut.x);
+    _TDemoteClampNN(pixIn.x, pixOut.y);
+    _TDemoteClampNN(pixIn.x, pixOut.z);
+    pixOut.w = 0;
+}};
+
+template<NCVColorSpace CSin, NCVColorSpace CSout, typename Tin, typename Tout>
+static
+NCVStatus _ncvColorConv_host(const NCVMatrix<Tin> &h_imgIn,
+                             const NCVMatrix<Tout> &h_imgOut)
+{
+    ncvAssertReturn(h_imgIn.size() == h_imgOut.size(), NCV_DIMENSIONS_INVALID);
+    ncvAssertReturn(h_imgIn.memType() == h_imgOut.memType() &&
+                    (h_imgIn.memType() == NCVMemoryTypeHostPinned || h_imgIn.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
+    NCV_SET_SKIP_COND(h_imgIn.memType() == NCVMemoryTypeNone);
+    NCV_SKIP_COND_BEGIN
+
+    for (Ncv32u i=0; i<h_imgIn.height(); i++)
+    {
+        for (Ncv32u j=0; j<h_imgIn.width(); j++)
+        {
+            __pixColorConv<CSin, CSout, Tin, Tout>::_pixColorConv(h_imgIn.at(j,i), h_imgOut.at(j,i));
+        }
+    }
+
+    NCV_SKIP_COND_END
+    return NCV_SUCCESS;
+}
+
+#endif //_ncv_color_conversion_hpp_
--- a/modules/gpu/src/nvidia/core/NCVPixelOperations.hpp
+++ b/modules/gpu/src/nvidia/core/NCVPixelOperations.hpp
@ -0,0 +1,350 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 
+// 
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _ncv_pixel_operations_hpp_
+#define _ncv_pixel_operations_hpp_
+
+#include <limits.h>
+#include <float.h>
+#include "NCV.hpp"
+
+template<typename TBase> inline TBase _pixMaxVal();
+template<> static inline Ncv8u _pixMaxVal<Ncv8u>() {return UCHAR_MAX;}
+template<> static inline Ncv16u _pixMaxVal<Ncv16u>() {return USHRT_MAX;}
+template<> static inline Ncv32u _pixMaxVal<Ncv32u>() {return UINT_MAX;}
+template<> static inline Ncv8s _pixMaxVal<Ncv8s>() {return CHAR_MAX;}
+template<> static inline Ncv16s _pixMaxVal<Ncv16s>() {return SHRT_MAX;}
+template<> static inline Ncv32s _pixMaxVal<Ncv32s>() {return INT_MAX;}
+template<> static inline Ncv32f _pixMaxVal<Ncv32f>() {return FLT_MAX;}
+template<> static inline Ncv64f _pixMaxVal<Ncv64f>() {return DBL_MAX;}
+
+template<typename TBase> inline TBase _pixMinVal();
+template<> static inline Ncv8u _pixMinVal<Ncv8u>() {return 0;}
+template<> static inline Ncv16u _pixMinVal<Ncv16u>() {return 0;}
+template<> static inline Ncv32u _pixMinVal<Ncv32u>() {return 0;}
+template<> static inline Ncv8s _pixMinVal<Ncv8s>() {return CHAR_MIN;}
+template<> static inline Ncv16s _pixMinVal<Ncv16s>() {return SHRT_MIN;}
+template<> static inline Ncv32s _pixMinVal<Ncv32s>() {return INT_MIN;}
+template<> static inline Ncv32f _pixMinVal<Ncv32f>() {return FLT_MIN;}
+template<> static inline Ncv64f _pixMinVal<Ncv64f>() {return DBL_MIN;}
+
+template<typename Tvec> struct TConvVec2Base;
+template<> struct TConvVec2Base<uchar1> {typedef Ncv8u TBase;};
+template<> struct TConvVec2Base<uchar3> {typedef Ncv8u TBase;};
+template<> struct TConvVec2Base<uchar4> {typedef Ncv8u TBase;};
+template<> struct TConvVec2Base<ushort1> {typedef Ncv16u TBase;};
+template<> struct TConvVec2Base<ushort3> {typedef Ncv16u TBase;};
+template<> struct TConvVec2Base<ushort4> {typedef Ncv16u TBase;};
+template<> struct TConvVec2Base<uint1> {typedef Ncv32u TBase;};
+template<> struct TConvVec2Base<uint3> {typedef Ncv32u TBase;};
+template<> struct TConvVec2Base<uint4> {typedef Ncv32u TBase;};
+template<> struct TConvVec2Base<float1> {typedef Ncv32f TBase;};
+template<> struct TConvVec2Base<float3> {typedef Ncv32f TBase;};
+template<> struct TConvVec2Base<float4> {typedef Ncv32f TBase;};
+template<> struct TConvVec2Base<double1> {typedef Ncv64f TBase;};
+template<> struct TConvVec2Base<double3> {typedef Ncv64f TBase;};
+template<> struct TConvVec2Base<double4> {typedef Ncv64f TBase;};
+
+#define NC(T)       (sizeof(T) / sizeof(TConvVec2Base<T>::TBase))
+
+template<typename TBase, Ncv32u NC> struct TConvBase2Vec;
+template<> struct TConvBase2Vec<Ncv8u, 1> {typedef uchar1 TVec;};
+template<> struct TConvBase2Vec<Ncv8u, 3> {typedef uchar3 TVec;};
+template<> struct TConvBase2Vec<Ncv8u, 4> {typedef uchar4 TVec;};
+template<> struct TConvBase2Vec<Ncv16u, 1> {typedef ushort1 TVec;};
+template<> struct TConvBase2Vec<Ncv16u, 3> {typedef ushort3 TVec;};
+template<> struct TConvBase2Vec<Ncv16u, 4> {typedef ushort4 TVec;};
+template<> struct TConvBase2Vec<Ncv32u, 1> {typedef uint1 TVec;};
+template<> struct TConvBase2Vec<Ncv32u, 3> {typedef uint3 TVec;};
+template<> struct TConvBase2Vec<Ncv32u, 4> {typedef uint4 TVec;};
+template<> struct TConvBase2Vec<Ncv32f, 1> {typedef float1 TVec;};
+template<> struct TConvBase2Vec<Ncv32f, 3> {typedef float3 TVec;};
+template<> struct TConvBase2Vec<Ncv32f, 4> {typedef float4 TVec;};
+template<> struct TConvBase2Vec<Ncv64f, 1> {typedef double1 TVec;};
+template<> struct TConvBase2Vec<Ncv64f, 3> {typedef double3 TVec;};
+template<> struct TConvBase2Vec<Ncv64f, 4> {typedef double4 TVec;};
+
+//TODO: consider using CUDA intrinsics to avoid branching
+template<typename Tin> static inline void _TDemoteClampZ(Tin &a, Ncv8u &out) {out = (Ncv8u)CLAMP_0_255(a);};
+template<typename Tin> static inline void _TDemoteClampZ(Tin &a, Ncv16u &out) {out = (Ncv16u)CLAMP(a, 0, USHRT_MAX);}
+template<typename Tin> static inline void _TDemoteClampZ(Tin &a, Ncv32u &out) {out = (Ncv32u)CLAMP(a, 0, UINT_MAX);}
+template<typename Tin> static inline void _TDemoteClampZ(Tin &a, Ncv32f &out) {out = (Ncv32f)a;}
+
+//TODO: consider using CUDA intrinsics to avoid branching
+template<typename Tin> static inline void _TDemoteClampNN(Tin &a, Ncv8u &out) {out = (Ncv8u)CLAMP_0_255(a+0.5f);}
+template<typename Tin> static inline void _TDemoteClampNN(Tin &a, Ncv16u &out) {out = (Ncv16u)CLAMP(a+0.5f, 0, USHRT_MAX);}
+template<typename Tin> static inline void _TDemoteClampNN(Tin &a, Ncv32u &out) {out = (Ncv32u)CLAMP(a+0.5f, 0, UINT_MAX);}
+template<typename Tin> static inline void _TDemoteClampNN(Tin &a, Ncv32f &out) {out = (Ncv32f)a;}
+
+template<typename Tout> inline Tout _pixMakeZero();
+template<> static inline uchar1 _pixMakeZero<uchar1>() {return make_uchar1(0);}
+template<> static inline uchar3 _pixMakeZero<uchar3>() {return make_uchar3(0,0,0);}
+template<> static inline uchar4 _pixMakeZero<uchar4>() {return make_uchar4(0,0,0,0);}
+template<> static inline ushort1 _pixMakeZero<ushort1>() {return make_ushort1(0);}
+template<> static inline ushort3 _pixMakeZero<ushort3>() {return make_ushort3(0,0,0);}
+template<> static inline ushort4 _pixMakeZero<ushort4>() {return make_ushort4(0,0,0,0);}
+template<> static inline uint1 _pixMakeZero<uint1>() {return make_uint1(0);}
+template<> static inline uint3 _pixMakeZero<uint3>() {return make_uint3(0,0,0);}
+template<> static inline uint4 _pixMakeZero<uint4>() {return make_uint4(0,0,0,0);}
+template<> static inline float1 _pixMakeZero<float1>() {return make_float1(0.f);}
+template<> static inline float3 _pixMakeZero<float3>() {return make_float3(0.f,0.f,0.f);}
+template<> static inline float4 _pixMakeZero<float4>() {return make_float4(0.f,0.f,0.f,0.f);}
+template<> static inline double1 _pixMakeZero<double1>() {return make_double1(0.);}
+template<> static inline double3 _pixMakeZero<double3>() {return make_double3(0.,0.,0.);}
+template<> static inline double4 _pixMakeZero<double4>() {return make_double4(0.,0.,0.,0.);}
+
+static inline uchar1 _pixMake(Ncv8u x) {return make_uchar1(x);}
+static inline uchar3 _pixMake(Ncv8u x, Ncv8u y, Ncv8u z) {return make_uchar3(x,y,z);}
+static inline uchar4 _pixMake(Ncv8u x, Ncv8u y, Ncv8u z, Ncv8u w) {return make_uchar4(x,y,z,w);}
+static inline ushort1 _pixMake(Ncv16u x) {return make_ushort1(x);}
+static inline ushort3 _pixMake(Ncv16u x, Ncv16u y, Ncv16u z) {return make_ushort3(x,y,z);}
+static inline ushort4 _pixMake(Ncv16u x, Ncv16u y, Ncv16u z, Ncv16u w) {return make_ushort4(x,y,z,w);}
+static inline uint1 _pixMake(Ncv32u x) {return make_uint1(x);}
+static inline uint3 _pixMake(Ncv32u x, Ncv32u y, Ncv32u z) {return make_uint3(x,y,z);}
+static inline uint4 _pixMake(Ncv32u x, Ncv32u y, Ncv32u z, Ncv32u w) {return make_uint4(x,y,z,w);}
+static inline float1 _pixMake(Ncv32f x) {return make_float1(x);}
+static inline float3 _pixMake(Ncv32f x, Ncv32f y, Ncv32f z) {return make_float3(x,y,z);}
+static inline float4 _pixMake(Ncv32f x, Ncv32f y, Ncv32f z, Ncv32f w) {return make_float4(x,y,z,w);}
+static inline double1 _pixMake(Ncv64f x) {return make_double1(x);}
+static inline double3 _pixMake(Ncv64f x, Ncv64f y, Ncv64f z) {return make_double3(x,y,z);}
+static inline double4 _pixMake(Ncv64f x, Ncv64f y, Ncv64f z, Ncv64f w) {return make_double4(x,y,z,w);}
+
+
+template<typename Tin, typename Tout, Ncv32u CN> struct __pixDemoteClampZ_CN {static Tout _pixDemoteClampZ_CN(Tin &pix);};
+
+template<typename Tin, typename Tout> struct __pixDemoteClampZ_CN<Tin, Tout, 1> {
+static Tout _pixDemoteClampZ_CN(Tin &pix)
+{
+    Tout out;
+    _TDemoteClampZ(pix.x, out.x);
+    return out;
+}};
+
+template<typename Tin, typename Tout> struct __pixDemoteClampZ_CN<Tin, Tout, 3> {
+static Tout _pixDemoteClampZ_CN(Tin &pix)
+{
+    Tout out;
+    _TDemoteClampZ(pix.x, out.x);
+    _TDemoteClampZ(pix.y, out.y);
+    _TDemoteClampZ(pix.z, out.z);
+    return out;
+}};
+
+template<typename Tin, typename Tout> struct __pixDemoteClampZ_CN<Tin, Tout, 4> {
+static Tout _pixDemoteClampZ_CN(Tin &pix)
+{
+    Tout out;
+    _TDemoteClampZ(pix.x, out.x);
+    _TDemoteClampZ(pix.y, out.y);
+    _TDemoteClampZ(pix.z, out.z);
+    _TDemoteClampZ(pix.w, out.w);
+    return out;
+}};
+
+template<typename Tin, typename Tout> static inline Tout _pixDemoteClampZ(Tin &pix)
+{
+    return __pixDemoteClampZ_CN<Tin, Tout, NC(Tin)>::_pixDemoteClampZ_CN(pix);
+}
+
+
+template<typename Tin, typename Tout, Ncv32u CN> struct __pixDemoteClampNN_CN {static Tout _pixDemoteClampNN_CN(Tin &pix);};
+
+template<typename Tin, typename Tout> struct __pixDemoteClampNN_CN<Tin, Tout, 1> {
+static Tout _pixDemoteClampNN_CN(Tin &pix)
+{
+    Tout out;
+    _TDemoteClampNN(pix.x, out.x);
+    return out;
+}};
+
+template<typename Tin, typename Tout> struct __pixDemoteClampNN_CN<Tin, Tout, 3> {
+static Tout _pixDemoteClampNN_CN(Tin &pix)
+{
+    Tout out;
+    _TDemoteClampNN(pix.x, out.x);
+    _TDemoteClampNN(pix.y, out.y);
+    _TDemoteClampNN(pix.z, out.z);
+    return out;
+}};
+
+template<typename Tin, typename Tout> struct __pixDemoteClampNN_CN<Tin, Tout, 4> {
+static Tout _pixDemoteClampNN_CN(Tin &pix)
+{
+    Tout out;
+    _TDemoteClampNN(pix.x, out.x);
+    _TDemoteClampNN(pix.y, out.y);
+    _TDemoteClampNN(pix.z, out.z);
+    _TDemoteClampNN(pix.w, out.w);
+    return out;
+}};
+
+template<typename Tin, typename Tout> static inline Tout _pixDemoteClampNN(Tin &pix)
+{
+    return __pixDemoteClampNN_CN<Tin, Tout, NC(Tin)>::_pixDemoteClampNN_CN(pix);
+}
+
+
+template<typename Tin, typename Tout, typename Tw, Ncv32u CN> struct __pixScale_CN {static Tout _pixScale_CN(Tin &pix, Tw w);};
+
+template<typename Tin, typename Tout, typename Tw> struct __pixScale_CN<Tin, Tout, Tw, 1> {
+static Tout _pixScale_CN(Tin &pix, Tw w)
+{
+    Tout out;
+    typedef typename TConvVec2Base<Tout>::TBase TBout;
+    out.x = (TBout)(pix.x * w);
+    return out;
+}};
+
+template<typename Tin, typename Tout, typename Tw> struct __pixScale_CN<Tin, Tout, Tw, 3> {
+static Tout _pixScale_CN(Tin &pix, Tw w)
+{
+    Tout out;
+    typedef typename TConvVec2Base<Tout>::TBase TBout;
+    out.x = (TBout)(pix.x * w);
+    out.y = (TBout)(pix.y * w);
+    out.z = (TBout)(pix.z * w);
+    return out;
+}};
+
+template<typename Tin, typename Tout, typename Tw> struct __pixScale_CN<Tin, Tout, Tw, 4> {
+static Tout _pixScale_CN(Tin &pix, Tw w)
+{
+    Tout out;
+    typedef typename TConvVec2Base<Tout>::TBase TBout;
+    out.x = (TBout)(pix.x * w);
+    out.y = (TBout)(pix.y * w);
+    out.z = (TBout)(pix.z * w);
+    out.w = (TBout)(pix.w * w);
+    return out;
+}};
+
+template<typename Tin, typename Tout, typename Tw> static Tout _pixScale(Tin &pix, Tw w)
+{
+    return __pixScale_CN<Tin, Tout, Tw, NC(Tin)>::_pixScale_CN(pix, w);
+}
+
+
+template<typename Tin, typename Tout, Ncv32u CN> struct __pixAdd_CN {static Tout _pixAdd_CN(Tout &pix1, Tin &pix2);};
+
+template<typename Tin, typename Tout> struct __pixAdd_CN<Tin, Tout, 1> {
+static Tout _pixAdd_CN(Tout &pix1, Tin &pix2)
+{
+    Tout out;
+    out.x = pix1.x + pix2.x;
+    return out;
+}};
+
+template<typename Tin, typename Tout> struct __pixAdd_CN<Tin, Tout, 3> {
+static Tout _pixAdd_CN(Tout &pix1, Tin &pix2)
+{
+    Tout out;
+    out.x = pix1.x + pix2.x;
+    out.y = pix1.y + pix2.y;
+    out.z = pix1.z + pix2.z;
+    return out;
+}};
+
+template<typename Tin, typename Tout> struct __pixAdd_CN<Tin, Tout, 4> {
+static Tout _pixAdd_CN(Tout &pix1, Tin &pix2)
+{
+    Tout out;
+    out.x = pix1.x + pix2.x;
+    out.y = pix1.y + pix2.y;
+    out.z = pix1.z + pix2.z;
+    out.w = pix1.w + pix2.w;
+    return out;
+}};
+
+template<typename Tin, typename Tout> static Tout _pixAdd(Tout &pix1, Tin &pix2)
+{
+    return __pixAdd_CN<Tin, Tout, NC(Tin)>::_pixAdd_CN(pix1, pix2);
+}
+
+
+template<typename Tin, typename Tout, Ncv32u CN> struct __pixDist_CN {static Tout _pixDist_CN(Tin &pix1, Tin &pix2);};
+
+template<typename Tin, typename Tout> struct __pixDist_CN<Tin, Tout, 1> {
+static Tout _pixDist_CN(Tin &pix1, Tin &pix2)
+{
+    return Tout(SQR(pix1.x - pix2.x));
+}};
+
+template<typename Tin, typename Tout> struct __pixDist_CN<Tin, Tout, 3> {
+static Tout _pixDist_CN(Tin &pix1, Tin &pix2)
+{
+    return Tout(SQR(pix1.x - pix2.x) + SQR(pix1.y - pix2.y) + SQR(pix1.z - pix2.z));
+}};
+
+template<typename Tin, typename Tout> struct __pixDist_CN<Tin, Tout, 4> {
+static Tout _pixDist_CN(Tin &pix1, Tin &pix2)
+{
+    return Tout(SQR(pix1.x - pix2.x) + SQR(pix1.y - pix2.y) + SQR(pix1.z - pix2.z) + SQR(pix1.w - pix2.w));
+}};
+
+template<typename Tin, typename Tout> static Tout _pixDist(Tin &pix1, Tin &pix2)
+{
+    return __pixDist_CN<Tin, Tout, NC(Tin)>::_pixDist_CN(pix1, pix2);
+}
+
+
+template <typename T> struct TAccPixWeighted;
+template<> struct TAccPixWeighted<uchar1> {typedef double1 type;};
+template<> struct TAccPixWeighted<uchar3> {typedef double3 type;};
+template<> struct TAccPixWeighted<uchar4> {typedef double4 type;};
+template<> struct TAccPixWeighted<ushort1> {typedef double1 type;};
+template<> struct TAccPixWeighted<ushort3> {typedef double3 type;};
+template<> struct TAccPixWeighted<ushort4> {typedef double4 type;};
+template<> struct TAccPixWeighted<float1> {typedef double1 type;};
+template<> struct TAccPixWeighted<float3> {typedef double3 type;};
+template<> struct TAccPixWeighted<float4> {typedef double4 type;};
+
+template<typename Tfrom> struct TAccPixDist {};
+template<> struct TAccPixDist<uchar1> {typedef Ncv32u type;};
+template<> struct TAccPixDist<uchar3> {typedef Ncv32u type;};
+template<> struct TAccPixDist<uchar4> {typedef Ncv32u type;};
+template<> struct TAccPixDist<ushort1> {typedef Ncv32u type;};
+template<> struct TAccPixDist<ushort3> {typedef Ncv32u type;};
+template<> struct TAccPixDist<ushort4> {typedef Ncv32u type;};
+template<> struct TAccPixDist<float1> {typedef Ncv32f type;};
+template<> struct TAccPixDist<float3> {typedef Ncv32f type;};
+template<> struct TAccPixDist<float4> {typedef Ncv32f type;};
+
+#endif //_ncv_pixel_operations_hpp_
--- a/modules/gpu/src/nvidia/core/NCVPyramid.cu
+++ b/modules/gpu/src/nvidia/core/NCVPyramid.cu
@ -0,0 +1,397 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 
+// 
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include "NCV.hpp"
+#include "NCVPyramid.hpp"
+#include "NCVPixelOperations.hpp"
+
+
+template<typename T, Ncv32u CN> struct __average4_CN {static T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11);};
+
+template<typename T> struct __average4_CN<T, 1> {
+static T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11)
+{
+    T out;
+    out.x = ((Ncv32s)p00.x + p01.x + p10.x + p11.x + 2) / 4;
+    return out;
+}};
+
+template<> struct __average4_CN<float1, 1> {
+static float1 _average4_CN(const float1 &p00, const float1 &p01, const float1 &p10, const float1 &p11)
+{
+    float1 out;
+    out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
+    return out;
+}};
+
+template<> struct __average4_CN<double1, 1> {
+static double1 _average4_CN(const double1 &p00, const double1 &p01, const double1 &p10, const double1 &p11)
+{
+    double1 out;
+    out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
+    return out;
+}};
+
+template<typename T> struct __average4_CN<T, 3> {
+static T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11)
+{
+    T out;
+    out.x = ((Ncv32s)p00.x + p01.x + p10.x + p11.x + 2) / 4;
+    out.y = ((Ncv32s)p00.y + p01.y + p10.y + p11.y + 2) / 4;
+    out.z = ((Ncv32s)p00.z + p01.z + p10.z + p11.z + 2) / 4;
+    return out;
+}};
+
+template<> struct __average4_CN<float3, 3> {
+static float3 _average4_CN(const float3 &p00, const float3 &p01, const float3 &p10, const float3 &p11)
+{
+    float3 out;
+    out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
+    out.y = (p00.y + p01.y + p10.y + p11.y) / 4;
+    out.z = (p00.z + p01.z + p10.z + p11.z) / 4;
+    return out;
+}};
+
+template<> struct __average4_CN<double3, 3> {
+static double3 _average4_CN(const double3 &p00, const double3 &p01, const double3 &p10, const double3 &p11)
+{
+    double3 out;
+    out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
+    out.y = (p00.y + p01.y + p10.y + p11.y) / 4;
+    out.z = (p00.z + p01.z + p10.z + p11.z) / 4;
+    return out;
+}};
+
+template<typename T> struct __average4_CN<T, 4> {
+static T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11)
+{
+    T out;
+    out.x = ((Ncv32s)p00.x + p01.x + p10.x + p11.x + 2) / 4;
+    out.y = ((Ncv32s)p00.y + p01.y + p10.y + p11.y + 2) / 4;
+    out.z = ((Ncv32s)p00.z + p01.z + p10.z + p11.z + 2) / 4;
+    out.w = ((Ncv32s)p00.w + p01.w + p10.w + p11.w + 2) / 4;
+    return out;
+}};
+
+template<> struct __average4_CN<float4, 4> {
+static float4 _average4_CN(const float4 &p00, const float4 &p01, const float4 &p10, const float4 &p11)
+{
+    float4 out;
+    out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
+    out.y = (p00.y + p01.y + p10.y + p11.y) / 4;
+    out.z = (p00.z + p01.z + p10.z + p11.z) / 4;
+    out.w = (p00.w + p01.w + p10.w + p11.w) / 4;
+    return out;
+}};
+
+template<> struct __average4_CN<double4, 4> {
+static double4 _average4_CN(const double4 &p00, const double4 &p01, const double4 &p10, const double4 &p11)
+{
+    double4 out;
+    out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
+    out.y = (p00.y + p01.y + p10.y + p11.y) / 4;
+    out.z = (p00.z + p01.z + p10.z + p11.z) / 4;
+    out.w = (p00.w + p01.w + p10.w + p11.w) / 4;
+    return out;
+}};
+
+template<typename T> static T _average4(const T &p00, const T &p01, const T &p10, const T &p11)
+{
+    return __average4_CN<T, NC(T)>::_average4_CN(p00, p01, p10, p11);
+}
+
+
+template<typename Tin, typename Tout, Ncv32u CN> struct __lerp_CN {static Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d);};
+
+template<typename Tin, typename Tout> struct __lerp_CN<Tin, Tout, 1> {
+static Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d)
+{
+    typedef typename TConvVec2Base<Tout>::TBase TB;
+    return _pixMake(TB(b.x * d + a.x * (1 - d)));
+}};
+
+template<typename Tin, typename Tout> struct __lerp_CN<Tin, Tout, 3> {
+static Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d)
+{
+    typedef typename TConvVec2Base<Tout>::TBase TB;
+    return _pixMake(TB(b.x * d + a.x * (1 - d)),
+                    TB(b.y * d + a.y * (1 - d)),
+                    TB(b.z * d + a.z * (1 - d)));
+}};
+
+template<typename Tin, typename Tout> struct __lerp_CN<Tin, Tout, 4> {
+static Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d)
+{
+    typedef typename TConvVec2Base<Tout>::TBase TB;
+    return _pixMake(TB(b.x * d + a.x * (1 - d)),
+                    TB(b.y * d + a.y * (1 - d)),
+                    TB(b.z * d + a.z * (1 - d)),
+                    TB(b.w * d + a.w * (1 - d)));
+}};
+
+template<typename Tin, typename Tout> static Tout _lerp(const Tin &a, const Tin &b, Ncv32f d)
+{
+    return __lerp_CN<Tin, Tout, NC(Tin)>::_lerp_CN(a, b, d);
+}
+
+
+template<typename T>
+static T _interpLinear(const T &a, const T &b, Ncv32f d)
+{
+    typedef typename TConvBase2Vec<Ncv32f, NC(T)>::TVec TVFlt;
+    TVFlt tmp = _lerp<T, TVFlt>(a, b, d);
+    return _pixDemoteClampZ<TVFlt, T>(tmp);
+}
+
+
+template<typename T>
+static T _interpBilinear(const NCVMatrix<T> &refLayer, Ncv32f x, Ncv32f y)
+{
+    Ncv32u xl = (Ncv32u)x;
+    Ncv32u xh = xl+1;
+    Ncv32f dx = x - xl;
+    Ncv32u yl = (Ncv32u)y;
+    Ncv32u yh = yl+1;
+    Ncv32f dy = y - yl;
+    T p00, p01, p10, p11;
+    p00 = refLayer.at(xl, yl);
+    p01 = xh < refLayer.width() ? refLayer.at(xh, yl) : p00;
+    p10 = yh < refLayer.height() ? refLayer.at(xl, yh) : p00;
+    p11 = (xh < refLayer.width() && yh < refLayer.height()) ? refLayer.at(xh, yh) : p00;
+    typedef typename TConvBase2Vec<Ncv32f, NC(T)>::TVec TVFlt;
+    TVFlt m_00_01 = _lerp<T, TVFlt>(p00, p01, dx);
+    TVFlt m_10_11 = _lerp<T, TVFlt>(p10, p11, dx);
+    TVFlt mixture = _lerp<TVFlt, TVFlt>(m_00_01, m_10_11, dy);
+    return _pixDemoteClampZ<TVFlt, T>(mixture);
+}
+
+
+template <class T>
+NCVImagePyramid<T>::NCVImagePyramid(const NCVMatrix<T> &img,
+                                    Ncv8u numLayers,
+                                    INCVMemAllocator &alloc,
+                                    cudaStream_t cuStream)
+{
+    this->_isInitialized = false;
+    ncvAssertPrintReturn(img.memType() == alloc.memType(), "NCVImagePyramid_host::ctor error", );
+
+    this->layer0 = &img;
+    NcvSize32u szLastLayer(img.width(), img.height());
+    this->nLayers = 1;
+
+    NCV_SET_SKIP_COND(alloc.isCounting());
+    NcvBool bDeviceCode = alloc.memType() == NCVMemoryTypeDevice;
+
+    if (numLayers == 0)
+    {
+        numLayers = 255; //it will cut-off when any of the dimensions goes 1
+    }
+
+    for (Ncv32u i=0; i<(Ncv32u)numLayers-1; i++)
+    {
+        NcvSize32u szCurLayer(szLastLayer.width / 2, szLastLayer.height / 2);
+        if (szCurLayer.width == 0 || szCurLayer.height == 0)
+        {
+            break;
+        }
+
+        this->pyramid.push_back(new NCVMatrixAlloc<T>(alloc, szCurLayer.width, szCurLayer.height));
+        ncvAssertPrintReturn(((NCVMatrixAlloc<T> *)(this->pyramid[i]))->isMemAllocated(), "NCVImagePyramid_host::ctor error", );
+        this->nLayers++;
+
+        //fill in the layer
+        NCV_SKIP_COND_BEGIN
+
+        const NCVMatrix<T> *prevLayer = i == 0 ? this->layer0 : this->pyramid[i-1];
+        NCVMatrix<T> *curLayer = this->pyramid[i];
+
+        if (bDeviceCode)
+        {
+            //TODO: in cuStream
+        }
+        else
+        {
+            for (Ncv32u i=0; i<szCurLayer.height; i++)
+            {
+                for (Ncv32u j=0; j<szCurLayer.width; j++)
+                {
+                    T p00 = prevLayer->at(2*j+0, 2*i+0);
+                    T p01 = prevLayer->at(2*j+1, 2*i+0);
+                    T p10 = prevLayer->at(2*j+0, 2*i+1);
+                    T p11 = prevLayer->at(2*j+1, 2*i+1);
+                    curLayer->at(j, i) = _average4(p00, p01, p10, p11);
+                }
+            }
+        }
+
+        NCV_SKIP_COND_END
+
+        szLastLayer = szCurLayer;
+    }
+
+    this->_isInitialized = true;
+}
+
+
+template <class T>
+NCVImagePyramid<T>::~NCVImagePyramid()
+{
+}
+
+
+template <class T>
+NcvBool NCVImagePyramid<T>::isInitialized() const
+{
+    return this->_isInitialized;
+}
+
+
+template <class T>
+NCVStatus NCVImagePyramid<T>::getLayer(NCVMatrix<T> &outImg,
+                                       NcvSize32u outRoi,
+                                       NcvBool bTrilinear,
+                                       cudaStream_t cuStream) const
+{
+    ncvAssertReturn(this->isInitialized(), NCV_UNKNOWN_ERROR);
+    ncvAssertReturn(outImg.memType() == this->layer0->memType(), NCV_MEM_RESIDENCE_ERROR);
+    ncvAssertReturn(outRoi.width <= this->layer0->width() && outRoi.height <= this->layer0->height() &&
+                    outRoi.width > 0 && outRoi.height > 0, NCV_DIMENSIONS_INVALID);
+
+    if (outRoi.width == this->layer0->width() && outRoi.height == this->layer0->height())
+    {
+        ncvAssertReturnNcvStat(this->layer0->copy2D(outImg, NcvSize32u(this->layer0->width(), this->layer0->height()), cuStream));
+        return NCV_SUCCESS;
+    }
+
+    Ncv32f lastScale = 1.0f;
+    Ncv32f curScale;
+    const NCVMatrix<T> *lastLayer = this->layer0;
+    const NCVMatrix<T> *curLayer = NULL;
+    NcvBool bUse2Refs = false;
+
+    for (Ncv32u i=0; i<this->nLayers-1; i++)
+    {
+        curScale = lastScale * 0.5f;
+        curLayer = this->pyramid[i];
+
+        if (outRoi.width == curLayer->width() && outRoi.height == curLayer->height())
+        {
+            ncvAssertReturnNcvStat(this->pyramid[i]->copy2D(outImg, NcvSize32u(this->pyramid[i]->width(), this->pyramid[i]->height()), cuStream));
+            return NCV_SUCCESS;
+        }
+
+        if (outRoi.width >= curLayer->width() && outRoi.height >= curLayer->height())
+        {
+            if (outRoi.width < lastLayer->width() && outRoi.height < lastLayer->height())
+            {
+                bUse2Refs = true;
+            }
+            break;
+        }
+
+        lastScale = curScale;
+        lastLayer = curLayer;
+    }
+
+    bUse2Refs = bUse2Refs && bTrilinear;
+
+    NCV_SET_SKIP_COND(outImg.memType() == NCVMemoryTypeNone);
+    NcvBool bDeviceCode = this->layer0->memType() == NCVMemoryTypeDevice;
+
+    NCV_SKIP_COND_BEGIN
+
+    if (bDeviceCode)
+    {
+        //TODO: in cuStream
+    }
+    else
+    {
+        for (Ncv32u i=0; i<outRoi.height; i++)
+        {
+            for (Ncv32u j=0; j<outRoi.width; j++)
+            {
+                //top layer pixel (always exists)
+                NcvSize32u szTopLayer(lastLayer->width(), lastLayer->height());
+                Ncv32f ptTopX = 1.0f * (szTopLayer.width - 1) * j / (outRoi.width - 1);
+                Ncv32f ptTopY = 1.0f * (szTopLayer.height - 1) * i / (outRoi.height - 1);
+                T topPix = _interpBilinear(*lastLayer, ptTopX, ptTopY);
+                T trilinearPix = topPix;
+
+                if (bUse2Refs)
+                {
+                    //bottom layer pixel (exists only if the requested scale is greater than the smallest layer scale)
+                    NcvSize32u szBottomLayer(curLayer->width(), curLayer->height());
+                    Ncv32f ptBottomX = 1.0f * (szBottomLayer.width - 1) * j / (outRoi.width - 1);
+                    Ncv32f ptBottomY = 1.0f * (szBottomLayer.height - 1) * i / (outRoi.height - 1);
+                    T bottomPix = _interpBilinear(*curLayer, ptBottomX, ptBottomY);
+
+                    Ncv32f scale = (1.0f * outRoi.width / layer0->width() + 1.0f * outRoi.height / layer0->height()) / 2;
+                    Ncv32f dl = (scale - curScale) / (lastScale - curScale);
+                    dl = CLAMP(dl, 0.0f, 1.0f);
+                    trilinearPix = _interpLinear(bottomPix, topPix, dl);
+                }
+
+                outImg.at(j, i) = trilinearPix;
+            }
+        }
+    }
+
+    NCV_SKIP_COND_END
+
+    return NCV_SUCCESS;
+}
+
+
+template class NCVImagePyramid<uchar1>;
+template class NCVImagePyramid<uchar3>;
+template class NCVImagePyramid<uchar4>;
+template class NCVImagePyramid<ushort1>;
+template class NCVImagePyramid<ushort3>;
+template class NCVImagePyramid<ushort4>;
+template class NCVImagePyramid<uint1>;
+template class NCVImagePyramid<uint3>;
+template class NCVImagePyramid<uint4>;
+template class NCVImagePyramid<float1>;
+template class NCVImagePyramid<float3>;
+template class NCVImagePyramid<float4>;
--- a/modules/gpu/src/nvidia/core/NCVPyramid.hpp
+++ b/modules/gpu/src/nvidia/core/NCVPyramid.hpp
@ -0,0 +1,97 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 
+// 
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+
+#ifndef _ncvpyramid_hpp_
+#define _ncvpyramid_hpp_
+
+#include <memory>
+#include <vector>
+#include "NCV.hpp"
+
+
+template <class T>
+class NCV_EXPORTS NCVMatrixStack
+{
+public:
+    NCVMatrixStack() {this->_arr.clear();}
+    ~NCVMatrixStack()
+    {
+        const Ncv32u nElem = this->_arr.size();
+        for (Ncv32u i=0; i<nElem; i++)
+        {
+            pop_back();
+        }
+    }
+    void push_back(NCVMatrix<T> *elem) {this->_arr.push_back(std::tr1::shared_ptr< NCVMatrix<T> >(elem));}
+    void pop_back() {this->_arr.pop_back();}
+    NCVMatrix<T> * operator [] (int i) const {return this->_arr[i].get();}
+private:
+    std::vector< std::tr1::shared_ptr< NCVMatrix<T> > > _arr;
+};
+
+
+template <class T>
+class NCV_EXPORTS NCVImagePyramid
+{
+public:
+
+    NCVImagePyramid(const NCVMatrix<T> &img,
+                    Ncv8u nLayers,
+                    INCVMemAllocator &alloc,
+                    cudaStream_t cuStream);
+    ~NCVImagePyramid();
+    NcvBool isInitialized() const;
+    NCVStatus getLayer(NCVMatrix<T> &outImg,
+                       NcvSize32u outRoi,
+                       NcvBool bTrilinear,
+                       cudaStream_t cuStream) const;
+
+private:
+
+    NcvBool _isInitialized;
+    const NCVMatrix<T> *layer0;
+    NCVMatrixStack<T> pyramid;
+    Ncv32u nLayers;
+};
+
+
+#endif //_ncvpyramid_hpp_
--- a/modules/gpu/src/optical_flow.cpp
+++ b/modules/gpu/src/optical_flow.cpp
@ -68,10 +68,7 @@ namespace

 namespace
 {
-    void outputHandler(const char* msg) 
-    { 
-        CV_Error(CV_GpuApiCallError, msg); 
-    }
+    static void outputHandler(const std::string &msg) { CV_Error(CV_GpuApiCallError, msg.c_str()); }
 }

 void cv::gpu::BroxOpticalFlow::operator ()(const GpuMat& frame0, const GpuMat& frame1, GpuMat& u, GpuMat& v, Stream& s)