Merge pull request #3378 from cudawarped:replace_texture_ref_with_texture_obj

Fix CUDA texture bugs and replace all instances of CUDA texture references with texture objects
2 years ago · 8db3e627fb
parent b5f4e24515 8a6ea82ed0
commit 8db3e627fb
33 changed files with 1135 additions and 2329 deletions
--- a/modules/cudaarithm/src/cuda/lut.cu
+++ b/modules/cudaarithm/src/cuda/lut.cu
@ -53,6 +53,7 @@
 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/cudev.hpp"
 #include "opencv2/core/private.cuda.hpp"
 #include <opencv2/cudev/ptr2d/texture.hpp>
 using namespace cv;
 using namespace cv::cuda;
@ -60,8 +61,6 @@ using namespace cv::cudev;
 namespace cv { namespace cuda {
    texture<uchar, cudaTextureType1D, cudaReadModeElementType> texLutTable;
    LookUpTableImpl::LookUpTableImpl(InputArray _lut)
    {
        if (_lut.kind() == _InputArray::CUDA_GPU_MAT)
@ -73,83 +72,28 @@ namespace cv { namespace cuda {
            Mat h_lut = _lut.getMat();
            d_lut.upload(Mat(1, 256, h_lut.type(), h_lut.data));
        }
        CV_Assert( d_lut.depth() == CV_8U );
        CV_Assert( d_lut.rows == 1 && d_lut.cols == 256 );
-
+        szInBytes = 256 * d_lut.channels() * sizeof(uchar);
        cc30 = deviceSupports(FEATURE_SET_COMPUTE_30);
        if (cc30)
        {
            // Use the texture object
            cudaResourceDesc texRes;
            std::memset(&texRes, 0, sizeof(texRes));
            texRes.resType = cudaResourceTypeLinear;
            texRes.res.linear.devPtr = d_lut.data;
            texRes.res.linear.desc = cudaCreateChannelDesc<uchar>();
            texRes.res.linear.sizeInBytes = 256 * d_lut.channels() * sizeof(uchar);
            cudaTextureDesc texDescr;
            std::memset(&texDescr, 0, sizeof(texDescr));
            CV_CUDEV_SAFE_CALL( cudaCreateTextureObject(&texLutTableObj, &texRes, &texDescr, 0) );
        }
        else
        {
            // Use the texture reference
            cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar>();
            CV_CUDEV_SAFE_CALL( cudaBindTexture(0, &texLutTable, d_lut.data, &desc) );
        }
    }
    LookUpTableImpl::~LookUpTableImpl()
    {
        if (cc30)
        {
            // Use the texture object
            cudaDestroyTextureObject(texLutTableObj);
        }
        else
        {
            // Use the texture reference
            cudaUnbindTexture(texLutTable);
        }
    }
    struct LutTablePtrC1
    {
        typedef uchar value_type;
        typedef uchar index_type;
-
+        cv::cudev::TexturePtr<uchar> tex;
-        cudaTextureObject_t texLutTableObj;
+        __device__ __forceinline__ uchar operator ()(uchar, uchar x) const {
-
+            return tex(x);
        __device__ __forceinline__ uchar operator ()(uchar, uchar x) const
        {
        #if CV_CUDEV_ARCH < 300
            // Use the texture reference
            return tex1Dfetch(texLutTable, x);
        #else
            // Use the texture object
            return tex1Dfetch<uchar>(texLutTableObj, x);
        #endif
        }
    };
    struct LutTablePtrC3
    {
        typedef uchar3 value_type;
        typedef uchar3 index_type;
-
+        cv::cudev::TexturePtr<uchar> tex;
-        cudaTextureObject_t texLutTableObj;
+        __device__ __forceinline__ uchar3 operator ()(const uchar3&, const uchar3& x) const {
-
+            return make_uchar3(tex(x.x * 3), tex(x.y * 3 + 1), tex(x.z * 3 + 2));
        __device__ __forceinline__ uchar3 operator ()(const uchar3&, const uchar3& x) const
        {
        #if CV_CUDEV_ARCH < 300
            // Use the texture reference
            return make_uchar3(tex1Dfetch(texLutTable, x.x * 3), tex1Dfetch(texLutTable, x.y * 3 + 1), tex1Dfetch(texLutTable, x.z * 3 + 2));
        #else
            // Use the texture object
            return make_uchar3(tex1Dfetch<uchar>(texLutTableObj, x.x * 3), tex1Dfetch<uchar>(texLutTableObj, x.y * 3 + 1), tex1Dfetch<uchar>(texLutTableObj, x.z * 3 + 2));
        #endif
        }
    };
@ -169,20 +113,18 @@ namespace cv { namespace cuda {
        {
            GpuMat_<uchar> src1(src.reshape(1));
            GpuMat_<uchar> dst1(dst.reshape(1));
-
+            cv::cudev::Texture<uchar> tex(szInBytes, reinterpret_cast<uchar*>(d_lut.data));
            LutTablePtrC1 tbl;
-            tbl.texLutTableObj = texLutTableObj;
+            tbl.tex = TexturePtr<uchar>(tex);
            dst1.assign(lut_(src1, tbl), stream);
        }
        else if (lut_cn == 3)
        {
            GpuMat_<uchar3>& src3 = (GpuMat_<uchar3>&) src;
            GpuMat_<uchar3>& dst3 = (GpuMat_<uchar3>&) dst;
-
+            cv::cudev::Texture<uchar> tex(szInBytes, reinterpret_cast<uchar*>(d_lut.data));
            LutTablePtrC3 tbl;
-            tbl.texLutTableObj = texLutTableObj;
+            tbl.tex = TexturePtr<uchar>(tex);
            dst3.assign(lut_(src3, tbl), stream);
        }
--- a/modules/cudaarithm/src/lut.hpp
+++ b/modules/cudaarithm/src/lut.hpp
@ -15,14 +15,10 @@ class LookUpTableImpl : public LookUpTable
 {
 public:
    LookUpTableImpl(InputArray lut);
    ~LookUpTableImpl();
    void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) CV_OVERRIDE;
 private:
    GpuMat d_lut;
-    cudaTextureObject_t texLutTableObj;
+    size_t szInBytes = 0;
    bool cc30;
 };
 } }
--- a/modules/cudaimgproc/src/cuda/canny.cu
+++ b/modules/cudaimgproc/src/cuda/canny.cu
@ -48,6 +48,7 @@
 #include "opencv2/core/cuda/functional.hpp"
 #include "opencv2/core/cuda/utility.hpp"
 #include "opencv2/core/cuda.hpp"
 #include <opencv2/cudev/ptr2d/texture.hpp>
 using namespace cv::cuda;
 using namespace cv::cuda::device;
@ -90,47 +91,8 @@ namespace cv { namespace cuda { namespace device
 namespace canny
 {
-    struct SrcTex
+    template <class Norm>
-    {
+    __global__ void calcMagnitudeKernel(cv::cudev::TextureOffPtr<uchar> texSrc, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
        virtual ~SrcTex() {}
        __host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {}
        __device__ __forceinline__ virtual int operator ()(int y, int x) const = 0;
        int xoff;
        int yoff;
    };
    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp);
    struct SrcTexRef : SrcTex
    {
        __host__ SrcTexRef(int _xoff, int _yoff) : SrcTex(_xoff, _yoff) {}
        __device__ __forceinline__ int operator ()(int y, int x) const override
        {
            return tex2D(tex_src, x + xoff, y + yoff);
        }
    };
    struct SrcTexObj : SrcTex
    {
        __host__ SrcTexObj(int _xoff, int _yoff, cudaTextureObject_t _tex_src_object) : SrcTex(_xoff, _yoff), tex_src_object(_tex_src_object) { }
        __device__ __forceinline__ int operator ()(int y, int x) const override
        {
            return tex2D<uchar>(tex_src_object, x + xoff, y + yoff);
        }
        cudaTextureObject_t tex_src_object;
    };
    template <
        class T,
        class Norm,
        typename = typename std::enable_if<std::is_base_of<SrcTex, T>::value>::type
    >
    __global__ void calcMagnitudeKernel(const T src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
    {
        const int x = blockIdx.x * blockDim.x + threadIdx.x;
        const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -138,8 +100,8 @@ namespace canny
        if (y >= mag.rows || x >= mag.cols)
            return;
-        int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1));
+        int dxVal = (texSrc(y - 1, x + 1) + 2 * texSrc(y, x + 1) + texSrc(y + 1, x + 1)) - (texSrc(y - 1, x - 1) + 2 * texSrc(y, x - 1) + texSrc(y + 1, x - 1));
-        int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1));
+        int dyVal = (texSrc(y + 1, x - 1) + 2 * texSrc(y + 1, x) + texSrc(y + 1, x + 1)) - (texSrc(y - 1, x - 1) + 2 * texSrc(y - 1, x) + texSrc(y - 1, x + 1));
        dx(y, x) = dxVal;
        dy(y, x) = dyVal;
@ -151,63 +113,20 @@ namespace canny
    {
        const dim3 block(16, 16);
        const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y));
-
+        cv::cudev::TextureOff<uchar> texSrc(srcWhole, yoff, xoff);
-        bool cc30 = deviceSupports(FEATURE_SET_COMPUTE_30);
+        if (L2Grad)
        if (cc30)
        {
-            cudaTextureDesc texDesc;
+            L2 norm;
-            memset(&texDesc, 0, sizeof(texDesc));
+            calcMagnitudeKernel<<<grid, block, 0, stream>>>(texSrc, dx, dy, mag, norm);
            texDesc.addressMode[0] = cudaAddressModeClamp;
            texDesc.addressMode[1] = cudaAddressModeClamp;
            texDesc.addressMode[2] = cudaAddressModeClamp;
            cudaTextureObject_t tex = 0;
            createTextureObjectPitch2D(&tex, srcWhole, texDesc);
            SrcTexObj src(xoff, yoff, tex);
            if (L2Grad)
            {
                L2 norm;
                calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
            }
            else
            {
                L1 norm;
                calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
            }
            cudaSafeCall( cudaGetLastError() );
            if (stream == NULL)
                cudaSafeCall( cudaDeviceSynchronize() );
            else
                cudaSafeCall( cudaStreamSynchronize(stream) );
            cudaSafeCall( cudaDestroyTextureObject(tex) );
        }
        else
        {
-            bindTexture(&tex_src, srcWhole);
+            L1 norm;
-            SrcTexRef src(xoff, yoff);
+            calcMagnitudeKernel<<<grid, block, 0, stream>>>(texSrc, dx, dy, mag, norm);
            if (L2Grad)
            {
                L2 norm;
                calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
            }
            else
            {
                L1 norm;
                calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
            }
            cudaSafeCall( cudaGetLastError() );
            if (stream == NULL)
                cudaSafeCall( cudaDeviceSynchronize() );
        }
        if (stream == NULL)
            cudaSafeCall(cudaDeviceSynchronize());
    }
    void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream)
@ -229,8 +148,7 @@ namespace canny
 namespace canny
 {
-    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp);
+    __global__ void calcMapKernel(cv::cudev::TexturePtr<float> texMag, const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
    __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
    {
        const int CANNY_SHIFT = 15;
        const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
@ -245,7 +163,7 @@ namespace canny
        int dyVal = dy(y, x);
        const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
-        const float m = tex2D(tex_mag, x, y);
+        const float m = texMag(y, x);
        dxVal = ::abs(dxVal);
        dyVal = ::abs(dyVal);
@ -264,69 +182,17 @@ namespace canny
            if (dyVal < tg22x)
            {
-                if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y))
+                if (m > texMag(y, x - 1) && m >= texMag(y, x + 1))
                    edge_type = 1 + (int)(m > high_thresh);
            }
            else if(dyVal > tg67x)
            {
-                if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1))
+                if (m > texMag(y - 1, x) && m >= texMag(y + 1, x))
                    edge_type = 1 + (int)(m > high_thresh);
            }
            else
            {
-                if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1))
+                if (m > texMag(y - 1, x - s) && m >= texMag(y + 1, x + s))
                    edge_type = 1 + (int)(m > high_thresh);
            }
        }
        map(y, x) = edge_type;
    }
    __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh, cudaTextureObject_t tex_mag)
    {
        const int CANNY_SHIFT = 15;
        const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
        const int x = blockIdx.x * blockDim.x + threadIdx.x;
        const int y = blockIdx.y * blockDim.y + threadIdx.y;
        if (x == 0 || x >= dx.cols - 1 || y == 0 || y >= dx.rows - 1)
            return;
        int dxVal = dx(y, x);
        int dyVal = dy(y, x);
        const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
        const float m = tex2D<float>(tex_mag, x, y);
        dxVal = ::abs(dxVal);
        dyVal = ::abs(dyVal);
        // 0 - the pixel can not belong to an edge
        // 1 - the pixel might belong to an edge
        // 2 - the pixel does belong to an edge
        int edge_type = 0;
        if (m > low_thresh)
        {
            const int tg22x = dxVal * TG22;
            const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT);
            dyVal <<= CANNY_SHIFT;
            if (dyVal < tg22x)
            {
                if (m > tex2D<float>(tex_mag, x - 1, y) && m >= tex2D<float>(tex_mag, x + 1, y))
                    edge_type = 1 + (int)(m > high_thresh);
            }
            else if(dyVal > tg67x)
            {
                if (m > tex2D<float>(tex_mag, x, y - 1) && m >= tex2D<float>(tex_mag, x, y + 1))
                    edge_type = 1 + (int)(m > high_thresh);
            }
            else
            {
                if (m > tex2D<float>(tex_mag, x - s, y - 1) && m >= tex2D<float>(tex_mag, x + s, y + 1))
                    edge_type = 1 + (int)(m > high_thresh);
            }
        }
@ -338,47 +204,10 @@ namespace canny
    {
        const dim3 block(16, 16);
        const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y));
-
+        cv::cudev::Texture<float> texMag(mag);
-        if (deviceSupports(FEATURE_SET_COMPUTE_30))
+        calcMapKernel<<<grid, block, 0, stream>>>(texMag, dx, dy, map, low_thresh, high_thresh);
-        {
+        if (stream == NULL)
-            // Use the texture object
+            cudaSafeCall( cudaDeviceSynchronize() );
            cudaResourceDesc resDesc;
            memset(&resDesc, 0, sizeof(resDesc));
            resDesc.resType = cudaResourceTypePitch2D;
            resDesc.res.pitch2D.devPtr = mag.ptr();
            resDesc.res.pitch2D.height = mag.rows;
            resDesc.res.pitch2D.width = mag.cols;
            resDesc.res.pitch2D.pitchInBytes = mag.step;
            resDesc.res.pitch2D.desc = cudaCreateChannelDesc<float>();
            cudaTextureDesc texDesc;
            memset(&texDesc, 0, sizeof(texDesc));
            texDesc.addressMode[0] = cudaAddressModeClamp;
            texDesc.addressMode[1] = cudaAddressModeClamp;
            texDesc.addressMode[2] = cudaAddressModeClamp;
            cudaTextureObject_t tex=0;
            cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
            calcMapKernel<<<grid, block, 0, stream>>>(dx, dy, map, low_thresh, high_thresh, tex);
            cudaSafeCall( cudaGetLastError() );
            if (stream == NULL)
                cudaSafeCall( cudaDeviceSynchronize() );
            else
                cudaSafeCall( cudaStreamSynchronize(stream) );
            cudaSafeCall( cudaDestroyTextureObject(tex) );
        }
        else
        {
            // Use the texture reference
            bindTexture(&tex_mag, mag);
            calcMapKernel<<<grid, block, 0, stream>>>(dx, dy, map, low_thresh, high_thresh);
            cudaSafeCall( cudaGetLastError() );
            if (stream == NULL)
                cudaSafeCall( cudaDeviceSynchronize() );
        }
    }
 }
--- a/modules/cudaimgproc/src/cuda/corners.cu
+++ b/modules/cudaimgproc/src/cuda/corners.cu
@ -47,6 +47,7 @@
 #include "opencv2/core/cuda/vec_math.hpp"
 #include "opencv2/core/cuda/saturate_cast.hpp"
 #include "opencv2/core/cuda/border_interpolate.hpp"
 #include <opencv2/cudev/ptr2d/texture.hpp>
 #include "opencv2/opencv_modules.hpp"
@ -58,10 +59,7 @@ namespace cv { namespace cuda { namespace device
    {
        /////////////////////////////////////////// Corner Harris /////////////////////////////////////////////////
-        texture<float, cudaTextureType2D, cudaReadModeElementType> harrisDxTex(0, cudaFilterModePoint, cudaAddressModeClamp);
+        __global__ void cornerHarris_kernel(cv::cudev::TexturePtr<float> texDx, cv::cudev::TexturePtr<float> texDy, const int block_size, const float k, PtrStepSzf dst)
        texture<float, cudaTextureType2D, cudaReadModeElementType> harrisDyTex(0, cudaFilterModePoint, cudaAddressModeClamp);
        __global__ void cornerHarris_kernel(const int block_size, const float k, PtrStepSzf dst)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -81,8 +79,8 @@ namespace cv { namespace cuda { namespace device
                {
                    for (int j = jbegin; j < jend; ++j)
                    {
-                        float dx = tex2D(harrisDxTex, j, i);
+                        float dx = texDx(i, j);
-                        float dy = tex2D(harrisDyTex, j, i);
+                        float dy = texDy(i, j);
                        a += dx * dx;
                        b += dx * dy;
@ -95,7 +93,7 @@ namespace cv { namespace cuda { namespace device
        }
        template <typename BR, typename BC>
-        __global__ void cornerHarris_kernel(const int block_size, const float k, PtrStepSzf dst, const BR border_row, const BC border_col)
+        __global__ void cornerHarris_kernel(cv::cudev::TexturePtr<float> texDx, cv::cudev::TexturePtr<float> texDy, const int block_size, const float k, PtrStepSzf dst, const BR border_row, const BC border_col)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -119,8 +117,8 @@ namespace cv { namespace cuda { namespace device
                    {
                        const int x = border_row.idx_col(j);
-                        float dx = tex2D(harrisDxTex, x, y);
+                        float dx = texDx(y, x);
-                        float dy = tex2D(harrisDyTex, x, y);
+                        float dy = texDy(y, x);
                        a += dx * dx;
                        b += dx * dy;
@ -136,22 +134,20 @@ namespace cv { namespace cuda { namespace device
        {
            dim3 block(32, 8);
            dim3 grid(divUp(Dx.cols, block.x), divUp(Dx.rows, block.y));
-
+            cv::cudev::Texture<float> texDx(Dx);
-            bindTexture(&harrisDxTex, Dx);
+            cv::cudev::Texture<float> texDy(Dy);
            bindTexture(&harrisDyTex, Dy);
            switch (border_type)
            {
            case BORDER_REFLECT101:
-                cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows));
+                cornerHarris_kernel<<<grid, block, 0, stream>>>(texDx, texDy, block_size, k, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows));
                break;
            case BORDER_REFLECT:
-                cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst, BrdRowReflect<void>(Dx.cols), BrdColReflect<void>(Dx.rows));
+                cornerHarris_kernel<<<grid, block, 0, stream>>>(texDx, texDy, block_size, k, dst, BrdRowReflect<void>(Dx.cols), BrdColReflect<void>(Dx.rows));
                break;
            case BORDER_REPLICATE:
-                cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst);
+                cornerHarris_kernel<<<grid, block, 0, stream>>>(texDx, texDy, block_size, k, dst);
                break;
            }
@ -163,10 +159,7 @@ namespace cv { namespace cuda { namespace device
        /////////////////////////////////////////// Corner Min Eigen Val /////////////////////////////////////////////////
-        texture<float, cudaTextureType2D, cudaReadModeElementType> minEigenValDxTex(0, cudaFilterModePoint, cudaAddressModeClamp);
+        __global__ void cornerMinEigenVal_kernel(cv::cudev::TexturePtr<float> texMinEigenValDx, cv::cudev::TexturePtr<float> texMinEigenValDy, const int block_size, PtrStepSzf dst)
        texture<float, cudaTextureType2D, cudaReadModeElementType> minEigenValDyTex(0, cudaFilterModePoint, cudaAddressModeClamp);
        __global__ void cornerMinEigenVal_kernel(const int block_size, PtrStepSzf dst)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -186,8 +179,8 @@ namespace cv { namespace cuda { namespace device
                {
                    for (int j = jbegin; j < jend; ++j)
                    {
-                        float dx = tex2D(minEigenValDxTex, j, i);
+                        float dx = texMinEigenValDx(i, j);
-                        float dy = tex2D(minEigenValDyTex, j, i);
+                        float dy = texMinEigenValDy(i, j);
                        a += dx * dx;
                        b += dx * dy;
@ -204,7 +197,7 @@ namespace cv { namespace cuda { namespace device
        template <typename BR, typename BC>
-        __global__ void cornerMinEigenVal_kernel(const int block_size, PtrStepSzf dst, const BR border_row, const BC border_col)
+        __global__ void cornerMinEigenVal_kernel(cv::cudev::TexturePtr<float> texMinEigenValDx, cv::cudev::TexturePtr<float> texMinEigenValDy, const int block_size, PtrStepSzf dst, const BR border_row, const BC border_col)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -228,8 +221,8 @@ namespace cv { namespace cuda { namespace device
                    {
                        int x = border_row.idx_col(j);
-                        float dx = tex2D(minEigenValDxTex, x, y);
+                        float dx = texMinEigenValDx(y, x);
-                        float dy = tex2D(minEigenValDyTex, x, y);
+                        float dy = texMinEigenValDy(y, x);
                        a += dx * dx;
                        b += dx * dy;
@ -248,22 +241,20 @@ namespace cv { namespace cuda { namespace device
        {
            dim3 block(32, 8);
            dim3 grid(divUp(Dx.cols, block.x), divUp(Dx.rows, block.y));
-
+            cv::cudev::Texture<float> texMinEigenValDx(Dx);
-            bindTexture(&minEigenValDxTex, Dx);
+            cv::cudev::Texture<float> texMinEigenValDy(Dy);
            bindTexture(&minEigenValDyTex, Dy);
            switch (border_type)
            {
            case BORDER_REFLECT101:
-                cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows));
+                cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(texMinEigenValDx, texMinEigenValDy, block_size, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows));
                break;
            case BORDER_REFLECT:
-                cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst, BrdRowReflect<void>(Dx.cols), BrdColReflect<void>(Dx.rows));
+                cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(texMinEigenValDx, texMinEigenValDy, block_size, dst, BrdRowReflect<void>(Dx.cols), BrdColReflect<void>(Dx.rows));
                break;
            case BORDER_REPLICATE:
-                cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst);
+                cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(texMinEigenValDx, texMinEigenValDy, block_size, dst);
                break;
            }
--- a/modules/cudaimgproc/src/cuda/debayer.cu
+++ b/modules/cudaimgproc/src/cuda/debayer.cu
@ -48,6 +48,7 @@
 #include "opencv2/core/cuda/limits.hpp"
 #include "opencv2/core/cuda/color.hpp"
 #include "opencv2/core/cuda/saturate_cast.hpp"
 #include "opencv2/cudev/ptr2d/texture.hpp"
 namespace cv { namespace cuda { namespace device
 {
@ -389,10 +390,8 @@ namespace cv { namespace cuda { namespace device
    //
    // ported to CUDA
-    texture<uchar, cudaTextureType2D, cudaReadModeElementType> sourceTex(false, cudaFilterModePoint, cudaAddressModeClamp);
+    template <typename DstType, class Ptr2D>
-
+    __global__ void MHCdemosaic(PtrStepSz<DstType> dst, Ptr2D src, const int2 firstRed)
    template <typename DstType>
    __global__ void MHCdemosaic(PtrStepSz<DstType> dst, const int2 sourceOffset, const int2 firstRed)
    {
        const float   kAx = -1.0f / 8.0f,     kAy = -1.5f / 8.0f,     kAz =  0.5f / 8.0f    /*kAw = -1.0f / 8.0f*/;
        const float   kBx =  2.0f / 8.0f,   /*kBy =  0.0f / 8.0f,*/ /*kBz =  0.0f / 8.0f,*/   kBw =  4.0f / 8.0f  ;
@ -408,8 +407,8 @@ namespace cv { namespace cuda { namespace device
            return;
        int2 center;
-        center.x = x + sourceOffset.x;
+        center.x = x;
-        center.y = y + sourceOffset.y;
+        center.y = y;
        int4 xCoord;
        xCoord.x = center.x - 2;
@ -423,25 +422,26 @@ namespace cv { namespace cuda { namespace device
        yCoord.z = center.y + 1;
        yCoord.w = center.y + 2;
-        float C = tex2D(sourceTex, center.x, center.y); // ( 0, 0)
+        float C = src(center.y, center.x); // ( 0, 0)
        float4 Dvec;
-        Dvec.x = tex2D(sourceTex, xCoord.y, yCoord.y); // (-1,-1)
+        Dvec.x = src(yCoord.y, xCoord.y); // (-1,-1)
-        Dvec.y = tex2D(sourceTex, xCoord.y, yCoord.z); // (-1, 1)
+        Dvec.y = src(yCoord.z, xCoord.y); // (-1, 1)
-        Dvec.z = tex2D(sourceTex, xCoord.z, yCoord.y); // ( 1,-1)
+        Dvec.z = src(yCoord.y, xCoord.z); // ( 1,-1)
-        Dvec.w = tex2D(sourceTex, xCoord.z, yCoord.z); // ( 1, 1)
+        Dvec.w = src(yCoord.z, xCoord.z); // ( 1, 1)
        float4 value;
-        value.x = tex2D(sourceTex, center.x, yCoord.x); // ( 0,-2) A0
+        value.x = src(yCoord.x, center.x); // ( 0,-2) A0
-        value.y = tex2D(sourceTex, center.x, yCoord.y); // ( 0,-1) B0
+        value.y = src(yCoord.y, center.x); // ( 0,-1) B0
-        value.z = tex2D(sourceTex, xCoord.x, center.y); // (-2, 0) E0
+        value.z = src(center.y, xCoord.x); // (-2, 0) E0
-        value.w = tex2D(sourceTex, xCoord.y, center.y); // (-1, 0) F0
+        value.w = src(center.y, xCoord.y); // (-1, 0) F0
        // (A0 + A1), (B0 + B1), (E0 + E1), (F0 + F1)
-        value.x += tex2D(sourceTex, center.x, yCoord.w); // ( 0, 2) A1
+        value.x += src(yCoord.w, center.x); // ( 0, 2) A1
-        value.y += tex2D(sourceTex, center.x, yCoord.z); // ( 0, 1) B1
+        value.y += src(yCoord.z, center.x); // ( 0, 1) B1
-        value.z += tex2D(sourceTex, xCoord.w, center.y); // ( 2, 0) E1
+        value.z += src(center.y, xCoord.w); // ( 2, 0) E1
-        value.w += tex2D(sourceTex, xCoord.z, center.y); // ( 1, 0) F1
+        value.w += src(center.y, xCoord.z); // ( 1, 0) F1
        float4 PATTERN;
        PATTERN.x = kCx * C;
@ -527,9 +527,15 @@ namespace cv { namespace cuda { namespace device
        const dim3 block(32, 8);
        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
-        bindTexture(&sourceTex, src);
+        if (sourceOffset.x || sourceOffset.y) {
            cv::cudev::TextureOff<uchar> texSrc(src, sourceOffset.y, sourceOffset.x);
            MHCdemosaic<dst_t, cv::cudev::TextureOffPtr<uchar>><<<grid, block, 0, stream>>>((PtrStepSz<dst_t>)dst, texSrc, firstRed);
        }
        else {
            cv::cudev::Texture<uchar> texSrc(src);
            MHCdemosaic<dst_t, cv::cudev::TexturePtr<uchar>><<<grid, block, 0, stream>>>((PtrStepSz<dst_t>)dst, texSrc, firstRed);
        }
        MHCdemosaic<dst_t><<<grid, block, 0, stream>>>((PtrStepSz<dst_t>)dst, sourceOffset, firstRed);
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0)
--- a/modules/cudaimgproc/src/cuda/gftt.cu
+++ b/modules/cudaimgproc/src/cuda/gftt.cu
@ -45,36 +45,36 @@
 #include <thrust/device_ptr.h>
 #include <thrust/sort.h>
 #include "opencv2/core/cuda/common.hpp"
 #include "opencv2/core/cuda/utility.hpp"
 #include <opencv2/cudev/ptr2d/texture.hpp>
 #include <thrust/execution_policy.h>
 namespace cv { namespace cuda { namespace device
 {
    namespace gfft
    {
-        template <class Mask> __global__ void findCorners(float threshold, const Mask mask, float2* corners, int max_count, int rows, int cols, cudaTextureObject_t eigTex, int *g_counter)
+        template <class Mask> __global__ void findCorners(cv::cudev::TexturePtr<float> tex, float threshold, const Mask mask, float2* corners, int max_count, int rows, int cols, int *g_counter)
        {
            const int j = blockIdx.x * blockDim.x + threadIdx.x;
            const int i = blockIdx.y * blockDim.y + threadIdx.y;
            if (i > 0 && i < rows - 1 && j > 0 && j < cols - 1 && mask(i, j))
            {
-                float val = tex2D<float>(eigTex, j, i);
+                float val = tex(i, j);
                if (val > threshold)
                {
                    float maxVal = val;
-                    maxVal = ::fmax(tex2D<float>(eigTex, j - 1, i - 1), maxVal);
+                    maxVal = ::fmax(tex(i - 1, j - 1), maxVal);
-                    maxVal = ::fmax(tex2D<float>(eigTex, j    , i - 1), maxVal);
+                    maxVal = ::fmax(tex(i - 1, j), maxVal);
-                    maxVal = ::fmax(tex2D<float>(eigTex, j + 1, i - 1), maxVal);
+                    maxVal = ::fmax(tex(i - 1, j + 1), maxVal);
-                    maxVal = ::fmax(tex2D<float>(eigTex, j - 1, i), maxVal);
+                    maxVal = ::fmax(tex(i, j - 1), maxVal);
-                    maxVal = ::fmax(tex2D<float>(eigTex, j + 1, i), maxVal);
+                    maxVal = ::fmax(tex(i, j + 1), maxVal);
-                    maxVal = ::fmax(tex2D<float>(eigTex, j - 1, i + 1), maxVal);
+                    maxVal = ::fmax(tex(i + 1, j - 1), maxVal);
-                    maxVal = ::fmax(tex2D<float>(eigTex, j    , i + 1), maxVal);
+                    maxVal = ::fmax(tex(i + 1, j), maxVal);
-                    maxVal = ::fmax(tex2D<float>(eigTex, j + 1, i + 1), maxVal);
+                    maxVal = ::fmax(tex(i + 1, j + 1), maxVal);
                    if (val == maxVal)
                    {
@ -87,17 +87,18 @@ namespace cv { namespace cuda { namespace device
            }
        }
-        int findCorners_gpu(const cudaTextureObject_t &eigTex, const int &rows, const int &cols, float threshold, PtrStepSzb mask, float2* corners, int max_count, int* counterPtr, cudaStream_t stream)
+        int findCorners_gpu(const PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count, int* counterPtr, cudaStream_t stream)
        {
            cudaSafeCall( cudaMemsetAsync(counterPtr, 0, sizeof(int), stream) );
            cv::cudev::Texture<float> tex(eig);
            dim3 block(16, 16);
-            dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
+            dim3 grid(divUp(eig.cols, block.x), divUp(eig.rows, block.y));
            if (mask.data)
-                findCorners<<<grid, block, 0, stream>>>(threshold, SingleMask(mask), corners, max_count, rows, cols, eigTex, counterPtr);
+                findCorners<<<grid, block, 0, stream>>>(tex, threshold, SingleMask(mask), corners, max_count, eig.rows, eig.cols, counterPtr);
            else
-                findCorners<<<grid, block, 0, stream>>>(threshold, WithOutMask(), corners, max_count, rows, cols, eigTex, counterPtr);
+                findCorners<<<grid, block, 0, stream>>>(tex, threshold, WithOutMask(), corners, max_count, eig.rows, eig.cols, counterPtr);
            cudaSafeCall( cudaGetLastError() );
@ -113,27 +114,24 @@ namespace cv { namespace cuda { namespace device
        class EigGreater
        {
        public:
-            EigGreater(const cudaTextureObject_t &eigTex_) : eigTex(eigTex_)
+            EigGreater(cv::cudev::TexturePtr<float> tex_) : tex(tex_) {}
-            {
+            __device__ __forceinline__ bool operator()(float2 a, float2 b) const{
-            }
+                return tex(a.y, a.x) > tex(b.y, b.x);
            __device__ __forceinline__ bool operator()(float2 a, float2 b) const
            {
                return tex2D<float>(eigTex, a.x, a.y) > tex2D<float>(eigTex, b.x, b.y);
            }
-
+            cv::cudev::TexturePtr<float> tex;
            cudaTextureObject_t eigTex;
        };
-        void sortCorners_gpu(const cudaTextureObject_t &eigTex, float2* corners, int count, cudaStream_t stream)
+        void sortCorners_gpu(const PtrStepSzf eig, float2* corners, int count, cudaStream_t stream)
        {
            cv::cudev::Texture<float> tex(eig);
            thrust::device_ptr<float2> ptr(corners);
 #if THRUST_VERSION >= 100802
            if (stream)
-                thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()).on(stream), ptr, ptr + count, EigGreater(eigTex));
+                thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()).on(stream), ptr, ptr + count, EigGreater(tex));
            else
-                thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()), ptr, ptr + count, EigGreater(eigTex));
+                thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()), ptr, ptr + count, EigGreater(tex));
 #else
-            thrust::sort(ptr, ptr + count, EigGreater(eigTex));
+            thrust::sort(ptr, ptr + count, EigGreater(tex));
 #endif
        }
    } // namespace optical_flow
--- a/modules/cudaimgproc/src/cuda/hough_segments.cu
+++ b/modules/cudaimgproc/src/cuda/hough_segments.cu
@ -50,7 +50,8 @@ namespace cv { namespace cuda { namespace device
 {
    namespace hough_segments
    {
-        __global__ void houghLinesProbabilistic(cv::cudev::Texture<uchar> src, const PtrStepSzi accum,
+        template<class Ptr2D>
        __global__ void houghLinesProbabilistic(Ptr2D src, const PtrStepSzi accum,
                                                int4* out, const int maxSize,
                                                const float rho, const float theta,
                                                const int lineGap, const int lineLength,
@ -219,15 +220,18 @@ namespace cv { namespace cuda { namespace device
            const dim3 block(32, 8);
            const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
-            cv::cudev::GpuMat_<uchar> src_(mask);
+            Size wholeSize;
-            cv::cudev::Texture<uchar> tex(src_, false, cudaFilterModePoint, cudaAddressModeClamp);
+            Point ofs;
-
+            mask.locateROI(wholeSize, ofs);
-            houghLinesProbabilistic<<<grid, block, 0, stream>>>(tex, accum,
+            if (ofs.x || ofs.y) {
-                                                     out, maxSize,
+                cv::cudev::TextureOff<uchar> texMask(wholeSize.height, wholeSize.width, mask.datastart, mask.step, ofs.y, ofs.x);
-                                                     rho, theta,
+                houghLinesProbabilistic<cv::cudev::TextureOffPtr<uchar>><<<grid, block, 0, stream>>>(texMask, accum, out, maxSize, rho, theta, lineGap, lineLength, mask.rows, mask.cols, counterPtr);
-                                                     lineGap, lineLength,
+            }
-                                                     mask.rows, mask.cols,
+            else {
-                                                     counterPtr);
+                cv::cudev::Texture<uchar> texMask(mask);
                houghLinesProbabilistic<cv::cudev::TexturePtr<uchar>><<<grid, block, 0, stream>>>(texMask, accum, out, maxSize, rho, theta, lineGap, lineLength, mask.rows, mask.cols, counterPtr);
            }
            cudaSafeCall( cudaGetLastError() );
            int totalCount;
@ -236,7 +240,6 @@ namespace cv { namespace cuda { namespace device
            cudaSafeCall( cudaStreamSynchronize(stream) );
            totalCount = ::min(totalCount, maxSize);
            return totalCount;
        }
    }
--- a/modules/cudaimgproc/src/cuda/mean_shift.cu
+++ b/modules/cudaimgproc/src/cuda/mean_shift.cu
@ -47,19 +47,16 @@
 #include "opencv2/core/cuda/vec_math.hpp"
 #include "opencv2/core/cuda/saturate_cast.hpp"
 #include "opencv2/core/cuda/border_interpolate.hpp"
 #include <opencv2/cudev/ptr2d/texture.hpp>
 namespace cv { namespace cuda { namespace device
 {
    namespace imgproc
    {
-        texture<uchar4, 2> tex_meanshift;
+        __device__ short2 do_mean_shift(cv::cudev::TexturePtr<uchar4> tex, int x0, int y0, unsigned char* out,size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps)
        __device__ short2 do_mean_shift(int x0, int y0, unsigned char* out,
                                        size_t out_step, int cols, int rows,
                                        int sp, int sr, int maxIter, float eps)
        {
            int isr2 = sr*sr;
-            uchar4 c = tex2D(tex_meanshift, x0, y0 );
+            uchar4 c = tex(y0, x0);
            // iterate meanshift procedure
            for( int iter = 0; iter < maxIter; iter++ )
@ -79,7 +76,7 @@ namespace cv { namespace cuda { namespace device
                    int rowCount = 0;
                    for( int x = minx; x <= maxx; x++ )
                    {
-                        uchar4 t = tex2D( tex_meanshift, x, y );
+                        uchar4 t = tex(y, x);
                        int norm2 = (t.x - c.x) * (t.x - c.x) + (t.y - c.y) * (t.y - c.y) + (t.z - c.z) * (t.z - c.z);
                        if( norm2 <= isr2 )
@ -119,13 +116,13 @@ namespace cv { namespace cuda { namespace device
            return make_short2((short)x0, (short)y0);
        }
-        __global__ void meanshift_kernel(unsigned char* out, size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps )
+        __global__ void meanshift_kernel(cv::cudev::TexturePtr<uchar4> tex, unsigned char* out, size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps )
        {
            int x0 = blockIdx.x * blockDim.x + threadIdx.x;
            int y0 = blockIdx.y * blockDim.y + threadIdx.y;
            if( x0 < cols && y0 < rows )
-                do_mean_shift(x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps);
+                do_mean_shift(tex, x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps);
        }
        void meanShiftFiltering_gpu(const PtrStepSzb& src, PtrStepSzb dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream)
@ -134,21 +131,15 @@ namespace cv { namespace cuda { namespace device
            dim3 threads(32, 8, 1);
            grid.x = divUp(src.cols, threads.x);
            grid.y = divUp(src.rows, threads.y);
-
+            cv::cudev::Texture<uchar4> tex(src.rows, src.cols, (uchar4*)src.data, src.step);
-            cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
+            meanshift_kernel<<< grid, threads, 0, stream >>>( tex, dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
            cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
            meanshift_kernel<<< grid, threads, 0, stream >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
            cudaSafeCall( cudaGetLastError() );
            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }
-        __global__ void meanshiftproc_kernel(unsigned char* outr, size_t outrstep,
+        __global__ void meanshiftproc_kernel(cv::cudev::TexturePtr<uchar4> tex, unsigned char* outr, size_t outrstep, unsigned char* outsp, size_t outspstep,
-                                             unsigned char* outsp, size_t outspstep,
+            int cols, int rows,int sp, int sr, int maxIter, float eps)
                                             int cols, int rows,
                                             int sp, int sr, int maxIter, float eps)
        {
            int x0 = blockIdx.x * blockDim.x + threadIdx.x;
            int y0 = blockIdx.y * blockDim.y + threadIdx.y;
@ -156,7 +147,7 @@ namespace cv { namespace cuda { namespace device
            if( x0 < cols && y0 < rows )
            {
                int basesp = (blockIdx.y * blockDim.y + threadIdx.y) * outspstep + (blockIdx.x * blockDim.x + threadIdx.x) * 2 * sizeof(short);
-                *(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps);
+                *(short2*)(outsp + basesp) = do_mean_shift(tex, x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps);
            }
        }
@ -166,13 +157,9 @@ namespace cv { namespace cuda { namespace device
            dim3 threads(32, 8, 1);
            grid.x = divUp(src.cols, threads.x);
            grid.y = divUp(src.rows, threads.y);
-
+            cv::cudev::Texture<uchar4> tex(src.rows, src.cols, (uchar4*)src.data, src.step);
-            cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
+            meanshiftproc_kernel<<< grid, threads, 0, stream >>>( tex, dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
            cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
            meanshiftproc_kernel<<< grid, threads, 0, stream >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
            cudaSafeCall( cudaGetLastError() );
            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }
--- a/modules/cudaimgproc/src/gftt.cpp
+++ b/modules/cudaimgproc/src/gftt.cpp
@ -55,8 +55,8 @@ namespace cv { namespace cuda { namespace device
 {
    namespace gfft
    {
-        int findCorners_gpu(const cudaTextureObject_t &eigTex_, const int &rows, const int &cols, float threshold, PtrStepSzb mask, float2* corners, int max_count, int* counterPtr, cudaStream_t stream);
+        int findCorners_gpu(const PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count, int* counterPtr, cudaStream_t stream);
-        void sortCorners_gpu(const cudaTextureObject_t &eigTex_, float2* corners, int count, cudaStream_t stream);
+        void sortCorners_gpu(const PtrStepSzf eig, float2* corners, int count, cudaStream_t stream);
    }
 }}}
@ -120,31 +120,15 @@ namespace
        cudaStream_t stream_ = StreamAccessor::getStream(stream);
        ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);
-        //create texture object for findCorners_gpu and sortCorners_gpu
+        int total = findCorners_gpu(eig_, static_cast<float>(maxVal * qualityLevel_), mask, tmpCorners_.ptr<float2>(), tmpCorners_.cols, counterPtr_, stream_);
        cudaTextureDesc texDesc;
        memset(&texDesc, 0, sizeof(texDesc));
        texDesc.readMode = cudaReadModeElementType;
        texDesc.filterMode = cudaFilterModePoint;
        texDesc.addressMode[0] = cudaAddressModeClamp;
        texDesc.addressMode[1] = cudaAddressModeClamp;
        texDesc.addressMode[2] = cudaAddressModeClamp;
        cudaTextureObject_t eigTex_;
        PtrStepSzf eig = eig_;
        cv::cuda::device::createTextureObjectPitch2D<float>(&eigTex_, eig, texDesc);
        int total = findCorners_gpu(eigTex_, eig_.rows, eig_.cols, static_cast<float>(maxVal * qualityLevel_), mask, tmpCorners_.ptr<float2>(), tmpCorners_.cols, counterPtr_, stream_);
        if (total == 0)
        {
            _corners.release();
            cudaSafeCall( cudaDestroyTextureObject(eigTex_) );
            return;
        }
-        sortCorners_gpu(eigTex_, tmpCorners_.ptr<float2>(), total, stream_);
+        sortCorners_gpu(eig_, tmpCorners_.ptr<float2>(), total, stream_);
        cudaSafeCall( cudaDestroyTextureObject(eigTex_) );
        if (minDistance_ < 1)
        {
--- a/modules/cudaimgproc/test/test_color.cpp
+++ b/modules/cudaimgproc/test/test_color.cpp
@ -2294,14 +2294,15 @@ INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CvtColor, testing::Combine(
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // Demosaicing
-struct Demosaicing : testing::TestWithParam<cv::cuda::DeviceInfo>
+struct Demosaicing : testing::TestWithParam<testing::tuple<cv::cuda::DeviceInfo, bool>>
 {
    cv::cuda::DeviceInfo devInfo;
    bool useRoi;
    virtual void SetUp()
    {
-        devInfo = GetParam();
+        devInfo = GET_PARAM(0);
-
+        useRoi = GET_PARAM(1);
        cv::cuda::setDevice(devInfo.deviceID());
    }
@ -2419,7 +2420,7 @@ CUDA_TEST_P(Demosaicing, BayerBG2BGR_MHT)
    mosaic(img, src, cv::Point(1, 1));
    cv::cuda::GpuMat dst;
-    cv::cuda::demosaicing(loadMat(src), dst, cv::cuda::COLOR_BayerBG2BGR_MHT);
+    cv::cuda::demosaicing(loadMat(src, useRoi), dst, cv::cuda::COLOR_BayerBG2BGR_MHT);
    EXPECT_MAT_SIMILAR(img, dst, 5e-3);
 }
@ -2433,7 +2434,7 @@ CUDA_TEST_P(Demosaicing, BayerGB2BGR_MHT)
    mosaic(img, src, cv::Point(0, 1));
    cv::cuda::GpuMat dst;
-    cv::cuda::demosaicing(loadMat(src), dst, cv::cuda::COLOR_BayerGB2BGR_MHT);
+    cv::cuda::demosaicing(loadMat(src, useRoi), dst, cv::cuda::COLOR_BayerGB2BGR_MHT);
    EXPECT_MAT_SIMILAR(img, dst, 5e-3);
 }
@ -2447,7 +2448,7 @@ CUDA_TEST_P(Demosaicing, BayerRG2BGR_MHT)
    mosaic(img, src, cv::Point(0, 0));
    cv::cuda::GpuMat dst;
-    cv::cuda::demosaicing(loadMat(src), dst, cv::cuda::COLOR_BayerRG2BGR_MHT);
+    cv::cuda::demosaicing(loadMat(src, useRoi), dst, cv::cuda::COLOR_BayerRG2BGR_MHT);
    EXPECT_MAT_SIMILAR(img, dst, 5e-3);
 }
@ -2461,12 +2462,11 @@ CUDA_TEST_P(Demosaicing, BayerGR2BGR_MHT)
    mosaic(img, src, cv::Point(1, 0));
    cv::cuda::GpuMat dst;
-    cv::cuda::demosaicing(loadMat(src), dst, cv::cuda::COLOR_BayerGR2BGR_MHT);
+    cv::cuda::demosaicing(loadMat(src, useRoi), dst, cv::cuda::COLOR_BayerGR2BGR_MHT);
    EXPECT_MAT_SIMILAR(img, dst, 5e-3);
 }
-INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, Demosaicing, ALL_DEVICES);
+INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, Demosaicing, testing::Combine(ALL_DEVICES, WHOLE_SUBMAT));
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // swapChannels
--- a/modules/cudaimgproc/test/test_hough.cpp
+++ b/modules/cudaimgproc/test/test_hough.cpp
@ -115,8 +115,20 @@ INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, HoughLines, testing::Combine(
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // HoughLines Probabilistic
-PARAM_TEST_CASE(HoughLinesProbabilistic, cv::cuda::DeviceInfo, cv::Size, UseRoi)
+PARAM_TEST_CASE(HoughLinesProbabilistic, DeviceInfo, Size, UseRoi)
 {
    cv::cuda::DeviceInfo devInfo;
    bool useRoi;
    Size size;
    virtual void SetUp()
    {
        devInfo = GET_PARAM(0);
        size = GET_PARAM(1);
        useRoi = GET_PARAM(2);
        cv::cuda::setDevice(devInfo.deviceID());
    }
    static void generateLines(cv::Mat& img)
    {
        img.setTo(cv::Scalar::all(0));
@ -140,11 +152,6 @@ PARAM_TEST_CASE(HoughLinesProbabilistic, cv::cuda::DeviceInfo, cv::Size, UseRoi)
 CUDA_TEST_P(HoughLinesProbabilistic, Accuracy)
 {
    const cv::cuda::DeviceInfo devInfo = GET_PARAM(0);
    cv::cuda::setDevice(devInfo.deviceID());
    const cv::Size size = GET_PARAM(1);
    const bool useRoi = GET_PARAM(2);
    const float rho = 1.0f;
    const float theta = (float) (1.0 * CV_PI / 180.0);
    const int minLineLength = 15;
@ -169,12 +176,55 @@ CUDA_TEST_P(HoughLinesProbabilistic, Accuracy)
 }
 void HoughLinesProbabilisticThread(const Ptr<HoughSegmentDetector> detector, const GpuMat& imgIn, const std::vector<GpuMat>& linesOut, Stream& stream) {
    for (auto& lines : linesOut)
        detector->detect(imgIn, lines, stream);
    stream.waitForCompletion();
 }
 CUDA_TEST_P(HoughLinesProbabilistic, Async)
 {
    constexpr int nThreads = 5;
    constexpr int nIters = 5;
    vector<Stream> streams(nThreads); // async test only
    vector<GpuMat> imgsIn;
    vector<Ptr<HoughSegmentDetector>> detectors;
    vector<vector<GpuMat>> linesOut(nThreads);
    const float rho = 1.0f;
    const float theta = (float)(1.0 * CV_PI / 180.0);
    const int minLineLength = 15;
    const int maxLineGap = 8;
    cv::Mat src(size, CV_8UC1);
    generateLines(src);
    for (int i = 0; i < nThreads; i++) {
        imgsIn.push_back(loadMat(src, useRoi));
        detectors.push_back(createHoughSegmentDetector(rho, theta, minLineLength, maxLineGap));
        linesOut.push_back(vector<GpuMat>(nIters));
    }
    vector<std::thread> thread(nThreads);
    for (int i = 0; i < nThreads; i++) thread.at(i) = std::thread(HoughLinesProbabilisticThread, detectors.at(i), std::ref(imgsIn.at(i)), std::ref(linesOut.at(i)), std::ref(streams.at(i)));
    for (int i = 0; i < nThreads; i++) thread.at(i).join();
    for (int i = 0; i < nThreads; i++) {
        std::vector<cv::Vec4i> linesSegment;
        std::vector<cv::Vec2f> lines;
        for (const auto& line : linesOut.at(i)) {
            line.download(linesSegment);
            cv::Mat dst(size, CV_8UC1);
            drawLines(dst, linesSegment);
            ASSERT_MAT_NEAR(src, dst, 0.0);
        }
    }
 }
 INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, HoughLinesProbabilistic, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    WHOLE_SUBMAT));
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // HoughCircles
--- a/modules/cudaimgproc/test/test_precomp.hpp
+++ b/modules/cudaimgproc/test/test_precomp.hpp
@ -49,4 +49,6 @@
 #include "cvconfig.h"
 #include <thread>
 #endif
--- a/modules/cudalegacy/include/opencv2/cudalegacy/NCV.hpp
+++ b/modules/cudalegacy/include/opencv2/cudalegacy/NCV.hpp
@ -119,9 +119,9 @@ typedef               bool NcvBool;
 typedef          long long Ncv64s;
 #if defined(__APPLE__) && !defined(__CUDACC__)
-    typedef uint64_t Ncv64u;
+    typedef uint64 Ncv64u;
 #else
-    typedef unsigned long long Ncv64u;
+    typedef uint64 Ncv64u;
 #endif
 typedef                int Ncv32s;
--- a/modules/cudalegacy/include/opencv2/cudalegacy/NPP_staging.hpp
+++ b/modules/cudalegacy/include/opencv2/cudalegacy/NPP_staging.hpp
@ -174,7 +174,7 @@ NCVStatus nppiStInterpolateFrames(const NppStInterpolationState *pState);
 * \return NCV status code
 */
 CV_EXPORTS
-NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
+NCVStatus nppiStFilterRowBorder_32f_C1R(Ncv32f *pSrc,
                                        NcvSize32u srcSize,
                                        Ncv32u nSrcStep,
                                        Ncv32f *pDst,
@ -182,7 +182,7 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
                                        Ncv32u nDstStep,
                                        NcvRect32u oROI,
                                        NppStBorderType borderType,
-                                        const Ncv32f *pKernel,
+                                        Ncv32f *pKernel,
                                        Ncv32s nKernelSize,
                                        Ncv32s nAnchor,
                                        Ncv32f multiplier);
@ -208,7 +208,7 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
 * \return NCV status code
 */
 CV_EXPORTS
-NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
+NCVStatus nppiStFilterColumnBorder_32f_C1R(Ncv32f *pSrc,
                                           NcvSize32u srcSize,
                                           Ncv32u nSrcStep,
                                           Ncv32f *pDst,
@ -216,7 +216,7 @@ NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
                                           Ncv32u nDstStep,
                                           NcvRect32u oROI,
                                           NppStBorderType borderType,
-                                           const Ncv32f *pKernel,
+                                           Ncv32f *pKernel,
                                           Ncv32s nKernelSize,
                                           Ncv32s nAnchor,
                                           Ncv32f multiplier);
@ -319,7 +319,7 @@ NCVStatus nppiStVectorWarp_PSF2x2_32f_C1(const Ncv32f *pSrc,
 * \return NCV status code
 */
 CV_EXPORTS
-NCVStatus nppiStResize_32f_C1R(const Ncv32f *pSrc,
+NCVStatus nppiStResize_32f_C1R(Ncv32f *pSrc,
                               NcvSize32u srcSize,
                               Ncv32u nSrcStep,
                               NcvRect32u srcROI,
--- a/modules/cudalegacy/src/cuda/NCVBroxOpticalFlow.cu
+++ b/modules/cudalegacy/src/cuda/NCVBroxOpticalFlow.cu
@ -65,9 +65,12 @@
 #include "opencv2/cudalegacy/NPP_staging.hpp"
 #include "opencv2/cudalegacy/NCVBroxOpticalFlow.hpp"
 #include <opencv2/cudev/ptr2d/texture.hpp>
 typedef NCVVectorAlloc<Ncv32f> FloatVector;
 typedef cv::cudev::TexturePtr<float> Ptr2D;
 typedef cv::cudev::Texture<float> Texture;
 /////////////////////////////////////////////////////////////////////////////////////////
 // Implementation specific constants
@ -84,39 +87,6 @@ inline int iDivUp(int a, int b)
    return (a + b - 1)/b;
 }
 /////////////////////////////////////////////////////////////////////////////////////////
 // Texture references
 /////////////////////////////////////////////////////////////////////////////////////////
 texture<float, 2, cudaReadModeElementType> tex_coarse;
 texture<float, 2, cudaReadModeElementType> tex_fine;
 texture<float, 2, cudaReadModeElementType> tex_I1;
 texture<float, 2, cudaReadModeElementType> tex_I0;
 texture<float, 2, cudaReadModeElementType> tex_Ix;
 texture<float, 2, cudaReadModeElementType> tex_Ixx;
 texture<float, 2, cudaReadModeElementType> tex_Ix0;
 texture<float, 2, cudaReadModeElementType> tex_Iy;
 texture<float, 2, cudaReadModeElementType> tex_Iyy;
 texture<float, 2, cudaReadModeElementType> tex_Iy0;
 texture<float, 2, cudaReadModeElementType> tex_Ixy;
 texture<float, 1, cudaReadModeElementType> tex_u;
 texture<float, 1, cudaReadModeElementType> tex_v;
 texture<float, 1, cudaReadModeElementType> tex_du;
 texture<float, 1, cudaReadModeElementType> tex_dv;
 texture<float, 1, cudaReadModeElementType> tex_numerator_dudv;
 texture<float, 1, cudaReadModeElementType> tex_numerator_u;
 texture<float, 1, cudaReadModeElementType> tex_numerator_v;
 texture<float, 1, cudaReadModeElementType> tex_inv_denominator_u;
 texture<float, 1, cudaReadModeElementType> tex_inv_denominator_v;
 texture<float, 1, cudaReadModeElementType> tex_diffusivity_x;
 texture<float, 1, cudaReadModeElementType> tex_diffusivity_y;
 /////////////////////////////////////////////////////////////////////////////////////////
 // SUPPLEMENTARY FUNCTIONS
 /////////////////////////////////////////////////////////////////////////////////////////
@ -265,8 +235,7 @@ __forceinline__ __device__ void diffusivity_along_y(float *s, int pos, const flo
 ///\param h number of rows in global memory array
 ///\param p global memory array pitch in floats
 ///////////////////////////////////////////////////////////////////////////////
-template<int tex_id>
+__forceinline__ __device__ void load_array_element(Ptr2D texSrc, float *smem, int is, int js, int i, int j, int w, int h, int p)
 __forceinline__ __device__ void load_array_element(float *smem, int is, int js, int i, int j, int w, int h, int p)
 {
    //position within shared memory array
    const int ijs = js * PSOR_PITCH + is;
@ -276,20 +245,7 @@ __forceinline__ __device__ void load_array_element(float *smem, int is, int js,
    j = max(j, -j-1);
    j = min(j, h-j+h-1);
    const int pos = j * p + i;
-    switch(tex_id){
+    smem[ijs] = texSrc(pos);
        case 0:
            smem[ijs] = tex1Dfetch(tex_u, pos);
            break;
        case 1:
            smem[ijs] = tex1Dfetch(tex_v, pos);
            break;
        case 2:
            smem[ijs] = tex1Dfetch(tex_du, pos);
            break;
        case 3:
            smem[ijs] = tex1Dfetch(tex_dv, pos);
            break;
    }
 }
 ///////////////////////////////////////////////////////////////////////////////
@ -301,49 +257,48 @@ __forceinline__ __device__ void load_array_element(float *smem, int is, int js,
 ///\param h number of rows in global memory array
 ///\param p global memory array pitch in floats
 ///////////////////////////////////////////////////////////////////////////////
-template<int tex>
+__forceinline__ __device__ void load_array(Ptr2D texSrc, float *smem, int ig, int jg, int w, int h, int p)
 __forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, int h, int p)
 {
    const int i = threadIdx.x + 2;
    const int j = threadIdx.y + 2;
-    load_array_element<tex>(smem, i, j, ig, jg, w, h, p);//load current pixel
+    load_array_element(texSrc, smem, i, j, ig, jg, w, h, p);//load current pixel
    __syncthreads();
    if(threadIdx.y < 2)
    {
        //load bottom shadow elements
-        load_array_element<tex>(smem, i, j-2, ig, jg-2, w, h, p);
+        load_array_element(texSrc, smem, i, j-2, ig, jg-2, w, h, p);
        if(threadIdx.x < 2)
        {
            //load bottom right shadow elements
-            load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j-2, ig+PSOR_TILE_WIDTH, jg-2, w, h, p);
+            load_array_element(texSrc, smem, i+PSOR_TILE_WIDTH, j-2, ig+PSOR_TILE_WIDTH, jg-2, w, h, p);
            //load middle right shadow elements
-            load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
+            load_array_element(texSrc, smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
        }
        else if(threadIdx.x >= PSOR_TILE_WIDTH-2)
        {
            //load bottom left shadow elements
-            load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j-2, ig-PSOR_TILE_WIDTH, jg-2, w, h, p);
+            load_array_element(texSrc, smem, i-PSOR_TILE_WIDTH, j-2, ig-PSOR_TILE_WIDTH, jg-2, w, h, p);
            //load middle left shadow elements
-            load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
+            load_array_element(texSrc, smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
        }
    }
    else if(threadIdx.y >= PSOR_TILE_HEIGHT-2)
    {
        //load upper shadow elements
-        load_array_element<tex>(smem, i, j+2, ig, jg+2, w, h, p);
+        load_array_element(texSrc, smem, i, j+2, ig, jg+2, w, h, p);
        if(threadIdx.x < 2)
        {
            //load upper right shadow elements
-            load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j+2, ig+PSOR_TILE_WIDTH, jg+2, w, h, p);
+            load_array_element(texSrc, smem, i+PSOR_TILE_WIDTH, j+2, ig+PSOR_TILE_WIDTH, jg+2, w, h, p);
            //load middle right shadow elements
-            load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
+            load_array_element(texSrc, smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
        }
        else if(threadIdx.x >= PSOR_TILE_WIDTH-2)
        {
            //load upper left shadow elements
-            load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j+2, ig-PSOR_TILE_WIDTH, jg+2, w, h, p);
+            load_array_element(texSrc, smem, i-PSOR_TILE_WIDTH, j+2, ig-PSOR_TILE_WIDTH, jg+2, w, h, p);
            //load middle left shadow elements
-            load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
+            load_array_element(texSrc, smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
        }
    }
    else
@ -352,12 +307,12 @@ __forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, i
        if(threadIdx.x < 2)
        {
            //load middle right shadow elements
-            load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
+            load_array_element(texSrc, smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
        }
        else if(threadIdx.x >= PSOR_TILE_WIDTH-2)
        {
            //load middle left shadow elements
-            load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
+            load_array_element(texSrc, smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
        }
    }
    __syncthreads();
@ -382,13 +337,9 @@ __forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, i
 /// \param alpha (in) alpha in Brox model (flow smoothness)
 /// \param gamma (in) gamma in Brox model (edge importance)
 ///////////////////////////////////////////////////////////////////////////////
-
+__global__ void prepare_sor_stage_1_tex(Ptr2D texU, Ptr2D texV, Ptr2D texDu, Ptr2D texDv, Ptr2D texI0, Ptr2D texI1, Ptr2D texIx, Ptr2D texIxx, Ptr2D texIx0, Ptr2D texIy, Ptr2D texIyy,
-__global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity_y,
+    Ptr2D texIy0, Ptr2D texIxy, float *diffusivity_x, float *diffusivity_y, float *denominator_u, float *denominator_v, float *numerator_dudv, float *numerator_u, float *numerator_v,
-                                                        float *denominator_u, float *denominator_v,
+    int w, int h, int s, float alpha, float gamma)
                                                        float *numerator_dudv,
                                                        float *numerator_u, float *numerator_v,
                                                        int w, int h, int s,
                                                        float alpha, float gamma)
 {
    __shared__ float u[PSOR_PITCH * PSOR_HEIGHT];
    __shared__ float v[PSOR_PITCH * PSOR_HEIGHT];
@ -408,24 +359,24 @@ __global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity
    float x = (float)ig + 0.5f;
    float y = (float)jg + 0.5f;
    //load u  and v to smem
-    load_array<0>(u, ig, jg, w, h, s);
+    load_array(texU, u, ig, jg, w, h, s);
-    load_array<1>(v, ig, jg, w, h, s);
+    load_array(texV, v, ig, jg, w, h, s);
-    load_array<2>(du, ig, jg, w, h, s);
+    load_array(texDu, du, ig, jg, w, h, s);
-    load_array<3>(dv, ig, jg, w, h, s);
+    load_array(texDv, dv, ig, jg, w, h, s);
    //warped position
    float wx = (x + u[ijs])/(float)w;
    float wy = (y + v[ijs])/(float)h;
    x /= (float)w;
    y /= (float)h;
    //compute image derivatives
-    const float Iz  = tex2D(tex_I1, wx, wy) - tex2D(tex_I0, x, y);
+    const float Iz = texI1(wy, wx) - texI0(y,x);
-    const float Ix  = tex2D(tex_Ix, wx, wy);
+    const float Ix  = texIx(wy, wx);
-    const float Ixz = Ix - tex2D(tex_Ix0, x, y);
+    const float Ixz = Ix - texIx0(y, x);
-    const float Ixy = tex2D(tex_Ixy, wx, wy);
+    const float Ixy = texIxy(wy, wx);
-    const float Ixx = tex2D(tex_Ixx, wx, wy);
+    const float Ixx = texIxx(wy, wx);
-    const float Iy  = tex2D(tex_Iy, wx, wy);
+    const float Iy = texIy(wy, wx);
-    const float Iyz = Iy - tex2D(tex_Iy0, x, y);
+    const float Iyz = Iy - texIy0(y, x);
-    const float Iyy = tex2D(tex_Iyy, wx, wy);
+    const float Iyy = texIyy(wy, wx);
    //compute data term
    float q0, q1, q2;
    q0 = Iz  + Ix  * du[ijs] + Iy  * dv[ijs];
@ -462,8 +413,7 @@ __global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity
 ///\param h
 ///\param s
 ///////////////////////////////////////////////////////////////////////////////
-__global__ void prepare_sor_stage_2(float *inv_denominator_u, float *inv_denominator_v,
+__global__ void prepare_sor_stage_2(Ptr2D texDiffX, Ptr2D texDiffY, float *inv_denominator_u, float *inv_denominator_v, int w, int h, int s)
                                    int w, int h, int s)
 {
    __shared__ float sx[(PSOR_TILE_WIDTH+1) * (PSOR_TILE_HEIGHT+1)];
    __shared__ float sy[(PSOR_TILE_WIDTH+1) * (PSOR_TILE_HEIGHT+1)];
@ -486,8 +436,8 @@ __global__ void prepare_sor_stage_2(float *inv_denominator_u, float *inv_denomin
    }
    if(inside)
    {
-        sx[ijs] = tex1Dfetch(tex_diffusivity_x, ijg);
+        sx[ijs] = texDiffX(ijg);
-        sy[ijs] = tex1Dfetch(tex_diffusivity_y, ijg);
+        sy[ijs] = texDiffY(ijg);
    }
    else
    {
@ -498,25 +448,17 @@ __global__ void prepare_sor_stage_2(float *inv_denominator_u, float *inv_denomin
    if(j == PSOR_TILE_HEIGHT-1)
    {
        if(jg < h-1 && inside)
-        {
+            sy[up] = texDiffY(ijg + s);
            sy[up] = tex1Dfetch(tex_diffusivity_y, ijg + s);
        }
        else
        {
            sy[up] = 0.0f;
        }
    }
    int right = ijs + 1;
    if(threadIdx.x == PSOR_TILE_WIDTH-1)
    {
        if(ig < w-1 && inside)
-        {
+            sx[right] = texDiffX(ijg + 1);
            sx[right] = tex1Dfetch(tex_diffusivity_x, ijg + 1);
        }
        else
        {
            sx[right] = 0.0f;
        }
    }
    __syncthreads();
    float diffusivity_sum;
@ -534,17 +476,8 @@ __global__ void prepare_sor_stage_2(float *inv_denominator_u, float *inv_denomin
 // Red-Black SOR
 /////////////////////////////////////////////////////////////////////////////////////////
-template<int isBlack> __global__ void sor_pass(float *new_du,
+template<int isBlack> __global__ void sor_pass(Ptr2D texU, Ptr2D texV, Ptr2D texDu, Ptr2D texDv, Ptr2D texDiffX, Ptr2D texDiffY, float *new_du, float *new_dv, const float *g_inv_denominator_u,
-                                               float *new_dv,
+    const float *g_inv_denominator_v, const float *g_numerator_u, const float *g_numerator_v, const float *g_numerator_dudv, float omega, int width, int height, int stride)
                                               const float *g_inv_denominator_u,
                                               const float *g_inv_denominator_v,
                                               const float *g_numerator_u,
                                               const float *g_numerator_v,
                                               const float *g_numerator_dudv,
                                               float omega,
                                               int width,
                                               int height,
                                               int stride)
 {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int j = blockIdx.y * blockDim.y + threadIdx.y;
@ -560,14 +493,14 @@ template<int isBlack> __global__ void sor_pass(float *new_du,
    //load smooth term
    float s_up, s_left, s_right, s_down;
-    s_left = tex1Dfetch(tex_diffusivity_x, pos);
+    s_left = texDiffX(pos);
-    s_down = tex1Dfetch(tex_diffusivity_y, pos);
+    s_down = texDiffY(pos);
    if(i < width-1)
-        s_right = tex1Dfetch(tex_diffusivity_x, pos_r);
+        s_right = texDiffX(pos_r);
    else
        s_right = 0.0f; //Neumann BC
    if(j < height-1)
-        s_up = tex1Dfetch(tex_diffusivity_y, pos_u);
+        s_up = texDiffY(pos_u);
    else
        s_up = 0.0f; //Neumann BC
@ -577,30 +510,29 @@ template<int isBlack> __global__ void sor_pass(float *new_du,
    float du_up, du_left, du_right, du_down, du;
    float dv_up, dv_left, dv_right, dv_down, dv;
-    u_left  = tex1Dfetch(tex_u, pos_l);
+    u_left = texU(pos_l);
-    u_right = tex1Dfetch(tex_u, pos_r);
+    u_right = texU(pos_r);
-    u_down  = tex1Dfetch(tex_u, pos_d);
+    u_down = texU(pos_d);
-    u_up    = tex1Dfetch(tex_u, pos_u);
+    u_up = texU(pos_u);
-    u       = tex1Dfetch(tex_u, pos);
+    u = texU(pos);
-
+
-    v_left  = tex1Dfetch(tex_v, pos_l);
+    v_left = texV(pos_l);
-    v_right = tex1Dfetch(tex_v, pos_r);
+    v_right = texV(pos_r);
-    v_down  = tex1Dfetch(tex_v, pos_d);
+    v_down = texV(pos_d);
-    v       = tex1Dfetch(tex_v, pos);
+    v = texV(pos);
-    v_up    = tex1Dfetch(tex_v, pos_u);
+    v_up = texV(pos_u);
-
+
-    du       = tex1Dfetch(tex_du, pos);
+    du =  texDu(pos);
-    du_left  = tex1Dfetch(tex_du, pos_l);
+    du_left = texDu(pos_l);
-    du_right = tex1Dfetch(tex_du, pos_r);
+    du_right = texDu(pos_r);
-    du_down  = tex1Dfetch(tex_du, pos_d);
+    du_down = texDu(pos_d);
-    du_up    = tex1Dfetch(tex_du, pos_u);
+    du_up = texDu(pos_u);
-
+
-    dv       = tex1Dfetch(tex_dv, pos);
+    dv = texDv(pos);
-    dv_left  = tex1Dfetch(tex_dv, pos_l);
+    dv_left = texDv(pos_l);
-    dv_right = tex1Dfetch(tex_dv, pos_r);
+    dv_right = texDv(pos_r);
-    dv_down  = tex1Dfetch(tex_dv, pos_d);
+    dv_down = texDv(pos_d);
-    dv_up    = tex1Dfetch(tex_dv, pos_u);
+    dv_up = texDv(pos_u);
    float numerator_dudv    = g_numerator_dudv[pos];
    if((i+j)%2 == isBlack)
@ -624,52 +556,6 @@ template<int isBlack> __global__ void sor_pass(float *new_du,
 ///////////////////////////////////////////////////////////////////////////////
 // utility functions
 ///////////////////////////////////////////////////////////////////////////////
 void initTexture1D(texture<float, 1, cudaReadModeElementType> &tex)
 {
    tex.addressMode[0] = cudaAddressModeClamp;
    tex.filterMode = cudaFilterModePoint;
    tex.normalized = false;
 }
 void initTexture2D(texture<float, 2, cudaReadModeElementType> &tex)
 {
    tex.addressMode[0] = cudaAddressModeMirror;
    tex.addressMode[1] = cudaAddressModeMirror;
    tex.filterMode = cudaFilterModeLinear;
    tex.normalized = true;
 }
 void InitTextures()
 {
    initTexture2D(tex_I0);
    initTexture2D(tex_I1);
    initTexture2D(tex_fine);      // for downsampling
    initTexture2D(tex_coarse);    // for prolongation
    initTexture2D(tex_Ix);
    initTexture2D(tex_Ixx);
    initTexture2D(tex_Ix0);
    initTexture2D(tex_Iy);
    initTexture2D(tex_Iyy);
    initTexture2D(tex_Iy0);
    initTexture2D(tex_Ixy);
    initTexture1D(tex_u);
    initTexture1D(tex_v);
    initTexture1D(tex_du);
    initTexture1D(tex_dv);
    initTexture1D(tex_diffusivity_x);
    initTexture1D(tex_diffusivity_y);
    initTexture1D(tex_inv_denominator_u);
    initTexture1D(tex_inv_denominator_v);
    initTexture1D(tex_numerator_dudv);
    initTexture1D(tex_numerator_u);
    initTexture1D(tex_numerator_v);
 }
 namespace
 {
    struct ImagePyramid
@ -804,8 +690,6 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
        ncvAssertCUDAReturn(cudaMemcpy(derivativeFilter.ptr(), derivativeFilterHost, sizeof(float) * kDFilterSize,
            cudaMemcpyHostToDevice), NCV_CUDA_ERROR);
        InitTextures();
    }
    //prepare image pyramid
@ -909,9 +793,6 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
        ncvAssertCUDAReturn(cudaMemsetAsync(v.ptr(), 0, kSizeInPixelsAligned * sizeof(float), stream), NCV_CUDA_ERROR);
        //select images with lowest resolution
        size_t pitch = alignUp(pyr.w.back(), kStrideAlignmentFloat) * sizeof(float);
        ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, pyr.img0.back()->ptr(), channel_desc, pyr.w.back(), pyr.h.back(), pitch), NCV_CUDA_ERROR);
        ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, pyr.img1.back()->ptr(), channel_desc, pyr.w.back(), pyr.h.back(), pitch), NCV_CUDA_ERROR);
        ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
        FloatVector* ptrU = &u;
@ -941,17 +822,14 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
            ncvAssertCUDAReturn(cudaMemsetAsync(du.ptr(), 0, kLevelSizeInBytes, stream), NCV_CUDA_ERROR);
            ncvAssertCUDAReturn(cudaMemsetAsync(dv.ptr(), 0, kLevelSizeInBytes, stream), NCV_CUDA_ERROR);
            //texture format descriptor
            cudaChannelFormatDesc ch_desc = cudaCreateChannelDesc<float>();
            I0 = *img0Iter;
            I1 = *img1Iter;
            ++img0Iter;
            ++img1Iter;
-            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, I0->ptr(), ch_desc, kLevelWidth, kLevelHeight, kLevelStride*sizeof(float)), NCV_CUDA_ERROR);
+            Texture texI0(kLevelHeight, kLevelWidth, I0->ptr(), kLevelStride * sizeof(float), true, cudaFilterModeLinear, cudaAddressModeMirror);
-            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, I1->ptr(), ch_desc, kLevelWidth, kLevelHeight, kLevelStride*sizeof(float)), NCV_CUDA_ERROR);
+            Texture texI1(kLevelHeight, kLevelWidth, I1->ptr(), kLevelStride * sizeof(float), true, cudaFilterModeLinear, cudaAddressModeMirror);
            //compute derivatives
            dim3 dBlocks(iDivUp(kLevelWidth, 32), iDivUp(kLevelHeight, 6));
@ -991,20 +869,24 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
            ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (Iy.ptr(), srcSize, nSrcStep, Ixy.ptr(), srcSize, nSrcStep, oROI,
                nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
-            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix,  Ix.ptr(),  ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
+            Texture texIx(kLevelHeight, kLevelWidth, Ix.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror);
-            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixx, Ixx.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
+            Texture texIxx(kLevelHeight, kLevelWidth, Ixx.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror);
-            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix0, Ix0.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
+            Texture texIx0(kLevelHeight, kLevelWidth, Ix0.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror);
-            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iy,  Iy.ptr(),  ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
+            Texture texIy(kLevelHeight, kLevelWidth, Iy.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror);
-            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iyy, Iyy.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
+            Texture texIyy(kLevelHeight, kLevelWidth, Iyy.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror);
-            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iy0, Iy0.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
+            Texture texIy0(kLevelHeight, kLevelWidth, Iy0.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror);
-            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixy, Ixy.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
+            Texture texIxy(kLevelHeight, kLevelWidth, Ixy.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror);
            Texture texDiffX(1, kLevelSizeInBytes / sizeof(float), diffusivity_x.ptr(), kLevelSizeInBytes);
            Texture texDiffY(1, kLevelSizeInBytes / sizeof(float), diffusivity_y.ptr(), kLevelSizeInBytes);
            //    flow
-            ncvAssertCUDAReturn(cudaBindTexture(0, tex_u, ptrU->ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+            Texture texU(1, kLevelSizeInBytes / sizeof(float), ptrU->ptr(), kLevelSizeInBytes);
-            ncvAssertCUDAReturn(cudaBindTexture(0, tex_v, ptrV->ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+            Texture texV(1, kLevelSizeInBytes / sizeof(float), ptrV->ptr(), kLevelSizeInBytes);
            //    flow increments
-            ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+            Texture texDu(1, kLevelSizeInBytes / sizeof(float), du.ptr(), kLevelSizeInBytes);
-            ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+            Texture texDv(1, kLevelSizeInBytes / sizeof(float), dv.ptr(), kLevelSizeInBytes);
            Texture texDuNew(1, kLevelSizeInBytes / sizeof(float), du_new.ptr(), kLevelSizeInBytes);
            Texture texDvNew(1, kLevelSizeInBytes / sizeof(float), dv_new.ptr(), kLevelSizeInBytes);
            dim3 psor_blocks(iDivUp(kLevelWidth, PSOR_TILE_WIDTH), iDivUp(kLevelHeight, PSOR_TILE_HEIGHT));
            dim3 psor_threads(PSOR_TILE_WIDTH, PSOR_TILE_HEIGHT);
@ -1018,89 +900,30 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
            for (Ncv32u current_inner_iteration = 0; current_inner_iteration < desc.number_of_inner_iterations; ++current_inner_iteration)
            {
                //compute coefficients
-                prepare_sor_stage_1_tex<<<psor_blocks, psor_threads, 0, stream>>>
+                prepare_sor_stage_1_tex<<<psor_blocks, psor_threads, 0, stream>>> (texU, texV, texDu, texDv, texI0, texI1, texIx, texIxx, texIx0, texIy, texIyy, texIy0, texIxy,
-                    (diffusivity_x.ptr(),
+                    diffusivity_x.ptr(), diffusivity_y.ptr(), denom_u.ptr(), denom_v.ptr(), num_dudv.ptr(), num_u.ptr(), num_v.ptr(), kLevelWidth, kLevelHeight, kLevelStride, alpha, gamma);
                     diffusivity_y.ptr(),
                     denom_u.ptr(),
                     denom_v.ptr(),
                     num_dudv.ptr(),
                     num_u.ptr(),
                     num_v.ptr(),
                     kLevelWidth,
                     kLevelHeight,
                     kLevelStride,
                     alpha,
                     gamma);
                ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                prepare_sor_stage_2<<<psor_blocks, psor_threads, 0, stream>>>(texDiffX, texDiffY, denom_u.ptr(), denom_v.ptr(), kLevelWidth, kLevelHeight, kLevelStride);
                ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_dudv, num_dudv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_u, num_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_v, num_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
                prepare_sor_stage_2<<<psor_blocks, psor_threads, 0, stream>>>(denom_u.ptr(), denom_v.ptr(), kLevelWidth, kLevelHeight, kLevelStride);
                ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
                //    linear system coefficients
                ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
                ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_dudv, num_dudv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_u, num_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_v, num_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
                ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_u, denom_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
                ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_v, denom_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
                //solve linear system
                for (Ncv32u solver_iteration = 0; solver_iteration < desc.number_of_solver_iterations; ++solver_iteration)
                {
                    float omega = 1.99f;
-
+                    sor_pass<0><<<sor_blocks, sor_threads, 0, stream>>>(texU, texV, texDu, texDv, texDiffX, texDiffY, du_new.ptr(), dv_new.ptr(), denom_u.ptr(), denom_v.ptr(),
-                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                        num_u.ptr(), num_v.ptr(), num_dudv.ptr(), omega, kLevelWidth, kLevelHeight, kLevelStride);
                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
                    sor_pass<0><<<sor_blocks, sor_threads, 0, stream>>>
                        (du_new.ptr(),
                        dv_new.ptr(),
                        denom_u.ptr(),
                        denom_v.ptr(),
                        num_u.ptr(),
                        num_v.ptr(),
                        num_dudv.ptr(),
                        omega,
                        kLevelWidth,
                        kLevelHeight,
                        kLevelStride);
                    ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du_new.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv_new.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
                    sor_pass<1><<<sor_blocks, sor_threads, 0, stream>>>
                        (du.ptr(),
                        dv.ptr(),
                        denom_u.ptr(),
                        denom_v.ptr(),
                        num_u.ptr(),
                        num_v.ptr(),
                        num_dudv.ptr(),
                        omega,
                        kLevelWidth,
                        kLevelHeight,
                        kLevelStride);
-                    ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
+                    sor_pass<1><<<sor_blocks, sor_threads, 0, stream>>>(texU, texV, texDuNew, texDvNew, texDiffX, texDiffY, du.ptr(), dv.ptr(), denom_u.ptr(), denom_v.ptr(), num_u.ptr(),
                        num_v.ptr(),num_dudv.ptr(), omega, kLevelWidth, kLevelHeight, kLevelStride);
-                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                    ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
                }//end of solver loop
            }// end of inner loop
--- a/modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu
+++ b/modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu
@ -72,6 +72,7 @@
 #include "opencv2/cudalegacy/NCV.hpp"
 #include "opencv2/cudalegacy/NPP_staging.hpp"
 #include "opencv2/cudalegacy/NCVHaarObjectDetection.hpp"
 #include <opencv2/cudev/ptr2d/texture.hpp>
 #include "NCVRuntimeTemplates.hpp"
 #include "NCVAlg.hpp"
@ -94,24 +95,6 @@ const Ncv32u NUM_THREADS_ANCHORSPARALLEL = 64;
 #define NUM_THREADS_CLASSIFIERPARALLEL          (1 << NUM_THREADS_CLASSIFIERPARALLEL_LOG2)
 /** \internal
 * Haar features solid array.
 */
 texture<uint2, 1, cudaReadModeElementType> texHaarFeatures;
 /** \internal
 * Haar classifiers flattened trees container.
 * Two parts: first contains root nodes, second - nodes that are referred by root nodes.
 * Drawback: breaks tree locality (might cause more cache misses
 * Advantage: No need to introduce additional 32-bit field to index root nodes offsets
 */
 texture<uint4, 1, cudaReadModeElementType> texHaarClassifierNodes;
 texture<Ncv32u, 1, cudaReadModeElementType> texIImage;
 __device__ HaarStage64 getStage(Ncv32u iStage, HaarStage64 *d_Stages)
 {
    return d_Stages[iStage];
@ -119,51 +102,37 @@ __device__ HaarStage64 getStage(Ncv32u iStage, HaarStage64 *d_Stages)
 template <NcvBool tbCacheTextureCascade>
-__device__ HaarClassifierNode128 getClassifierNode(Ncv32u iNode, HaarClassifierNode128 *d_ClassifierNodes)
+__device__ HaarClassifierNode128 getClassifierNode(cv::cudev::TexturePtr<uint4> texHaarClassifierNodes, Ncv32u iNode, HaarClassifierNode128 *d_ClassifierNodes)
 {
    HaarClassifierNode128 tmpNode;
    if (tbCacheTextureCascade)
-    {
+        tmpNode._ui4 = texHaarClassifierNodes(iNode);
        tmpNode._ui4 = tex1Dfetch(texHaarClassifierNodes, iNode);
    }
    else
    {
        tmpNode = d_ClassifierNodes[iNode];
    }
    return tmpNode;
 }
 template <NcvBool tbCacheTextureCascade>
-__device__ void getFeature(Ncv32u iFeature, HaarFeature64 *d_Features,
+__device__ void getFeature(cv::cudev::TexturePtr<uint2> texHaarFeatures, Ncv32u iFeature, HaarFeature64* d_Features, Ncv32f* weight, Ncv32u* rectX, Ncv32u* rectY, Ncv32u* rectWidth, Ncv32u* rectHeight)
                           Ncv32f *weight,
                           Ncv32u *rectX, Ncv32u *rectY, Ncv32u *rectWidth, Ncv32u *rectHeight)
 {
    HaarFeature64 feature;
    if (tbCacheTextureCascade)
-    {
+        feature._ui2 = texHaarFeatures(iFeature);
        feature._ui2 = tex1Dfetch(texHaarFeatures, iFeature);
    }
    else
    {
        feature = d_Features[iFeature];
    }
    feature.getRect(rectX, rectY, rectWidth, rectHeight);
    *weight = feature.getWeight();
 }
 template <NcvBool tbCacheTextureIImg>
-__device__ Ncv32u getElemIImg(Ncv32u x, Ncv32u *d_IImg)
+__device__ Ncv32u getElemIImg(cv::cudev::TexturePtr<Ncv32u> texImg, Ncv32u x, Ncv32u *d_IImg)
 {
    if (tbCacheTextureIImg)
-    {
+        return texImg(x);
        return tex1Dfetch(texIImage, x);
    }
    else
    {
        return d_IImg[x];
    }
 }
@ -203,17 +172,10 @@ __device__ void compactBlockWriteOutAnchorParallel(Ncv32u threadPassFlag, Ncv32u
 }
-template <NcvBool tbInitMaskPositively,
+template <NcvBool tbInitMaskPositively, NcvBool tbCacheTextureIImg, NcvBool tbCacheTextureCascade, NcvBool tbReadPixelIndexFromVector, NcvBool tbDoAtomicCompaction>
-          NcvBool tbCacheTextureIImg,
+__global__ void applyHaarClassifierAnchorParallel(cv::cudev::TexturePtr<Ncv32u> texImg, cv::cudev::TexturePtr<uint2> texHaarFeatures, cv::cudev::TexturePtr<uint4> texHaarClassifierNodes,
-          NcvBool tbCacheTextureCascade,
+    Ncv32u *d_IImg, Ncv32u IImgStride, Ncv32f *d_weights, Ncv32u weightsStride, HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages, Ncv32u *d_inMask,
-          NcvBool tbReadPixelIndexFromVector,
+    Ncv32u *d_outMask, Ncv32u mask1Dlen, Ncv32u mask2Dstride,  NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
          NcvBool tbDoAtomicCompaction>
 __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStride,
                                                  Ncv32f *d_weights, Ncv32u weightsStride,
                                                  HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
                                                  Ncv32u *d_inMask, Ncv32u *d_outMask,
                                                  Ncv32u mask1Dlen, Ncv32u mask2Dstride,
                                                  NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
 {
    Ncv32u y_offs;
    Ncv32u x_offs;
@ -299,7 +261,7 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr
                {
                    while (bMoreNodesToTraverse)
                    {
-                        HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes);
+                        HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(texHaarClassifierNodes, iNode, d_ClassifierNodes);
                        HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc();
                        Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures();
                        Ncv32u iFeature = featuresDesc.getFeaturesOffset();
@ -310,19 +272,17 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr
                        {
                            Ncv32f rectWeight;
                            Ncv32u rectX, rectY, rectWidth, rectHeight;
-                            getFeature<tbCacheTextureCascade>
+                            getFeature<tbCacheTextureCascade> (texHaarFeatures, iFeature + iRect, d_Features, &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);
                                (iFeature + iRect, d_Features,
                                &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);
                            Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX);
                            Ncv32u iioffsTR = iioffsTL + rectWidth;
                            Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride;
                            Ncv32u iioffsBR = iioffsBL + rectWidth;
-                            Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(iioffsBR, d_IImg) -
+                            Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(texImg, iioffsBR, d_IImg) -
-                                             getElemIImg<tbCacheTextureIImg>(iioffsBL, d_IImg) +
+                                             getElemIImg<tbCacheTextureIImg>(texImg, iioffsBL, d_IImg) +
-                                             getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) -
+                                             getElemIImg<tbCacheTextureIImg>(texImg, iioffsTL, d_IImg) -
-                                             getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg);
+                                             getElemIImg<tbCacheTextureIImg>(texImg, iioffsTR, d_IImg);
    #if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY
                        curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight);
@ -393,15 +353,10 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr
 }
-template <NcvBool tbCacheTextureIImg,
+template <NcvBool tbCacheTextureIImg, NcvBool tbCacheTextureCascade, NcvBool tbDoAtomicCompaction>
-          NcvBool tbCacheTextureCascade,
+__global__ void applyHaarClassifierClassifierParallel(cv::cudev::TexturePtr<Ncv32u> texImg, cv::cudev::TexturePtr<uint2> texHaarFeatures, cv::cudev::TexturePtr<uint4> texHaarClassifierNodes, Ncv32u *d_IImg,
-          NcvBool tbDoAtomicCompaction>
+    Ncv32u IImgStride, Ncv32f *d_weights, Ncv32u weightsStride, HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages, Ncv32u *d_inMask, Ncv32u *d_outMask,
-__global__ void applyHaarClassifierClassifierParallel(Ncv32u *d_IImg, Ncv32u IImgStride,
+    Ncv32u mask1Dlen, Ncv32u mask2Dstride, NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
                                                      Ncv32f *d_weights, Ncv32u weightsStride,
                                                      HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
                                                      Ncv32u *d_inMask, Ncv32u *d_outMask,
                                                      Ncv32u mask1Dlen, Ncv32u mask2Dstride,
                                                      NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
 {
    Ncv32u maskOffset = MAX_GRID_DIM * blockIdx.y + blockIdx.x;
@ -439,7 +394,7 @@ __global__ void applyHaarClassifierClassifierParallel(Ncv32u *d_IImg, Ncv32u IIm
                while (bMoreNodesToTraverse)
                {
-                    HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes);
+                    HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(texHaarClassifierNodes, iNode, d_ClassifierNodes);
                    HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc();
                    Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures();
                    Ncv32u iFeature = featuresDesc.getFeaturesOffset();
@ -450,19 +405,17 @@ __global__ void applyHaarClassifierClassifierParallel(Ncv32u *d_IImg, Ncv32u IIm
                    {
                        Ncv32f rectWeight;
                        Ncv32u rectX, rectY, rectWidth, rectHeight;
-                        getFeature<tbCacheTextureCascade>
+                        getFeature<tbCacheTextureCascade> (texHaarFeatures, iFeature + iRect, d_Features, &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);
                            (iFeature + iRect, d_Features,
                            &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);
                        Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX);
                        Ncv32u iioffsTR = iioffsTL + rectWidth;
                        Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride;
                        Ncv32u iioffsBR = iioffsBL + rectWidth;
-                        Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(iioffsBR, d_IImg) -
+                        Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(texImg, iioffsBR, d_IImg) -
-                                         getElemIImg<tbCacheTextureIImg>(iioffsBL, d_IImg) +
+                                         getElemIImg<tbCacheTextureIImg>(texImg, iioffsBL, d_IImg) +
-                                         getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) -
+                                         getElemIImg<tbCacheTextureIImg>(texImg, iioffsTL, d_IImg) -
-                                         getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg);
+                                         getElemIImg<tbCacheTextureIImg>(texImg, iioffsTR, d_IImg);
 #if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY
                        curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight);
@ -578,8 +531,9 @@ struct applyHaarClassifierAnchorParallelFunctor
 {
    dim3 gridConf, blockConf;
    cudaStream_t cuStream;
-
+    cv::cudev::TexturePtr<Ncv32u> texImg;
-    //Kernel arguments are stored as members;
+    cv::cudev::TexturePtr<uint2> texHaarFeatures;
    cv::cudev::TexturePtr<uint4> texHaarClassifierNodes;
    Ncv32u *d_IImg;
    Ncv32u IImgStride;
    Ncv32f *d_weights;
@ -597,32 +551,12 @@ struct applyHaarClassifierAnchorParallelFunctor
    Ncv32f scaleArea;
    //Arguments are passed through the constructor
-    applyHaarClassifierAnchorParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,
+    applyHaarClassifierAnchorParallelFunctor(cv::cudev::TexturePtr<Ncv32u> texImg_, cv::cudev::TexturePtr<uint2> texHaarFeatures_, cv::cudev::TexturePtr<uint4> texHaarClassifierNodes_, dim3 _gridConf,
-                                             Ncv32u *_d_IImg, Ncv32u _IImgStride,
+        dim3 _blockConf, cudaStream_t _cuStream, Ncv32u *_d_IImg, Ncv32u _IImgStride, Ncv32f *_d_weights, Ncv32u _weightsStride, HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes,
-                                             Ncv32f *_d_weights, Ncv32u _weightsStride,
+        HaarStage64 *_d_Stages, Ncv32u *_d_inMask, Ncv32u *_d_outMask, Ncv32u _mask1Dlen, Ncv32u _mask2Dstride, NcvSize32u _anchorsRoi, Ncv32u _startStageInc, Ncv32u _endStageExc, Ncv32f _scaleArea) :
-                                             HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages,
+        gridConf(_gridConf), blockConf(_blockConf), cuStream(_cuStream), texImg(texImg_), texHaarFeatures(texHaarFeatures_), texHaarClassifierNodes(texHaarClassifierNodes_), d_IImg(_d_IImg), IImgStride(_IImgStride),
-                                             Ncv32u *_d_inMask, Ncv32u *_d_outMask,
+        d_weights(_d_weights), weightsStride(_weightsStride), d_Features(_d_Features), d_ClassifierNodes(_d_ClassifierNodes), d_Stages(_d_Stages), d_inMask(_d_inMask), d_outMask(_d_outMask), mask1Dlen(_mask1Dlen),
-                                             Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,
+        mask2Dstride(_mask2Dstride), anchorsRoi(_anchorsRoi), startStageInc(_startStageInc), endStageExc(_endStageExc), scaleArea(_scaleArea)
                                             NcvSize32u _anchorsRoi, Ncv32u _startStageInc,
                                             Ncv32u _endStageExc, Ncv32f _scaleArea) :
    gridConf(_gridConf),
    blockConf(_blockConf),
    cuStream(_cuStream),
    d_IImg(_d_IImg),
    IImgStride(_IImgStride),
    d_weights(_d_weights),
    weightsStride(_weightsStride),
    d_Features(_d_Features),
    d_ClassifierNodes(_d_ClassifierNodes),
    d_Stages(_d_Stages),
    d_inMask(_d_inMask),
    d_outMask(_d_outMask),
    mask1Dlen(_mask1Dlen),
    mask2Dstride(_mask2Dstride),
    anchorsRoi(_anchorsRoi),
    startStageInc(_startStageInc),
    endStageExc(_endStageExc),
    scaleArea(_scaleArea)
    {}
    template<class TList>
@ -635,43 +569,19 @@ struct applyHaarClassifierAnchorParallelFunctor
            Loki::TL::TypeAt<TList, 2>::Result::value,
            Loki::TL::TypeAt<TList, 3>::Result::value,
            Loki::TL::TypeAt<TList, 4>::Result::value >
-            <<<gridConf, blockConf, 0, cuStream>>>
+            <<<gridConf, blockConf, 0, cuStream>>> (texImg, texHaarFeatures, texHaarClassifierNodes, d_IImg, IImgStride, d_weights, weightsStride, d_Features, d_ClassifierNodes, d_Stages, d_inMask,
-            (d_IImg, IImgStride,
+                d_outMask, mask1Dlen, mask2Dstride, anchorsRoi, startStageInc, endStageExc, scaleArea);
            d_weights, weightsStride,
            d_Features, d_ClassifierNodes, d_Stages,
            d_inMask, d_outMask,
            mask1Dlen, mask2Dstride,
            anchorsRoi, startStageInc,
            endStageExc, scaleArea);
    }
 };
-void applyHaarClassifierAnchorParallelDynTemplate(NcvBool tbInitMaskPositively,
+void applyHaarClassifierAnchorParallelDynTemplate(NcvBool tbInitMaskPositively, NcvBool tbCacheTextureIImg, NcvBool tbCacheTextureCascade, NcvBool tbReadPixelIndexFromVector, NcvBool tbDoAtomicCompaction,
-                                                  NcvBool tbCacheTextureIImg,
+    dim3 gridConf, dim3 blockConf, cudaStream_t cuStream, cv::cudev::TexturePtr<Ncv32u> texImg, cv::cudev::TexturePtr<uint2> texHaarFeatures, cv::cudev::TexturePtr<uint4> texHaarClassifierNodes, Ncv32u *d_IImg,
-                                                  NcvBool tbCacheTextureCascade,
+    Ncv32u IImgStride, Ncv32f *d_weights, Ncv32u weightsStride, HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages, Ncv32u *d_inMask, Ncv32u *d_outMask,
-                                                  NcvBool tbReadPixelIndexFromVector,
+    Ncv32u mask1Dlen, Ncv32u mask2Dstride, NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
                                                  NcvBool tbDoAtomicCompaction,
                                                  dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
                                                  Ncv32u *d_IImg, Ncv32u IImgStride,
                                                  Ncv32f *d_weights, Ncv32u weightsStride,
                                                  HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
                                                  Ncv32u *d_inMask, Ncv32u *d_outMask,
                                                  Ncv32u mask1Dlen, Ncv32u mask2Dstride,
                                                  NcvSize32u anchorsRoi, Ncv32u startStageInc,
                                                  Ncv32u endStageExc, Ncv32f scaleArea)
 {
-
+    applyHaarClassifierAnchorParallelFunctor functor(texImg, texHaarFeatures, texHaarClassifierNodes, gridConf, blockConf, cuStream, d_IImg, IImgStride, d_weights, weightsStride, d_Features, d_ClassifierNodes, d_Stages,
-    applyHaarClassifierAnchorParallelFunctor functor(gridConf, blockConf, cuStream,
+                                                     d_inMask, d_outMask, mask1Dlen, mask2Dstride, anchorsRoi, startStageInc, endStageExc, scaleArea);
                                                     d_IImg, IImgStride,
                                                     d_weights, weightsStride,
                                                     d_Features, d_ClassifierNodes, d_Stages,
                                                     d_inMask, d_outMask,
                                                     mask1Dlen, mask2Dstride,
                                                     anchorsRoi, startStageInc,
                                                     endStageExc, scaleArea);
    //Second parameter is the number of "dynamic" template parameters
    NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 5, applyHaarClassifierAnchorParallelFunctor>
@ -688,8 +598,9 @@ struct applyHaarClassifierClassifierParallelFunctor
 {
    dim3 gridConf, blockConf;
    cudaStream_t cuStream;
-
+    cv::cudev::TexturePtr<Ncv32u> texImg;
-    //Kernel arguments are stored as members;
+    cv::cudev::TexturePtr<uint2> texHaarFeatures;
    cv::cudev::TexturePtr<uint4> texHaarClassifierNodes;
    Ncv32u *d_IImg;
    Ncv32u IImgStride;
    Ncv32f *d_weights;
@ -707,32 +618,13 @@ struct applyHaarClassifierClassifierParallelFunctor
    Ncv32f scaleArea;
    //Arguments are passed through the constructor
-    applyHaarClassifierClassifierParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,
+    applyHaarClassifierClassifierParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream, cv::cudev::TexturePtr<Ncv32u> texImg_, cv::cudev::TexturePtr<uint2> texHaarFeatures_,
-                                                 Ncv32u *_d_IImg, Ncv32u _IImgStride,
+        cv::cudev::TexturePtr<uint4> texHaarClassifierNodes_, Ncv32u *_d_IImg, Ncv32u _IImgStride, Ncv32f *_d_weights, Ncv32u _weightsStride, HaarFeature64 *_d_Features,
-                                                 Ncv32f *_d_weights, Ncv32u _weightsStride,
+        HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages, Ncv32u *_d_inMask, Ncv32u *_d_outMask, Ncv32u _mask1Dlen, Ncv32u _mask2Dstride, NcvSize32u _anchorsRoi,
-                                                 HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages,
+        Ncv32u _startStageInc, Ncv32u _endStageExc, Ncv32f _scaleArea) : gridConf(_gridConf), blockConf(_blockConf), cuStream(_cuStream), texImg(texImg_), texHaarFeatures(texHaarFeatures_),
-                                                 Ncv32u *_d_inMask, Ncv32u *_d_outMask,
+        texHaarClassifierNodes(texHaarClassifierNodes_), d_IImg(_d_IImg), IImgStride(_IImgStride), d_weights(_d_weights), weightsStride(_weightsStride), d_Features(_d_Features),
-                                                 Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,
+        d_ClassifierNodes(_d_ClassifierNodes), d_Stages(_d_Stages), d_inMask(_d_inMask), d_outMask(_d_outMask), mask1Dlen(_mask1Dlen), mask2Dstride(_mask2Dstride), anchorsRoi(_anchorsRoi),
-                                                 NcvSize32u _anchorsRoi, Ncv32u _startStageInc,
+        startStageInc(_startStageInc), endStageExc(_endStageExc), scaleArea(_scaleArea)
                                                 Ncv32u _endStageExc, Ncv32f _scaleArea) :
    gridConf(_gridConf),
    blockConf(_blockConf),
    cuStream(_cuStream),
    d_IImg(_d_IImg),
    IImgStride(_IImgStride),
    d_weights(_d_weights),
    weightsStride(_weightsStride),
    d_Features(_d_Features),
    d_ClassifierNodes(_d_ClassifierNodes),
    d_Stages(_d_Stages),
    d_inMask(_d_inMask),
    d_outMask(_d_outMask),
    mask1Dlen(_mask1Dlen),
    mask2Dstride(_mask2Dstride),
    anchorsRoi(_anchorsRoi),
    startStageInc(_startStageInc),
    endStageExc(_endStageExc),
    scaleArea(_scaleArea)
    {}
    template<class TList>
@ -743,40 +635,19 @@ struct applyHaarClassifierClassifierParallelFunctor
            Loki::TL::TypeAt<TList, 0>::Result::value,
            Loki::TL::TypeAt<TList, 1>::Result::value,
            Loki::TL::TypeAt<TList, 2>::Result::value >
-            <<<gridConf, blockConf, 0, cuStream>>>
+            <<<gridConf, blockConf, 0, cuStream>>> (texImg, texHaarFeatures, texHaarClassifierNodes, d_IImg, IImgStride, d_weights, weightsStride, d_Features, d_ClassifierNodes, d_Stages, d_inMask,
-            (d_IImg, IImgStride,
+                d_outMask, mask1Dlen, mask2Dstride, anchorsRoi, startStageInc, endStageExc, scaleArea);
            d_weights, weightsStride,
            d_Features, d_ClassifierNodes, d_Stages,
            d_inMask, d_outMask,
            mask1Dlen, mask2Dstride,
            anchorsRoi, startStageInc,
            endStageExc, scaleArea);
    }
 };
-void applyHaarClassifierClassifierParallelDynTemplate(NcvBool tbCacheTextureIImg,
+void applyHaarClassifierClassifierParallelDynTemplate(NcvBool tbCacheTextureIImg, NcvBool tbCacheTextureCascade, NcvBool tbDoAtomicCompaction, dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
-                                                      NcvBool tbCacheTextureCascade,
+    cv::cudev::TexturePtr<Ncv32u> texImg, cv::cudev::TexturePtr<uint2> texHaarFeatures, cv::cudev::TexturePtr<uint4> texHaarClassifierNodes, Ncv32u *d_IImg, Ncv32u IImgStride, Ncv32f *d_weights,
-                                                      NcvBool tbDoAtomicCompaction,
+    Ncv32u weightsStride, HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages, Ncv32u *d_inMask, Ncv32u *d_outMask, Ncv32u mask1Dlen, Ncv32u mask2Dstride,
-
+    NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
                                                      dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
                                                      Ncv32u *d_IImg, Ncv32u IImgStride,
                                                      Ncv32f *d_weights, Ncv32u weightsStride,
                                                      HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
                                                      Ncv32u *d_inMask, Ncv32u *d_outMask,
                                                      Ncv32u mask1Dlen, Ncv32u mask2Dstride,
                                                      NcvSize32u anchorsRoi, Ncv32u startStageInc,
                                                      Ncv32u endStageExc, Ncv32f scaleArea)
 {
-    applyHaarClassifierClassifierParallelFunctor functor(gridConf, blockConf, cuStream,
+    applyHaarClassifierClassifierParallelFunctor functor(gridConf, blockConf, cuStream, texImg, texHaarFeatures, texHaarClassifierNodes, d_IImg, IImgStride, d_weights, weightsStride, d_Features,
-                                                         d_IImg, IImgStride,
+        d_ClassifierNodes, d_Stages, d_inMask, d_outMask, mask1Dlen, mask2Dstride, anchorsRoi, startStageInc, endStageExc, scaleArea);
                                                         d_weights, weightsStride,
                                                         d_Features, d_ClassifierNodes, d_Stages,
                                                         d_inMask, d_outMask,
                                                         mask1Dlen, mask2Dstride,
                                                         anchorsRoi, startStageInc,
                                                         endStageExc, scaleArea);
    //Second parameter is the number of "dynamic" template parameters
    NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 3, applyHaarClassifierClassifierParallelFunctor>
@ -1015,31 +886,15 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &integral,
    NCV_SKIP_COND_BEGIN
    cv::cudev::Texture<Ncv32u> texImg;
    if (bTexCacheIImg)
-    {
+        texImg = cv::cudev::Texture<Ncv32u>((anchorsRoi.height + haar.ClassifierSize.height) * integral.pitch(), integral.ptr());
        cudaChannelFormatDesc cfdTexIImage;
        cfdTexIImage = cudaCreateChannelDesc<Ncv32u>();
-        size_t alignmentOffset;
+    cv::cudev::Texture<uint2> texHaarFeatures;
-        ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texIImage, integral.ptr(), cfdTexIImage,
+    cv::cudev::Texture<uint4> texHaarClassifierNodes;
-            (anchorsRoi.height + haar.ClassifierSize.height) * integral.pitch()), NCV_CUDA_ERROR);
+    if (bTexCacheCascade) {
-        ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
+        texHaarFeatures = cv::cudev::Texture<uint2>(sizeof(HaarFeature64) * haar.NumFeatures, reinterpret_cast<uint2*>(d_HaarFeatures.ptr()));
-    }
+        texHaarClassifierNodes = cv::cudev::Texture<uint4>(sizeof(HaarClassifierNode128) * haar.NumClassifierTotalNodes, reinterpret_cast<uint4*>(d_HaarNodes.ptr()));
    if (bTexCacheCascade)
    {
        cudaChannelFormatDesc cfdTexHaarFeatures;
        cudaChannelFormatDesc cfdTexHaarClassifierNodes;
        cfdTexHaarFeatures = cudaCreateChannelDesc<uint2>();
        cfdTexHaarClassifierNodes = cudaCreateChannelDesc<uint4>();
        size_t alignmentOffset;
        ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texHaarFeatures,
            d_HaarFeatures.ptr(), cfdTexHaarFeatures,sizeof(HaarFeature64) * haar.NumFeatures), NCV_CUDA_ERROR);
        ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
        ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texHaarClassifierNodes,
            d_HaarNodes.ptr(), cfdTexHaarClassifierNodes, sizeof(HaarClassifierNode128) * haar.NumClassifierTotalNodes), NCV_CUDA_ERROR);
        ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
    }
    Ncv32u stageStartAnchorParallel = 0;
@ -1130,26 +985,10 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &integral,
        dim3 grid1(((d_pixelMask.stride() + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL),
                   anchorsRoi.height);
        dim3 block1(NUM_THREADS_ANCHORSPARALLEL);
-        applyHaarClassifierAnchorParallelDynTemplate(
+        applyHaarClassifierAnchorParallelDynTemplate( true, bTexCacheIImg, bTexCacheCascade,  pixParallelStageStops[pixParallelStageStopsIndex] != 0, bDoAtomicCompaction, grid1, block1, cuStream,
-            true,                         //tbInitMaskPositively
+            texImg, texHaarFeatures, texHaarClassifierNodes, integral.ptr(), integral.stride(), d_weights.ptr(), d_weights.stride(), d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
-            bTexCacheIImg,                //tbCacheTextureIImg
+            d_ptrNowData->ptr(), bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(), 0, d_pixelMask.stride(), anchorsRoi, pixParallelStageStops[pixParallelStageStopsIndex],
-            bTexCacheCascade,             //tbCacheTextureCascade
+            pixParallelStageStops[pixParallelStageStopsIndex+1], scaleAreaPixels);
            pixParallelStageStops[pixParallelStageStopsIndex] != 0,//tbReadPixelIndexFromVector
            bDoAtomicCompaction,          //tbDoAtomicCompaction
            grid1,
            block1,
            cuStream,
            integral.ptr(), integral.stride(),
            d_weights.ptr(), d_weights.stride(),
            d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
            d_ptrNowData->ptr(),
            bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
            0,
            d_pixelMask.stride(),
            anchorsRoi,
            pixParallelStageStops[pixParallelStageStopsIndex],
            pixParallelStageStops[pixParallelStageStopsIndex+1],
            scaleAreaPixels);
        ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
        if (bDoAtomicCompaction)
@ -1200,26 +1039,10 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &integral,
        }
        dim3 block2(NUM_THREADS_ANCHORSPARALLEL);
-        applyHaarClassifierAnchorParallelDynTemplate(
+        applyHaarClassifierAnchorParallelDynTemplate( false, bTexCacheIImg, bTexCacheCascade, pixParallelStageStops[pixParallelStageStopsIndex] != 0 || pixelStep != 1 || bMaskElements, bDoAtomicCompaction,
-            false,                        //tbInitMaskPositively
+            grid2, block2, cuStream, texImg, texHaarFeatures, texHaarClassifierNodes, integral.ptr(), integral.stride(), d_weights.ptr(), d_weights.stride(), d_HaarFeatures.ptr(), d_HaarNodes.ptr(),
-            bTexCacheIImg,                //tbCacheTextureIImg
+            d_HaarStages.ptr(), d_ptrNowData->ptr(), bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(), numDetections, d_pixelMask.stride(), anchorsRoi,
-            bTexCacheCascade,             //tbCacheTextureCascade
+            pixParallelStageStops[pixParallelStageStopsIndex], pixParallelStageStops[pixParallelStageStopsIndex+1], scaleAreaPixels);
            pixParallelStageStops[pixParallelStageStopsIndex] != 0 || pixelStep != 1 || bMaskElements,//tbReadPixelIndexFromVector
            bDoAtomicCompaction,          //tbDoAtomicCompaction
            grid2,
            block2,
            cuStream,
            integral.ptr(), integral.stride(),
            d_weights.ptr(), d_weights.stride(),
            d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
            d_ptrNowData->ptr(),
            bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
            numDetections,
            d_pixelMask.stride(),
            anchorsRoi,
            pixParallelStageStops[pixParallelStageStopsIndex],
            pixParallelStageStops[pixParallelStageStopsIndex+1],
            scaleAreaPixels);
        ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
        if (bDoAtomicCompaction)
@ -1263,24 +1086,9 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &integral,
        }
        dim3 block3(NUM_THREADS_CLASSIFIERPARALLEL);
-        applyHaarClassifierClassifierParallelDynTemplate(
+        applyHaarClassifierClassifierParallelDynTemplate(bTexCacheIImg, bTexCacheCascade, bDoAtomicCompaction, grid3, block3, cuStream, texImg, texHaarFeatures, texHaarClassifierNodes, integral.ptr(), integral.stride(),
-            bTexCacheIImg,                //tbCacheTextureIImg
+            d_weights.ptr(), d_weights.stride(), d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(), d_ptrNowData->ptr(), bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(), numDetections,
-            bTexCacheCascade,             //tbCacheTextureCascade
+            d_pixelMask.stride(), anchorsRoi, stageMiddleSwitch, stageEndClassifierParallel, scaleAreaPixels);
            bDoAtomicCompaction,          //tbDoAtomicCompaction
            grid3,
            block3,
            cuStream,
            integral.ptr(), integral.stride(),
            d_weights.ptr(), d_weights.stride(),
            d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
            d_ptrNowData->ptr(),
            bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
            numDetections,
            d_pixelMask.stride(),
            anchorsRoi,
            stageMiddleSwitch,
            stageEndClassifierParallel,
            scaleAreaPixels);
        ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
        if (bDoAtomicCompaction)
--- a/modules/cudalegacy/src/cuda/NPP_staging.cu
+++ b/modules/cudalegacy/src/cuda/NPP_staging.cu
@ -48,12 +48,7 @@
 #include "opencv2/cudev.hpp"
 #include "opencv2/cudalegacy/NPP_staging.hpp"
-
+#include <opencv2/cudev/ptr2d/texture.hpp>
 texture<Ncv8u,  1, cudaReadModeElementType> tex8u;
 texture<Ncv32u, 1, cudaReadModeElementType> tex32u;
 texture<uint2,  1, cudaReadModeElementType> tex64u;
 //==============================================================================
 //
@ -71,7 +66,6 @@ cudaStream_t nppStGetActiveCUDAstream(void)
 }
 cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream)
 {
    cudaStream_t tmp = nppStream;
@ -117,25 +111,25 @@ private:
 template<class T>
-inline __device__ T readElem(T *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs);
+inline __device__ T readElem(cv::cudev::TexturePtr<Ncv8u> tex8u, T *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs);
 template<>
-inline __device__ Ncv8u readElem<Ncv8u>(Ncv8u *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
+inline __device__ Ncv8u readElem<Ncv8u>(cv::cudev::TexturePtr<Ncv8u> tex8u, Ncv8u* d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
 {
-    return tex1Dfetch(tex8u, texOffs + srcStride * blockIdx.x + curElemOffs);
+    return tex8u(texOffs + srcStride * blockIdx.x + curElemOffs);
 }
 template<>
-inline __device__ Ncv32u readElem<Ncv32u>(Ncv32u *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
+inline __device__ Ncv32u readElem<Ncv32u>(cv::cudev::TexturePtr<Ncv8u> tex8u, Ncv32u *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
 {
    return d_src[curElemOffs];
 }
 template<>
-inline __device__ Ncv32f readElem<Ncv32f>(Ncv32f *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
+inline __device__ Ncv32f readElem<Ncv32f>(cv::cudev::TexturePtr<Ncv8u> tex8u, Ncv32f *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
 {
    return d_src[curElemOffs];
 }
@ -160,8 +154,7 @@ inline __device__ Ncv32f readElem<Ncv32f>(Ncv32f *d_src, Ncv32u texOffs, Ncv32u
 * \return None
 */
 template <class T_in, class T_out, bool tbDoSqr>
-__global__ void scanRows(T_in *d_src, Ncv32u texOffs, Ncv32u srcWidth, Ncv32u srcStride,
+__global__ void scanRows(cv::cudev::TexturePtr<Ncv8u> tex8u, T_in *d_src, Ncv32u texOffs, Ncv32u srcWidth, Ncv32u srcStride, T_out *d_II, Ncv32u IIstride)
                         T_out *d_II, Ncv32u IIstride)
 {
    //advance pointers to the current line
    if (sizeof(T_in) != 1)
@ -190,7 +183,7 @@ __global__ void scanRows(T_in *d_src, Ncv32u texOffs, Ncv32u srcWidth, Ncv32u sr
        if (curElemOffs < srcWidth)
        {
            //load elements
-            curElem = readElem<T_in>(d_src, texOffs, srcStride, curElemOffs);
+            curElem = readElem<T_in>(tex8u, d_src, texOffs, srcStride, curElemOffs);
        }
        curElemMod = _scanElemOp<T_in, T_out>::scanElemOp<tbDoSqr>(curElem);
@ -224,25 +217,9 @@ template <bool tbDoSqr, class T_in, class T_out>
 NCVStatus scanRowsWrapperDevice(T_in *d_src, Ncv32u srcStride,
                                T_out *d_dst, Ncv32u dstStride, NcvSize32u roi)
 {
-    cudaChannelFormatDesc cfdTex;
+    cv::cudev::Texture<Ncv8u> tex8u(static_cast<size_t>(roi.height * srcStride), (Ncv8u*)d_src);
-    size_t alignmentOffset = 0;
+    scanRows <T_in, T_out, tbDoSqr> <<<roi.height, NUM_SCAN_THREADS, 0, nppStGetActiveCUDAstream()>>> (tex8u, d_src, 0, roi.width, srcStride, d_dst, dstStride);
    if (sizeof(T_in) == 1)
    {
        cfdTex = cudaCreateChannelDesc<Ncv8u>();
        ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex8u, d_src, cfdTex, roi.height * srcStride), NPPST_TEXTURE_BIND_ERROR);
        if (alignmentOffset > 0)
        {
            ncvAssertCUDAReturn(cudaUnbindTexture(tex8u), NCV_CUDA_ERROR);
            ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex8u, d_src, cfdTex, alignmentOffset + roi.height * srcStride), NPPST_TEXTURE_BIND_ERROR);
        }
    }
    scanRows
        <T_in, T_out, tbDoSqr>
        <<<roi.height, NUM_SCAN_THREADS, 0, nppStGetActiveCUDAstream()>>>
        (d_src, (Ncv32u)alignmentOffset, roi.width, srcStride, d_dst, dstStride);
    ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
    return NPPST_SUCCESS;
 }
@ -585,59 +562,25 @@ NCVStatus nppiStSqrIntegral_8u64u_C1R_host(Ncv8u *h_src, Ncv32u srcStep,
 const Ncv32u NUM_DOWNSAMPLE_NEAREST_THREADS_X = 32;
 const Ncv32u NUM_DOWNSAMPLE_NEAREST_THREADS_Y = 8;
-
+template <class T>
-template<class T, NcvBool tbCacheTexture>
+__global__ void decimate_C1R(T* d_src, Ncv32u srcStep, T* d_dst, Ncv32u dstStep, NcvSize32u dstRoi, Ncv32u scale)
 __device__ T getElem_Decimate(Ncv32u x, T *d_src);
 template<>
 __device__ Ncv32u getElem_Decimate<Ncv32u, true>(Ncv32u x, Ncv32u *d_src)
 {
    return tex1Dfetch(tex32u, x);
 }
 template<>
 __device__ Ncv32u getElem_Decimate<Ncv32u, false>(Ncv32u x, Ncv32u *d_src)
 {
    return d_src[x];
 }
 template<>
 __device__ Ncv64u getElem_Decimate<Ncv64u, true>(Ncv32u x, Ncv64u *d_src)
 {
    uint2 tmp = tex1Dfetch(tex64u, x);
    Ncv64u res = (Ncv64u)tmp.y;
    res <<= 32;
    res |= tmp.x;
    return res;
 }
 template<>
 __device__ Ncv64u getElem_Decimate<Ncv64u, false>(Ncv32u x, Ncv64u *d_src)
 {
-    return d_src[x];
+    int curX = blockIdx.x * blockDim.x + threadIdx.x;
    int curY = blockIdx.y * blockDim.y + threadIdx.y;
    if (curX >= dstRoi.width || curY >= dstRoi.height) return;
    d_dst[curY * dstStep + curX] = d_src[(curY * srcStep + curX) * scale];
 }
-
+template <class T>
-template <class T, NcvBool tbCacheTexture>
+__global__ void decimate_C1R(cv::cudev::TexturePtr<T> texSrc, Ncv32u srcStep, T* d_dst, Ncv32u dstStep,
-__global__ void decimate_C1R(T *d_src, Ncv32u srcStep, T *d_dst, Ncv32u dstStep,
+    NcvSize32u dstRoi, Ncv32u scale)
                                      NcvSize32u dstRoi, Ncv32u scale)
 {
    int curX = blockIdx.x * blockDim.x + threadIdx.x;
    int curY = blockIdx.y * blockDim.y + threadIdx.y;
-
+    if (curX >= dstRoi.width || curY >= dstRoi.height) return;
-    if (curX >= dstRoi.width || curY >= dstRoi.height)
+    d_dst[curY * dstStep + curX] = texSrc((curY * srcStep + curX) * scale);
    {
        return;
    }
    d_dst[curY * dstStep + curX] = getElem_Decimate<T, tbCacheTexture>((curY * srcStep + curX) * scale, d_src);
 }
 template <class T>
 static NCVStatus decimateWrapperDevice(T *d_src, Ncv32u srcStep,
                                                T *d_dst, Ncv32u dstStep,
@ -659,39 +602,12 @@ static NCVStatus decimateWrapperDevice(T *d_src, Ncv32u srcStep,
    dim3 grid((dstRoi.width + NUM_DOWNSAMPLE_NEAREST_THREADS_X - 1) / NUM_DOWNSAMPLE_NEAREST_THREADS_X,
              (dstRoi.height + NUM_DOWNSAMPLE_NEAREST_THREADS_Y - 1) / NUM_DOWNSAMPLE_NEAREST_THREADS_Y);
    dim3 block(NUM_DOWNSAMPLE_NEAREST_THREADS_X, NUM_DOWNSAMPLE_NEAREST_THREADS_Y);
-
+    if (!readThruTexture) {
-    if (!readThruTexture)
+        decimate_C1R<T><<<grid, block, 0, nppStGetActiveCUDAstream()>>>(d_src, srcStep, d_dst, dstStep, dstRoi, scale);
    {
        decimate_C1R
            <T, false>
            <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
            (d_src, srcStep, d_dst, dstStep, dstRoi, scale);
    }
-    else
+    else {
-    {
+        cv::cudev::Texture<T> texSrc(srcRoi.height * srcStep * sizeof(T), d_src);
-        cudaChannelFormatDesc cfdTexSrc;
+        decimate_C1R<T><<<grid, block, 0, nppStGetActiveCUDAstream()>>>(texSrc, srcStep, d_dst, dstStep, dstRoi, scale);
        if (sizeof(T) == sizeof(Ncv32u))
        {
            cfdTexSrc = cudaCreateChannelDesc<Ncv32u>();
            size_t alignmentOffset;
            ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex32u, d_src, cfdTexSrc, srcRoi.height * srcStep * sizeof(T)), NPPST_TEXTURE_BIND_ERROR);
            ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
        }
        else
        {
            cfdTexSrc = cudaCreateChannelDesc<uint2>();
            size_t alignmentOffset;
            ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex64u, d_src, cfdTexSrc, srcRoi.height * srcStep * sizeof(T)), NPPST_TEXTURE_BIND_ERROR);
            ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
        }
        decimate_C1R
            <T, true>
            <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
            (d_src, srcStep, d_dst, dstStep, dstRoi, scale);
    }
    ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
@ -753,11 +669,7 @@ static NCVStatus decimateWrapperHost(T *h_src, Ncv32u srcStep,
 implementNppDecimate(32, u)
 implementNppDecimate(32, s)
 implementNppDecimate(32, f)
 implementNppDecimate(64, u)
 implementNppDecimate(64, s)
 implementNppDecimate(64, f)
 implementNppDecimateHost(32, u)
 implementNppDecimateHost(32, s)
 implementNppDecimateHost(32, f)
@ -776,43 +688,29 @@ implementNppDecimateHost(64, f)
 const Ncv32u NUM_RECTSTDDEV_THREADS = 128;
-template <NcvBool tbCacheTexture>
+template <NcvBool tbCacheTexture, class Ptr2D>
-__device__ Ncv32u getElemSum(Ncv32u x, Ncv32u *d_sum)
+__device__ Ncv32u getElemSum(Ptr2D tex, Ncv32u x, Ncv32u *d_sum)
 {
    if (tbCacheTexture)
-    {
+        return tex(x);
        return tex1Dfetch(tex32u, x);
    }
    else
    {
        return d_sum[x];
    }
 }
-template <NcvBool tbCacheTexture>
+template <NcvBool tbCacheTexture, class Ptr2D>
-__device__ Ncv64u getElemSqSum(Ncv32u x, Ncv64u *d_sqsum)
+__device__ Ncv64u getElemSqSum(Ptr2D tex, Ncv32u x, Ncv64u *d_sqsum)
 {
    if (tbCacheTexture)
-    {
+        return tex(x);
        uint2 tmp = tex1Dfetch(tex64u, x);
        Ncv64u res = (Ncv64u)tmp.y;
        res <<= 32;
        res |= tmp.x;
        return res;
    }
    else
    {
        return d_sqsum[x];
    }
 }
 template <NcvBool tbCacheTexture>
-__global__ void rectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep,
+__global__ void rectStdDev_32f_C1R(cv::cudev::TexturePtr<Ncv32u> texSum, cv::cudev::TexturePtr<Ncv64u> texSumSq, Ncv32u *d_sum, Ncv32u sumStep, Ncv64u *d_sqsum, Ncv32u sqsumStep,
-                                   Ncv64u *d_sqsum, Ncv32u sqsumStep,
+    Ncv32f *d_norm, Ncv32u normStep, NcvSize32u roi, NcvRect32u rect, Ncv32f invRectArea)
                                   Ncv32f *d_norm, Ncv32u normStep,
                                   NcvSize32u roi, NcvRect32u rect, Ncv32f invRectArea)
 {
    Ncv32u x_offs = blockIdx.x * NUM_RECTSTDDEV_THREADS + threadIdx.x;
    if (x_offs >= roi.width)
@ -824,17 +722,17 @@ __global__ void rectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep,
    Ncv32u sqsum_offset = blockIdx.y * sqsumStep + x_offs;
    //OPT: try swapping order (could change cache hit/miss ratio)
-    Ncv32u sum_tl = getElemSum<tbCacheTexture>(sum_offset + rect.y * sumStep + rect.x, d_sum);
+    Ncv32u sum_tl = getElemSum<tbCacheTexture>(texSum, sum_offset + rect.y * sumStep + rect.x, d_sum);
-    Ncv32u sum_bl = getElemSum<tbCacheTexture>(sum_offset + (rect.y + rect.height) * sumStep + rect.x, d_sum);
+    Ncv32u sum_bl = getElemSum<tbCacheTexture>(texSum, sum_offset + (rect.y + rect.height) * sumStep + rect.x, d_sum);
-    Ncv32u sum_tr = getElemSum<tbCacheTexture>(sum_offset + rect.y * sumStep + rect.x + rect.width, d_sum);
+    Ncv32u sum_tr = getElemSum<tbCacheTexture>(texSum, sum_offset + rect.y * sumStep + rect.x + rect.width, d_sum);
-    Ncv32u sum_br = getElemSum<tbCacheTexture>(sum_offset + (rect.y + rect.height) * sumStep + rect.x + rect.width, d_sum);
+    Ncv32u sum_br = getElemSum<tbCacheTexture>(texSum, sum_offset + (rect.y + rect.height) * sumStep + rect.x + rect.width, d_sum);
    Ncv32u sum_val = sum_br + sum_tl - sum_tr - sum_bl;
    Ncv64u sqsum_tl, sqsum_bl, sqsum_tr, sqsum_br;
-    sqsum_tl = getElemSqSum<tbCacheTexture>(sqsum_offset + rect.y * sqsumStep + rect.x, d_sqsum);
+    sqsum_tl = getElemSqSum<tbCacheTexture>(texSumSq, sqsum_offset + rect.y * sqsumStep + rect.x, d_sqsum);
-    sqsum_bl = getElemSqSum<tbCacheTexture>(sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x, d_sqsum);
+    sqsum_bl = getElemSqSum<tbCacheTexture>(texSumSq, sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x, d_sqsum);
-    sqsum_tr = getElemSqSum<tbCacheTexture>(sqsum_offset + rect.y * sqsumStep + rect.x + rect.width, d_sqsum);
+    sqsum_tr = getElemSqSum<tbCacheTexture>(texSumSq, sqsum_offset + rect.y * sqsumStep + rect.x + rect.width, d_sqsum);
-    sqsum_br = getElemSqSum<tbCacheTexture>(sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x + rect.width, d_sqsum);
+    sqsum_br = getElemSqSum<tbCacheTexture>(texSumSq, sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x + rect.width, d_sqsum);
    Ncv64u sqsum_val = sqsum_br + sqsum_tl - sqsum_tr - sqsum_bl;
    Ncv32f mean = sum_val * invRectArea;
@ -897,31 +795,12 @@ NCVStatus nppiStRectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep,
    dim3 grid(((roi.width + NUM_RECTSTDDEV_THREADS - 1) / NUM_RECTSTDDEV_THREADS), roi.height);
    dim3 block(NUM_RECTSTDDEV_THREADS);
    cv::cudev::Texture<Ncv32u> texSum((roi.height + rect.y + rect.height) * sumStep * sizeof(Ncv32u), d_sum);
    cv::cudev::Texture<Ncv64u> texSumSq((roi.height + rect.y + rect.height) * sqsumStep * sizeof(Ncv64u), d_sqsum);
    if (!readThruTexture)
-    {
+        rectStdDev_32f_C1R<false><<<grid, block, 0, nppStGetActiveCUDAstream()>>>(texSum, texSumSq, d_sum, sumStep, d_sqsum, sqsumStep, d_norm, normStep, roi, rect, invRectArea);
        rectStdDev_32f_C1R
            <false>
            <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
            (d_sum, sumStep, d_sqsum, sqsumStep, d_norm, normStep, roi, rect, invRectArea);
    }
    else
-    {
+        rectStdDev_32f_C1R<true><<<grid, block, 0, nppStGetActiveCUDAstream()>>>(texSum, texSumSq, NULL, sumStep, NULL, sqsumStep, d_norm, normStep, roi, rect, invRectArea);
        cudaChannelFormatDesc cfdTexSrc;
        cudaChannelFormatDesc cfdTexSqr;
        cfdTexSrc = cudaCreateChannelDesc<Ncv32u>();
        cfdTexSqr = cudaCreateChannelDesc<uint2>();
        size_t alignmentOffset;
        ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex32u, d_sum, cfdTexSrc, (roi.height + rect.y + rect.height) * sumStep * sizeof(Ncv32u)), NPPST_TEXTURE_BIND_ERROR);
        ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
        ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex64u, d_sqsum, cfdTexSqr, (roi.height + rect.y + rect.height) * sqsumStep * sizeof(Ncv64u)), NPPST_TEXTURE_BIND_ERROR);
        ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
        rectStdDev_32f_C1R
            <true>
            <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
            (NULL, sumStep, NULL, sqsumStep, d_norm, normStep, roi, rect, invRectArea);
    }
    ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
@ -1553,40 +1432,24 @@ NCVStatus nppsStCompact_32f_host(Ncv32f *h_src, Ncv32u srcLen,
 //
 //==============================================================================
-
+__forceinline__ __device__ float getValueMirrorRow(cv::cudev::TexturePtr< Ncv32f> tex, const int rowOffset, int i, int w)
 texture <float, 1, cudaReadModeElementType> texSrc;
 texture <float, 1, cudaReadModeElementType> texKernel;
 __forceinline__ __device__ float getValueMirrorRow(const int rowOffset,
                                                   int i,
                                                   int w)
 {
    if (i < 0) i = 1 - i;
    if (i >= w) i = w + w - i - 1;
-    return tex1Dfetch (texSrc, rowOffset + i);
+    return tex(rowOffset + i);
 }
-__forceinline__ __device__ float getValueMirrorColumn(const int offset,
+__forceinline__ __device__ float getValueMirrorColumn(cv::cudev::TexturePtr< Ncv32f> tex, const int offset, const int rowStep, int j, int h)
                                                      const int rowStep,
                                                      int j,
                                                      int h)
 {
    if (j < 0) j = 1 - j;
    if (j >= h) j = h + h - j - 1;
-    return tex1Dfetch (texSrc, offset + j * rowStep);
+    return tex(offset + j * rowStep);
 }
-__global__ void FilterRowBorderMirror_32f_C1R(Ncv32u srcStep,
+__global__ void FilterRowBorderMirror_32f_C1R(cv::cudev::TexturePtr<Ncv32f> texSrc, cv::cudev::TexturePtr<Ncv32f> texKernel1, Ncv32u srcStep, Ncv32f *pDst, NcvSize32u dstSize, Ncv32u dstStep,
-                                              Ncv32f *pDst,
+    NcvRect32u roi, Ncv32s nKernelSize, Ncv32s nAnchor, Ncv32f multiplier)
                                              NcvSize32u dstSize,
                                              Ncv32u dstStep,
                                              NcvRect32u roi,
                                              Ncv32s nKernelSize,
                                              Ncv32s nAnchor,
                                              Ncv32f multiplier)
 {
    // position within ROI
    const int ix = blockDim.x * blockIdx.x + threadIdx.x;
@ -1606,22 +1469,16 @@ __global__ void FilterRowBorderMirror_32f_C1R(Ncv32u srcStep,
    float sum = 0.0f;
    for (int m = 0; m < nKernelSize; ++m)
    {
-        sum += getValueMirrorRow (rowOffset, ix + m - p, roi.width)
+        sum += getValueMirrorRow(texSrc, rowOffset, ix + m - p, roi.width)
-            * tex1Dfetch (texKernel, m);
+            * texKernel1(m);
    }
    pDst[iy * dstStep + ix] = sum * multiplier;
 }
-__global__ void FilterColumnBorderMirror_32f_C1R(Ncv32u srcStep,
+__global__ void FilterColumnBorderMirror_32f_C1R(cv::cudev::TexturePtr<Ncv32f> texSrc, cv::cudev::TexturePtr<Ncv32f> texKernel, Ncv32u srcStep, Ncv32f *pDst, NcvSize32u dstSize, Ncv32u dstStep,
-                                                 Ncv32f *pDst,
+    NcvRect32u roi, Ncv32s nKernelSize, Ncv32s nAnchor, Ncv32f multiplier)
                                                 NcvSize32u dstSize,
                                                 Ncv32u dstStep,
                                                 NcvRect32u roi,
                                                 Ncv32s nKernelSize,
                                                 Ncv32s nAnchor,
                                                 Ncv32f multiplier)
 {
    const int ix = blockDim.x * blockIdx.x + threadIdx.x;
    const int iy = blockDim.y * blockIdx.y + threadIdx.y;
@ -1638,15 +1495,15 @@ __global__ void FilterColumnBorderMirror_32f_C1R(Ncv32u srcStep,
    float sum = 0.0f;
    for (int m = 0; m < nKernelSize; ++m)
    {
-        sum += getValueMirrorColumn (offset, srcStep, iy + m - p, roi.height)
+        sum += getValueMirrorColumn(texSrc, offset, srcStep, iy + m - p, roi.height)
-            * tex1Dfetch (texKernel, m);
+            * texKernel(m);
    }
    pDst[ix + iy * dstStep] = sum * multiplier;
 }
-NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
+NCVStatus nppiStFilterRowBorder_32f_C1R(Ncv32f *pSrc,
                                        NcvSize32u srcSize,
                                        Ncv32u nSrcStep,
                                        Ncv32f *pDst,
@ -1654,7 +1511,7 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
                                        Ncv32u nDstStep,
                                        NcvRect32u oROI,
                                        NppStBorderType borderType,
-                                        const Ncv32f *pKernel,
+                                        Ncv32f *pKernel,
                                        Ncv32s nKernelSize,
                                        Ncv32s nAnchor,
                                        Ncv32f multiplier)
@ -1686,12 +1543,8 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
        oROI.height = srcSize.height - oROI.y;
    }
-    cudaChannelFormatDesc floatChannel = cudaCreateChannelDesc <float> ();
+    cv::cudev::Texture<Ncv32f> texSrc(srcSize.height * nSrcStep, pSrc);
-    texSrc.normalized    = false;
+    cv::cudev::Texture<Ncv32f> texKernel(nKernelSize * sizeof(Ncv32f), pKernel);
    texKernel.normalized = false;
    cudaBindTexture (0, texSrc, pSrc, floatChannel, srcSize.height * nSrcStep);
    cudaBindTexture (0, texKernel, pKernel, floatChannel, nKernelSize * sizeof (Ncv32f));
    dim3 ctaSize (32, 6);
    dim3 gridSize ((oROI.width + ctaSize.x - 1) / ctaSize.x,
@ -1706,8 +1559,7 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
    case nppStBorderWrap:
        return NPPST_ERROR;
    case nppStBorderMirror:
-        FilterRowBorderMirror_32f_C1R <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
+        FilterRowBorderMirror_32f_C1R <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>(texSrc, texKernel, srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier);
            (srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier);
        ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
        break;
    default:
@ -1718,7 +1570,7 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
 }
-NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
+NCVStatus nppiStFilterColumnBorder_32f_C1R(Ncv32f *pSrc,
                                           NcvSize32u srcSize,
                                           Ncv32u nSrcStep,
                                           Ncv32f *pDst,
@ -1726,7 +1578,7 @@ NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
                                           Ncv32u nDstStep,
                                           NcvRect32u oROI,
                                           NppStBorderType borderType,
-                                           const Ncv32f *pKernel,
+                                           Ncv32f *pKernel,
                                           Ncv32s nKernelSize,
                                           Ncv32s nAnchor,
                                           Ncv32f multiplier)
@ -1758,12 +1610,8 @@ NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
        oROI.height = srcSize.height - oROI.y;
    }
-    cudaChannelFormatDesc floatChannel = cudaCreateChannelDesc <float> ();
+    cv::cudev::Texture<Ncv32f> texSrc(srcSize.height * nSrcStep, pSrc);
-    texSrc.normalized    = false;
+    cv::cudev::Texture<Ncv32f> texKernel(nKernelSize * sizeof(Ncv32f), pKernel);
    texKernel.normalized = false;
    cudaBindTexture (0, texSrc, pSrc, floatChannel, srcSize.height * nSrcStep);
    cudaBindTexture (0, texKernel, pKernel, floatChannel, nKernelSize * sizeof (Ncv32f));
    dim3 ctaSize (32, 6);
    dim3 gridSize ((oROI.width + ctaSize.x - 1) / ctaSize.x,
@ -1776,8 +1624,7 @@ NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
    case nppStBorderWrap:
        return NPPST_ERROR;
    case nppStBorderMirror:
-        FilterColumnBorderMirror_32f_C1R <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
+        FilterColumnBorderMirror_32f_C1R <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>(texSrc, texKernel, srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier);
            (srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier);
        ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
        break;
    default:
@ -1800,16 +1647,11 @@ inline Ncv32u iDivUp(Ncv32u num, Ncv32u denom)
    return (num + denom - 1)/denom;
 }
-
+__global__ void BlendFramesKernel(cv::cudev::TexturePtr<Ncv32f> texSrc0, cv::cudev::TexturePtr<Ncv32f> texSrc1,
-texture<float, 2, cudaReadModeElementType> tex_src1;
+    const float *u, const float *v,   // forward flow
-texture<float, 2, cudaReadModeElementType> tex_src0;
+    const float *ur, const float *vr, // backward flow
-
+    const float *o0, const float *o1, // coverage masks
-
+    int w, int h, int s, float theta, float *out)
 __global__ void BlendFramesKernel(const float *u, const float *v,   // forward flow
                                  const float *ur, const float *vr, // backward flow
                                  const float *o0, const float *o1, // coverage masks
                                  int w, int h, int s,
                                  float theta, float *out)
 {
    const int ix = threadIdx.x + blockDim.x * blockIdx.x;
    const int iy = threadIdx.y + blockDim.y * blockIdx.y;
@ -1829,27 +1671,17 @@ __global__ void BlendFramesKernel(const float *u, const float *v,   // forward f
    bool b0 = o0[pos] > 1e-4f;
    bool b1 = o1[pos] > 1e-4f;
-    if (b0 && b1)
+    if (b0 && b1) // pixel is visible on both frames
-    {
+        out[pos] = texSrc0(y - _v * theta, x - _u * theta)* (1.0f - theta) + texSrc0(y + _v * (1.0f - theta), x + _u * (1.0f - theta)) * theta;
-        // pixel is visible on both frames
+    else if (b0) // visible on the first frame only
-        out[pos] = tex2D(tex_src0, x - _u * theta, y - _v * theta) * (1.0f - theta) +
+        out[pos] = texSrc0(y - _v * theta, x - _u * theta);
-            tex2D(tex_src1, x + _u * (1.0f - theta), y + _v * (1.0f - theta)) * theta;
+    else // visible on the second frame only
-    }
+        out[pos] = texSrc1(y - _vr * (1.0f - theta), x - _ur * (1.0f - theta));
    else if (b0)
    {
        // visible on the first frame only
        out[pos] = tex2D(tex_src0, x - _u * theta, y - _v * theta);
    }
    else
    {
        // visible on the second frame only
        out[pos] = tex2D(tex_src1, x - _ur * (1.0f - theta), y - _vr * (1.0f - theta));
    }
 }
-NCVStatus BlendFrames(const Ncv32f *src0,
+NCVStatus BlendFrames(Ncv32f *src0,
-                      const Ncv32f *src1,
+                      Ncv32f *src1,
                      const Ncv32f *ufi,
                      const Ncv32f *vfi,
                      const Ncv32f *ubi,
@ -1862,29 +1694,13 @@ NCVStatus BlendFrames(const Ncv32f *src0,
                      Ncv32f theta,
                      Ncv32f *out)
 {
    tex_src1.addressMode[0] = cudaAddressModeClamp;
    tex_src1.addressMode[1] = cudaAddressModeClamp;
    tex_src1.filterMode = cudaFilterModeLinear;
    tex_src1.normalized = false;
    tex_src0.addressMode[0] = cudaAddressModeClamp;
    tex_src0.addressMode[1] = cudaAddressModeClamp;
    tex_src0.filterMode = cudaFilterModeLinear;
    tex_src0.normalized = false;
    cudaChannelFormatDesc desc = cudaCreateChannelDesc <float> ();
    const Ncv32u pitch = stride * sizeof (float);
-    ncvAssertCUDAReturn (cudaBindTexture2D (0, tex_src1, src1, desc, width, height, pitch), NPPST_TEXTURE_BIND_ERROR);
+    cv::cudev::Texture<Ncv32f> texSrc0(height, width, src0, pitch, false, cudaFilterModeLinear);
-    ncvAssertCUDAReturn (cudaBindTexture2D (0, tex_src0, src0, desc, width, height, pitch), NPPST_TEXTURE_BIND_ERROR);
+    cv::cudev::Texture<Ncv32f> texSrc1(height, width, src1, pitch, false, cudaFilterModeLinear);
    dim3 threads (32, 4);
    dim3 blocks (iDivUp (width, threads.x), iDivUp (height, threads.y));
-
+    BlendFramesKernel<<<blocks, threads, 0, nppStGetActiveCUDAstream ()>>>(texSrc0, texSrc1, ufi, vfi, ubi, vbi, o1, o2, width, height, stride, theta, out);
    BlendFramesKernel<<<blocks, threads, 0, nppStGetActiveCUDAstream ()>>>
        (ufi, vfi, ubi, vbi, o1, o2, width, height, stride, theta, out);
    ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
    return NPPST_SUCCESS;
 }
@ -2255,44 +2071,27 @@ NCVStatus nppiStVectorWarp_PSF2x2_32f_C1(const Ncv32f *pSrc,
 //
 //==============================================================================
 texture <float, 2, cudaReadModeElementType> texSrc2D;
 __forceinline__
-__device__ float processLine(int spos,
+__device__ float processLine(cv::cudev::TexturePtr<Ncv32f> tex, int spos, float xmin, float xmax, int ixmin, int ixmax, float fxmin, float cxmax)
                             float xmin,
                             float xmax,
                             int ixmin,
                             int ixmax,
                             float fxmin,
                             float cxmax)
 {
    // first element
    float wsum = 1.0f - xmin + fxmin;
-    float sum = tex1Dfetch(texSrc, spos) * (1.0f - xmin + fxmin);
+    float sum = tex( spos) * (1.0f - xmin + fxmin);
    spos++;
    for (int ix = ixmin + 1; ix < ixmax; ++ix)
    {
-        sum += tex1Dfetch(texSrc, spos);
+        sum += tex(spos);
        spos++;
        wsum += 1.0f;
    }
-    sum += tex1Dfetch(texSrc, spos) * (cxmax - xmax);
+    sum += tex(spos) * (cxmax - xmax);
    wsum += cxmax - xmax;
    return sum / wsum;
 }
-__global__ void resizeSuperSample_32f(NcvSize32u srcSize,
+__global__ void resizeSuperSample_32f(cv::cudev::TexturePtr<Ncv32f> texSrc, NcvSize32u srcSize, Ncv32u srcStep, NcvRect32u srcROI, Ncv32f *dst, NcvSize32u dstSize, Ncv32u dstStep,
-                                      Ncv32u srcStep,
+    NcvRect32u dstROI, Ncv32f scaleX, Ncv32f scaleY)
                                      NcvRect32u srcROI,
                                      Ncv32f *dst,
                                      NcvSize32u dstSize,
                                      Ncv32u dstStep,
                                      NcvRect32u dstROI,
                                      Ncv32f scaleX,
                                      Ncv32f scaleY)
 {
    // position within dst ROI
    const int ix = blockIdx.x * blockDim.x + threadIdx.x;
@ -2332,18 +2131,18 @@ __global__ void resizeSuperSample_32f(NcvSize32u srcSize,
    float wsum = 1.0f - yBegin + floorYBegin;
-    float sum = processLine (pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
+    float sum = processLine (texSrc, pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
        ceilXEnd) * (1.0f - yBegin + floorYBegin);
    pos += srcStep;
    for (int iy = iYBegin + 1; iy < iYEnd; ++iy)
    {
-        sum += processLine (pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
+        sum += processLine (texSrc, pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
            ceilXEnd);
        pos += srcStep;
        wsum += 1.0f;
    }
-    sum += processLine (pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
+    sum += processLine (texSrc, pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
        ceilXEnd) * (ceilYEnd - yEnd);
    wsum += ceilYEnd - yEnd;
    sum /= wsum;
@ -2372,14 +2171,7 @@ __device__ float bicubicCoeff(float x_)
 }
-__global__ void resizeBicubic(NcvSize32u srcSize,
+__global__ void resizeBicubic(cv::cudev::TexturePtr<Ncv32f> texSrc, NcvSize32u srcSize, NcvRect32u srcROI, NcvSize32u dstSize, Ncv32u dstStep, Ncv32f *dst, NcvRect32u dstROI, Ncv32f scaleX, Ncv32f scaleY)
                              NcvRect32u srcROI,
                              NcvSize32u dstSize,
                              Ncv32u dstStep,
                              Ncv32f *dst,
                              NcvRect32u dstROI,
                              Ncv32f scaleX,
                              Ncv32f scaleY)
 {
    const int ix = blockIdx.x * blockDim.x + threadIdx.x;
    const int iy = blockIdx.y * blockDim.y + threadIdx.y;
@ -2433,7 +2225,7 @@ __global__ void resizeBicubic(NcvSize32u srcSize,
            float wx = bicubicCoeff (xDist);
            float wy = bicubicCoeff (yDist);
            wx *= wy;
-            sum += wx * tex2D (texSrc2D, cx * dx, cy * dy);
+            sum += wx * texSrc(cy * dy, cx * dx);
            wsum += wx;
        }
    }
@ -2441,7 +2233,7 @@ __global__ void resizeBicubic(NcvSize32u srcSize,
 }
-NCVStatus nppiStResize_32f_C1R(const Ncv32f *pSrc,
+NCVStatus nppiStResize_32f_C1R(Ncv32f *pSrc,
                               NcvSize32u srcSize,
                               Ncv32u nSrcStep,
                               NcvRect32u srcROI,
@ -2469,33 +2261,17 @@ NCVStatus nppiStResize_32f_C1R(const Ncv32f *pSrc,
    if (interpolation == nppStSupersample)
    {
-        // bind texture
+        cv::cudev::Texture<Ncv32f> texSrc(srcSize.height * nSrcStep, pSrc);
        cudaBindTexture (0, texSrc, pSrc, srcSize.height * nSrcStep);
        // invoke kernel
        dim3 ctaSize (32, 6);
-        dim3 gridSize ((dstROI.width  + ctaSize.x - 1) / ctaSize.x,
+        dim3 gridSize ((dstROI.width  + ctaSize.x - 1) / ctaSize.x,(dstROI.height + ctaSize.y - 1) / ctaSize.y);
-            (dstROI.height + ctaSize.y - 1) / ctaSize.y);
+        resizeSuperSample_32f <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>> (texSrc, srcSize, srcStep, srcROI, pDst, dstSize, dstStep, dstROI, 1.0f / xFactor, 1.0f / yFactor);
        resizeSuperSample_32f <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
            (srcSize, srcStep, srcROI, pDst, dstSize, dstStep, dstROI, 1.0f / xFactor, 1.0f / yFactor);
    }
    else if (interpolation == nppStBicubic)
    {
-        texSrc2D.addressMode[0] = cudaAddressModeMirror;
+        cv::cudev::Texture<float> texSrc(srcSize.height, srcSize.width, pSrc, nSrcStep, true, cudaFilterModePoint, cudaAddressModeMirror);
        texSrc2D.addressMode[1] = cudaAddressModeMirror;
        texSrc2D.normalized = true;
        cudaChannelFormatDesc desc = cudaCreateChannelDesc <float> ();
        cudaBindTexture2D (0, texSrc2D, pSrc, desc, srcSize.width, srcSize.height,
            nSrcStep);
        dim3 ctaSize (32, 6);
-        dim3 gridSize ((dstSize.width  + ctaSize.x - 1) / ctaSize.x,
+        dim3 gridSize ((dstSize.width  + ctaSize.x - 1) / ctaSize.x, (dstSize.height + ctaSize.y - 1) / ctaSize.y);
-            (dstSize.height + ctaSize.y - 1) / ctaSize.y);
+        resizeBicubic <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>> (texSrc, srcSize, srcROI, dstSize, dstStep, pDst, dstROI, 1.0f / xFactor, 1.0f / yFactor);
        resizeBicubic <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
            (srcSize, srcROI, dstSize, dstStep, pDst, dstROI, 1.0f / xFactor, 1.0f / yFactor);
    }
    else
    {
--- a/modules/cudalegacy/src/cuda/bm.cu
+++ b/modules/cudalegacy/src/cuda/bm.cu
@ -46,29 +46,27 @@
 #include "opencv2/core/cuda/limits.hpp"
 #include "opencv2/core/cuda/functional.hpp"
 #include "opencv2/core/cuda/reduce.hpp"
 #include <opencv2/cudev/ptr2d/texture.hpp>
 using namespace cv::cuda;
 using namespace cv::cuda::device;
 namespace optflowbm
 {
-    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_prev(false, cudaFilterModePoint, cudaAddressModeClamp);
+    __device__ int cmpBlocks(cv::cudev::TexturePtr<uchar> texCurr, cv::cudev::TexturePtr<uchar> texPrev, int X1, int Y1, int X2, int Y2, int2 blockSize)
    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_curr(false, cudaFilterModePoint, cudaAddressModeClamp);
    __device__ int cmpBlocks(int X1, int Y1, int X2, int Y2, int2 blockSize)
    {
        int s = 0;
        for (int y = 0; y < blockSize.y; ++y)
        {
            for (int x = 0; x < blockSize.x; ++x)
-                s += ::abs(tex2D(tex_prev, X1 + x, Y1 + y) - tex2D(tex_curr, X2 + x, Y2 + y));
+                s += ::abs(texPrev(Y1 + y, X1 + x) -texCurr(Y2 + y, X2 + x));
        }
        return s;
    }
-    __global__ void calcOptFlowBM(PtrStepSzf velx, PtrStepf vely, const int2 blockSize, const int2 shiftSize, const bool usePrevious,
+    __global__ void calcOptFlowBM(cv::cudev::TexturePtr<uchar> texPrev, cv::cudev::TexturePtr<uchar> texCurr, PtrStepSzf velx, PtrStepf vely, const int2 blockSize, const int2 shiftSize, const bool usePrevious,
                                  const int maxX, const int maxY, const int acceptLevel, const int escapeLevel,
                                  const short2* ss, const int ssCount)
    {
@ -90,7 +88,7 @@ namespace optflowbm
        int dist = numeric_limits<int>::max();
        if (0 <= X2 && X2 <= maxX && 0 <= Y2 && Y2 <= maxY)
-            dist = cmpBlocks(X1, Y1, X2, Y2, blockSize);
+            dist = cmpBlocks(texPrev, texCurr, X1, Y1, X2, Y2, blockSize);
        int countMin = 1;
        int sumx = offX;
@ -111,7 +109,7 @@ namespace optflowbm
                if (0 <= X2 && X2 <= maxX && 0 <= Y2 && Y2 <= maxY)
                {
-                    const int tmpDist = cmpBlocks(X1, Y1, X2, Y2, blockSize);
+                    const int tmpDist = cmpBlocks(texPrev, texCurr, X1, Y1, X2, Y2, blockSize);
                    if (tmpDist < acceptLevel)
                    {
                        sumx = dx;
@ -151,16 +149,12 @@ namespace optflowbm
    void calc(PtrStepSzb prev, PtrStepSzb curr, PtrStepSzf velx, PtrStepSzf vely, int2 blockSize, int2 shiftSize, bool usePrevious,
              int maxX, int maxY, int acceptLevel, int escapeLevel, const short2* ss, int ssCount, cudaStream_t stream)
    {
-        bindTexture(&tex_prev, prev);
+        cv::cudev::Texture<uchar> texPrev(prev);
-        bindTexture(&tex_curr, curr);
+        cv::cudev::Texture<uchar> texCurr(curr);
        const dim3 block(32, 8);
        const dim3 grid(divUp(velx.cols, block.x), divUp(vely.rows, block.y));
-
+        calcOptFlowBM<<<grid, block, 0, stream>>>(texPrev, texCurr, velx, vely, blockSize, shiftSize, usePrevious, maxX, maxY, acceptLevel,  escapeLevel, ss, ssCount);
        calcOptFlowBM<<<grid, block, 0, stream>>>(velx, vely, blockSize, shiftSize, usePrevious,
                                                  maxX, maxY, acceptLevel,  escapeLevel, ss, ssCount);
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0)
            cudaSafeCall( cudaDeviceSynchronize() );
    }
--- a/modules/cudalegacy/test/TestHypothesesGrow.cpp
+++ b/modules/cudalegacy/test/TestHypothesesGrow.cpp
@ -100,7 +100,8 @@ bool TestHypothesesGrow::process()
    NCV_SKIP_COND_BEGIN
    ncvAssertReturn(this->src.fill(h_vecSrc), false);
-    memset(h_vecDst.ptr(), 0, h_vecDst.length() * sizeof(NcvRect32u));
+
    *h_vecDst.ptr() = {};
    NCVVectorReuse<Ncv32u> h_vecDst_as32u(h_vecDst.getSegment(), lenDst * sizeof(NcvRect32u) / sizeof(Ncv32u));
    ncvAssertReturn(h_vecDst_as32u.isMemReused(), false);
    ncvAssertReturn(this->src.fill(h_vecDst_as32u), false);
--- a/modules/cudaobjdetect/src/cuda/hog.cu
+++ b/modules/cudaobjdetect/src/cuda/hog.cu
@ -46,6 +46,7 @@
 #include "opencv2/core/cuda/reduce.hpp"
 #include "opencv2/core/cuda/functional.hpp"
 #include "opencv2/core/cuda/warp_shuffle.hpp"
 #include  <opencv2/cudev/ptr2d/texture.hpp>
 namespace cv { namespace cuda { namespace device
 {
@ -825,64 +826,57 @@ namespace cv { namespace cuda { namespace device
        //-------------------------------------------------------------------
        // Resize
-        texture<uchar4, 2, cudaReadModeNormalizedFloat> resize8UC4_tex;
+        __global__ void resize_for_hog_kernel(cv::cudev::TexturePtr<uchar, float> src, float sx, float sy, PtrStepSz<uchar> dst)
        texture<uchar,  2, cudaReadModeNormalizedFloat> resize8UC1_tex;
        __global__ void resize_for_hog_kernel(float sx, float sy, PtrStepSz<uchar> dst, int colOfs)
        {
            unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
            unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
            if (x < dst.cols && y < dst.rows)
-                dst.ptr(y)[x] = tex2D(resize8UC1_tex, x * sx + colOfs, y * sy) * 255;
+                dst.ptr(y)[x] = src(y * sy, x * sx) * 255;
        }
-        __global__ void resize_for_hog_kernel(float sx, float sy, PtrStepSz<uchar4> dst, int colOfs)
+        __global__ void resize_for_hog_kernel(cv::cudev::TexturePtr<uchar4, float4> src, float sx, float sy, PtrStepSz<uchar4> dst)
        {
            unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
            unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
            if (x < dst.cols && y < dst.rows)
            {
-                float4 val = tex2D(resize8UC4_tex, x * sx + colOfs, y * sy);
+                float4 val = src(y * sy, x * sx);
                dst.ptr(y)[x] = make_uchar4(val.x * 255, val.y * 255, val.z * 255, val.w * 255);
            }
        }
-        template<class T, class TEX>
+        static void resize_for_hog_8UC1(const PtrStepSzb& src, PtrStepSzb dst)
        static void resize_for_hog(const PtrStepSzb& src, PtrStepSzb dst, TEX& tex)
        {
-            tex.filterMode = cudaFilterModeLinear;
+            cv::cudev::Texture<uchar,float> tex(src.rows, src.cols, src.data, src.step, false, cudaFilterModeLinear, cudaAddressModeClamp, cudaReadModeNormalizedFloat);
            size_t texOfs = 0;
            int colOfs = 0;
            cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
            cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );
            if (texOfs != 0)
            {
                colOfs = static_cast<int>( texOfs/sizeof(T) );
                cudaSafeCall( cudaUnbindTexture(tex) );
                cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );
            }
            dim3 threads(32, 8);
            dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));
            float sx = static_cast<float>(src.cols) / dst.cols;
            float sy = static_cast<float>(src.rows) / dst.rows;
-            resize_for_hog_kernel<<<grid, threads>>>(sx, sy, (PtrStepSz<T>)dst, colOfs);
+            resize_for_hog_kernel<<<grid, threads>>>(tex, sx, sy, (PtrStepSz<uchar>)dst);
            cudaSafeCall( cudaGetLastError() );
            cudaSafeCall( cudaDeviceSynchronize() );
        }
        static void resize_for_hog_8UC4(const PtrStepSzb& src, PtrStepSzb dst)
        {
            cv::cudev::Texture<uchar4, float4> tex(src.rows, src.cols, reinterpret_cast<uchar4*>(src.data), src.step, false, cudaFilterModeLinear, cudaAddressModeClamp, cudaReadModeNormalizedFloat);
            dim3 threads(32, 8);
            dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));
            float sx = static_cast<float>(src.cols) / dst.cols;
            float sy = static_cast<float>(src.rows) / dst.rows;
-            cudaSafeCall( cudaUnbindTexture(tex) );
+            resize_for_hog_kernel<<<grid, threads>>>(tex, sx, sy, (PtrStepSz<uchar4>)dst);
            cudaSafeCall(cudaGetLastError());
            cudaSafeCall(cudaDeviceSynchronize());
        }
-        void resize_8UC1(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog<uchar> (src, dst, resize8UC1_tex); }
+        void resize_8UC1(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog_8UC1(src, dst); }
-        void resize_8UC4(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog<uchar4>(src, dst, resize8UC4_tex); }
+        void resize_8UC4(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog_8UC4(src, dst); }
    } // namespace hog
 }}} // namespace cv { namespace cuda { namespace cudev
--- a/modules/cudaobjdetect/test/test_objdetect.cpp
+++ b/modules/cudaobjdetect/test/test_objdetect.cpp
@ -222,7 +222,7 @@ INSTANTIATE_TEST_CASE_P(CUDA_ObjDetect, HOG, ALL_DEVICES);
 */
 //============== caltech hog tests =====================//
-struct CalTech : public ::testing::TestWithParam<tuple<cv::cuda::DeviceInfo, std::string> >
+struct CalTech : public ::testing::TestWithParam<tuple<cv::cuda::DeviceInfo, std::string, bool>>
 {
    cv::cuda::DeviceInfo devInfo;
    cv::Mat img;
@ -232,7 +232,13 @@ struct CalTech : public ::testing::TestWithParam<tuple<cv::cuda::DeviceInfo, std
        devInfo = GET_PARAM(0);
        cv::cuda::setDevice(devInfo.deviceID());
-        img = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE);
+        const bool grayScale = GET_PARAM(2);
        if(grayScale)
            img = readImage(GET_PARAM(1), IMREAD_GRAYSCALE);
        else {
            Mat imgBgr = readImage(GET_PARAM(1));
            cv::cvtColor(imgBgr, img, COLOR_BGR2BGRA);
        }
        ASSERT_FALSE(img.empty());
    }
 };
@ -263,10 +269,11 @@ CUDA_TEST_P(CalTech, HOG)
 #endif
 }
 #define GREYSCALE true, false
 INSTANTIATE_TEST_CASE_P(detect, CalTech, testing::Combine(ALL_DEVICES,
    ::testing::Values<std::string>("caltech/image_00000009_0.png", "caltech/image_00000032_0.png",
        "caltech/image_00000165_0.png", "caltech/image_00000261_0.png", "caltech/image_00000469_0.png",
-        "caltech/image_00000527_0.png", "caltech/image_00000574_0.png")));
+        "caltech/image_00000527_0.png", "caltech/image_00000574_0.png"), testing::Values(GREYSCALE)));
 //------------------------variable GPU HOG Tests------------------------//
--- a/modules/cudaoptflow/src/cuda/pyrlk.cu
+++ b/modules/cudaoptflow/src/cuda/pyrlk.cu
@ -50,8 +50,7 @@
 #include "opencv2/core/cuda/reduce.hpp"
 #include "opencv2/core/cuda/filters.hpp"
 #include "opencv2/core/cuda/border_interpolate.hpp"
-
+#include  <opencv2/cudev/ptr2d/texture.hpp>
 #include <iostream>
 using namespace cv::cuda;
 using namespace cv::cuda::device;
@ -64,224 +63,6 @@ namespace pyrlk
    __constant__ int c_halfWin_y;
    __constant__ int c_iters;
    texture<uchar, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_I8U(false, cudaFilterModeLinear, cudaAddressModeClamp);
    texture<uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_I8UC4(false, cudaFilterModeLinear, cudaAddressModeClamp);
    texture<ushort4, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_I16UC4(false, cudaFilterModeLinear, cudaAddressModeClamp);
    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_If(false, cudaFilterModeLinear, cudaAddressModeClamp);
    texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_If4(false, cudaFilterModeLinear, cudaAddressModeClamp);
    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_Ib(false, cudaFilterModePoint, cudaAddressModeClamp);
    texture<uchar, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_J8U(false, cudaFilterModeLinear, cudaAddressModeClamp);
    texture<uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_J8UC4(false, cudaFilterModeLinear, cudaAddressModeClamp);
    texture<ushort4, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_J16UC4(false, cudaFilterModeLinear, cudaAddressModeClamp);
    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_Jf(false, cudaFilterModeLinear, cudaAddressModeClamp);
    texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_Jf4(false, cudaFilterModeLinear, cudaAddressModeClamp);
    template <int cn, typename T> struct Tex_I
    {
        static __host__ __forceinline__ void bindTexture_(PtrStepSz<typename TypeVec<T, cn>::vec_type> I)
        {
            CV_UNUSED(I);
        }
    };
    template <> struct Tex_I<1, uchar>
    {
        static __device__ __forceinline__ float read(float x, float y)
        {
            return tex2D(tex_I8U, x, y);
        }
        static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar>& I)
        {
            bindTexture(&tex_I8U, I);
        }
    };
    template <> struct Tex_I<1, ushort>
    {
        static __device__ __forceinline__ float read(float x, float y)
        {
            return 0.0;
        }
        static __host__ __forceinline__ void bindTexture_(PtrStepSz<ushort>& I)
        {
            CV_UNUSED(I);
        }
    };
    template <> struct Tex_I<1, int>
    {
        static __device__ __forceinline__ float read(float x, float y)
        {
            return 0.0;
        }
        static __host__ __forceinline__ void bindTexture_(PtrStepSz<int>& I)
        {
            CV_UNUSED(I);
        }
    };
    template <> struct Tex_I<1, float>
    {
        static __device__ __forceinline__ float read(float x, float y)
        {
            return tex2D(tex_If, x, y);
        }
        static __host__ __forceinline__ void bindTexture_(PtrStepSz<float>& I)
        {
            bindTexture(&tex_If, I);
        }
    };
    // ****************** 3 channel specializations ************************
    template <> struct Tex_I<3, uchar>
    {
        static __device__ __forceinline__ float3 read(float x, float y)
        {
            return make_float3(0,0,0);
        }
        static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar3> I)
        {
            CV_UNUSED(I);
        }
    };
    template <> struct Tex_I<3, ushort>
    {
        static __device__ __forceinline__ float3 read(float x, float y)
        {
            return make_float3(0, 0, 0);
        }
        static __host__ __forceinline__ void bindTexture_(PtrStepSz<ushort3> I)
        {
            CV_UNUSED(I);
        }
    };
    template <> struct Tex_I<3, int>
    {
        static __device__ __forceinline__ float3 read(float x, float y)
        {
            return make_float3(0, 0, 0);
        }
        static __host__ __forceinline__ void bindTexture_(PtrStepSz<int3> I)
        {
            CV_UNUSED(I);
        }
    };
    template <> struct Tex_I<3, float>
    {
        static __device__ __forceinline__ float3 read(float x, float y)
        {
            return make_float3(0, 0, 0);
        }
        static __host__ __forceinline__ void bindTexture_(PtrStepSz<float3> I)
        {
            CV_UNUSED(I);
        }
    };
    // ****************** 4 channel specializations ************************
    template <> struct Tex_I<4, uchar>
    {
        static __device__ __forceinline__ float4 read(float x, float y)
        {
            return tex2D(tex_I8UC4, x, y);
        }
        static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar4>& I)
        {
            bindTexture(&tex_I8UC4, I);
        }
    };
    template <> struct Tex_I<4, ushort>
    {
        static __device__ __forceinline__ float4 read(float x, float y)
        {
            return tex2D(tex_I16UC4, x, y);
        }
        static __host__ __forceinline__ void bindTexture_(PtrStepSz<ushort4>& I)
        {
            bindTexture(&tex_I16UC4, I);
        }
    };
    template <> struct Tex_I<4, float>
    {
        static __device__ __forceinline__ float4 read(float x, float y)
        {
            return tex2D(tex_If4, x, y);
        }
        static __host__ __forceinline__ void bindTexture_(PtrStepSz<float4>& I)
        {
            bindTexture(&tex_If4, I);
        }
    };
    // ************* J  ***************
    template <int cn, typename T> struct Tex_J
    {
        static __host__ __forceinline__ void bindTexture_(PtrStepSz<typename TypeVec<T,cn>::vec_type>& J)
        {
            CV_UNUSED(J);
        }
    };
    template <> struct Tex_J<1, uchar>
    {
        static __device__ __forceinline__ float read(float x, float y)
        {
            return tex2D(tex_J8U, x, y);
        }
        static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar>& J)
        {
            bindTexture(&tex_J8U, J);
        }
    };
    template <> struct Tex_J<1, float>
    {
        static __device__ __forceinline__ float read(float x, float y)
        {
            return tex2D(tex_Jf, x, y);
        }
        static __host__ __forceinline__ void bindTexture_(PtrStepSz<float>& J)
        {
            bindTexture(&tex_Jf, J);
        }
    };
    // ************* 4 channel specializations ***************
    template <> struct Tex_J<4, uchar>
    {
        static __device__ __forceinline__ float4 read(float x, float y)
        {
            return tex2D(tex_J8UC4, x, y);
        }
        static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar4>& J)
        {
            bindTexture(&tex_J8UC4, J);
        }
    };
    template <> struct Tex_J<4, ushort>
    {
        static __device__ __forceinline__ float4 read(float x, float y)
        {
            return tex2D(tex_J16UC4, x, y);
        }
        static __host__ __forceinline__ void bindTexture_(PtrStepSz<ushort4>& J)
        {
            bindTexture(&tex_J16UC4, J);
        }
    };
    template <> struct Tex_J<4, float>
    {
        static __device__ __forceinline__ float4 read(float x, float y)
        {
            return tex2D(tex_Jf4, x, y);
        }
        static __host__ __forceinline__ void bindTexture_(PtrStepSz<float4>& J)
        {
            bindTexture(&tex_Jf4, J);
        }
    };
    __device__ __forceinline__ void accum(float& dst, const float& val)
    {
        dst += val;
@ -364,8 +145,8 @@ namespace pyrlk
        }
    };
-    template <int cn, int PATCH_X, int PATCH_Y, bool calcErr, typename T>
+    template <int cn, int PATCH_X, int PATCH_Y, bool calcErr, typename T, class Ptr2D>
-    __global__ void sparseKernel(const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
+    __global__ void sparseKernel(const Ptr2D texI, const Ptr2D texJ, const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
    {
    #if __CUDA_ARCH__ <= 110
        const int BLOCK_SIZE = 128;
@ -413,15 +194,14 @@ namespace pyrlk
                float x = prevPt.x + xBase + 0.5f;
                float y = prevPt.y + yBase + 0.5f;
-                I_patch[i][j] = Tex_I<cn, T>::read(x, y);
+                I_patch[i][j] = texI(y, x);
                // Scharr Deriv
                work_type dIdx = 3.0f * texI(y - 1, x + 1) + 10.0f * texI(y, x + 1) + 3.0f * texI(y + 1, x + 1) -
                    (3.0f * texI(y - 1, x - 1) + 10.0f * texI(y, x - 1) + 3.0f * texI(y + 1, x - 1));
-                work_type dIdx = 3.0f * Tex_I<cn,T>::read(x+1, y-1) + 10.0f * Tex_I<cn, T>::read(x+1, y) + 3.0f * Tex_I<cn,T>::read(x+1, y+1) -
+                work_type dIdy = 3.0f * texI(y + 1, x - 1) + 10.0f * texI(y + 1, x) + 3.0f * texI(y + 1, x + 1) -
-                                 (3.0f * Tex_I<cn,T>::read(x-1, y-1) + 10.0f * Tex_I<cn, T>::read(x-1, y) + 3.0f * Tex_I<cn,T>::read(x-1, y+1));
+                    (3.0f * texI(y - 1, x - 1) + 10.0f * texI(y - 1, x) + 3.0f * texI(y - 1, x + 1));
                work_type dIdy = 3.0f * Tex_I<cn,T>::read(x-1, y+1) + 10.0f * Tex_I<cn, T>::read(x, y+1) + 3.0f * Tex_I<cn,T>::read(x+1, y+1) -
                                (3.0f * Tex_I<cn,T>::read(x-1, y-1) + 10.0f * Tex_I<cn, T>::read(x, y-1) + 3.0f * Tex_I<cn,T>::read(x+1, y-1));
                dIdx_patch[i][j] = dIdx;
                dIdy_patch[i][j] = dIdy;
@ -490,7 +270,8 @@ namespace pyrlk
                for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
                {
                    work_type I_val = I_patch[i][j];
-                    work_type J_val = Tex_J<cn, T>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
+
                    work_type J_val = texJ(nextPt.y + y + 0.5f, nextPt.x + x + 0.5f);
                    work_type diff = (J_val - I_val) * 32.0f;
@ -533,7 +314,8 @@ namespace pyrlk
                for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
                {
                    work_type I_val = I_patch[i][j];
-                    work_type J_val = Tex_J<cn, T>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
+
                    work_type J_val = texJ(nextPt.y + y + 0.5f, nextPt.x + x + 0.5f);
                    work_type diff = J_val - I_val;
@ -749,6 +531,27 @@ namespace pyrlk
        }
    } // __global__ void sparseKernel_
    // Specialization for non float data, cudaFilterModeLinear only compatible with cudaReadModeNormalizedFloat.
    template<int cn, class T> class TextureLinear : public cv::cudev::Texture<typename TypeVec<T, cn>::vec_type, typename TypeVec<float, cn>::vec_type> {
    public:
        typedef typename TypeVec<T, cn>::vec_type elem_type;
        typedef typename TypeVec<float, cn>::vec_type ret_type;
        __host__ TextureLinear(PtrStepSz<elem_type> src, const bool normalizedCoords = false, const cudaTextureAddressMode addressMode = cudaAddressModeClamp) :
            cv::cudev::Texture<elem_type, ret_type>(src, normalizedCoords, cudaFilterModeLinear, addressMode, cudaReadModeNormalizedFloat)
        {
        }
    };
    // Specialization for float data, cudaReadModeNormalizedFloat only compatible with cudaReadModeElementType.
    template<int cn> class TextureLinear<cn, float> : public cv::cudev::Texture<typename TypeVec<float, cn>::vec_type, typename TypeVec<float, cn>::vec_type>
    {
    public:
        typedef typename TypeVec<float, cn>::vec_type float_type;
        __host__ TextureLinear(PtrStepSz<float_type> src, const bool normalizedCoords = false, const cudaTextureAddressMode addressMode = cudaAddressModeClamp) :
            cv::cudev::Texture <float_type, float_type>(src, normalizedCoords, cudaFilterModeLinear, addressMode, cudaReadModeElementType)
        {
        }
    };
    template <int cn, int PATCH_X, int PATCH_Y, typename T> class sparse_caller
    {
@ -756,16 +559,16 @@ namespace pyrlk
        static void call(PtrStepSz<typename TypeVec<T, cn>::vec_type> I, PtrStepSz<typename TypeVec<T, cn>::vec_type> J, int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
            int level, dim3 block, cudaStream_t stream)
        {
            typedef typename TypeVec<T, cn>::vec_type dType;
            typedef typename TypeVec<float, cn>::vec_type rType;
            TextureLinear<cn,T> texI(I);
            TextureLinear<cn,T> texJ(J);
            dim3 grid(ptcount);
            CV_UNUSED(I);
            CV_UNUSED(J);
            if (level == 0 && err)
-                sparseKernel<cn, PATCH_X, PATCH_Y, true, T> <<<grid, block, 0, stream >>>(prevPts, nextPts, status, err, level, rows, cols);
+                sparseKernel<cn, PATCH_X, PATCH_Y, true, T, cv::cudev::TexturePtr<dType,rType>><<<grid, block, 0, stream>>>(texI, texJ, prevPts, nextPts, status, err, level, rows, cols);
            else
-                sparseKernel<cn, PATCH_X, PATCH_Y, false, T> <<<grid, block, 0, stream >>>(prevPts, nextPts, status, err, level, rows, cols);
+                sparseKernel<cn, PATCH_X, PATCH_Y, false, T, cv::cudev::TexturePtr<dType, rType>><<<grid, block, 0, stream>>>(texI, texJ, prevPts, nextPts, status, err, level, rows, cols);
            cudaSafeCall(cudaGetLastError());
            if (stream == 0)
                cudaSafeCall(cudaDeviceSynchronize());
        }
@ -903,8 +706,8 @@ namespace pyrlk
    };
-    template <bool calcErr>
+    template <bool calcErr, class Ptr2D>
-    __global__ void denseKernel(PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols)
+    __global__ void denseKernel(const Ptr2D texI, const Ptr2D texJ, PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols)
    {
        extern __shared__ int smem[];
@ -925,15 +728,15 @@ namespace pyrlk
                float x = xBase - c_halfWin_x + j + 0.5f;
                float y = yBase - c_halfWin_y + i + 0.5f;
-                I_patch[i * patchWidth + j] = tex2D(tex_If, x, y);
+                I_patch[i * patchWidth + j] = texI(y, x);
                // Scharr Deriv
-                dIdx_patch[i * patchWidth + j] = 3 * tex2D(tex_If, x+1, y-1) + 10 * tex2D(tex_If, x+1, y) + 3 * tex2D(tex_If, x+1, y+1) -
+                dIdx_patch[i * patchWidth + j] = 3 * texI(y - 1, x + 1) + 10 * texI(y, x + 1) + 3 * texI(y + 1, x + 1) -
-                                                (3 * tex2D(tex_If, x-1, y-1) + 10 * tex2D(tex_If, x-1, y) + 3 * tex2D(tex_If, x-1, y+1));
+                    (3 * texI(y - 1, x - 1) + 10 * texI(y, x - 1) + 3 * texI(y + 1, x - 1));
-                dIdy_patch[i * patchWidth + j] = 3 * tex2D(tex_If, x-1, y+1) + 10 * tex2D(tex_If, x, y+1) + 3 * tex2D(tex_If, x+1, y+1) -
+                dIdy_patch[i * patchWidth + j] = 3 * texI(y + 1, x - 1) + 10 * texI(y + 1,x) + 3 * texI(y+ 1, x + 1) -
-                                                (3 * tex2D(tex_If, x-1, y-1) + 10 * tex2D(tex_If, x, y-1) + 3 * tex2D(tex_If, x+1, y-1));
+                    (3 * texI(y - 1, x - 1) + 10 * texI(y - 1,x) + 3 * texI(y - 1, x + 1));
            }
        }
@ -1004,7 +807,7 @@ namespace pyrlk
                for (int j = 0; j < c_winSize_x; ++j)
                {
                    int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];
-                    int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f);
+                    int J = texJ(nextPt.y - c_halfWin_y + i + 0.5f, nextPt.x - c_halfWin_x + j + 0.5f);
                    int diff = (J - I) * 32;
@ -1040,7 +843,8 @@ namespace pyrlk
                for (int j = 0; j < c_winSize_x; ++j)
                {
                    int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];
-                    int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f);
+
                    int J = texJ(nextPt.y - c_halfWin_y + i + 0.5f, nextPt.x - c_halfWin_x + j + 0.5f);
                    errval += ::abs(J - I);
                }
@ -1109,9 +913,6 @@ namespace pyrlk
                { sparse_caller<cn, 1, 5,T>::call, sparse_caller<cn, 2, 5,T>::call, sparse_caller<cn, 3, 5,T>::call, sparse_caller<cn, 4, 5,T>::call, sparse_caller<cn, 5, 5,T>::call }
            };
            Tex_I<cn, T>::bindTexture_(I);
            Tex_J<cn, T>::bindTexture_(J);
            funcs[patch.y - 1][patch.x - 1](I, J, I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
                level, block, stream);
        }
@ -1119,9 +920,8 @@ namespace pyrlk
        {
            dim3 block(16, 16);
            dim3 grid(divUp(I.cols, block.x), divUp(I.rows, block.y));
-            Tex_I<1, T>::bindTexture_(I);
+            TextureLinear<1, T> texI(I);
-            Tex_J<1, T>::bindTexture_(J);
+            TextureLinear<1, T> texJ(J);
            int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
            const int patchWidth = block.x + 2 * halfWin.x;
            const int patchHeight = block.y + 2 * halfWin.y;
@ -1129,12 +929,12 @@ namespace pyrlk
            if (err.data)
            {
-                denseKernel<true> << <grid, block, smem_size, stream >> >(u, v, prevU, prevV, err, I.rows, I.cols);
+                denseKernel<true, cv::cudev::TexturePtr<T,float>><<<grid, block, smem_size, stream>>>(texI, texJ, u, v, prevU, prevV, err, I.rows, I.cols);
                cudaSafeCall(cudaGetLastError());
            }
            else
            {
-                denseKernel<false> << <grid, block, smem_size, stream >> >(u, v, prevU, prevV, PtrStepf(), I.rows, I.cols);
+                denseKernel<false, cv::cudev::TexturePtr<T, float>><<<grid, block, smem_size, stream>>>(texI, texJ, u, v, prevU, prevV, PtrStepf(), I.rows, I.cols);
                cudaSafeCall(cudaGetLastError());
            }
--- a/modules/cudaoptflow/src/cuda/tvl1flow.cu
+++ b/modules/cudaoptflow/src/cuda/tvl1flow.cu
@ -46,6 +46,7 @@
 #include "opencv2/core/cuda/border_interpolate.hpp"
 #include "opencv2/core/cuda/limits.hpp"
 #include "opencv2/core/cuda.hpp"
 #include <opencv2/cudev/ptr2d/texture.hpp>
 using namespace cv::cuda;
 using namespace cv::cuda::device;
@ -102,63 +103,8 @@ namespace tvl1flow
        }
    }
    struct SrcTex
    {
        virtual ~SrcTex() {}
        __device__ __forceinline__ virtual float I1(float x, float y) const = 0;
        __device__ __forceinline__ virtual float I1x(float x, float y) const = 0;
        __device__ __forceinline__ virtual float I1y(float x, float y) const = 0;
    };
    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1 (false, cudaFilterModePoint, cudaAddressModeClamp);
    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1x(false, cudaFilterModePoint, cudaAddressModeClamp);
    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1y(false, cudaFilterModePoint, cudaAddressModeClamp);
    struct SrcTexRef : SrcTex
    {
        __device__ __forceinline__ float I1(float x, float y) const CV_OVERRIDE
        {
            return tex2D(tex_I1, x, y);
        }
        __device__ __forceinline__ float I1x(float x, float y) const CV_OVERRIDE
        {
            return tex2D(tex_I1x, x, y);
        }
        __device__ __forceinline__ float I1y(float x, float y) const CV_OVERRIDE
        {
            return tex2D(tex_I1y, x, y);
        }
    };
    struct SrcTexObj : SrcTex
    {
        __host__ SrcTexObj(cudaTextureObject_t tex_obj_I1_, cudaTextureObject_t tex_obj_I1x_, cudaTextureObject_t tex_obj_I1y_)
            : tex_obj_I1(tex_obj_I1_), tex_obj_I1x(tex_obj_I1x_), tex_obj_I1y(tex_obj_I1y_) {}
        __device__ __forceinline__ float I1(float x, float y) const CV_OVERRIDE
        {
            return tex2D<float>(tex_obj_I1, x, y);
        }
        __device__ __forceinline__ float I1x(float x, float y) const CV_OVERRIDE
        {
            return tex2D<float>(tex_obj_I1x, x, y);
        }
        __device__ __forceinline__ float I1y(float x, float y) const CV_OVERRIDE
        {
            return tex2D<float>(tex_obj_I1y, x, y);
        }
        cudaTextureObject_t tex_obj_I1;
        cudaTextureObject_t tex_obj_I1x;
        cudaTextureObject_t tex_obj_I1y;
    };
    template <
        typename T,
        typename = typename std::enable_if<std::is_base_of<SrcTex, T>::value>::type
    >
    __global__ void warpBackwardKernel(
-        const PtrStepSzf I0, const T src, const PtrStepf u1, const PtrStepf u2,
+        const PtrStepSzf I0, const cv::cudev::TexturePtr<float> I1, const cv::cudev::TexturePtr<float> I1x, const cv::cudev::TexturePtr<float> I1y, const PtrStepf u1, const PtrStepf u2,
        PtrStepf I1w, PtrStepf I1wx, PtrStepf I1wy, PtrStepf grad, PtrStepf rho)
    {
        const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -189,11 +135,9 @@ namespace tvl1flow
            for (int cx = xmin; cx <= xmax; ++cx)
            {
                const float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);
-
+                sum  += w * I1(cy, cx);
-                sum  += w * src.I1(cx, cy);
+                sumx += w * I1x(cy, cx);
-                sumx += w * src.I1x(cx, cy);
+                sumy += w * I1y(cy, cx);
                sumy += w * src.I1y(cx, cy);
                wsum += w;
            }
        }
@ -224,49 +168,14 @@ namespace tvl1flow
                      PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho,
                      cudaStream_t stream)
    {
        cv::cudev::Texture<float> texI1(I1);
        cv::cudev::Texture<float> texI1x(I1x);
        cv::cudev::Texture<float> texI1y(I1y);
        const dim3 block(32, 8);
        const dim3 grid(divUp(I0.cols, block.x), divUp(I0.rows, block.y));
-
+        warpBackwardKernel<<<grid, block, 0, stream>>>(I0, texI1, texI1x, texI1y , u1, u2, I1w, I1wx, I1wy, grad, rho);
-        bool cc30 = deviceSupports(FEATURE_SET_COMPUTE_30);
+        if (!stream)
-
+            cudaSafeCall(cudaDeviceSynchronize());
        if (cc30)
        {
            cudaTextureDesc texDesc;
            memset(&texDesc, 0, sizeof(texDesc));
            texDesc.addressMode[0] = cudaAddressModeClamp;
            texDesc.addressMode[1] = cudaAddressModeClamp;
            texDesc.addressMode[2] = cudaAddressModeClamp;
            cudaTextureObject_t texObj_I1 = 0, texObj_I1x = 0, texObj_I1y = 0;
            createTextureObjectPitch2D(&texObj_I1, I1, texDesc);
            createTextureObjectPitch2D(&texObj_I1x, I1x, texDesc);
            createTextureObjectPitch2D(&texObj_I1y, I1y, texDesc);
            warpBackwardKernel << <grid, block, 0, stream >> > (I0, SrcTexObj(texObj_I1, texObj_I1x, texObj_I1y), u1, u2, I1w, I1wx, I1wy, grad, rho);
            cudaSafeCall(cudaGetLastError());
            if (!stream)
                cudaSafeCall(cudaDeviceSynchronize());
            else
                cudaSafeCall(cudaStreamSynchronize(stream));
            cudaSafeCall(cudaDestroyTextureObject(texObj_I1));
            cudaSafeCall(cudaDestroyTextureObject(texObj_I1x));
            cudaSafeCall(cudaDestroyTextureObject(texObj_I1y));
        }
        else
        {
            bindTexture(&tex_I1, I1);
            bindTexture(&tex_I1x, I1x);
            bindTexture(&tex_I1y, I1y);
            warpBackwardKernel << <grid, block, 0, stream >> > (I0, SrcTexRef(), u1, u2, I1w, I1wx, I1wy, grad, rho);
            cudaSafeCall(cudaGetLastError());
            if (!stream)
                cudaSafeCall(cudaDeviceSynchronize());
        }
    }
 }
--- a/modules/cudastereo/src/cuda/stereobm.cu
+++ b/modules/cudastereo/src/cuda/stereobm.cu
@ -43,8 +43,10 @@
 #if !defined CUDA_DISABLER
 #include "opencv2/core/cuda/common.hpp"
 #include <opencv2/cudev/ptr2d/texture.hpp>
 #include <limits.h>
 namespace cv { namespace cuda { namespace device
 {
    namespace stereobm
@ -601,13 +603,12 @@ namespace cv { namespace cuda { namespace device
        /////////////////////////////////// Textureness filtering ////////////////////////////////////////
        //////////////////////////////////////////////////////////////////////////////////////////////////
-        texture<unsigned char, 2, cudaReadModeNormalizedFloat> texForTF;
+        __device__ __forceinline__ float sobel(cv::cudev::TexturePtr<uchar, float> texSrc, int x, int y)
        __device__ __forceinline__ float sobel(int x, int y)
        {
-            float conv = tex2D(texForTF, x - 1, y - 1) * (-1) + tex2D(texForTF, x + 1, y - 1) * (1) +
+            float conv = texSrc(y - 1, x - 1) * (-1) + texSrc(y - 1, x + 1) * (1) +
-                         tex2D(texForTF, x - 1, y    ) * (-2) + tex2D(texForTF, x + 1, y    ) * (2) +
+                texSrc(y, x - 1) * (-2) + texSrc(y, x + 1) * (2) +
-                         tex2D(texForTF, x - 1, y + 1) * (-1) + tex2D(texForTF, x + 1, y + 1) * (1);
+                texSrc(y + 1, x - 1) * (-1) + texSrc(y + 1, x + 1) * (1);
            return fabs(conv);
        }
@ -635,7 +636,7 @@ namespace cv { namespace cuda { namespace device
        #define RpT (2 * ROWSperTHREAD)  // got experimentally
-        __global__ void textureness_kernel(PtrStepSzb disp, int winsz, float threshold)
+        __global__ void textureness_kernel(cv::cudev::TexturePtr<uchar,float> texSrc, PtrStepSzb disp, int winsz, float threshold)
        {
            int winsz2 = winsz/2;
            int n_dirty_pixels = (winsz2) * 2;
@ -657,9 +658,9 @@ namespace cv { namespace cuda { namespace device
                for(int i = y - winsz2; i <= y + winsz2; ++i)
                {
-                    sum += sobel(x - winsz2, i);
+                    sum += sobel(texSrc, x - winsz2, i);
                    if (cols_extra)
-                        sum_extra += sobel(x + blockDim.x - winsz2, i);
+                        sum_extra += sobel(texSrc, x + blockDim.x - winsz2, i);
                }
                *cols = sum;
                if (cols_extra)
@ -675,12 +676,12 @@ namespace cv { namespace cuda { namespace device
                for(int y = beg_row + 1; y < end_row; ++y)
                {
-                    sum = sum - sobel(x - winsz2, y - winsz2 - 1) + sobel(x - winsz2, y + winsz2);
+                    sum = sum - sobel(texSrc, x - winsz2, y - winsz2 - 1) + sobel(texSrc, x - winsz2, y + winsz2);
                    *cols = sum;
                    if (cols_extra)
                    {
-                        sum_extra = sum_extra - sobel(x + blockDim.x - winsz2, y - winsz2 - 1) + sobel(x + blockDim.x - winsz2, y + winsz2);
+                        sum_extra = sum_extra - sobel(texSrc, x + blockDim.x - winsz2, y - winsz2 - 1) + sobel(texSrc, x + blockDim.x - winsz2, y + winsz2);
                        *cols_extra = sum_extra;
                    }
@ -697,28 +698,16 @@ namespace cv { namespace cuda { namespace device
        void postfilter_textureness(const PtrStepSzb& input, int winsz, float avgTexturenessThreshold, const PtrStepSzb& disp, cudaStream_t & stream)
        {
            avgTexturenessThreshold *= winsz * winsz;
-
+            cv::cudev::Texture<unsigned char, float> tex(input, false, cudaFilterModeLinear, cudaAddressModeWrap, cudaReadModeNormalizedFloat);
            texForTF.filterMode     = cudaFilterModeLinear;
            texForTF.addressMode[0] = cudaAddressModeWrap;
            texForTF.addressMode[1] = cudaAddressModeWrap;
            cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
            cudaSafeCall( cudaBindTexture2D( 0, texForTF, input.data, desc, input.cols, input.rows, input.step ) );
            dim3 threads(128, 1, 1);
            dim3 grid(1, 1, 1);
            grid.x = divUp(input.cols, threads.x);
            grid.y = divUp(input.rows, RpT);
            size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float);
-            textureness_kernel<<<grid, threads, smem_size, stream>>>(disp, winsz, avgTexturenessThreshold);
+            textureness_kernel<<<grid, threads, smem_size, stream>>>(tex, disp, winsz, avgTexturenessThreshold);
            cudaSafeCall( cudaGetLastError() );
            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
            cudaSafeCall( cudaUnbindTexture (texForTF) );
        }
    } // namespace stereobm
 }}} // namespace cv { namespace cuda { namespace cudev
--- a/modules/cudawarping/src/cuda/remap.cu
+++ b/modules/cudawarping/src/cuda/remap.cu
@ -48,6 +48,7 @@
 #include "opencv2/core/cuda/vec_math.hpp"
 #include "opencv2/core/cuda/saturate_cast.hpp"
 #include "opencv2/core/cuda/filters.hpp"
 #include <opencv2/cudev/ptr2d/texture.hpp>
 namespace cv { namespace cuda { namespace device
 {
@ -77,8 +78,8 @@ namespace cv { namespace cuda { namespace device
                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
+                BorderReader<PtrStep<T>, B<work_type>> brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
+                Filter<BorderReader<PtrStep<T>, B<work_type>>> filter_src(brdSrc);
                remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);
                cudaSafeCall( cudaGetLastError() );
@ -98,8 +99,8 @@ namespace cv { namespace cuda { namespace device
                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
+                BorderReader<PtrStep<T>, B<work_type>> brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
+                Filter<BorderReader<PtrStep<T>, B<work_type>>> filter_src(brdSrc);
                remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
                cudaSafeCall( cudaGetLastError() );
@ -108,88 +109,96 @@ namespace cv { namespace cuda { namespace device
            }
        };
-        #define OPENCV_CUDA_IMPLEMENT_REMAP_TEX(type) \
+        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStreamTex
-            texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
+        {
-            struct tex_remap_ ## type ## _reader \
+            static void call(PtrStepSz< T > src, PtrStepSz< T > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy,
-            { \
+                PtrStepSz< T > dst, const float* borderValue, bool cc20)
-                typedef type elem_type; \
+            {
-                typedef int index_type; \
+                typedef typename TypeVec<float, VecTraits< T >::cn>::vec_type work_type;
-                int xoff, yoff; \
+                dim3 block(32, cc20 ? 8 : 4);
-                tex_remap_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
+                if (srcWhole.cols == src.cols && srcWhole.rows == src.rows)
-                { \
+                {
-                    return tex2D(tex_remap_ ## type , x + xoff, y + yoff); \
+                    cudev::Texture<T> texSrcWhole(srcWhole);
-                } \
+                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-            }; \
+                    BorderReader<cudev::TexturePtr<T>, B<work_type>> brdSrc(texSrcWhole, brd);
-            template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \
+                    Filter<BorderReader<cudev::TexturePtr<T>, B<work_type>>> filter_src(brdSrc);
-            { \
+                    remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
                    PtrStepSz< type > dst, const float* borderValue, bool cc20) \
                { \
                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
                    dim3 block(32, cc20 ? 8 : 4); \
                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
                    bindTexture(&tex_remap_ ## type , srcWhole); \
                    tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
                    BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
                    Filter< BorderReader< tex_remap_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
                    remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
                    cudaSafeCall( cudaGetLastError() ); \
                    cudaSafeCall( cudaDeviceSynchronize() ); \
                } \
            }; \
            template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
            { \
                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
                    PtrStepSz< type > dst, const float*, bool) \
                { \
                    dim3 block(32, 8); \
                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
                    bindTexture(&tex_remap_ ## type , srcWhole); \
                    tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
                    { \
                        Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \
                        remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
                    } \
                    else \
                    { \
                        BrdReplicate<type> brd(src.rows, src.cols); \
                        BorderReader< tex_remap_ ## type ##_reader, BrdReplicate<type> > brdSrc(texSrc, brd); \
                        Filter< BorderReader< tex_remap_ ## type ##_reader, BrdReplicate<type> > > filter_src(brdSrc); \
                        remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
                    } \
                    cudaSafeCall( cudaGetLastError() ); \
                    cudaSafeCall( cudaDeviceSynchronize() ); \
                } \
            };
        OPENCV_CUDA_IMPLEMENT_REMAP_TEX(uchar)
        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(uchar2)
        OPENCV_CUDA_IMPLEMENT_REMAP_TEX(uchar4)
        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(schar)
        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(char2)
        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(char4)
-        OPENCV_CUDA_IMPLEMENT_REMAP_TEX(ushort)
+                }
-        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(ushort2)
+                else {
-        OPENCV_CUDA_IMPLEMENT_REMAP_TEX(ushort4)
+                    cudev::TextureOff<T> texSrcWhole(srcWhole, yoff, xoff);
                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
                    BorderReader<cudev::TextureOffPtr<T>, B<work_type>> brdSrc(texSrcWhole, brd);
                    Filter<BorderReader<cudev::TextureOffPtr<T>, B<work_type>>> filter_src(brdSrc);
                    remap<<<grid, block >>>(filter_src, mapx, mapy, dst);
                }
-        OPENCV_CUDA_IMPLEMENT_REMAP_TEX(short)
+                cudaSafeCall( cudaGetLastError() );
-        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(short2)
+                cudaSafeCall( cudaDeviceSynchronize() );
-        OPENCV_CUDA_IMPLEMENT_REMAP_TEX(short4)
+            }
        };
-        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(int)
+        template <template <typename> class Filter, typename T> struct RemapDispatcherNonStreamTex<Filter, BrdReplicate, T>
-        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(int2)
+        {
-        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(int4)
+            static void call(PtrStepSz< T > src, PtrStepSz< T > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy,
                PtrStepSz< T > dst, const float*, bool)
            {
                dim3 block(32, 8);
                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
                if (srcWhole.cols == src.cols && srcWhole.rows == src.rows)
                {
                    cudev::Texture<T> texSrcWhole(srcWhole);
                    Filter<cudev::TexturePtr<T>> filter_src(texSrcWhole);
                    remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
                }
                else
                {
                    cudev::TextureOff<T> texSrcWhole(srcWhole, yoff, xoff);
                    BrdReplicate<T> brd(src.rows, src.cols);
                    BorderReader<cudev::TextureOffPtr<T>, BrdReplicate<T>> brdSrc(texSrcWhole, brd);
                    Filter<BorderReader<cudev::TextureOffPtr<T>, BrdReplicate<T>>> filter_src(brdSrc);
                    remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
                }
                cudaSafeCall( cudaGetLastError() );
                cudaSafeCall( cudaDeviceSynchronize() );
            }
        };
        OPENCV_CUDA_IMPLEMENT_REMAP_TEX(float)
        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(float2)
        OPENCV_CUDA_IMPLEMENT_REMAP_TEX(float4)
-        #undef OPENCV_CUDA_IMPLEMENT_REMAP_TEX
+        template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, uchar> :
            RemapDispatcherNonStreamTex<Filter, B, uchar> {};
        template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, uchar4> :
            RemapDispatcherNonStreamTex<Filter, B, uchar4> {};
        template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, ushort> :
            RemapDispatcherNonStreamTex<Filter, B, ushort> {};
        template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, ushort4> :
            RemapDispatcherNonStreamTex<Filter, B, ushort4> {};
        template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, short> :
            RemapDispatcherNonStreamTex<Filter, B, short> {};
        template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, short4> :
            RemapDispatcherNonStreamTex<Filter, B, short4> {};
        template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, float> :
            RemapDispatcherNonStreamTex<Filter, B, float> {};
        template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, float4> :
            RemapDispatcherNonStreamTex<Filter, B, float4> {};
        template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, uchar> :
            RemapDispatcherNonStreamTex<Filter, BrdReplicate, uchar> {};
        template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, uchar4> :
            RemapDispatcherNonStreamTex<Filter, BrdReplicate, uchar4> {};
        template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, ushort> :
            RemapDispatcherNonStreamTex<Filter, BrdReplicate, ushort> {};
        template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, ushort4> :
            RemapDispatcherNonStreamTex<Filter, BrdReplicate, ushort4> {};
        template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, short> :
            RemapDispatcherNonStreamTex<Filter, BrdReplicate, short> {};
        template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, short4> :
            RemapDispatcherNonStreamTex<Filter, BrdReplicate, short4> {};
        template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, float> :
            RemapDispatcherNonStreamTex<Filter, BrdReplicate, float> {};
        template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, float4> :
            RemapDispatcherNonStreamTex<Filter, BrdReplicate, float4> {};
        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
        {
@ -234,37 +243,23 @@ namespace cv { namespace cuda { namespace device
                }
            };
-            callers[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, xmap, ymap,
+            callers[interpolation][borderMode](static_cast<PtrStepSz<T>>(src), static_cast<PtrStepSz<T>>(srcWhole), xoff, yoff, xmap, ymap,
-                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20);
+                static_cast<PtrStepSz<T>>(dst), borderValue, stream, cc20);
        }
        template void remap_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void remap_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void remap_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void remap_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void remap_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void remap_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void remap_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void remap_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void remap_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void remap_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void remap_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void remap_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void remap_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void remap_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void remap_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void remap_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void remap_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void remap_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void remap_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void remap_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void remap_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void remap_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void remap_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void remap_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
    } // namespace imgproc
--- a/modules/cudawarping/src/cuda/resize.cu
+++ b/modules/cudawarping/src/cuda/resize.cu
@ -49,6 +49,7 @@
 #include "opencv2/core/cuda/vec_math.hpp"
 #include "opencv2/core/cuda/saturate_cast.hpp"
 #include "opencv2/core/cuda/filters.hpp"
 #include <opencv2/cudev/ptr2d/texture.hpp>
 namespace cv { namespace cuda { namespace device
 {
@ -105,7 +106,7 @@ namespace cv { namespace cuda { namespace device
        }
    }
-    template <class Ptr2D, typename T> __global__ void resize(const Ptr2D src, PtrStepSz<T> dst, const float fy, const float fx)
+    template <class Ptr2D, typename T> __global__ void resize(Ptr2D src, PtrStepSz<T> dst, const float fy, const float fx)
    {
        const int dst_x = blockDim.x * blockIdx.x + threadIdx.x;
        const int dst_y = blockDim.y * blockIdx.y + threadIdx.y;
@ -130,54 +131,6 @@ namespace cv { namespace cuda { namespace device
        }
    }
    // textures
    template <typename T> struct TextureAccessor;
    #define OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(type) \
        texture<type, cudaTextureType2D, cudaReadModeElementType> tex_resize_##type (0, cudaFilterModePoint, cudaAddressModeClamp); \
        template <> struct TextureAccessor<type> \
        { \
            typedef type elem_type; \
            typedef int index_type; \
            int xoff; \
            int yoff; \
            __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
            { \
                return tex2D(tex_resize_##type, x + xoff, y + yoff); \
            } \
            __host__ static void bind(const PtrStepSz<type>& mat) \
            { \
                bindTexture(&tex_resize_##type, mat); \
            } \
        };
    OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(uchar)
    OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(uchar4)
    OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(ushort)
    OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(ushort4)
    OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(short)
    OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(short4)
    OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(float)
    OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(float4)
    #undef OPENCV_CUDA_IMPLEMENT_RESIZE_TEX
    template <typename T>
    TextureAccessor<T> texAccessor(const PtrStepSz<T>& mat, int yoff, int xoff)
    {
        TextureAccessor<T>::bind(mat);
        TextureAccessor<T> t;
        t.xoff = xoff;
        t.yoff = yoff;
        return t;
    }
    // callers for nearest interpolation
    template <typename T>
@ -194,14 +147,19 @@ namespace cv { namespace cuda { namespace device
    }
    template <typename T>
-    void call_resize_nearest_tex(const PtrStepSz<T>& /*src*/, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx)
+    void call_resize_nearest_tex(const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx)
    {
        const dim3 block(32, 8);
        const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
+        if (xoff || yoff) {
-        resize<<<grid, block>>>(texAccessor(srcWhole, yoff, xoff), dst, fy, fx);
+            cudev::TextureOff<T> texSrcWhole(srcWhole, yoff, xoff);
            resize<cudev::TextureOffPtr<T>><<<grid, block>>>(texSrcWhole, dst, fy, fx);
        }
        else {
            cudev::Texture<T> texSrcWhole(srcWhole);
            resize<cudev::TexturePtr<T>><<<grid, block>>>(texSrcWhole, dst, fy, fx);
        }
        cudaSafeCall( cudaGetLastError() );
        cudaSafeCall( cudaDeviceSynchronize() );
    }
@ -225,27 +183,21 @@ namespace cv { namespace cuda { namespace device
    {
        const dim3 block(32, 8);
        const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
        if (srcWhole.data == src.data)
        {
-            TextureAccessor<T> texSrc = texAccessor(src, 0, 0);
+            cudev::Texture<T> texSrc(src);
-            LinearFilter< TextureAccessor<T> > filteredSrc(texSrc);
+            LinearFilter<cudev::TexturePtr<T>> filteredSrc(texSrc);
            resize<<<grid, block>>>(filteredSrc, dst, fy, fx);
        }
        else
        {
-            TextureAccessor<T> texSrc = texAccessor(srcWhole, yoff, xoff);
+            cudev::TextureOff<T> texSrcWhole(srcWhole, yoff, xoff);
            BrdReplicate<T> brd(src.rows, src.cols);
-            BorderReader<TextureAccessor<T>, BrdReplicate<T> > brdSrc(texSrc, brd);
+            BorderReader<cudev::TextureOffPtr<T>, BrdReplicate<T>> brdSrc(texSrcWhole, brd);
-            LinearFilter< BorderReader<TextureAccessor<T>, BrdReplicate<T> > > filteredSrc(brdSrc);
+            LinearFilter<BorderReader<cudev::TextureOffPtr<T>, BrdReplicate<T>>> filteredSrc(brdSrc);
            resize<<<grid, block>>>(filteredSrc, dst, fy, fx);
        }
        cudaSafeCall( cudaGetLastError() );
        cudaSafeCall( cudaDeviceSynchronize() );
    }
@ -258,8 +210,8 @@ namespace cv { namespace cuda { namespace device
        const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
        BrdReplicate<T> brd(src.rows, src.cols);
-        BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
+        BorderReader<PtrStep<T>, BrdReplicate<T>> brdSrc(src, brd);
-        CubicFilter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filteredSrc(brdSrc);
+        CubicFilter<BorderReader< PtrStep<T>, BrdReplicate<T>>> filteredSrc(brdSrc);
        resize<<<grid, block, 0, stream>>>(filteredSrc, dst, fy, fx);
        cudaSafeCall( cudaGetLastError() );
@ -273,27 +225,21 @@ namespace cv { namespace cuda { namespace device
    {
        const dim3 block(32, 8);
        const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
        if (srcWhole.data == src.data)
        {
-            TextureAccessor<T> texSrc = texAccessor(src, 0, 0);
+            cudev::Texture<T> texSrc(src);
-            CubicFilter< TextureAccessor<T> > filteredSrc(texSrc);
+            CubicFilter<cudev::TexturePtr<T>> filteredSrc(texSrc);
            resize<<<grid, block>>>(filteredSrc, dst, fy, fx);
        }
        else
        {
-            TextureAccessor<T> texSrc = texAccessor(srcWhole, yoff, xoff);
+            cudev::TextureOff<T> texSrcWhole(srcWhole, yoff, xoff);
            BrdReplicate<T> brd(src.rows, src.cols);
-            BorderReader<TextureAccessor<T>, BrdReplicate<T> > brdSrc(texSrc, brd);
+            BorderReader<cudev::TextureOffPtr<T>, BrdReplicate<T>> brdSrc(texSrcWhole, brd);
-            CubicFilter< BorderReader<TextureAccessor<T>, BrdReplicate<T> > > filteredSrc(brdSrc);
+            CubicFilter<BorderReader<cudev::TextureOffPtr<T>, BrdReplicate<T>>> filteredSrc(brdSrc);
            resize<<<grid, block>>>(filteredSrc, dst, fy, fx);
        }
        cudaSafeCall( cudaGetLastError() );
        cudaSafeCall( cudaDeviceSynchronize() );
    }
@ -318,7 +264,7 @@ namespace cv { namespace cuda { namespace device
                if (fx > 1 || fy > 1)
                    call_resize_nearest_glob(src, dst, fy, fx, 0);
                else
-                    call_resize_nearest_tex(src, srcWhole, yoff, xoff, dst, fy, fx);
+                   call_resize_nearest_tex(srcWhole, yoff, xoff, dst, fy, fx);
            }
        }
    };
@ -389,7 +335,7 @@ namespace cv { namespace cuda { namespace device
        {
            if (stream)
                call_resize_cubic_glob(src, dst, fy, fx, stream);
-            else
+           else
                call_resize_cubic_tex(src, srcWhole, yoff, xoff, dst, fy, fx);
        }
    };
@ -421,16 +367,16 @@ namespace cv { namespace cuda { namespace device
            if (std::abs(fx - iscale_x) < FLT_MIN && std::abs(fy - iscale_y) < FLT_MIN)
            {
                BrdConstant<T> brd(src.rows, src.cols);
-                BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd);
+                BorderReader<PtrStep<T>, BrdConstant<T>> brdSrc(src, brd);
-                IntegerAreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy);
+                IntegerAreaFilter<BorderReader< PtrStep<T>, BrdConstant<T>>> filteredSrc(brdSrc, fx, fy);
                resize_area<<<grid, block, 0, stream>>>(filteredSrc, dst);
            }
            else
            {
                BrdConstant<T> brd(src.rows, src.cols);
-                BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd);
+                BorderReader<PtrStep<T>, BrdConstant<T>> brdSrc(src, brd);
-                AreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy);
+                AreaFilter<BorderReader< PtrStep<T>, BrdConstant<T>>> filteredSrc(brdSrc, fx, fy);
                resize_area<<<grid, block, 0, stream>>>(filteredSrc, dst);
            }
--- a/modules/cudawarping/src/cuda/warp.cu
+++ b/modules/cudawarping/src/cuda/warp.cu
@ -48,6 +48,7 @@
 #include "opencv2/core/cuda/vec_math.hpp"
 #include "opencv2/core/cuda/saturate_cast.hpp"
 #include "opencv2/core/cuda/filters.hpp"
 #include <opencv2/cudev/ptr2d/texture.hpp>
 namespace cv { namespace cuda { namespace device
 {
@ -164,8 +165,8 @@ namespace cv { namespace cuda { namespace device
                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
+                BorderReader<PtrStep<T>, B<work_type>> brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
+                Filter<BorderReader<PtrStep<T>, B<work_type>>> filter_src(brdSrc);
                warp<Transform><<<grid, block, 0, stream>>>(filter_src, dst, warpMat);
                cudaSafeCall( cudaGetLastError() );
@ -186,8 +187,8 @@ namespace cv { namespace cuda { namespace device
                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
+                BorderReader<PtrStep<T>, B<work_type>> brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
+                Filter<BorderReader<PtrStep<T>, B<work_type>>> filter_src(brdSrc);
                warp<Transform><<<grid, block>>>(filter_src, dst, warpMat);
                cudaSafeCall( cudaGetLastError() );
@ -196,86 +197,48 @@ namespace cv { namespace cuda { namespace device
            }
        };
-        #define OPENCV_CUDA_IMPLEMENT_WARP_TEX(type) \
+        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherNonStreamTex
-            texture< type , cudaTextureType2D > tex_warp_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
+        {
-            struct tex_warp_ ## type ## _reader \
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, const float warpMat[Transform::rows*3], bool cc20)
-            { \
+            {
-                typedef type elem_type; \
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
-                typedef int index_type; \
+                dim3 block(32, cc20 ? 8 : 4);
-                int xoff, yoff; \
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-                tex_warp_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
+                if (xoff || yoff) {
-                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
+                    cudev::TextureOff<T> texSrcWhole(srcWhole, yoff, xoff);
-                { \
+                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-                    return tex2D(tex_warp_ ## type , x + xoff, y + yoff); \
+                    BorderReader<cudev::TextureOffPtr<T>, B<work_type>> brdSrc(texSrcWhole, brd);
-                } \
+                    Filter<BorderReader<cudev::TextureOffPtr<T>, B<work_type>>> filter_src(brdSrc);
-            }; \
+                    warp<Transform><<<grid, block>>> (filter_src, dst, warpMat);
-            template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, type> \
+                }
-            { \
+                else {
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float* borderValue, const float warpMat[Transform::rows*3], bool cc20) \
+                    cudev::Texture<T> texSrcWhole(srcWhole);
-                { \
+                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
+                    BorderReader<cudev::TexturePtr<T>, B<work_type>>brdSrc(texSrcWhole, brd);
-                    dim3 block(32, cc20 ? 8 : 4); \
+                    Filter< BorderReader<cudev::TexturePtr<T>, B<work_type>>> filter_src(brdSrc);
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    warp<Transform><<<grid, block>>> (filter_src, dst, warpMat);
-                    bindTexture(&tex_warp_ ## type , srcWhole); \
+                }
-                    tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
+                cudaSafeCall( cudaGetLastError() );
-                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
+                cudaSafeCall( cudaDeviceSynchronize() );
-                    BorderReader< tex_warp_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
+            }
-                    Filter< BorderReader< tex_warp_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
+        };
                    warp<Transform><<<grid, block>>>(filter_src, dst, warpMat); \
                    cudaSafeCall( cudaGetLastError() ); \
                    cudaSafeCall( cudaDeviceSynchronize() ); \
                } \
            }; \
            template <class Transform, template <typename> class Filter> struct WarpDispatcherNonStream<Transform, Filter, BrdReplicate, type> \
            { \
                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float*, const float warpMat[Transform::rows*3], bool) \
                { \
                    dim3 block(32, 8); \
                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
                    bindTexture(&tex_warp_ ## type , srcWhole); \
                    tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
                    { \
                        Filter< tex_warp_ ## type ##_reader > filter_src(texSrc); \
                        warp<Transform><<<grid, block>>>(filter_src, dst, warpMat); \
                    } \
                    else \
                    { \
                        BrdReplicate<type> brd(src.rows, src.cols); \
                        BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > brdSrc(texSrc, brd); \
                        Filter< BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > > filter_src(brdSrc); \
                        warp<Transform><<<grid, block>>>(filter_src, dst, warpMat); \
                    } \
                    cudaSafeCall( cudaGetLastError() ); \
                    cudaSafeCall( cudaDeviceSynchronize() ); \
                } \
            };
        OPENCV_CUDA_IMPLEMENT_WARP_TEX(uchar)
        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(uchar2)
        OPENCV_CUDA_IMPLEMENT_WARP_TEX(uchar4)
        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(schar)
        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(char2)
        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(char4)
        OPENCV_CUDA_IMPLEMENT_WARP_TEX(ushort)
        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(ushort2)
        OPENCV_CUDA_IMPLEMENT_WARP_TEX(ushort4)
        OPENCV_CUDA_IMPLEMENT_WARP_TEX(short)
        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(short2)
        OPENCV_CUDA_IMPLEMENT_WARP_TEX(short4)
        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(int)
        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(int2)
        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(int4)
        OPENCV_CUDA_IMPLEMENT_WARP_TEX(float)
        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(float2)
        OPENCV_CUDA_IMPLEMENT_WARP_TEX(float4)
-        #undef OPENCV_CUDA_IMPLEMENT_WARP_TEX
+        template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, uchar> :
            WarpDispatcherNonStreamTex<Transform, Filter, B, uchar> {};
        template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, uchar4> :
            WarpDispatcherNonStreamTex<Transform, Filter, B, uchar4> {};
        template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, ushort> :
            WarpDispatcherNonStreamTex<Transform, Filter, B, ushort> {};
        template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, ushort4> :
            WarpDispatcherNonStreamTex<Transform, Filter, B, ushort4> {};
        template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, short> :
            WarpDispatcherNonStreamTex<Transform, Filter, B, short> {};
        template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, short4> :
            WarpDispatcherNonStreamTex<Transform, Filter, B, short4> {};
        template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, float> :
            WarpDispatcherNonStreamTex<Transform, Filter, B, float> {};
        template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, float4> :
            WarpDispatcherNonStreamTex<Transform, Filter, B, float4> {};
        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcher
        {
@ -319,8 +282,8 @@ namespace cv { namespace cuda { namespace device
                }
            };
-            funcs[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff,
+            funcs[interpolation][borderMode](static_cast<PtrStepSz<T>>(src), static_cast<PtrStepSz<T>>(srcWhole), xoff, yoff,
-                                             static_cast< PtrStepSz<T> >(dst), borderValue, warpMat, stream, cc20);
+                                             static_cast<PtrStepSz<T>>(dst), borderValue, warpMat, stream, cc20);
        }
        template <typename T> void warpAffine_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
@ -330,32 +293,18 @@ namespace cv { namespace cuda { namespace device
        }
        template void warpAffine_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpAffine_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpAffine_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpAffine_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpAffine_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpAffine_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpAffine_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpAffine_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpAffine_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpAffine_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpAffine_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpAffine_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpAffine_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpAffine_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpAffine_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpAffine_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpAffine_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpAffine_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpAffine_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpAffine_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpAffine_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpAffine_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpAffine_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpAffine_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
@ -366,32 +315,18 @@ namespace cv { namespace cuda { namespace device
        }
        template void warpPerspective_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpPerspective_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpPerspective_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpPerspective_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpPerspective_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpPerspective_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpPerspective_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpPerspective_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpPerspective_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpPerspective_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpPerspective_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpPerspective_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpPerspective_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpPerspective_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpPerspective_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpPerspective_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpPerspective_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpPerspective_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpPerspective_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpPerspective_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpPerspective_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpPerspective_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpPerspective_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpPerspective_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
    } // namespace imgproc
--- a/modules/cudawarping/test/test_precomp.hpp
+++ b/modules/cudawarping/test/test_precomp.hpp
@ -42,6 +42,8 @@
 #ifndef __OPENCV_TEST_PRECOMP_HPP__
 #define __OPENCV_TEST_PRECOMP_HPP__
 #include <thread>
 #include "opencv2/ts.hpp"
 #include "opencv2/ts/cuda_test.hpp"
--- a/modules/cudawarping/test/test_resize.cpp
+++ b/modules/cudawarping/test/test_resize.cpp
@ -206,6 +206,60 @@ INSTANTIATE_TEST_CASE_P(CUDA_Warping, ResizeSameAsHost, testing::Combine(
    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_AREA)),
    WHOLE_SUBMAT));
 PARAM_TEST_CASE(ResizeTextures, cv::cuda::DeviceInfo, Interpolation)
 {
    cv::cuda::DeviceInfo devInfo;
    Interpolation interpolation;
    virtual void SetUp()
    {
        devInfo = GET_PARAM(0);
        interpolation = GET_PARAM(1);
        cv::cuda::setDevice(devInfo.deviceID());
    }
 };
 void ResizeThread(const Interpolation interp, const GpuMat& imgIn, const std::vector<GpuMat>& imgsOut, Stream& stream) {
    for (auto& imgOut : imgsOut)
        cv::cuda::resize(imgIn, imgOut, imgOut.size(), 0, 0, interp, stream);
 }
 CUDA_TEST_P(ResizeTextures, Accuracy)
 {
    constexpr int nThreads = 5;
    constexpr int nIters = 5;
    const Size szIn(100, 100);
    const Size szOut(200, 200);
    vector<Stream> streams(nThreads, cv::cuda::Stream::Null());
    vector<GpuMat> imgsIn;
    vector<vector<GpuMat>> imgsOut;
    for (int i = 0; i < nThreads; i++) {
        imgsIn.push_back(GpuMat(szIn, CV_8UC1, i));
        vector<GpuMat> imgsOutPerThread;
        for (int j = 0; j < nIters; j++)
            imgsOutPerThread.push_back(GpuMat(szOut, CV_8UC1));
        imgsOut.push_back(imgsOutPerThread);
    }
    vector<std::thread> thread(nThreads);
    for (int i = 0; i < nThreads; i++) thread.at(i) = std::thread(ResizeThread, interpolation, std::ref(imgsIn.at(i)), std::ref(imgsOut.at(i)), std::ref(streams.at(i)));
    for (int i = 0; i < nThreads; i++) thread.at(i).join();
    for (int i = 0; i < nThreads; i++) {
        GpuMat imgOutGs;
        cv::cuda::resize(imgsIn.at(i), imgOutGs, szOut, 0, 0, interpolation, streams.at(i));
        Mat imgOutGsHost; imgOutGs.download(imgOutGsHost);
        for (const auto& imgOut : imgsOut.at(i)) {
            Mat imgOutHost; imgOut.download(imgOutHost);
            ASSERT_TRUE(cv::norm(imgOutHost, imgOutGsHost, NORM_INF) == 0);
        }
    }
 }
 INSTANTIATE_TEST_CASE_P(CUDA_Warping, ResizeTextures, testing::Combine(
    ALL_DEVICES,
    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC))));
 }} // namespace
 #endif // HAVE_CUDA
--- a/modules/cudev/include/opencv2/cudev/ptr2d/texture.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/texture.hpp
@ -1,147 +1,159 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
+// This file is part of OpenCV project.
-//
+// It is subject to the license terms in the LICENSE file found in the top-level directory
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+// of this distribution and at http://opencv.org/license.html.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                          License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors "as is" and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 #pragma once
 #ifndef OPENCV_CUDEV_PTR2D_TEXTURE_HPP
 #define OPENCV_CUDEV_PTR2D_TEXTURE_HPP
 #include <cstring>
 #include "../common.hpp"
 #include "glob.hpp"
 #include "gpumat.hpp"
 #include "traits.hpp"
 #if CUDART_VERSION >= 5050
 namespace
 {
    template <typename T> struct CvCudevTextureRef
    {
        typedef texture<T, cudaTextureType2D, cudaReadModeElementType> TexRef;
-        static TexRef ref;
+#ifndef OPENCV_CUDEV_PTR2D_TEXTURE_OBJECT_HPP
 #define OPENCV_CUDEV_PTR2D_TEXTURE_OBJECT_HPP
-        __host__ static void bind(const cv::cudev::GlobPtrSz<T>& mat,
+#include <opencv2/core.hpp>
-                                  bool normalizedCoords = false,
+#include <opencv2/core/utils/logger.hpp>
-                                  cudaTextureFilterMode filterMode = cudaFilterModePoint,
+#include <opencv2/core/cuda_types.hpp>
-                                  cudaTextureAddressMode addressMode = cudaAddressModeClamp)
+#include <opencv2/cudev/common.hpp>
-        {
+#include <opencv2/cudev/ptr2d/traits.hpp>
            ref.normalized = normalizedCoords;
            ref.filterMode = filterMode;
            ref.addressMode[0] = addressMode;
            ref.addressMode[1] = addressMode;
            ref.addressMode[2] = addressMode;
-            cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
+/** \file texture.hpp
 */
-            CV_CUDEV_SAFE_CALL( cudaBindTexture2D(0, &ref, mat.data, &desc, mat.cols, mat.rows, mat.step) );
+namespace cv {  namespace cudev {
 //! @addtogroup cudev
 //! @{
    /** @brief Simple lightweight structures that encapsulate information about an image texture on the device.
    * They are intended to be passed to nvcc-compiled code.
    */
    template<class T, class R = T>
    struct TexturePtr {
        typedef R     elem_type, value_type;
        typedef float index_type;
        __host__ TexturePtr() {};
        __host__ TexturePtr(const cudaTextureObject_t tex_) : tex(tex_) {};
        __device__ __forceinline__ R operator ()(index_type y, index_type x) const {
            return tex2D<R>(tex, x, y);
        }
        __device__ __forceinline__ R operator ()(index_type x) const {
            return tex1Dfetch<R>(tex, x);
        }
    private:
        cudaTextureObject_t tex;
    };
-        __host__ static void unbind()
+    // textures are a maximum of 32 bits wide, 64 bits is read as two 32 bit wide values
-        {
+    template <class R>
-            cudaUnbindTexture(ref);
+    struct TexturePtr<uint64, R> {
        typedef float index_type;
        __host__ TexturePtr() {};
        __host__ TexturePtr(const cudaTextureObject_t tex_) : tex(tex_) {};
        __device__ __forceinline__ R operator ()(index_type y, index_type x) const {
            const uint2 retVal = tex2D<uint2>(tex, x, y);
            return *(reinterpret_cast<const R*>(&retVal));
        }
        __device__ __forceinline__ R operator ()(index_type x) const {
            const uint2 retVal = tex1Dfetch<uint2>(tex, x);
            return *(reinterpret_cast<const R*>(&retVal));
        }
    private:
        cudaTextureObject_t tex;
    };
-    template <typename T>
+    template<class T, class R = T>
-    typename CvCudevTextureRef<T>::TexRef CvCudevTextureRef<T>::ref;
+    struct TextureOffPtr {
-}
+        typedef R     elem_type;
        typedef float index_type;
        __host__ TextureOffPtr(const cudaTextureObject_t tex_, const int yoff_, const int xoff_) : tex(tex_), yoff(yoff_), xoff(xoff_) {};
        __device__ __forceinline__ R operator ()(index_type y, index_type x) const {
            return tex2D<R>(tex, x + xoff, y + yoff);
        }
    private:
        cudaTextureObject_t tex;
        int xoff = 0;
        int yoff = 0;
    };
-#endif
+    /** @brief non-copyable smart CUDA texture object
    *
    * UniqueTexture is a smart non-sharable wrapper for a cudaTextureObject_t handle which ensures that the handle is destroyed after use.
    */
    template<class T, class R = T>
    class UniqueTexture {
    public:
        __host__ UniqueTexture() noexcept { }
        __host__ UniqueTexture(UniqueTexture&) = delete;
        __host__ UniqueTexture(UniqueTexture&& other) noexcept {
            tex = other.tex;
            other.tex = 0;
        }
-namespace cv { namespace cudev {
+        __host__ UniqueTexture(const int rows, const int cols, T* data, const size_t step, const bool normalizedCoords = false,
            const cudaTextureFilterMode filterMode = cudaFilterModePoint, const cudaTextureAddressMode addressMode = cudaAddressModeClamp,
            const cudaTextureReadMode readMode = cudaReadModeElementType)
        {
            create(rows, cols, data, step, normalizedCoords, filterMode, addressMode, readMode);
        }
-//! @addtogroup cudev
+        __host__ UniqueTexture(const size_t sizeInBytes, T* data, const bool normalizedCoords = false, const cudaTextureFilterMode filterMode = cudaFilterModePoint,
-//! @{
+            const cudaTextureAddressMode addressMode = cudaAddressModeClamp, const cudaTextureReadMode readMode = cudaReadModeElementType)
        {
            create(1, static_cast<int>(sizeInBytes/sizeof(T)), data, sizeInBytes, normalizedCoords, filterMode, addressMode, readMode);
        }
-#if CUDART_VERSION >= 5050
+        __host__ ~UniqueTexture() {
            if (tex != cudaTextureObject_t()) {
                try {
                    CV_CUDEV_SAFE_CALL(cudaDestroyTextureObject(tex));
                }
                catch (const cv::Exception& ex) {
                    std::ostringstream os;
                    os << "Exception caught during CUDA texture object destruction.\n";
                    os << ex.what();
                    os << "Exception will be ignored.\n";
                    CV_LOG_WARNING(0, os.str().c_str());
                }
            }
-template <typename T> struct TexturePtr
+        }
 {
    typedef T     value_type;
    typedef float index_type;
-    cudaTextureObject_t texObj;
+        __host__ UniqueTexture& operator=(const UniqueTexture&) = delete;
        __host__ UniqueTexture& operator=(UniqueTexture&& other) noexcept {
            CV_Assert(other);
            if (&other != this) {
                UniqueTexture(std::move(*this)); /* destroy current texture object */
                tex = other.tex;
                other.tex = cudaTextureObject_t();
            }
            return *this;
        }
-    __device__ __forceinline__ T operator ()(float y, float x) const
+        __host__ cudaTextureObject_t get() const noexcept {
-    {
+            CV_Assert(tex);
-    #if CV_CUDEV_ARCH < 300
+            return tex;
-        // Use the texture reference
+        }
-        return tex2D(CvCudevTextureRef<T>::ref, x, y);
+
-    #else
+        __host__ explicit operator bool() const noexcept { return tex != cudaTextureObject_t(); }
        // Use the texture object
        return tex2D<T>(texObj, x, y);
    #endif
    }
 };
 template <typename T> struct Texture : TexturePtr<T>
 {
    int rows, cols;
    bool cc30;
    __host__ explicit Texture(const GlobPtrSz<T>& mat,
                              bool normalizedCoords = false,
                              cudaTextureFilterMode filterMode = cudaFilterModePoint,
                              cudaTextureAddressMode addressMode = cudaAddressModeClamp)
    {
        cc30 = deviceSupports(FEATURE_SET_COMPUTE_30);
-        rows = mat.rows;
+    private:
        cols = mat.cols;
-        if (cc30)
+        template <class T1>
        __host__ void create(const int rows, const int cols, T1* data, const size_t step, const bool normalizedCoords, const cudaTextureFilterMode filterMode,
            const cudaTextureAddressMode addressMode, const cudaTextureReadMode readMode)
        {
            // Use the texture object
            cudaResourceDesc texRes;
            std::memset(&texRes, 0, sizeof(texRes));
-            texRes.resType = cudaResourceTypePitch2D;
+            if (rows == 1) {
-            texRes.res.pitch2D.devPtr = mat.data;
+                CV_Assert(rows == 1 && cols*sizeof(T) == step);
-            texRes.res.pitch2D.height = mat.rows;
+                texRes.resType = cudaResourceTypeLinear;
-            texRes.res.pitch2D.width = mat.cols;
+                texRes.res.linear.devPtr = data;
-            texRes.res.pitch2D.pitchInBytes = mat.step;
+                texRes.res.linear.sizeInBytes = step;
-            texRes.res.pitch2D.desc = cudaCreateChannelDesc<T>();
+                texRes.res.linear.desc = cudaCreateChannelDesc<T1>();
            }
            else {
                texRes.resType = cudaResourceTypePitch2D;
                texRes.res.pitch2D.devPtr = data;
                texRes.res.pitch2D.height = rows;
                texRes.res.pitch2D.width = cols;
                texRes.res.pitch2D.pitchInBytes = step;
                texRes.res.pitch2D.desc = cudaCreateChannelDesc<T1>();
            }
            cudaTextureDesc texDescr;
            std::memset(&texDescr, 0, sizeof(texDescr));
@ -150,109 +162,112 @@ template <typename T> struct Texture : TexturePtr<T>
            texDescr.addressMode[0] = addressMode;
            texDescr.addressMode[1] = addressMode;
            texDescr.addressMode[2] = addressMode;
-            texDescr.readMode = cudaReadModeElementType;
+            texDescr.readMode = readMode;
-            CV_CUDEV_SAFE_CALL( cudaCreateTextureObject(&this->texObj, &texRes, &texDescr, 0) );
+            CV_CUDEV_SAFE_CALL(cudaCreateTextureObject(&tex, &texRes, &texDescr, 0));
        }
-        else
+
        __host__ void create(const int rows, const int cols, uint64* data, const size_t step, const bool normalizedCoords, const cudaTextureFilterMode filterMode,
            const cudaTextureAddressMode addressMode, const cudaTextureReadMode readMode)
        {
-            // Use the texture reference
+            create<uint2>(rows, cols, (uint2*)data, step, normalizedCoords, filterMode, addressMode, readMode);
            CvCudevTextureRef<T>::bind(mat, normalizedCoords, filterMode, addressMode);
        }
    }
-    __host__ ~Texture()
+    private:
-    {
+        cudaTextureObject_t tex;
-        if (cc30)
+    };
    /** @brief sharable smart CUDA texture object
    *
    * Texture is a smart sharable wrapper for a cudaTextureObject_t handle which ensures that the handle is destroyed after use.
    */
    template<class T, class R = T>
    class Texture {
    public:
        Texture() = default;
        Texture(const Texture&) = default;
        Texture(Texture&&) = default;
        __host__ Texture(const int rows_, const int cols_, T* data, const size_t step, const bool normalizedCoords = false, const cudaTextureFilterMode filterMode = cudaFilterModePoint,
            const cudaTextureAddressMode addressMode = cudaAddressModeClamp, const cudaTextureReadMode readMode = cudaReadModeElementType) :
            rows(rows_), cols(cols_), texture(std::make_shared<UniqueTexture<T,R>>(rows, cols, data, step, normalizedCoords, filterMode, addressMode, readMode))
        {
            // Use the texture object
            cudaDestroyTextureObject(this->texObj);
        }
-        else
+
        __host__ Texture(const size_t sizeInBytes, T* data, const bool normalizedCoords = false, const cudaTextureFilterMode filterMode = cudaFilterModePoint,
            const cudaTextureAddressMode addressMode = cudaAddressModeClamp, const cudaTextureReadMode readMode = cudaReadModeElementType) :
            rows(1), cols(static_cast<int>(sizeInBytes/sizeof(T))), texture(std::make_shared<UniqueTexture<T, R>>(sizeInBytes, data, normalizedCoords, filterMode, addressMode, readMode))
        {
            // Use the texture reference
            CvCudevTextureRef<T>::unbind();
        }
    }
 };
-template <typename T> struct PtrTraits< Texture<T> > : PtrTraitsBase<Texture<T>, TexturePtr<T> >
+        __host__ Texture(PtrStepSz<T> src, const bool normalizedCoords = false, const cudaTextureFilterMode filterMode = cudaFilterModePoint,
-{
+            const cudaTextureAddressMode addressMode = cudaAddressModeClamp, const cudaTextureReadMode readMode = cudaReadModeElementType) :
-};
+            Texture(src.rows, src.cols, src.data, src.step, normalizedCoords, filterMode, addressMode, readMode)
        {
        }
-#else
+        Texture& operator=(const Texture&) = default;
        Texture& operator=(Texture&&) = default;
-template <typename T> struct TexturePtr
+        __host__ explicit operator bool() const noexcept {
-{
+            if (!texture)
-    typedef T     value_type;
+                return false;
-    typedef float index_type;
+            return texture->operator bool();
        }
-    cudaTextureObject_t texObj;
+        __host__ operator TexturePtr<T, R>() const {
            if (texture)
                return TexturePtr<T, R>(texture->get());
            else
                return TexturePtr<T, R>(cudaTextureObject_t());
        }
-    __device__ __forceinline__ T operator ()(float y, float x) const
+        int rows = 0;
-    {
+        int cols = 0;
-    #if CV_CUDEV_ARCH >= 300
+
-        // Use the texture object
+    protected:
-        return tex2D<T>(texObj, x, y);
+        std::shared_ptr<UniqueTexture<T, R>> texture = 0;
-    #else
+    };
-        CV_UNUSED(y);
+
-        CV_UNUSED(x);
+    template <typename T, typename R> struct PtrTraits<Texture<T, R>> : PtrTraitsBase<Texture<T, R>, TexturePtr<T, R>>
        return T();
    #endif
    }
 };
 template <typename T> struct Texture : TexturePtr<T>
 {
    int rows, cols;
    __host__ explicit Texture(const GlobPtrSz<T>& mat,
                              bool normalizedCoords = false,
                              cudaTextureFilterMode filterMode = cudaFilterModePoint,
                              cudaTextureAddressMode addressMode = cudaAddressModeClamp)
    {
        CV_Assert( deviceSupports(FEATURE_SET_COMPUTE_30) );
        rows = mat.rows;
        cols = mat.cols;
        // Use the texture object
        cudaResourceDesc texRes;
        std::memset(&texRes, 0, sizeof(texRes));
        texRes.resType = cudaResourceTypePitch2D;
        texRes.res.pitch2D.devPtr = mat.data;
        texRes.res.pitch2D.height = mat.rows;
        texRes.res.pitch2D.width = mat.cols;
        texRes.res.pitch2D.pitchInBytes = mat.step;
        texRes.res.pitch2D.desc = cudaCreateChannelDesc<T>();
        cudaTextureDesc texDescr;
        std::memset(&texDescr, 0, sizeof(texDescr));
        texDescr.normalizedCoords = normalizedCoords;
        texDescr.filterMode = filterMode;
        texDescr.addressMode[0] = addressMode;
        texDescr.addressMode[1] = addressMode;
        texDescr.addressMode[2] = addressMode;
        texDescr.readMode = cudaReadModeElementType;
        CV_CUDEV_SAFE_CALL( cudaCreateTextureObject(&this->texObj, &texRes, &texDescr, 0) );
    }
    __host__ ~Texture()
    {
-        // Use the texture object
+    };
        cudaDestroyTextureObject(this->texObj);
    }
 };
 template <typename T> struct PtrTraits< Texture<T> > : PtrTraitsBase<Texture<T>, TexturePtr<T> >
 {
 };
-#endif
+    /** @brief sharable smart CUDA texture object with offset
    * TextureOff is a smart sharable wrapper for a cudaTextureObject_t handle which ensures that the handle is destroyed after use.
    */
    template<class T, class R = T>
    class TextureOff {
    public:
        TextureOff(const TextureOff&) = default;
        TextureOff(TextureOff&&) = default;
        __host__ TextureOff(const int rows, const int cols, T* data, const size_t step, const int yoff_ = 0, const int xoff_ = 0, const bool normalizedCoords = false,
            const cudaTextureFilterMode filterMode = cudaFilterModePoint, const cudaTextureAddressMode addressMode = cudaAddressModeClamp,
            const cudaTextureReadMode readMode = cudaReadModeElementType) :
            texture(std::make_shared<UniqueTexture<T, R>>(rows, cols, data, step, normalizedCoords, filterMode, addressMode, readMode)), xoff(xoff_), yoff(yoff_)
        {
        }
-//! @}
+        __host__ TextureOff(PtrStepSz<T> src, const int yoff = 0, const int xoff = 0, const bool normalizedCoords = false, const cudaTextureFilterMode filterMode = cudaFilterModePoint,
            const cudaTextureAddressMode addressMode = cudaAddressModeClamp, const cudaTextureReadMode readMode = cudaReadModeElementType) :
            TextureOff(src.rows, src.cols, src.data, src.step, yoff, xoff, normalizedCoords, filterMode, addressMode, readMode)
        {
        }
        TextureOff& operator=(const TextureOff&) = default;
        TextureOff& operator=(TextureOff&&) = default;
        __host__ operator TextureOffPtr<T, R>() const {
            return TextureOffPtr<T, R>(texture->get(), yoff, xoff);
        }
    private:
        int xoff = 0;
        int yoff = 0;
        std::shared_ptr<UniqueTexture<T, R>> texture = 0;
    };
 }}
 #endif
--- a/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
+++ b/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
@ -213,7 +213,7 @@ __device__ double shfl_up(double val, uint delta, int width = warpSize)
    return __hiloint2double(hi, lo);
 }
-__device__ __forceinline__ unsigned long long shfl_up(unsigned long long val, uint delta, int width = warpSize)
+__device__ __forceinline__ uint64 shfl_up(uint64 val, uint delta, int width = warpSize)
 {
    return __shfl_up(val, delta, width);
 }
--- a/modules/xfeatures2d/src/cuda/surf.cu
+++ b/modules/xfeatures2d/src/cuda/surf.cu
@ -51,6 +51,7 @@
 #include "opencv2/core/cuda/utility.hpp"
 #include "opencv2/core/cuda/functional.hpp"
 #include "opencv2/core/cuda/filters.hpp"
 #include <opencv2/cudev/ptr2d/texture.hpp>
 namespace cv { namespace cuda { namespace device
 {
@ -59,23 +60,19 @@ namespace cv { namespace cuda { namespace device
        void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
        void loadOctaveConstants(int octave, int layer_rows, int layer_cols);
-        void bindImgTex(PtrStepSzb img);
+        void icvCalcLayerDetAndTrace_gpu(const PtrStepSz<unsigned int>& sum, const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
        size_t bindSumTex(PtrStepSz<unsigned int> sum);
        size_t bindMaskSumTex(PtrStepSz<unsigned int> maskSum);
        void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
            int octave, int nOctaveLayer);
-        void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
+        void icvFindMaximaInLayer_gpu(const PtrStepSz<unsigned int>& maskSum, const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
            int img_rows, int img_cols, int octave, bool use_mask, int nLayers);
        void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter,
            float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian,
            unsigned int* featureCounter);
-        void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures);
+        void icvCalcOrientation_gpu(const PtrStepSz<unsigned int>& sum, const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures);
-        void compute_descriptors_gpu(PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
+        void compute_descriptors_gpu(const PtrStepSzb& img, PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
    }
 }}}
@ -121,34 +118,8 @@ namespace cv { namespace cuda { namespace device
            cudaSafeCall( cudaMemcpyToSymbol(c_layer_cols, &layer_cols, sizeof(layer_cols)) );
        }
        ////////////////////////////////////////////////////////////////////////
        // Integral image texture
        texture<unsigned char, 2, cudaReadModeElementType> imgTex(0, cudaFilterModePoint, cudaAddressModeClamp);
        texture<unsigned int, 2, cudaReadModeElementType> sumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
        texture<unsigned int, 2, cudaReadModeElementType> maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
        void bindImgTex(PtrStepSzb img)
        {
            bindTexture(&imgTex, img);
        }
        size_t bindSumTex(PtrStepSz<uint> sum)
        {
            size_t offset;
            cudaChannelFormatDesc desc_sum = cudaCreateChannelDesc<uint>();
            cudaSafeCall( cudaBindTexture2D(&offset, sumTex, sum.data, desc_sum, sum.cols, sum.rows, sum.step));
            return offset / sizeof(uint);
        }
        size_t bindMaskSumTex(PtrStepSz<uint> maskSum)
        {
            size_t offset;
            cudaChannelFormatDesc desc_sum = cudaCreateChannelDesc<uint>();
            cudaSafeCall( cudaBindTexture2D(&offset, maskSumTex, maskSum.data, desc_sum, maskSum.cols, maskSum.rows, maskSum.step));
            return offset / sizeof(uint);
        }
-        template <int N> __device__ float icvCalcHaarPatternSum(const float src[][5], int oldSize, int newSize, int y, int x)
+        template <int N> __device__ float icvCalcHaarPatternSum(cudev::TexturePtr<unsigned int> texSum, const float src[][5], int oldSize, int newSize, int y, int x)
        {
        #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 200
            typedef double real_t;
@ -169,10 +140,10 @@ namespace cv { namespace cuda { namespace device
                int dy2 = __float2int_rn(ratio * src[k][3]);
                real_t t = 0;
-                t += tex2D(sumTex, x + dx1, y + dy1);
+                t += texSum(y + dy1, x + dx1);
-                t -= tex2D(sumTex, x + dx1, y + dy2);
+                t -= texSum(y + dy2, x + dx1);
-                t -= tex2D(sumTex, x + dx2, y + dy1);
+                t -= texSum(y + dy1, x + dx2);
-                t += tex2D(sumTex, x + dx2, y + dy2);
+                t += texSum(y + dy2, x + dx2);
                d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
            }
@ -201,7 +172,7 @@ namespace cv { namespace cuda { namespace device
            return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
        }
-        __global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace)
+        __global__ void icvCalcLayerDetAndTrace(cudev::TexturePtr<unsigned int> texSum, PtrStepf det, PtrStepf trace)
        {
            // Determine the indices
            const int gridDim_y = gridDim.y / (c_nOctaveLayers + 2);
@ -222,29 +193,29 @@ namespace cv { namespace cuda { namespace device
            if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
            {
-                const float dx  = icvCalcHaarPatternSum<3>(c_DX , 9, size, (i << c_octave), (j << c_octave));
+                const float dx  = icvCalcHaarPatternSum<3>(texSum, c_DX , 9, size, (i << c_octave), (j << c_octave));
-                const float dy  = icvCalcHaarPatternSum<3>(c_DY , 9, size, (i << c_octave), (j << c_octave));
+                const float dy  = icvCalcHaarPatternSum<3>(texSum, c_DY , 9, size, (i << c_octave), (j << c_octave));
-                const float dxy = icvCalcHaarPatternSum<4>(c_DXY, 9, size, (i << c_octave), (j << c_octave));
+                const float dxy = icvCalcHaarPatternSum<4>(texSum, c_DXY, 9, size, (i << c_octave), (j << c_octave));
                det.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx * dy - 0.81f * dxy * dxy;
                trace.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx + dy;
            }
        }
-        void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
+        void icvCalcLayerDetAndTrace_gpu(const PtrStepSz<unsigned int>& sum, const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
            int octave, int nOctaveLayers)
        {
            const int min_size = calcSize(octave, 0);
            const int max_samples_i = 1 + ((img_rows - min_size) >> octave);
            const int max_samples_j = 1 + ((img_cols - min_size) >> octave);
-
+            cudev::Texture<unsigned int> texSum(sum);
            dim3 threads(16, 16);
            dim3 grid;
            grid.x = divUp(max_samples_j, threads.x);
            grid.y = divUp(max_samples_i, threads.y) * (nOctaveLayers + 2);
-            icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace);
+            icvCalcLayerDetAndTrace<<<grid, threads>>>(texSum, det, trace);
            cudaSafeCall( cudaGetLastError() );
            cudaSafeCall( cudaDeviceSynchronize() );
@ -255,10 +226,14 @@ namespace cv { namespace cuda { namespace device
        __constant__ float c_DM[5] = {0, 0, 9, 9, 1};
-        struct WithMask
+        template<bool useMask = true>
        struct Mask
        {
-            static __device__ bool check(int sum_i, int sum_j, int size)
+            __host__ Mask(){};
            __host__ Mask(cudev::TexturePtr<unsigned int> tex_): tex(tex_) {};
            __device__ bool check(int sum_i, int sum_j, int size)
            {
                if (!useMask) return true;
                float ratio = (float)size / 9.0f;
                float d = 0;
@ -269,19 +244,20 @@ namespace cv { namespace cuda { namespace device
                int dy2 = __float2int_rn(ratio * c_DM[3]);
                float t = 0;
-                t += tex2D(maskSumTex, sum_j + dx1, sum_i + dy1);
+                t += tex(sum_i + dy1, sum_j + dx1);
-                t -= tex2D(maskSumTex, sum_j + dx1, sum_i + dy2);
+                t -= tex(sum_i + dy2, sum_j + dx1);
-                t -= tex2D(maskSumTex, sum_j + dx2, sum_i + dy1);
+                t -= tex(sum_i + dy1, sum_j + dx2);
-                t += tex2D(maskSumTex, sum_j + dx2, sum_i + dy2);
+                t += tex(sum_i + dy2, sum_j + dx2);
                d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));
                return (d >= 0.5f);
            }
            cudev::TexturePtr<unsigned int> tex;
        };
-        template <typename Mask>
+        template<class T>
-        __global__ void icvFindMaximaInLayer(const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer,
+        __global__ void icvFindMaximaInLayer(T mask, const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer,
            unsigned int* maxCounter)
        {
            #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
@ -323,7 +299,7 @@ namespace cv { namespace cuda { namespace device
                    const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;
                    const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;
-                    if (Mask::check(sum_i, sum_j, size))
+                    if (mask.check(sum_i, sum_j, size))
                    {
                        // Check to see if we have a max (in its 26 neighbours)
                        const bool condmax = val0 > N9[localLin - 1 - blockDim.x - zoff]
@ -374,7 +350,7 @@ namespace cv { namespace cuda { namespace device
            #endif
        }
-        void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
+        void icvFindMaximaInLayer_gpu(const PtrStepSz<unsigned int>& maskSum, const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
            int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers)
        {
            const int layer_rows = img_rows >> octave;
@ -390,10 +366,15 @@ namespace cv { namespace cuda { namespace device
            const size_t smem_size = threads.x * threads.y * 3 * sizeof(float);
-            if (use_mask)
+            if (use_mask) {
-                icvFindMaximaInLayer<WithMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
+                cudev::Texture<unsigned int> texMaskSum(maskSum);
-            else
+                Mask<true> mask(texMaskSum);
-                icvFindMaximaInLayer<WithOutMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
+                icvFindMaximaInLayer<<<grid, threads, smem_size>>>(mask, det, trace, maxPosBuffer, maxCounter);
            }
            else {
                Mask<false> mask;
                icvFindMaximaInLayer<<<grid, threads, smem_size>>>(mask, det, trace, maxPosBuffer, maxCounter);
            }
            cudaSafeCall( cudaGetLastError() );
@ -539,7 +520,7 @@ namespace cv { namespace cuda { namespace device
        __constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};
        __constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};
-        __global__ void icvCalcOrientation(const float* featureX, const float* featureY, const float* featureSize, float* featureDir)
+        __global__ void icvCalcOrientation(cudev::TexturePtr<unsigned int> texSum, const float* featureX, const float* featureY, const float* featureSize, float* featureDir)
        {
            __shared__ float s_X[128];
            __shared__ float s_Y[128];
@ -576,8 +557,8 @@ namespace cv { namespace cuda { namespace device
                if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size &&
                    x >= 0 && x < (c_img_cols + 1) - grad_wav_size)
                {
-                    X = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NX, 4, grad_wav_size, y, x);
+                    X = c_aptW[tid] * icvCalcHaarPatternSum<2>(texSum, c_NX, 4, grad_wav_size, y, x);
-                    Y = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NY, 4, grad_wav_size, y, x);
+                    Y = c_aptW[tid] * icvCalcHaarPatternSum<2>(texSum, c_NY, 4, grad_wav_size, y, x);
                    angle = atan2f(Y, X);
                    if (angle < 0)
@ -676,8 +657,9 @@ namespace cv { namespace cuda { namespace device
        #undef ORI_WIN
        #undef ORI_SAMPLES
-        void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures)
+        void icvCalcOrientation_gpu(const PtrStepSz<unsigned int>& sum, const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures)
        {
            cudev::Texture<unsigned int> texSum(sum);
            dim3 threads;
            threads.x = 32;
            threads.y = 4;
@ -685,7 +667,7 @@ namespace cv { namespace cuda { namespace device
            dim3 grid;
            grid.x = nFeatures;
-            icvCalcOrientation<<<grid, threads>>>(featureX, featureY, featureSize, featureDir);
+            icvCalcOrientation<<<grid, threads>>>(texSum, featureX, featureY, featureSize, featureDir);
            cudaSafeCall( cudaGetLastError() );
            cudaSafeCall( cudaDeviceSynchronize() );
@ -724,12 +706,14 @@ namespace cv { namespace cuda { namespace device
        {
            typedef uchar elem_type;
            __device__ WinReader(cudev::TexturePtr<uchar> tex_) : tex(tex_) {};
            __device__ __forceinline__ uchar operator ()(int i, int j) const
            {
                float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
                float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
-                return tex2D(imgTex, pixel_x, pixel_y);
+                return tex(pixel_y, pixel_x);
            }
            float centerX;
@ -739,19 +723,17 @@ namespace cv { namespace cuda { namespace device
            float sin_dir;
            int width;
            int height;
            cudev::TexturePtr<uchar> tex;
        };
-        __device__ void calc_dx_dy(const float* featureX, const float* featureY, const float* featureSize, const float* featureDir,
+        __device__ void calc_dx_dy(cudev::TexturePtr<uchar> tex, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir,
                                   float& dx, float& dy);
        __device__ void calc_dx_dy(const float* featureX, const float* featureY, const float* featureSize, const float* featureDir,
                                   float& dx, float& dy)
        {
            __shared__ float s_PATCH[PATCH_SZ + 1][PATCH_SZ + 1];
            dx = dy = 0.0f;
-            WinReader win;
+            WinReader win(tex);
            win.centerX = featureX[blockIdx.x];
            win.centerY = featureY[blockIdx.x];
@ -813,14 +795,14 @@ namespace cv { namespace cuda { namespace device
            }
        }
-        __global__ void compute_descriptors_64(PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
+        __global__ void compute_descriptors_64(cudev::TexturePtr<uchar> texImg, PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
        {
            __shared__ float smem[32 * 16];
            float* sRow = smem + threadIdx.y * 32;
            float dx, dy;
-            calc_dx_dy(featureX, featureY, featureSize, featureDir, dx, dy);
+            calc_dx_dy(texImg, featureX, featureY, featureSize, featureDir, dx, dy);
            float dxabs = ::fabsf(dx);
            float dyabs = ::fabsf(dy);
@ -839,14 +821,14 @@ namespace cv { namespace cuda { namespace device
                *descriptors_block = make_float4(dx, dy, dxabs, dyabs);
        }
-        __global__ void compute_descriptors_128(PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
+        __global__ void compute_descriptors_128(cudev::TexturePtr<uchar> texImg, PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
        {
            __shared__ float smem[32 * 16];
            float* sRow = smem + threadIdx.y * 32;
            float dx, dy;
-            calc_dx_dy(featureX, featureY, featureSize, featureDir, dx, dy);
+            calc_dx_dy(texImg, featureX, featureY, featureSize, featureDir, dx, dy);
            float4* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.y * 2;
@ -925,13 +907,13 @@ namespace cv { namespace cuda { namespace device
            descriptor_base[threadIdx.x] = val / s_len;
        }
-        void compute_descriptors_gpu(PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures)
+        void compute_descriptors_gpu(const PtrStepSzb& img, PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures)
        {
            // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
-
+            cudev::Texture<unsigned char> texImg(img);
            if (descriptors.cols == 64)
            {
-                compute_descriptors_64<<<nFeatures, dim3(32, 16)>>>(descriptors, featureX, featureY, featureSize, featureDir);
+                compute_descriptors_64<<<nFeatures, dim3(32, 16)>>>(texImg, descriptors, featureX, featureY, featureSize, featureDir);
                cudaSafeCall( cudaGetLastError() );
                cudaSafeCall( cudaDeviceSynchronize() );
@ -943,7 +925,7 @@ namespace cv { namespace cuda { namespace device
            }
            else
            {
-                compute_descriptors_128<<<nFeatures, dim3(32, 16)>>>(descriptors, featureX, featureY, featureSize, featureDir);
+                compute_descriptors_128<<<nFeatures, dim3(32, 16)>>>(texImg, descriptors, featureX, featureY, featureSize, featureDir);
                cudaSafeCall( cudaGetLastError() );
                cudaSafeCall( cudaDeviceSynchronize() );
--- a/modules/xfeatures2d/src/surf.cuda.cpp
+++ b/modules/xfeatures2d/src/surf.cuda.cpp
@ -94,23 +94,19 @@ namespace cv { namespace cuda { namespace device
        void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
        void loadOctaveConstants(int octave, int layer_rows, int layer_cols);
-        void bindImgTex(PtrStepSzb img);
+        void icvCalcLayerDetAndTrace_gpu(const PtrStepSz<unsigned int>& sum, const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
        size_t bindSumTex(PtrStepSz<unsigned int> sum);
        size_t bindMaskSumTex(PtrStepSz<unsigned int> maskSum);
        void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
            int octave, int nOctaveLayer);
-        void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
+        void icvFindMaximaInLayer_gpu(const PtrStepSz<unsigned int>& maskSum, const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
            int img_rows, int img_cols, int octave, bool use_mask, int nLayers);
        void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter,
            float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian,
            unsigned int* featureCounter);
-        void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures);
+        void icvCalcOrientation_gpu(const PtrStepSz<unsigned int>& sum, const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures);
-        void compute_descriptors_gpu(PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
+        void compute_descriptors_gpu(const PtrStepSzb& img, PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
    }
 }}}
@ -138,10 +134,7 @@ namespace
    class SURF_CUDA_Invoker
    {
    public:
-        SURF_CUDA_Invoker(cv::cuda::SURF_CUDA& surf, const GpuMat& img, const GpuMat& mask) :
+        SURF_CUDA_Invoker(cv::cuda::SURF_CUDA& surf, const GpuMat& img_, const GpuMat& mask) : surf_(surf), img(img_), img_cols(img_.cols), img_rows(img_.rows), use_mask(!mask.empty())
            surf_(surf),
            img_cols(img.cols), img_rows(img.rows),
            use_mask(!mask.empty())
        {
            CV_Assert(!img.empty() && img.type() == CV_8UC1);
            CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
@ -167,16 +160,12 @@ namespace
            loadGlobalConstants(maxCandidates, maxFeatures, img_rows, img_cols, surf_.nOctaveLayers, static_cast<float>(surf_.hessianThreshold));
            bindImgTex(img);
            cuda::integral(img, surf_.sum);
            sumOffset = bindSumTex(surf_.sum);
            if (use_mask)
            {
                cuda::min(mask, 1.0, surf_.mask1);
                cuda::integral(surf_.mask1, surf_.maskSum);
                maskOffset = bindMaskSumTex(surf_.maskSum);
            }
        }
@ -195,9 +184,9 @@ namespace
                const int layer_cols = img_cols >> octave;
                loadOctaveConstants(octave, layer_rows, layer_cols);
-                icvCalcLayerDetAndTrace_gpu(surf_.det, surf_.trace, img_rows, img_cols, octave, surf_.nOctaveLayers);
+                icvCalcLayerDetAndTrace_gpu(surf_.sum, surf_.det, surf_.trace, img_rows, img_cols, octave, surf_.nOctaveLayers);
-                icvFindMaximaInLayer_gpu(surf_.det, surf_.trace, surf_.maxPosBuffer.ptr<int4>(), counters.ptr<unsigned int>() + 1 + octave,
+                icvFindMaximaInLayer_gpu(surf_.maskSum, surf_.det, surf_.trace, surf_.maxPosBuffer.ptr<int4>(), counters.ptr<unsigned int>() + 1 + octave,
                    img_rows, img_cols, octave, use_mask, surf_.nOctaveLayers);
                unsigned int maxCounter;
@ -230,7 +219,7 @@ namespace
            const int nFeatures = keypoints.cols;
            if (nFeatures > 0)
            {
-                icvCalcOrientation_gpu(keypoints.ptr<float>(SURF_CUDA::X_ROW), keypoints.ptr<float>(SURF_CUDA::Y_ROW),
+                icvCalcOrientation_gpu(surf_.sum, keypoints.ptr<float>(SURF_CUDA::X_ROW), keypoints.ptr<float>(SURF_CUDA::Y_ROW),
                    keypoints.ptr<float>(SURF_CUDA::SIZE_ROW), keypoints.ptr<float>(SURF_CUDA::ANGLE_ROW), nFeatures);
            }
        }
@ -241,7 +230,7 @@ namespace
            if (nFeatures > 0)
            {
                ensureSizeIsEnough(nFeatures, descriptorSize, CV_32F, descriptors);
-                compute_descriptors_gpu(descriptors, keypoints.ptr<float>(SURF_CUDA::X_ROW), keypoints.ptr<float>(SURF_CUDA::Y_ROW),
+                compute_descriptors_gpu(img, descriptors, keypoints.ptr<float>(SURF_CUDA::X_ROW), keypoints.ptr<float>(SURF_CUDA::Y_ROW),
                    keypoints.ptr<float>(SURF_CUDA::SIZE_ROW), keypoints.ptr<float>(SURF_CUDA::ANGLE_ROW), nFeatures);
            }
        }
@ -252,6 +241,8 @@ namespace
        SURF_CUDA& surf_;
        GpuMat img;
        int img_cols, img_rows;
        bool use_mask;
@ -259,9 +250,6 @@ namespace
        int maxCandidates;
        int maxFeatures;
        size_t maskOffset;
        size_t sumOffset;
        GpuMat counters;
    };
 }